OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_codestream_vsx.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_codestream_vsx.cpp
34// Author: Aous Naman
35// Date: 15 May 2022
36//***************************************************************************/
37
38#include <climits>
39#include <cstddef>
40#include "ojph_simd_vsx.h"
41
42#include "ojph_defs.h"
43
44namespace ojph {
45 namespace local {
46
48 void vsx_mem_clear(void* addr, size_t count)
49 {
50 v128_t zero = vsx_i32x4_splat(0);
51 for (size_t i = 0; i < count; i += 16, addr = (char*)addr + 16)
52 vsx_v128_store(addr, zero);
53 }
54
57 {
58 v128_t x1, x0 = vsx_v128_load(address);
59 x1 = vsx_i32x4_shuffle(x0, x0, 2, 3, 2, 3); // x1 = x0[2,3,2,3]
60 x0 = vsx_v128_or(x0, x1);
61 x1 = vsx_i32x4_shuffle(x0, x0, 1, 1, 1, 1); // x1 = x0[1,1,1,1]
62 x0 = vsx_v128_or(x0, x1);
64 return t;
65 }
66
69 {
70 v128_t x1, x0 = vsx_v128_load(address);
71 x1 = vsx_i64x2_shuffle(x0, x0, 1, 1); // x1 = x0[2,3,2,3]
72 x0 = vsx_v128_or(x0, x1);
74 return t;
75 }
76
78 void vsx_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
79 float delta_inv, ui32 count, ui32* max_val)
80 {
81 ojph_unused(delta_inv);
82
83 // convert to sign and magnitude and keep max_val
84 ui32 shift = 31 - K_max;
85 v128_t m0 = vsx_i32x4_splat(INT_MIN);
86 v128_t zero = vsx_i32x4_splat(0);
87 v128_t one = vsx_i32x4_splat(1);
88 v128_t tmax = vsx_v128_load(max_val);
89 si32 *p = (si32*)sp;
90 for ( ; count >= 4; count -= 4, p += 4, dp += 4)
91 {
92 v128_t v = vsx_v128_load(p);
93 v128_t sign = vsx_i32x4_lt(v, zero);
94 v128_t val = vsx_v128_xor(v, sign); // negate 1's complement
95 v128_t ones = vsx_v128_and(sign, one);
96 val = vsx_i32x4_add(val, ones); // 2's complement
97 sign = vsx_v128_and(sign, m0);
98 val = vsx_i32x4_shl(val, shift);
99 tmax = vsx_v128_or(tmax, val);
100 val = vsx_v128_or(val, sign);
101 vsx_v128_store(dp, val);
102 }
103 if (count)
104 {
105 v128_t v = vsx_v128_load(p);
106 v128_t sign = vsx_i32x4_lt(v, zero);
107 v128_t val = vsx_v128_xor(v, sign); // negate 1's complement
108 v128_t ones = vsx_v128_and(sign, one);
109 val = vsx_i32x4_add(val, ones); // 2's complement
110 sign = vsx_v128_and(sign, m0);
111 val = vsx_i32x4_shl(val, shift);
112
113 v128_t c = vsx_i32x4_splat((si32)count);
114 v128_t idx = vsx_i32x4_make(0, 1, 2, 3);
115 v128_t mask = vsx_i32x4_gt(c, idx);
116 c = vsx_v128_and(val, mask);
117 tmax = vsx_v128_or(tmax, c);
118
119 val = vsx_v128_or(val, sign);
120 vsx_v128_store(dp, val);
121 }
122 vsx_v128_store(max_val, tmax);
123 }
124
126 void vsx_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
127 float delta_inv, ui32 count, ui32* max_val)
128 {
129 ojph_unused(K_max);
130
131 //quantize and convert to sign and magnitude and keep max_val
132
133 v128_t d = vsx_f32x4_splat(delta_inv);
134 v128_t zero = vsx_i32x4_splat(0);
135 v128_t one = vsx_i32x4_splat(1);
136 v128_t tmax = vsx_v128_load(max_val);
137 float *p = (float*)sp;
138 for ( ; count >= 4; count -= 4, p += 4, dp += 4)
139 {
140 v128_t vf = vsx_v128_load(p);
141 vf = vsx_f32x4_mul(vf, d); // multiply
142 v128_t val = vsx_i32x4_trunc_sat_f32x4(vf); // convert to signed int
143 v128_t sign = vsx_i32x4_lt(val, zero); // get sign
144 val = vsx_v128_xor(val, sign); // negate 1's complement
145 v128_t ones = vsx_v128_and(sign, one);
146 val = vsx_i32x4_add(val, ones); // 2's complement
147 tmax = vsx_v128_or(tmax, val);
148 sign = vsx_i32x4_shl(sign, 31);
149 val = vsx_v128_or(val, sign);
150 vsx_v128_store(dp, val);
151 }
152 if (count)
153 {
154 v128_t vf = vsx_v128_load(p);
155 vf = vsx_f32x4_mul(vf, d); // multiply
156 v128_t val = vsx_i32x4_trunc_sat_f32x4(vf); // convert to signed int
157 v128_t sign = vsx_i32x4_lt(val, zero); // get sign
158 val = vsx_v128_xor(val, sign); // negate 1's complement
159 v128_t ones = vsx_v128_and(sign, one);
160 val = vsx_i32x4_add(val, ones); // 2's complement
161
162 v128_t c = vsx_i32x4_splat((si32)count);
163 v128_t idx = vsx_i32x4_make(0, 1, 2, 3);
164 v128_t mask = vsx_i32x4_gt(c, idx);
165 c = vsx_v128_and(val, mask);
166 tmax = vsx_v128_or(tmax, c);
167
168 sign = vsx_i32x4_shl(sign, 31);
169 val = vsx_v128_or(val, sign);
170 vsx_v128_store(dp, val);
171 }
172 vsx_v128_store(max_val, tmax);
173 }
174
176 void vsx_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
177 float delta, ui32 count)
178 {
179 ojph_unused(delta);
180 ui32 shift = 31 - K_max;
181 v128_t m1 = vsx_i32x4_splat(INT_MAX);
182 v128_t zero = vsx_i32x4_splat(0);
183 v128_t one = vsx_i32x4_splat(1);
184 si32 *p = (si32*)dp;
185 for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
186 {
187 v128_t v = vsx_v128_load((v128_t*)sp);
188 v128_t val = vsx_v128_and(v, m1);
189 val = vsx_i32x4_shr(val, shift);
190 v128_t sign = vsx_i32x4_lt(v, zero);
191 val = vsx_v128_xor(val, sign); // negate 1's complement
192 v128_t ones = vsx_v128_and(sign, one);
193 val = vsx_i32x4_add(val, ones); // 2's complement
194 vsx_v128_store(p, val);
195 }
196 }
197
199 void vsx_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
200 float delta, ui32 count)
201 {
202 ojph_unused(K_max);
203 v128_t m1 = vsx_i32x4_splat(INT_MAX);
204 v128_t d = vsx_f32x4_splat(delta);
205 float *p = (float*)dp;
206 for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
207 {
208 v128_t v = vsx_v128_load((v128_t*)sp);
209 v128_t vali = vsx_v128_and(v, m1);
210 v128_t valf = vsx_f32x4_convert_i32x4(vali);
211 valf = vsx_f32x4_mul(valf, d);
212 v128_t sign = vsx_v128_andnot(v, m1);
213 valf = vsx_v128_or(valf, sign);
214 vsx_v128_store(p, valf);
215 }
216 }
217
219 void vsx_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
220 float delta_inv, ui32 count, ui64* max_val)
221 {
222 ojph_unused(delta_inv);
223
224 // convert to sign and magnitude and keep max_val
225 ui32 shift = 63 - K_max;
226 v128_t m0 = vsx_i64x2_splat(LLONG_MIN);
227 v128_t zero = vsx_i64x2_splat(0);
228 v128_t one = vsx_i64x2_splat(1);
229 v128_t tmax = vsx_v128_load(max_val);
230 si64 *p = (si64*)sp;
231 for ( ; count >= 2; count -= 2, p += 2, dp += 2)
232 {
233 v128_t v = vsx_v128_load(p);
234 v128_t sign = vsx_i64x2_lt(v, zero);
235 v128_t val = vsx_v128_xor(v, sign); // negate 1's complement
236 v128_t ones = vsx_v128_and(sign, one);
237 val = vsx_i64x2_add(val, ones); // 2's complement
238 sign = vsx_v128_and(sign, m0);
239 val = vsx_i64x2_shl(val, shift);
240 tmax = vsx_v128_or(tmax, val);
241 val = vsx_v128_or(val, sign);
242 vsx_v128_store(dp, val);
243 }
244 if (count)
245 {
246 v128_t v = vsx_v128_load(p);
247 v128_t sign = vsx_i64x2_lt(v, zero);
248 v128_t val = vsx_v128_xor(v, sign); // negate 1's complement
249 v128_t ones = vsx_v128_and(sign, one);
250 val = vsx_i64x2_add(val, ones); // 2's complement
251 sign = vsx_v128_and(sign, m0);
252 val = vsx_i64x2_shl(val, shift);
253
254 v128_t c = vsx_i32x4_make((si32)0xFFFFFFFF, (si32)0xFFFFFFFF, 0, 0);
255 c = vsx_v128_and(val, c);
256 tmax = vsx_v128_or(tmax, c);
257
258 val = vsx_v128_or(val, sign);
259 vsx_v128_store(dp, val);
260 }
261
262 vsx_v128_store(max_val, tmax);
263 }
264
266 void vsx_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
267 float delta, ui32 count)
268 {
269 ojph_unused(delta);
270 ui32 shift = 63 - K_max;
271 v128_t m1 = vsx_i64x2_splat(LLONG_MAX);
272 v128_t zero = vsx_i64x2_splat(0);
273 v128_t one = vsx_i64x2_splat(1);
274 si64 *p = (si64*)dp;
275 for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
276 {
277 v128_t v = vsx_v128_load((v128_t*)sp);
278 v128_t val = vsx_v128_and(v, m1);
279 val = vsx_i64x2_shr(val, shift);
280 v128_t sign = vsx_i64x2_lt(v, zero);
281 val = vsx_v128_xor(val, sign); // negate 1's complement
282 v128_t ones = vsx_v128_and(sign, one);
283 val = vsx_i64x2_add(val, ones); // 2's complement
284 vsx_v128_store(p, val);
285 }
286 }
287 }
288}
ui32 vsx_find_max_val32(ui32 *address)
void vsx_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, float delta_inv, ui32 count, ui64 *max_val)
void vsx_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
ui64 vsx_find_max_val64(ui64 *address)
void vsx_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void vsx_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void vsx_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void vsx_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void vsx_mem_clear(void *addr, size_t count)
int64_t si64
Definition ojph_defs.h:57
uint64_t ui64
Definition ojph_defs.h:56
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
#define ojph_unused(x)
Definition ojph_defs.h:78
static v128_t vsx_i32x4_make(int a, int b, int c, int d)
static v128_t vsx_f32x4_mul(v128_t a, v128_t b)
static v128_t vsx_i64x2_lt(v128_t a, v128_t b)
static v128_t vsx_f32x4_convert_i32x4(v128_t a)
static v128_t vsx_v128_xor(v128_t a, v128_t b)
static v128_t vsx_f32x4_splat(float x)
#define vsx_i64x2_extract_lane(a, i)
static v128_t vsx_i32x4_shl(v128_t a, int n)
static v128_t vsx_i32x4_add(v128_t a, v128_t b)
__vector unsigned char v128_t
static v128_t vsx_v128_andnot(v128_t a, v128_t b)
static void vsx_v128_store(void *p, v128_t a)
static v128_t vsx_v128_and(v128_t a, v128_t b)
static v128_t vsx_i64x2_shl(v128_t a, int n)
static v128_t vsx_v128_or(v128_t a, v128_t b)
static v128_t vsx_i32x4_lt(v128_t a, v128_t b)
#define vsx_i64x2_shuffle(a, b, c0, c1)
#define vsx_i32x4_shuffle(a, b, c0, c1, c2, c3)
static v128_t vsx_i64x2_splat(long long x)
static v128_t vsx_i64x2_shr(v128_t a, int n)
static v128_t vsx_i64x2_add(v128_t a, v128_t b)
static v128_t vsx_i32x4_splat(int x)
static v128_t vsx_v128_load(const void *p)
static v128_t vsx_i32x4_shr(v128_t a, int n)
#define vsx_i32x4_extract_lane(a, i)
static v128_t vsx_i32x4_trunc_sat_f32x4(v128_t a)
static v128_t vsx_i32x4_gt(v128_t a, v128_t b)