OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_arch.h
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8// Copyright (c) 2026, Osamu Watanabe
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32//***************************************************************************/
33// This file is part of the OpenJPH software implementation.
34// File: ojph_arch.h
35// Author: Aous Naman
36// Date: 28 August 2019
37//***************************************************************************/
38
39
40#ifndef OJPH_ARCH_H
41#define OJPH_ARCH_H
42
43#include <cstring>
44#include <cstdio>
45#include <cstdint>
46#include <cmath>
47
48#include "ojph_defs.h"
49
50
52// preprocessor directives for compiler
54#ifdef _MSC_VER
55#define OJPH_COMPILER_MSVC
56#elif (defined __GNUC__)
57#define OJPH_COMPILER_GNUC
58#endif
59
60#ifdef __EMSCRIPTEN__
61#define OJPH_EMSCRIPTEN
62#endif
63
64#ifdef OJPH_COMPILER_MSVC
65#include <intrin.h>
66#endif
67
69 // portable force-inline / no-inline function qualifiers
71#ifdef OJPH_COMPILER_MSVC
72 #define OJPH_FORCE_INLINE static __forceinline
73 #define OJPH_NO_INLINE static __declspec(noinline)
74#else
75 #define OJPH_FORCE_INLINE static inline __attribute__((always_inline))
76 #define OJPH_NO_INLINE static __attribute__((noinline))
77#endif
78
80// preprocessor directives for architecture
82#if defined(__arm__) || defined(__TARGET_ARCH_ARM) \
83 || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
84 #define OJPH_ARCH_ARM
85#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
86 #define OJPH_ARCH_I386
87#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) \
88 || defined(_M_X64)
89 #define OJPH_ARCH_X86_64
90#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
91 #define OJPH_ARCH_IA64
92#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \
93 || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \
94 || defined(_M_MPPC) || defined(_M_PPC)
95 #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
96 #define OJPH_ARCH_PPC64
97 #else
98 #define OJPH_ARCH_PPC
99 #endif
100#else
101 #define OJPH_ARCH_UNKNOWN
102#endif
103
104// Only little-endian POWER (ppc64le) is supported for SIMD
105#if defined(OJPH_ARCH_PPC64) && \
106 (defined(__LITTLE_ENDIAN__) || \
107 (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
108 #define OJPH_ARCH_PPC64LE
109#endif
110
111namespace ojph {
113 // disable SIMD for unknown architecture
115#if !defined(OJPH_ARCH_X86_64) && !defined(OJPH_ARCH_I386) && \
116 !defined(OJPH_ARCH_ARM) && !defined(OJPH_ARCH_PPC64LE) && \
117 !defined(OJPH_DISABLE_SIMD)
118#define OJPH_DISABLE_SIMD
119#endif // !OJPH_ARCH_UNKNOWN
120
122 // OS detection definitions
124#if (defined WIN32) || (defined _WIN32) || (defined _WIN64)
125#define OJPH_OS_WINDOWS
126#elif (defined __APPLE__)
127#define OJPH_OS_APPLE
128#elif (defined __ANDROID__)
129#define OJPH_OS_ANDROID
130#elif (defined __linux)
131#define OJPH_OS_LINUX
132#elif (defined __FreeBSD__)
133#define OJPH_OS_FREEBSD
134#elif (defined __OpenBSD__)
135#define OJPH_OS_OPENBSD
136#endif
137
139 // defines for dll
141#if defined(OJPH_OS_WINDOWS) && defined(OJPH_BUILD_SHARED_LIBRARY)
142#define OJPH_EXPORT __declspec(dllexport)
143#else
144#define OJPH_EXPORT
145#endif
146
148 // cpu features
151 int get_cpu_ext_level();
152
153 enum : int {
166 };
167
168 enum : int {
174 };
175
176 // POWER9 (ISA 3.0) is the minimum supported SIMD level; older CPUs
177 // (POWER8 and earlier) use the generic code paths
178 enum : int {
180 PPC_CPU_EXT_LEVEL_ARCH_3_00 = 1, // ISA 3.0 (POWER9)
181 PPC_CPU_EXT_LEVEL_ARCH_3_1 = 2, // ISA 3.1 (POWER10)
182 };
183
185 static inline ui32 population_count(ui32 val)
186 {
187 #if defined(OJPH_COMPILER_MSVC) \
188 && (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
189 return (ui32)__popcnt(val);
190 #elif (defined OJPH_COMPILER_GNUC)
191 return (ui32)__builtin_popcount(val);
192 #else
193 val -= ((val >> 1) & 0x55555555);
194 val = (((val >> 2) & 0x33333333) + (val & 0x33333333));
195 val = (((val >> 4) + val) & 0x0f0f0f0f);
196 val += (val >> 8);
197 val += (val >> 16);
198 return (int)(val & 0x0000003f);
199 #endif
200 }
201
203#ifdef OJPH_COMPILER_MSVC
204 #pragma intrinsic(_BitScanReverse)
205#endif
206 static inline ui32 count_leading_zeros(ui32 val)
207 {
208 #ifdef OJPH_COMPILER_MSVC
209 unsigned long result = 0;
210 _BitScanReverse(&result, val);
211 return 31 ^ (ui32)result;
212 #elif (defined OJPH_COMPILER_GNUC)
213 return (ui32)__builtin_clz(val);
214 #else
215 val |= (val >> 1);
216 val |= (val >> 2);
217 val |= (val >> 4);
218 val |= (val >> 8);
219 val |= (val >> 16);
220 return 32 - population_count(val);
221 #endif
222 }
223
225#ifdef OJPH_COMPILER_MSVC
226 #if (defined OJPH_ARCH_X86_64 || defined OJPH_ARCH_ARM)
227 #pragma intrinsic(_BitScanReverse64)
228 #elif (defined OJPH_ARCH_I386)
229 #pragma intrinsic(_BitScanReverse)
230 #else
231 #error Error unsupport MSVC version
232 #endif
233#endif
234 static inline ui32 count_leading_zeros(ui64 val)
235 {
236 #ifdef OJPH_COMPILER_MSVC
237 unsigned long result = 0;
238 #if (defined OJPH_ARCH_X86_64) || (defined OJPH_ARCH_ARM)
239 _BitScanReverse64(&result, val);
240 #elif (defined OJPH_ARCH_I386)
241 ui32 msb = (ui32)(val >> 32), lsb = (ui32)val;
242 if (msb == 0)
243 _BitScanReverse(&result, lsb);
244 else {
245 _BitScanReverse(&result, msb);
246 result += 32;
247 }
248 #else
249 #error Error unsupport MSVC version
250 #endif
251 return 63 ^ (ui32)result;
252 #elif (defined OJPH_COMPILER_GNUC)
253 return (ui32)__builtin_clzll(val);
254 #else
255 val |= (val >> 1);
256 val |= (val >> 2);
257 val |= (val >> 4);
258 val |= (val >> 8);
259 val |= (val >> 16);
260 val |= (val >> 32);
261 return 64 - population_count64(val);
262 #endif
263 }
264
266#ifdef OJPH_COMPILER_MSVC
267 #pragma intrinsic(_BitScanForward)
268#endif
269 static inline ui32 count_trailing_zeros(ui32 val)
270 {
271 #ifdef OJPH_COMPILER_MSVC
272 unsigned long result = 0;
273 _BitScanForward(&result, val);
274 return (ui32)result;
275 #elif (defined OJPH_COMPILER_GNUC)
276 return (ui32)__builtin_ctz(val);
277 #else
278 val |= (val << 1);
279 val |= (val << 2);
280 val |= (val << 4);
281 val |= (val << 8);
282 val |= (val << 16);
283 return 32 - population_count(val);
284 #endif
285 }
286
288#ifdef OJPH_COMPILER_MSVC
289 #pragma intrinsic(_BitScanForward64)
290#endif
291 static inline ui32 count_trailing_zeros(ui64 val)
292 {
293 #ifdef OJPH_COMPILER_MSVC
294 unsigned long result = 0;
295 #if (defined OJPH_ARCH_X86_64) || (defined OJPH_ARCH_ARM)
296 _BitScanForward64(&result, val);
297 #elif (defined OJPH_ARCH_I386)
298 ui32 lsb = (ui32)val, msb = (ui32)(val >> 32);
299 if (lsb != 0)
300 _BitScanForward(&result, lsb);
301 else {
302 _BitScanForward(&result, msb);
303 result += 32;
304 }
305 #endif
306 return (ui32)result;
307 #elif (defined OJPH_COMPILER_GNUC)
308 return (ui32)__builtin_ctzll(val);
309 #else
310 if ((ui32)val != 0)
311 return count_trailing_zeros((ui32)val);
312 return 32 + count_trailing_zeros((ui32)(val >> 32));
313 #endif
314 }
315
317 static inline si32 ojph_round(float val)
318 {
319 #ifdef OJPH_COMPILER_MSVC
320 return (si32)(val + (val >= 0.0f ? 0.5f : -0.5f));
321 #elif (defined OJPH_COMPILER_GNUC)
322 return (si32)(val + (val >= 0.0f ? 0.5f : -0.5f));
323 #else
324 return (si32)round(val);
325 #endif
326 }
327
329 static inline si32 ojph_trunc(float val)
330 {
331 #ifdef OJPH_COMPILER_MSVC
332 return (si32)(val);
333 #elif (defined OJPH_COMPILER_GNUC)
334 return (si32)(val);
335 #else
336 return (si32)trunc(val);
337 #endif
338 }
339
341 // constants
343 #ifndef OJPH_EMSCRIPTEN
344 const ui32 byte_alignment = 64; // 64 bytes == 512 bits
347 #else
348 const ui32 byte_alignment = 16; // 16 bytes == 128 bits
350 const ui32 object_alignment = 8;
351 #endif
352
354 // templates for alignment
356
358 // finds the size such that it is a multiple of byte_alignment
359 template <typename T, ui32 N>
360 size_t calc_aligned_size(size_t size) {
361 size = size * sizeof(T) + N - 1;
362 size &= ~((1ULL << (31 - count_leading_zeros(N))) - 1);
363 size >>= (63 - count_leading_zeros((ui64)sizeof(T)));
364 return size;
365 }
366
368 // moves the pointer to first address that is a multiple of byte_alignment
369 template <typename T, ui32 N>
370 inline T *align_ptr(T *ptr) {
371 intptr_t p = reinterpret_cast<intptr_t>(ptr);
372 p += N - 1;
373 p &= ~((1ULL << (31 - count_leading_zeros(N))) - 1);
374 return reinterpret_cast<T *>(p);
375 }
376
378 // Determine the byte order of the target at compile time when possible,
379 // so that the compiler can remove the branches for the other byte order.
380 // __BYTE_ORDER__ is a predefined macro that describes the target
381 // architecture, not the machine running the compiler, so it is also
382 // correct when cross-compiling.
383 // All MSVC targets (x86, x64, ARM64 Windows) are little endian.
384#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
385 constexpr bool is_machine_little_endian = false;
386#elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
387 constexpr bool is_machine_little_endian = true;
388#elif defined(OJPH_COMPILER_MSVC)
389 constexpr bool is_machine_little_endian = true;
390#else
391 // fallback in case macro __BYTE_ORDER__ is not defined
392 // If the first byte in memory is 0x01, the machine is Little Endian.
393 // If the first byte in memory is 0x00, the machine is Big Endian.
395 {
396 const uint16_t n = 0x0001;
397 bool is_machine_little_endian = (*((uint8_t *)&n) == 0x01);
399 }
401#endif
402
404 // swap bytes 1 2 --> 2 1 on big-endian machines
405 static inline ui16 swap_bytes_if_be(ui16 t)
406 {
408 return t;
409 else
410 return (ui16)((t << 8) | (t >> 8));
411 }
412
413 // swap bytes 1 2 --> 2 1 on little-endian machines
414 static inline ui16 swap_bytes_if_le(ui16 t)
415 {
417 return (ui16)((t << 8) | (t >> 8));
418 else
419 return t;
420 }
421
422 // swap bytes 1 2 3 4 --> 4 3 2 1 on big-endian machines
423 static inline ui32 swap_bytes_if_be(ui32 t)
424 {
426 return t;
427 else
428 {
429 ui32 u = swap_bytes_if_be((ui16)(t & 0xFFFFu));
430 u <<= 16;
431 u |= swap_bytes_if_be((ui16)(t >> 16));
432 return u;
433 }
434 }
435
436 // swap bytes 1 2 3 4 --> 4 3 2 1 on little-endian machines
437 static inline ui32 swap_bytes_if_le(ui32 t)
438 {
440 {
441 ui32 u = swap_bytes_if_le((ui16)(t & 0xFFFFu));
442 u <<= 16;
443 u |= swap_bytes_if_le((ui16)(t >> 16));
444 return u;
445 }
446 else
447 return t;
448 }
449
450 // swap bytes 1 2 3 4 5 6 7 8 --> 8 7 6 5 4 3 2 1 on little-endian machines
451 static inline ui64 swap_bytes_if_le(ui64 t)
452 {
454 {
455 ui64 u =
456 swap_bytes_if_le((ui32)(t & 0xFFFFFFFFu));
457 u <<= 32;
458 u |= swap_bytes_if_le((ui32)(t >> 32));
459 return u;
460 }
461 else
462 return t;
463 }
464
466 // loads 4 bytes from p as a little-endian 32-bit integer; that is, the
467 // byte at the lowest address goes into the least-significant byte of the
468 // result, irrespective of the machine's endianness
469 static inline ui32 load_le_ui32(const ui8 *p)
470 {
472 ui32 val;
473 std::memcpy(&val, p, sizeof(val));
474 return val;
475 }
476 else
477 return (ui32)p[0] | ((ui32)p[1] << 8)
478 | ((ui32)p[2] << 16) | ((ui32)p[3] << 24);
479 }
480
482 // loads two consecutive ui16 values from p, placing the one at the lower
483 // address in the least-significant 16 bits of the result, irrespective
484 // of the machine's endianness
485 static inline ui32 load_le_ui16x2(const ui16 *p)
486 {
488 ui32 val;
489 std::memcpy(&val, p, sizeof(val));
490 return val;
491 }
492 else
493 return (ui32)p[0] | ((ui32)p[1] << 16);
494 }
495}
496
497#endif // !OJPH_ARCH_H
const ui32 object_alignment
Definition ojph_arch.h:346
static ui16 swap_bytes_if_le(ui16 t)
Definition ojph_arch.h:414
@ ARM_CPU_EXT_LEVEL_SVE
Definition ojph_arch.h:172
@ ARM_CPU_EXT_LEVEL_SVE2
Definition ojph_arch.h:173
@ ARM_CPU_EXT_LEVEL_NEON
Definition ojph_arch.h:170
@ ARM_CPU_EXT_LEVEL_GENERIC
Definition ojph_arch.h:169
@ ARM_CPU_EXT_LEVEL_ASIMD
Definition ojph_arch.h:171
const ui32 byte_alignment
Definition ojph_arch.h:344
const bool is_machine_little_endian
Definition ojph_arch.h:400
uint64_t ui64
Definition ojph_defs.h:56
static si32 ojph_round(float val)
Definition ojph_arch.h:317
static bool check_if_machine_is_little_endian()
Definition ojph_arch.h:394
static ui32 load_le_ui16x2(const ui16 *p)
Definition ojph_arch.h:485
static ui32 load_le_ui32(const ui8 *p)
Definition ojph_arch.h:469
size_t calc_aligned_size(size_t size)
Definition ojph_arch.h:360
uint16_t ui16
Definition ojph_defs.h:52
T * align_ptr(T *ptr)
Definition ojph_arch.h:370
@ PPC_CPU_EXT_LEVEL_ARCH_3_1
Definition ojph_arch.h:181
@ PPC_CPU_EXT_LEVEL_GENERIC
Definition ojph_arch.h:179
@ PPC_CPU_EXT_LEVEL_ARCH_3_00
Definition ojph_arch.h:180
static ui32 population_count(ui32 val)
Definition ojph_arch.h:185
static ui16 swap_bytes_if_be(ui16 t)
Definition ojph_arch.h:405
OJPH_EXPORT int get_cpu_ext_level()
static si32 ojph_trunc(float val)
Definition ojph_arch.h:329
static ui32 count_trailing_zeros(ui32 val)
Definition ojph_arch.h:269
static ui32 count_leading_zeros(ui32 val)
Definition ojph_arch.h:206
int32_t si32
Definition ojph_defs.h:55
const ui32 log_byte_alignment
Definition ojph_arch.h:345
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
@ X86_CPU_EXT_LEVEL_AVX2
Definition ojph_arch.h:163
@ X86_CPU_EXT_LEVEL_AVX
Definition ojph_arch.h:162
@ X86_CPU_EXT_LEVEL_AVX512
Definition ojph_arch.h:165
@ X86_CPU_EXT_LEVEL_GENERIC
Definition ojph_arch.h:154
@ X86_CPU_EXT_LEVEL_SSE2
Definition ojph_arch.h:157
@ X86_CPU_EXT_LEVEL_SSE41
Definition ojph_arch.h:160
@ X86_CPU_EXT_LEVEL_SSE
Definition ojph_arch.h:156
@ X86_CPU_EXT_LEVEL_MMX
Definition ojph_arch.h:155
@ X86_CPU_EXT_LEVEL_SSE42
Definition ojph_arch.h:161
@ X86_CPU_EXT_LEVEL_SSSE3
Definition ojph_arch.h:159
@ X86_CPU_EXT_LEVEL_SSE3
Definition ojph_arch.h:158
@ X86_CPU_EXT_LEVEL_AVX2FMA
Definition ojph_arch.h:164
#define OJPH_EXPORT
Definition ojph_arch.h:144