OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_block_encoder_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8// Copyright (c) 2024, Intel Corporation
9// Copyright (c) 2026, Osamu Watanabe
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
23// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
25// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
28// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33//***************************************************************************/
34// This file is part of the OpenJPH software implementation.
35// File: ojph_block_encoder_avx2.cpp
36//***************************************************************************/
37
38// Apple Clang on Intel produces corrupt bitstreams with several of the
39// scalar optimizations below (branchless VLC drain, 64-bit MagSgn, etc.)
40// for reasons not yet identified. Since this file is x86-only, the guard
41// does not affect Apple Silicon builds (which use NEON, not AVX2).
42#if defined(__apple_build_version__)
44#else
45
46#include "ojph_arch.h"
47#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
48
49#include <cassert>
50#include <cstring>
51#include <cstdint>
52#include <climits>
53#include <immintrin.h>
54#include <mutex>
55
56#include "ojph_mem.h"
57#include "ojph_arch.h"
58#include "ojph_block_encoder.h"
59#include "ojph_message.h"
60
61#ifdef OJPH_COMPILER_MSVC
62 #define likely(x) (x)
63 #define unlikely(x) (x)
64#else
65 #define likely(x) __builtin_expect((x), 1)
66 #define unlikely(x) __builtin_expect((x), 0)
67#endif
68
69namespace ojph {
70 namespace local {
71
73 // tables
75
76 //VLC encoding
77 // index is (c_q << 8) + (rho << 4) + eps
78 // data is (cwd << 8) + (cwd_len << 4) + eps
79 // table 0 is for the initial line of quads
80 static ui32 vlc_tbl0[2048];
81 static ui32 vlc_tbl1[2048];
82
83 //UVLC encoding
84 static ui32 uvlc_tbl_pair1[33 * 33];
85 static ui32 uvlc_tbl_pair2[33 * 33];
86 static ui32 ulvc_cwd_pre[33];
87 static int ulvc_cwd_pre_len[33];
88 static ui32 ulvc_cwd_suf[33];
89 static int ulvc_cwd_suf_len[33];
90
92 static bool vlc_init_tables()
93 {
94 struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
95 vlc_src_table tbl0[] = {
96 #include "table0.h"
97 };
98 size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table);
99
100 si32 pattern_popcnt[16];
101 for (ui32 i = 0; i < 16; ++i)
102 pattern_popcnt[i] = (si32)population_count(i);
103
104 vlc_src_table* src_tbl = tbl0;
105 ui32 *tgt_tbl = vlc_tbl0;
106 size_t tbl_size = tbl0_size;
107 for (int i = 0; i < 2048; ++i)
108 {
109 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
110 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
111 tgt_tbl[i] = 0;
112 else
113 {
114 vlc_src_table *best_entry = NULL;
115 if (emb) // u_off = 1
116 {
117 int best_e_k = -1;
118 for (size_t j = 0; j < tbl_size; ++j)
119 {
120 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
121 if (src_tbl[j].u_off == 1)
122 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
123 {
124 //now we need to find the smallest cwd with the highest
125 // number of bits set in e_k
126 int ones_count = pattern_popcnt[src_tbl[j].e_k];
127 if (ones_count >= best_e_k)
128 {
129 best_entry = src_tbl + j;
130 best_e_k = ones_count;
131 }
132 }
133 }
134 }
135 else // u_off = 0
136 {
137 for (size_t j = 0; j < tbl_size; ++j)
138 {
139 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
140 if (src_tbl[j].u_off == 0)
141 {
142 best_entry = src_tbl + j;
143 break;
144 }
145 }
146 }
147 assert(best_entry);
148 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
149 + best_entry->e_k);
150 }
151 }
152
153 vlc_src_table tbl1[] = {
154 #include "table1.h"
155 };
156 size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table);
157
158 src_tbl = tbl1;
159 tgt_tbl = vlc_tbl1;
160 tbl_size = tbl1_size;
161 for (int i = 0; i < 2048; ++i)
162 {
163 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
164 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
165 tgt_tbl[i] = 0;
166 else
167 {
168 vlc_src_table *best_entry = NULL;
169 if (emb) // u_off = 1
170 {
171 int best_e_k = -1;
172 for (size_t j = 0; j < tbl_size; ++j)
173 {
174 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
175 if (src_tbl[j].u_off == 1)
176 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
177 {
178 //now we need to find the smallest cwd with the highest
179 // number of bits set in e_k
180 int ones_count = pattern_popcnt[src_tbl[j].e_k];
181 if (ones_count >= best_e_k)
182 {
183 best_entry = src_tbl + j;
184 best_e_k = ones_count;
185 }
186 }
187 }
188 }
189 else // u_off = 0
190 {
191 for (size_t j = 0; j < tbl_size; ++j)
192 {
193 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
194 if (src_tbl[j].u_off == 0)
195 {
196 best_entry = src_tbl + j;
197 break;
198 }
199 }
200 }
201 assert(best_entry);
202 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
203 + best_entry->e_k);
204 }
205 }
206
207
208 return true;
209 }
210
212 static bool uvlc_init_tables()
213 {
214 //code goes from 0 to 31, extension and 32 are not supported here
215 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
216 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
217 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
218 ulvc_cwd_pre_len[2] = 2;
219 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
220 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
221 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
222 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
223 ulvc_cwd_suf_len[2] = 0;
224 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
225 for (int i = 5; i < 33; ++i)
226 {
227 ulvc_cwd_pre[i] = 0;
228 ulvc_cwd_pre_len[i] = 3;
229 ulvc_cwd_suf[i] = (ui32)(i-5);
230 ulvc_cwd_suf_len[i] = 5;
231 }
232 return true;
233 }
234
236 static void uvlc_init_pair_tables()
237 {
238 for (int uq0 = 0; uq0 < 33; ++uq0) {
239 for (int uq1 = 0; uq1 < 33; ++uq1) {
240 ui32 cwd; int len;
241
242 cwd = 0; len = 0;
243 if (uq0 > 2 && uq1 > 2) {
244 cwd |= ulvc_cwd_pre[uq0 - 2];
245 len += ulvc_cwd_pre_len[uq0 - 2];
246 cwd |= ulvc_cwd_pre[uq1 - 2] << len;
247 len += ulvc_cwd_pre_len[uq1 - 2];
248 cwd |= ulvc_cwd_suf[uq0 - 2] << len;
249 len += ulvc_cwd_suf_len[uq0 - 2];
250 cwd |= ulvc_cwd_suf[uq1 - 2] << len;
251 len += ulvc_cwd_suf_len[uq1 - 2];
252 } else if (uq0 > 2 && uq1 > 0) {
253 cwd |= ulvc_cwd_pre[uq0];
254 len += ulvc_cwd_pre_len[uq0];
255 cwd |= (ui32)(uq1 - 1) << len;
256 len += 1;
257 cwd |= ulvc_cwd_suf[uq0] << len;
258 len += ulvc_cwd_suf_len[uq0];
259 } else {
260 cwd |= ulvc_cwd_pre[uq0];
261 len += ulvc_cwd_pre_len[uq0];
262 cwd |= ulvc_cwd_pre[uq1] << len;
263 len += ulvc_cwd_pre_len[uq1];
264 cwd |= ulvc_cwd_suf[uq0] << len;
265 len += ulvc_cwd_suf_len[uq0];
266 cwd |= ulvc_cwd_suf[uq1] << len;
267 len += ulvc_cwd_suf_len[uq1];
268 }
269 uvlc_tbl_pair1[uq0 * 33 + uq1] = (cwd << 5) | (ui32)len;
270
271 cwd = 0; len = 0;
272 cwd |= ulvc_cwd_pre[uq0];
273 len += ulvc_cwd_pre_len[uq0];
274 cwd |= ulvc_cwd_pre[uq1] << len;
275 len += ulvc_cwd_pre_len[uq1];
276 cwd |= ulvc_cwd_suf[uq0] << len;
277 len += ulvc_cwd_suf_len[uq0];
278 cwd |= ulvc_cwd_suf[uq1] << len;
279 len += ulvc_cwd_suf_len[uq1];
280 uvlc_tbl_pair2[uq0 * 33 + uq1] = (cwd << 5) | (ui32)len;
281 }
282 }
283 }
284
287 static bool tables_initialized = false;
288 static std::once_flag tables_initialized_flag;
289 std::call_once(tables_initialized_flag, []() {
290 memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
291 memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
292 tables_initialized = vlc_init_tables();
293 tables_initialized = tables_initialized && uvlc_init_tables();
294 uvlc_init_pair_tables();
295 });
296 return tables_initialized;
297 }
298
300 //
302 struct mel_struct {
303 //storage
304 ui8* buf; //pointer to data buffer
305 ui32 pos; //position of next writing within buf
306 ui32 buf_size; //size of buffer, which we must not exceed
307
308 // all these can be replaced by bytes
309 int remaining_bits; //number of empty bits in tmp
310 int tmp; //temporary storage of coded bits
311 int run; //number of 0 run
312 int k; //state
313 int threshold; //threshold where one bit must be coded
314 };
315
317 static inline void
318 mel_init(mel_struct* melp, ui32 buffer_size, ui8* data)
319 {
320 melp->buf = data;
321 melp->pos = 0;
322 melp->buf_size = buffer_size;
323 melp->remaining_bits = 8;
324 melp->tmp = 0;
325 melp->run = 0;
326 melp->k = 0;
327 melp->threshold = 1; // this is 1 << mel_exp[melp->k];
328 }
329
330 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
331
333 static inline void
334 mel_emit_bits(mel_struct* melp, ui32 bits, int num_bits)
335 {
336 melp->tmp = (melp->tmp << num_bits) | (int)bits;
337 melp->remaining_bits -= num_bits;
338 if (melp->remaining_bits <= 0) {
339 int excess = -melp->remaining_bits;
340 ui8 byte = (ui8)(melp->tmp >> excess);
341 melp->buf[melp->pos++] = byte;
342 melp->tmp &= (1 << excess) - 1;
343 melp->remaining_bits += 8 - (byte == 0xFF);
344 }
345 }
346
348 static inline void
349 mel_encode(mel_struct* melp, bool bit)
350 {
351 if (bit == false) {
352 ++melp->run;
353 if (melp->run >= melp->threshold) {
354 mel_emit_bits(melp, 1, 1);
355 melp->run = 0;
356 melp->k = ojph_min(12, melp->k + 1);
357 melp->threshold = 1 << mel_exp[melp->k];
358 }
359 } else {
360 int t = mel_exp[melp->k];
361 mel_emit_bits(melp, (ui32)melp->run & ((1u << t) - 1), t + 1);
362 melp->run = 0;
363 melp->k = ojph_max(0, melp->k - 1);
364 melp->threshold = 1 << mel_exp[melp->k];
365 }
366 }
367
369 // static inline void
370 // mel_advance_run(mel_struct* melp, ui32 n)
371 // {
372 // ui32 remaining = n;
373 // while (remaining > 0) {
374 // ui32 space = (ui32)melp->threshold - (ui32)melp->run;
375 // if (remaining >= space) {
376 // remaining -= space;
377 // mel_emit_bits(melp, 1, 1);
378 // melp->run = 0;
379 // melp->k = ojph_min(12, melp->k + 1);
380 // melp->threshold = 1 << mel_exp[melp->k];
381 // } else {
382 // melp->run += (int)remaining;
383 // remaining = 0;
384 // }
385 // }
386 // }
387
389 // static inline void
390 // mel_encode_significance(mel_struct* melp)
391 // {
392 // int t = mel_exp[melp->k];
393 // mel_emit_bits(melp, melp->run & ((1u << t) - 1), t + 1);
394 // melp->run = 0;
395 // melp->k = ojph_max(0, melp->k - 1);
396 // melp->threshold = 1 << mel_exp[melp->k];
397 // }
398
400 //
402
403 struct vlc_struct {
404 //storage
405 ui8* buf; //pointer to data buffer
406 ui32 pos; //position of next writing within buf
407 ui32 buf_size; //size of buffer, which we must not exceed
408
409 int used_bits; //number of occupied bits in tmp
410 ui64 tmp; //temporary storage of coded bits
411 bool last_greater_than_8F; //true if last byte us greater than 0x8F
412 };
413
415 static inline void
416 vlc_init(vlc_struct* vlcp, ui32 buffer_size, ui8* data)
417 {
418 vlcp->buf = data + buffer_size - 1; //points to last byte
419 vlcp->pos = 1; //locations will be all -pos
420 vlcp->buf_size = buffer_size;
421
422 vlcp->buf[0] = 0xFF;
423 vlcp->used_bits = 4;
424 vlcp->tmp = 0xF;
425 vlcp->last_greater_than_8F = true;
426 }
427
429 static inline void
430 vlc_drain(vlc_struct* vlcp)
431 {
432 while (vlcp->used_bits >= 8) {
433 int escape = (int)vlcp->last_greater_than_8F;
434 int is_7f = (int)((vlcp->tmp & 0x7F) == 0x7F);
435 int need_stuff = (escape & is_7f) != 0 ? 1 : 0;
436 int bits = 8 - need_stuff;
437
438 ui8 byte = (ui8)(vlcp->tmp & ((1u << bits) - 1));
439 *(vlcp->buf - vlcp->pos) = byte;
440 vlcp->pos++;
441 vlcp->tmp >>= bits;
442 vlcp->used_bits -= bits;
443 vlcp->last_greater_than_8F = byte > 0x8F;
444 }
445 }
446
448 static inline void
449 vlc_encode(vlc_struct* vlcp, ui64 cwd, int cwd_len)
450 {
451 while (true) {
452 int avail = 64 - vlcp->used_bits;
453 if (likely(avail > 0 && cwd_len <= avail)) {
454 vlcp->tmp |= cwd << vlcp->used_bits;
455 vlcp->used_bits += cwd_len;
456 return;
457 }
458 if (likely(avail > 0)) // available space smaller than needed
459 vlcp->tmp |= cwd << vlcp->used_bits;
460 vlcp->used_bits = 64;
461 vlc_drain(vlcp);
462 cwd >>= avail;
463 cwd_len -= avail;
464 }
465 }
466
468 //
470 static inline void
472 {
473 if (melp->run > 0)
474 mel_emit_bits(melp, 1, 1);
475
476 if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
477 *(vlcp->buf - vlcp->pos) = 0x7f;
478 vlcp->pos++;
479 vlcp->tmp >>= 7;
480 vlcp->used_bits -= 7;
481 }
482
483 melp->tmp = melp->tmp << melp->remaining_bits;
484 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
485 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
486 if ((mel_mask | vlc_mask) == 0)
487 return; //last mel byte cannot be 0xFF, since then
488 //melp->remaining_bits would be < 8
489 if (melp->pos >= melp->buf_size)
490 OJPH_ERROR(0x00020003, "mel encoder's buffer is full");
491 ui8 vlcp_tmp = (ui8)vlcp->tmp;
492 int fuse = melp->tmp | vlcp_tmp;
493 if ( ( ((fuse ^ melp->tmp) & mel_mask)
494 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
495 && (fuse != 0xFF) && vlcp->pos > 1)
496 {
497 melp->buf[melp->pos++] = (ui8)fuse;
498 }
499 else
500 {
501 if (vlcp->pos >= vlcp->buf_size)
502 OJPH_ERROR(0x00020004, "vlc encoder's buffer is full");
503 melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF
504 *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp;
505 vlcp->pos++;
506 }
507 }
508
510//
512
513 struct ms_struct {
514 //storage
515 ui8* buf; //pointer to data buffer
516 ui32 pos; //position of next writing within buf
517 ui32 buf_size; //size of buffer, which we must not exceed
518
519 int used_bits; //number of occupied bits in tmp
520 ui64 tmp; //temporary storage of coded bits (64-bit accumulator)
521 bool last_was_ff;//true if the last written byte was 0xFF
522 };
523
525 static inline void
526 ms_init(ms_struct* msp, ui32 buffer_size, ui8* data)
527 {
528 msp->buf = data;
529 msp->pos = 0;
530 msp->buf_size = buffer_size;
531 msp->used_bits = 0;
532 msp->tmp = 0;
533 msp->last_was_ff = false;
534 }
535
537 static inline void
538 ms_drain(ms_struct* msp)
539 {
540 if (msp->last_was_ff) {
541 if (msp->used_bits < 7)
542 return;
543 msp->buf[msp->pos++] = (ui8)(msp->tmp & 0x7F);
544 msp->tmp >>= 7;
545 msp->used_bits -= 7;
546 msp->last_was_ff = false;
547 }
548
549 while (msp->used_bits >= 8) {
550 int n_bytes = msp->used_bits >> 3;
551 if (n_bytes > 8) n_bytes = 8;
552
553 ui64 word = msp->tmp;
554 ui64 valid_mask = (n_bytes < 8)
555 ? (1ULL << (n_bytes * 8)) - 1 : ~(ui64)0;
556
557 ui64 w = ~word;
558 ui64 ff_detect = (w - 0x0101010101010101ULL) & ~w
559 & 0x8080808080808080ULL;
560 ff_detect &= valid_mask;
561
562 if (likely(ff_detect == 0)) {
563 memcpy(msp->buf + msp->pos, &word, (size_t)n_bytes);
564 msp->pos += (ui32)n_bytes;
565 if (n_bytes < 8)
566 msp->tmp >>= (n_bytes * 8);
567 else
568 msp->tmp = 0;
569 msp->used_bits -= n_bytes * 8;
570 } else {
571 int ff_pos = (int)(count_trailing_zeros(ff_detect) >> 3);
572 int safe = ff_pos + 1;
573 memcpy(msp->buf + msp->pos, &word, (size_t)safe);
574 msp->pos += (ui32)safe;
575 int bits = safe * 8;
576 if (bits < 64)
577 msp->tmp >>= bits;
578 else
579 msp->tmp = 0;
580 msp->used_bits -= bits;
581
582 if (msp->used_bits >= 7) {
583 msp->buf[msp->pos++] = (ui8)(msp->tmp & 0x7F);
584 msp->tmp >>= 7;
585 msp->used_bits -= 7;
586 msp->last_was_ff = false;
587 } else {
588 msp->last_was_ff = true;
589 return;
590 }
591 }
592 }
593 }
594
596 static inline void
597 ms_encode_nodefer(ms_struct* msp, ui64 cwd, int cwd_len)
598 {
599 while (true) {
600 int avail = 64 - msp->used_bits;
601 if (likely(avail > 0 && cwd_len <= avail)) {
602 msp->tmp |= cwd << msp->used_bits;
603 msp->used_bits += cwd_len;
604 return;
605 }
606 if (likely(avail > 0)) // available space smaller than needed
607 msp->tmp |= cwd << msp->used_bits;
608 msp->used_bits = 64;
609 ms_drain(msp);
610 cwd >>= avail;
611 cwd_len -= avail;
612 }
613 }
614
616 // static inline void
617 // ms_encode(ms_struct* msp, ui64 cwd, int cwd_len)
618 // {
619 // int avail = 64 - msp->used_bits;
620 // if (likely(cwd_len <= avail)) {
621 // msp->tmp |= cwd << msp->used_bits;
622 // msp->used_bits += cwd_len;
623 // } else {
624 // msp->tmp |= (cwd & ((1ULL << avail) - 1)) << msp->used_bits;
625 // msp->used_bits = 64;
626 // ms_drain(msp);
627 // cwd >>= avail;
628 // cwd_len -= avail;
629 // msp->tmp |= cwd << msp->used_bits;
630 // msp->used_bits += cwd_len;
631 // }
632 // ms_drain(msp);
633 // }
634
636 static inline void
638 {
639 ms_drain(msp);
640 if (msp->used_bits)
641 {
642 int max_bits = msp->last_was_ff ? 7 : 8;
643 int t = max_bits - msp->used_bits;
644 ui32 byte = (ui32)(msp->tmp & ((1ULL << msp->used_bits) - 1));
645 byte |= (0xFFu & ((1u << t) - 1)) << msp->used_bits;
646 if (byte != 0xFF)
647 {
648 if (msp->pos >= msp->buf_size)
649 OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full");
650 msp->buf[msp->pos++] = (ui8)byte;
651 }
652 }
653 else if (msp->last_was_ff)
654 msp->pos--;
655 }
656
657#define ZERO _mm256_setzero_si256()
658#define ONE _mm256_set1_epi32(1)
659
660// https://stackoverflow.com/a/58827596
661inline __m256i avx2_lzcnt_epi32(__m256i v) {
662 // prevent value from being rounded up to the next power of two
663 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB
664
665 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float
666 v = _mm256_srli_epi32(v, 23); // shift down the exponent
667 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias
668 v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32
669
670 return v;
671}
672
673inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) {
674 return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff));
675}
676
677static void proc_pixel(__m256i *src_vec, ui32 p,
678 __m256i *eq_vec, __m256i *s_vec,
679 __m256i &rho_vec, __m256i &e_qmax_vec)
680{
681 __m256i val_vec[4];
682 __m256i _eq_vec[4];
683 __m256i _s_vec[4];
684 __m256i _rho_vec[4];
685
686 for (ui32 i = 0; i < 4; ++i) {
687 /* val = t + t; //multiply by 2 and get rid of sign */
688 val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]);
689
690 /* val >>= p; // 2 \mu_p + x */
691 val_vec[i] = _mm256_srli_epi32(val_vec[i], (int)p);
692
693 /* val &= ~1u; // 2 \mu_p */
694 val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((int)~1u));
695
696 /* if (val) { */
697 const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO);
698
699 /* rho[i] = 1 << i;
700 * rho is processed below.
701 */
702
703 /* e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */
704 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
705 _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]);
706 _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]);
707
708 /* e_qmax[i] = ojph_max(e_qmax[i], e_q[j]);
709 * e_qmax is processed below
710 */
711
712 /* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
713 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
714 _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31);
715 _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]);
716
717 _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask);
718 _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask);
719 val_vec[i] = _mm256_srli_epi32(val_notmask, 31);
720 /* } */
721 }
722
723 const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
724
725 /* Reorder from
726 * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7]
727 * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],.[1, 6], [1, 7]
728 * *_vec[2]:[0, 8], [0, 9], [0,10], [0,11], [0,12], [0,13], [0,14], [0,15]
729 * *_vec[3]:[1, 8], [1, 9], [1,10], [1,11], [1,12], [1,13], [1,14], [1,15]
730 * to
731 * *_vec[0]:[0, 0], [0, 2], [0, 4], [0, 6], [0, 8], [0,10], [0,12], [0,14]
732 * *_vec[1]:[1, 0], [1, 2], [1, 4], [1, 6], [1, 8], [1,10], [1,12], [1,14]
733 * *_vec[2]:[0, 1], [0, 3], [0, 5], [0, 7], [0, 9], [0,11], [0,13], [0,15]
734 * *_vec[3]:[1, 1], [1, 3], [1, 5], [1, 7], [1, 9], [1,11], [1,13], [1,15]
735 */
736 __m256i tmp1, tmp2;
737 for (ui32 i = 0; i < 2; ++i) {
738 tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx);
739 tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx);
740 eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
741 eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
742
743 tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx);
744 tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx);
745 s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
746 s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
747
748 tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx);
749 tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx);
750 _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
751 _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
752 }
753
754 e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]);
755 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]);
756 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]);
757 _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1);
758 _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2);
759 _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3);
760 rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]);
761 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]);
762 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]);
763}
764
765/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...]
766 * [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...]
767 * [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...]
768 * [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...]
769 *
770 * to [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31,
771 * 0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33]
772 *
773 * [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35,
774 * 0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37]
775 *
776 * [..]
777 */
778static void rotate_matrix(__m256i *matrix)
779{
780 __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]);
781 __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]);
782 __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]);
783 __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]);
784
785 matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2);
786 matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4);
787 matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2);
788 matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4);
789
790 tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20);
791 matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31);
792 matrix[0] = tmp1;
793
794 tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20);
795 matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31);
796 matrix[1] = tmp1;
797}
798
799static void proc_ms_encode(ms_struct *msp,
800 __m256i &tuple_vec,
801 __m256i &uq_vec,
802 __m256i &rho_vec,
803 __m256i *s_vec)
804{
805 __m256i m_vec[4];
806
807 /* Prepare parameters for ms_encode */
808 /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */
809 auto tmp = _mm256_and_si256(tuple_vec, ONE);
810 tmp = _mm256_sub_epi32(uq_vec, tmp);
811 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
812 auto mask = avx2_cmpneq_epi32(tmp1, ZERO);
813 m_vec[0] = _mm256_and_si256(mask, tmp);
814
815 /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */
816 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2));
817 tmp = _mm256_srli_epi32(tmp, 1);
818 tmp = _mm256_sub_epi32(uq_vec, tmp);
819 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
820 mask = avx2_cmpneq_epi32(tmp1, ZERO);
821 m_vec[1] = _mm256_and_si256(mask, tmp);
822
823 /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */
824 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4));
825 tmp = _mm256_srli_epi32(tmp, 2);
826 tmp = _mm256_sub_epi32(uq_vec, tmp);
827 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
828 mask = avx2_cmpneq_epi32(tmp1, ZERO);
829 m_vec[2] = _mm256_and_si256(mask, tmp);
830
831 /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */
832 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8));
833 tmp = _mm256_srli_epi32(tmp, 3);
834 tmp = _mm256_sub_epi32(uq_vec, tmp);
835 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
836 mask = avx2_cmpneq_epi32(tmp1, ZERO);
837 m_vec[3] = _mm256_and_si256(mask, tmp);
838
839 rotate_matrix(m_vec);
840 rotate_matrix(s_vec);
841
842 ui32 cwd[8];
843 int cwd_len[8];
844
845 /* Each iteration process 8 bytes * 2 lines */
846 for (ui32 i = 0; i < 4; ++i) {
847 /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
848 * cwd_len = m
849 */
850 _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]);
851 tmp = _mm256_sllv_epi32(ONE, m_vec[i]);
852 tmp = _mm256_sub_epi32(tmp, ONE);
853 tmp = _mm256_and_si256(tmp, s_vec[i]);
854 _mm256_storeu_si256((__m256i*)cwd, tmp);
855
856 for (ui32 j = 0; j < 4; j += 2) {
857 ui32 idx0 = j * 2;
858 ui64 _cwd = cwd[idx0];
859 int _cwd_len = cwd_len[idx0];
860 _cwd |= ((ui64)cwd[idx0 + 1]) << _cwd_len;
861 _cwd_len += cwd_len[idx0 + 1];
862
863 ui32 idx1 = (j + 1) * 2;
864 int len1 = cwd_len[idx1] + cwd_len[idx1 + 1];
865 if (likely(_cwd_len + len1 <= 64)) {
866 _cwd |= ((ui64)cwd[idx1]) << _cwd_len;
867 _cwd_len += cwd_len[idx1];
868 _cwd |= ((ui64)cwd[idx1 + 1]) << _cwd_len;
869 _cwd_len += cwd_len[idx1 + 1];
870 ms_encode_nodefer(msp, _cwd, _cwd_len);
871 } else {
872 ms_encode_nodefer(msp, _cwd, _cwd_len);
873 _cwd = cwd[idx1];
874 _cwd_len = cwd_len[idx1];
875 _cwd |= ((ui64)cwd[idx1 + 1]) << _cwd_len;
876 _cwd_len += cwd_len[idx1 + 1];
877 ms_encode_nodefer(msp, _cwd, _cwd_len);
878 }
879 }
880 }
881 ms_drain(msp);
882}
883
884static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec,
885 __m256i &e_qmax_vec)
886{
887 /* if (u_q[i] > 0) {
888 * eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]);
889 * eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1;
890 * eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2;
891 * eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3;
892 * }
893 */
894 auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO);
895
896 auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec);
897 auto eps_vec = _mm256_srli_epi32(mask, 31);
898
899 mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec);
900 auto tmp = _mm256_srli_epi32(mask, 31);
901 tmp = _mm256_slli_epi32(tmp, 1);
902 eps_vec = _mm256_or_si256(eps_vec, tmp);
903
904 mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec);
905 tmp = _mm256_srli_epi32(mask, 31);
906 tmp = _mm256_slli_epi32(tmp, 2);
907 eps_vec = _mm256_or_si256(eps_vec, tmp);
908
909 mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec);
910 tmp = _mm256_srli_epi32(mask, 31);
911 tmp = _mm256_slli_epi32(tmp, 3);
912 eps_vec = _mm256_or_si256(eps_vec, tmp);
913
914 return _mm256_and_si256(u_q_mask, eps_vec);
915}
916
917static void update_lep(ui32 x, __m256i &prev_e_val_vec,
918 __m256i *eq_vec, __m256i *e_val_vec,
919 const __m256i left_shift)
920{
921 /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
922 * lep[0] = (ui8)e_q[3];
923 * Compare e_q[1] with e_q[3] of the prevous round.
924 */
925 auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift);
926 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0);
927 prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0);
928 e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp);
929}
930
931
932static void update_lcxp(ui32 x, __m256i &prev_cx_val_vec,
933 __m256i &rho_vec, __m256i *cx_val_vec,
934 const __m256i left_shift)
935{
936 /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
937 * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
938 * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
939 */
940 auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift);
941 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0);
942 prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0);
943
944 tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8));
945 tmp = _mm256_srli_epi32(tmp, 3);
946
947 auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
948 tmp1 = _mm256_srli_epi32(tmp1, 1);
949 cx_val_vec[x] = _mm256_or_si256(tmp, tmp1);
950}
951
952static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec,
953 __m256i &eps_vec, ui32 *vlc_tbl)
954{
955 /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */
956 auto tmp = _mm256_slli_epi32(cq_vec, 8);
957 auto tmp1 = _mm256_slli_epi32(rho_vec, 4);
958 tmp = _mm256_add_epi32(tmp, tmp1);
959 tmp = _mm256_add_epi32(tmp, eps_vec);
960 return _mm256_i32gather_epi32((const int *)vlc_tbl, tmp, 4);
961}
962
963static __m256i proc_cq1(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
964 const __m256i right_shift)
965{
966 ojph_unused(x);
967 ojph_unused(cx_val_vec);
968 ojph_unused(right_shift);
969
970 /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */
971 auto tmp = _mm256_srli_epi32(rho_vec, 1);
972 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
973 return _mm256_or_si256(tmp, tmp1);
974}
975
976static __m256i proc_cq2(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
977 const __m256i right_shift)
978{
979 // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
980 // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
981 auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift);
982 auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift);
983
984#ifdef OJPH_ARCH_X86_64
985 tmp = _mm256_insert_epi64(tmp,
986 _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3);
987#elif (defined OJPH_ARCH_I386)
988 int lsb = _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1]));
989 tmp = _mm256_insert_epi32(tmp, lsb, 6);
990 int msb = _mm_extract_epi32(_mm256_castsi256_si128(cx_val_vec[x + 1]), 1);
991 tmp = _mm256_insert_epi32(tmp, msb, 7);
992#else
993 #error Error unsupport compiler
994#endif
995 tmp = _mm256_slli_epi32(tmp, 2);
996 auto tmp1 = _mm256_insert_epi32(lcxp1_vec,
997 _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7);
998 tmp = _mm256_add_epi32(tmp1, tmp);
999
1000 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
1001 tmp1 = _mm256_srli_epi32(tmp1, 1);
1002 tmp = _mm256_or_si256(tmp, tmp1);
1003
1004 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
1005 tmp1 = _mm256_srli_epi32(tmp1, 2);
1006
1007 return _mm256_or_si256(tmp, tmp1);
1008}
1009
1010static void proc_mel_encode1(mel_struct *melp, __m256i &cq_vec,
1011 __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
1012 const __m256i right_shift)
1013{
1014 int32_t mel_need_encode[8];
1015 int32_t mel_need_encode2[8];
1016 int32_t mel_bit[8];
1017 int32_t mel_bit2[8];
1018 /* Prepare mel_encode params */
1019 /* if (c_q[i] == 0) { */
1020 _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
1021 /* mel_encode(&mel, rho[i] != 0); */
1022 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
1023 /* } */
1024
1025 /* mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */
1026 auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift);
1027 auto tmp1 = _mm256_min_epi32(u_q_vec, tmp);
1028 _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31));
1029
1030 /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
1031 auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO);
1032 _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO)));
1033
1034 ui32 i_max = 8 - (ignore / 2);
1035
1036 for (ui32 i = 0; i < i_max; i += 2) {
1037 if (mel_need_encode[i]) {
1038 mel_encode(melp, mel_bit[i]);
1039 }
1040
1041 if (i + 1 < i_max) {
1042 if (mel_need_encode[i + 1]) {
1043 mel_encode(melp, mel_bit[i + 1]);
1044 }
1045 }
1046
1047 if (mel_need_encode2[i]) {
1048 mel_encode(melp, mel_bit2[i]);
1049 }
1050 }
1051}
1052
1053static void proc_mel_encode2(mel_struct *melp, __m256i &cq_vec,
1054 __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
1055 const __m256i right_shift)
1056{
1057 ojph_unused(u_q_vec);
1058 ojph_unused(right_shift);
1059
1060 __m256i need = _mm256_cmpeq_epi32(cq_vec, ZERO);
1061 ui32 mask = (ui32)_mm256_movemask_epi8(need);
1062 mask &= 0x88888888;
1063
1064 ui32 i_max = 8 - (ignore / 2);
1065 if (i_max < 8)
1066 mask &= (1u << (i_max * 4)) - 1;
1067
1068 if (mask == 0)
1069 return;
1070
1071 int32_t mel_bit[8];
1072 _mm256_storeu_si256((__m256i*)mel_bit,
1073 _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
1074
1075 while (mask) {
1076 ui32 bit_pos = (ui32)count_trailing_zeros(mask);
1077 ui32 i = bit_pos / 4;
1078 mel_encode(melp, mel_bit[i]);
1079 mask &= mask - 1;
1080 }
1081}
1082
1083using fn_proc_mel_encode = void (*)(mel_struct *, __m256i &, __m256i &,
1084 __m256i, ui32, const __m256i);
1085
1086static inline void
1087build_vlc_uvlc_pair(ui32 *tuple, ui32 *u_q, ui32 i,
1088 const ui32 *uvlc_tbl, ui64 &val, int &size)
1089{
1090 val = tuple[i + 0] >> 4;
1091 size = tuple[i + 0] & 7;
1092
1093 val |= (ui64)(tuple[i + 1] >> 4) << size;
1094 size += tuple[i + 1] & 7;
1095
1096 ui32 entry = uvlc_tbl[u_q[i] * 33 + u_q[i + 1]];
1097 val |= (ui64)(entry >> 5) << size;
1098 size += entry & 0x1F;
1099}
1100
1101static void proc_vlc_encode(vlc_struct *vlcp, ui32 *tuple,
1102 ui32 *u_q, ui32 ignore, const ui32 *uvlc_tbl)
1103{
1104 ui32 i_max = 8 - (ignore / 2);
1105
1106 ui32 i = 0;
1107 for (; i + 2 < i_max; i += 4) {
1108 ui64 val1; int size1;
1109 build_vlc_uvlc_pair(tuple, u_q, i, uvlc_tbl, val1, size1);
1110 ui64 val2; int size2;
1111 build_vlc_uvlc_pair(tuple, u_q, i + 2, uvlc_tbl, val2, size2);
1112 vlc_encode(vlcp, val1 | (val2 << size1), size1 + size2);
1113 }
1114 if (i < i_max) {
1115 ui64 val; int size;
1116 build_vlc_uvlc_pair(tuple, u_q, i, uvlc_tbl, val, size);
1117 vlc_encode(vlcp, val, size);
1118 }
1119}
1120
1121template<int PASS>
1122OJPH_FORCE_INLINE void encode_x_loop(
1123 ui32 *sp, ui32 stride, ui32 height, ui32 y,
1124 ui32 n_loop, ui32 _width, ui32 ignore, ui32 p,
1125 mel_struct &mel, vlc_struct &vlc, ms_struct &ms,
1126 __m256i *e_val_vec, __m256i &prev_e_val_vec,
1127 __m256i *cx_val_vec, __m256i &prev_cx_val_vec,
1128 ui32 &prev_cq,
1129 const __m256i &right_shift, const __m256i &left_shift)
1130{
1131 ui32 *vlc_tbl = (PASS == 1) ? vlc_tbl0 : vlc_tbl1;
1132
1133 __m256i tmp, tmp1;
1134 __m256i eq_vec[4];
1135 __m256i s_vec[4];
1136 __m256i src_vec[4];
1137
1138 /* 16 bytes per iteration */
1139 for (ui32 x = 0; x < n_loop; ++x) {
1140
1141 /* t = sp[i]; */
1142 if ((x == (n_loop - 1)) && (_width % 16)) {
1143 ui32 tmp_buf[16] = { 0 };
1144 memcpy(tmp_buf, sp, (_width % 16) * sizeof(ui32));
1145 src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1146 src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1147 if (y + 1 < height) {
1148 memcpy(tmp_buf, sp + stride, (_width % 16) * sizeof(ui32));
1149 src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1150 src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1151 }
1152 else {
1153 src_vec[1] = ZERO;
1154 src_vec[3] = ZERO;
1155 }
1156 }
1157 else {
1158 src_vec[0] = _mm256_loadu_si256((__m256i*)(sp));
1159 src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8));
1160
1161 if (y + 1 < height) {
1162 src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride));
1163 src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride));
1164 }
1165 else {
1166 src_vec[1] = ZERO;
1167 src_vec[3] = ZERO;
1168 }
1169 sp += 16;
1170 }
1171
1172 __m256i rho_vec, e_qmax_vec;
1173 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1174
1175 // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1;
1176 tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift);
1177 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7);
1178
1179 auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]);
1180 max_e_vec = _mm256_sub_epi32(max_e_vec, ONE);
1181
1182 // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1;
1183 tmp = _mm256_max_epi32(max_e_vec, ONE);
1184 tmp1 = _mm256_sub_epi32(rho_vec, ONE);
1185 tmp1 = _mm256_and_si256(rho_vec, tmp1);
1186
1187 auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO);
1188 auto kappa_vec1_ = _mm256_and_si256(cmp, ONE);
1189 auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp);
1190 const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_);
1191
1192 if (PASS == 1)
1193 tmp = proc_cq1(x, cx_val_vec, rho_vec, right_shift);
1194 else
1195 tmp = proc_cq2(x, cx_val_vec, rho_vec, right_shift);
1196
1197 auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift);
1198 cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0);
1199 prev_cq = (ui32)_mm256_extract_epi32(tmp, 7);
1200
1201 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1202 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1203
1204 /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */
1205 /* u_q[i] = Uq[i] - kappa[i]; */
1206 auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec);
1207 auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec);
1208
1209 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1210 __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1211 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1212
1213 if (PASS == 1)
1214 proc_mel_encode1(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1215 right_shift);
1216 else
1217 proc_mel_encode2(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1218 right_shift);
1219
1220 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1221
1222 ui32 u_q[10];
1223 ui32 tuple[10];
1224 tuple_vec = _mm256_srli_epi32(tuple_vec, 4);
1225 _mm256_storeu_si256((__m256i*)tuple, tuple_vec);
1226 _mm256_storeu_si256((__m256i*)u_q, u_q_vec);
1227 {
1228 ui32 i_max = 8 - (_ignore / 2);
1229 if (i_max & 1) { tuple[i_max] = 0; u_q[i_max] = 0; }
1230 tuple[8] = 0; u_q[8] = 0;
1231 }
1232 proc_vlc_encode(&vlc, tuple, u_q, _ignore,
1233 (PASS == 1) ? uvlc_tbl_pair1 : uvlc_tbl_pair2);
1234 }
1235}
1236
1237void ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs,
1238 ui32 num_passes, ui32 _width, ui32 height,
1239 ui32 stride, ui32* lengths,
1240 ojph::mem_elastic_allocator *elastic,
1241 ojph::coded_lists *& coded)
1242{
1243 ojph_unused(num_passes); //currently not used
1244
1245 ui32 width = (_width + 15) & ~15u;
1246 ui32 ignore = width - _width;
1247 const int ms_size = (16384 * 16 + 14) / 15; //more than enough
1248 const int mel_vlc_size = 3072; //more than enough
1249 const int mel_size = 192;
1250 const int vlc_size = mel_vlc_size - mel_size;
1251
1252 ui8 ms_buf[ms_size];
1253 ui8 mel_vlc_buf[mel_vlc_size];
1254 ui8 *mel_buf = mel_vlc_buf;
1255 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1256
1257 mel_struct mel;
1258 mel_init(&mel, mel_size, mel_buf);
1259 vlc_struct vlc;
1260 vlc_init(&vlc, vlc_size, vlc_buf);
1261 ms_struct ms;
1262 ms_init(&ms, ms_size, ms_buf);
1263
1264 const ui32 p = 30 - missing_msbs;
1265
1266 //e_val: E values for a line (these are the highest set bit)
1267 //cx_val: is the context values
1268 //Each byte stores the info for the 2 sample. For E, it is maximum
1269 // of the two samples, while for cx, it is the OR of these two samples.
1270 //The maximum is between the pixel at the bottom left of one quad
1271 // and the bottom right of the earlier quad. The same is true for cx.
1272 //For a 1024 pixels, we need 512 bytes, the 2 extra,
1273 // one for the non-existing earlier quad, and one for beyond the
1274 // the end
1275 const __m256i right_shift = _mm256_set_epi32(
1276 0, 7, 6, 5, 4, 3, 2, 1
1277 );
1278
1279 const __m256i left_shift = _mm256_set_epi32(
1280 6, 5, 4, 3, 2, 1, 0, 7
1281 );
1282
1283 ui32 n_loop = (width + 15) / 16;
1284
1285 __m256i e_val_vec[65];
1286 for (ui32 i = 0; i < ojph_min(64, n_loop); ++i)
1287 e_val_vec[i] = ZERO;
1288
1289 __m256i prev_e_val_vec = ZERO;
1290
1291 __m256i cx_val_vec[65];
1292 __m256i prev_cx_val_vec = ZERO;
1293
1294 ui32 prev_cq = 0;
1295
1296 __m256i tmp;
1297
1298 /* 2 lines per iteration */
1299 for (ui32 y = 0; y < height; y += 2)
1300 {
1301 e_val_vec[n_loop] = prev_e_val_vec;
1302 /* lcxp[0] = (ui8)((rho[0] & 8) >> 3); */
1303 tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8));
1304 cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3);
1305
1306 prev_e_val_vec = ZERO;
1307 prev_cx_val_vec = ZERO;
1308
1309 ui32 *sp = buf + y * stride;
1310
1311 if (y == 0)
1312 encode_x_loop<1>(sp, stride, height, y, n_loop, _width,
1313 ignore, p, mel, vlc, ms,
1314 e_val_vec, prev_e_val_vec,
1315 cx_val_vec, prev_cx_val_vec, prev_cq,
1316 right_shift, left_shift);
1317 else
1318 encode_x_loop<2>(sp, stride, height, y, n_loop, _width,
1319 ignore, p, mel, vlc, ms,
1320 e_val_vec, prev_e_val_vec,
1321 cx_val_vec, prev_cx_val_vec, prev_cq,
1322 right_shift, left_shift);
1323
1324 tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift);
1325 tmp = _mm256_slli_epi32(tmp, 2);
1326 tmp = _mm256_add_epi32(tmp, cx_val_vec[0]);
1327 prev_cq = (ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp));
1328 }
1329
1330 ms_terminate(&ms);
1331 vlc_drain(&vlc);
1332 terminate_mel_vlc(&mel, &vlc);
1333
1334 //copy to elastic
1335 lengths[0] = mel.pos + vlc.pos + ms.pos;
1336 elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1337 memcpy(coded->buf, ms.buf, ms.pos);
1338 memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
1339 memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1340
1341 // put in the interface locator word
1342 ui32 num_bytes = mel.pos + vlc.pos;
1343 coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
1344 coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
1345 coded->buf[lengths[0]-2] =
1346 (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
1347
1348 coded->avail_size -= lengths[0];
1349}
1350
1351} /* namespace local */
1352} /* namespace ojph */
1353
1354#endif
1355#endif // !defined(__apple_build_version__)
void get_buffer(ui32 needed_bytes, coded_lists *&p)
Definition ojph_mem.cpp:113
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void ms_terminate(ms_struct *msp)
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
void ojph_encode_codeblock_avx2(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void mel_encode(mel_struct *melp, bool bit)
bool initialize_block_encoder_tables_avx2()
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
static uvlc_tbl_struct uvlc_tbl[num_uvlc_entries]
uint64_t ui64
Definition ojph_defs.h:56
uint16_t ui16
Definition ojph_defs.h:52
static ui32 population_count(ui32 val)
Definition ojph_arch.h:185
static ui32 count_trailing_zeros(ui32 val)
Definition ojph_arch.h:269
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define OJPH_FORCE_INLINE
Definition ojph_arch.h:75
#define ojph_max(a, b)
Definition ojph_defs.h:73
#define ojph_min(a, b)
Definition ojph_defs.h:76
#define ojph_unused(x)
Definition ojph_defs.h:78
#define OJPH_ERROR(t,...)