OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_block_decoder_vsx.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_block_decoder_vsx.cpp
34// Author: Aous Naman
35// Date: 13 May 2022
36//***************************************************************************/
37
38//***************************************************************************/
42
43#include <string>
44#include <iostream>
45
46#include <cassert>
47#include <cstring>
48#include "ojph_block_common.h"
49#include "ojph_block_decoder.h"
50#include "ojph_arch.h"
51#include "ojph_message.h"
52
53#include "ojph_simd_vsx.h"
54
55namespace ojph {
56 namespace local {
57
58 //************************************************************************/
61 #define OJPH_REPEAT2(a) a,a
62 #define OJPH_REPEAT4(a) a,a,a,a
63 #define OJPH_REPEAT8(a) a,a,a,a,a,a,a,a
64 #define OJPH_REPEAT16(a) a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a
65
66 //************************************************************************/
73 struct dec_mel_st {
74 dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
75 k(0), num_runs(0), runs(0)
76 {}
77 // data decoding machinery
78 ui8* data;
79 ui64 tmp;
80 int bits;
81 int size;
82 bool unstuff;
83 int k;
84
85 // queue of decoded runs
86 int num_runs;
87 ui64 runs;
88 };
89
90 //************************************************************************/
102 static inline
104 {
105 if (melp->bits > 32) //there are enough bits in the tmp variable
106 return; // return without reading new data
107
108 ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted
109 if (melp->size > 4) { // if there is data in the MEL segment
110 memcpy(&val, melp->data, sizeof(val)); // read 32 bits from MEL data
111 melp->data += 4; // advance pointer
112 melp->size -= 4; // reduce counter
113 }
114 else if (melp->size > 0)
115 { // 4 or less
116 int i = 0;
117 while (melp->size > 1) {
118 ui32 v = *melp->data++; // read one byte at a time
119 ui32 m = ~(0xFFu << i); // mask of location
120 val = (val & m) | (v << i);// put one byte in its correct location
121 --melp->size;
122 i += 8;
123 }
124 // size equal to 1
125 ui32 v = *melp->data++; // the one before the last is different
126 v |= 0xF; // MEL and VLC segments can overlap
127 ui32 m = ~(0xFFu << i);
128 val = (val & m) | (v << i);
129 --melp->size;
130 }
131
132 // next we unstuff them before adding them to the buffer
133 int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
134 // the previously read byte requires
135 // unstuffing
136
137 // data is unstuffed and accumulated in t
138 // bits has the number of bits in t
139 ui32 t = val & 0xFF;
140 bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
141 bits -= unstuff; // there is one less bit in t if unstuffing is needed
142 t = t << (8 - unstuff); // move up to make room for the next byte
143
144 //this is a repeat of the above
145 t |= (val>>8) & 0xFF;
146 unstuff = (((val >> 8) & 0xFF) == 0xFF);
147 bits -= unstuff;
148 t = t << (8 - unstuff);
149
150 t |= (val>>16) & 0xFF;
151 unstuff = (((val >> 16) & 0xFF) == 0xFF);
152 bits -= unstuff;
153 t = t << (8 - unstuff);
154
155 t |= (val>>24) & 0xFF;
156 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
157
158 // move t to tmp, and push the result all the way up, so we read from
159 // the MSB
160 melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
161 melp->bits += bits; //increment the number of bits in tmp
162 }
163
164 //************************************************************************/
179 static inline
181 {
182 static const int mel_exp[13] = { //MEL exponents
183 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
184 };
185
186 if (melp->bits < 6) // if there are less than 6 bits in tmp
187 mel_read(melp); // then read from the MEL bitstream
188 // 6 bits is the largest decodable MEL cwd
189
190 //repeat so long that there is enough decodable bits in tmp,
191 // and the runs store is not full (num_runs < 8)
192 while (melp->bits >= 6 && melp->num_runs < 8)
193 {
194 int eval = mel_exp[melp->k]; // number of bits associated with state
195 int run = 0;
196 if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
197 { //one is found
198 run = 1 << eval;
199 run--; // consecutive runs of 0 events - 1
200 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
201 melp->tmp <<= 1; // consume one bit from tmp
202 melp->bits -= 1;
203 run = run << 1; // a stretch of zeros not terminating in one
204 }
205 else
206 { //0 is found
207 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
208 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
209 melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
210 melp->bits -= eval + 1;
211 run = (run << 1) + 1; // a stretch of zeros terminating with one
212 }
213 eval = melp->num_runs * 7; // 7 bits per run
214 melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
215 melp->runs |= ((ui64)run) << eval; // store the value in runs
216 melp->num_runs++; // increment count
217 }
218 }
219
220 //************************************************************************/
230 static inline
231 void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
232 {
233 melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
234 melp->bits = 0; // 0 bits in tmp
235 melp->tmp = 0; //
236 melp->unstuff = false; // no unstuffing
237 melp->size = scup - 1; // size is the length of MEL+VLC-1
238 melp->k = 0; // 0 for state
239 melp->num_runs = 0; // num_runs is 0
240 melp->runs = 0; //
241
242 //This code is borrowed; original is for a different architecture
243 //These few lines take care of the case where data is not at a multiple
244 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment
245 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
246 for (int i = 0; i < num; ++i) { // this code is similar to mel_read
247 assert(melp->unstuff == false || melp->data[0] <= 0x8F);
248 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
249 //set data to 0xFF
250 if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
251 // see the standard
252 melp->data += melp->size-- > 0; //increment if the end is not reached
253 int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
254 melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
255 melp->bits += d_bits; //increment tmp by number of bits
256 melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
257 //unstuffing
258 }
259 melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
260 // is the MSB
261 }
262
263 //************************************************************************/
269 static inline
271 {
272 if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment
273 mel_decode(melp);
274
275 int t = melp->runs & 0x7F; //retrieve one run
276 melp->runs >>= 7; // remove the retrieved run
277 melp->num_runs--;
278 return t; // return run
279 }
280
281 //************************************************************************/
285 struct rev_struct {
286 rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
287 {}
288 //storage
289 ui8* data;
290 ui64 tmp;
291 ui32 bits;
292 int size;
293 bool unstuff;
295 };
296
297 //************************************************************************/
317 static inline
319 {
320 //process 4 bytes at a time
321 if (vlcp->bits > 32) // if there are more than 32 bits in tmp, then
322 return; // reading 32 bits can overflow vlcp->tmp
323 ui32 val = 0;
324 //the next line (the if statement) needs to be tested first
325 if (vlcp->size > 3) // if there are more than 3 bytes left in VLC
326 {
327 // (vlcp->data - 3) move pointer back to read 32 bits at once
328 memcpy(&val, vlcp->data - 3, sizeof(val)); // then read 32 bits
329 vlcp->data -= 4; // move data pointer back by 4
330 vlcp->size -= 4; // reduce available byte by 4
331 }
332 else if (vlcp->size > 0)
333 { // 4 or less
334 int i = 24;
335 while (vlcp->size > 0) {
336 ui32 v = *vlcp->data--; // read one byte at a time
337 val |= (v << i); // put byte in its correct location
338 --vlcp->size;
339 i -= 8;
340 }
341 }
342
343 //accumulate in tmp, number of bits in tmp are stored in bits
344 ui32 tmp = val >> 24; //start with the MSB byte
345 ui32 bits;
346
347 // test unstuff (previous byte is >0x8F), and this byte is 0x7F
348 bits = 8 - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
349 bool unstuff = (val >> 24) > 0x8F; //this is for the next byte
350
351 tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte
352 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
353 unstuff = ((val >> 16) & 0xFF) > 0x8F;
354
355 tmp |= ((val >> 8) & 0xFF) << bits;
356 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
357 unstuff = ((val >> 8) & 0xFF) > 0x8F;
358
359 tmp |= (val & 0xFF) << bits;
360 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
361 unstuff = (val & 0xFF) > 0x8F;
362
363 // now move the read and unstuffed bits into vlcp->tmp
364 vlcp->tmp |= (ui64)tmp << vlcp->bits;
365 vlcp->bits += bits;
366 vlcp->unstuff = unstuff; // this for the next read
367 }
368
369 //************************************************************************/
383 static inline
384 void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup)
385 {
386 //first byte has only the upper 4 bits
387 vlcp->data = data + lcup - 2;
388
389 //size can not be larger than this, in fact it should be smaller
390 vlcp->size = scup - 2;
391
392 ui32 d = *vlcp->data--; // read one byte (this is a half byte)
393 vlcp->tmp = d >> 4; // both initialize and set
394 vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
395 vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
396
397 //This code is designed for an architecture that read address should
398 // align to the read size (address multiple of 4 if read size is 4)
399 //These few lines take care of the case where data is not at a multiple
400 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream.
401 // To read 32 bits, read from (vlcp->data - 3)
402 int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
403 int tnum = num < vlcp->size ? num : vlcp->size;
404 for (int i = 0; i < tnum; ++i) {
405 ui64 d;
406 d = *vlcp->data--; // read one byte and move read pointer
407 //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
408 ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
409 vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
410 vlcp->bits += d_bits;
411 vlcp->unstuff = d > 0x8F; // for next byte
412 }
413 vlcp->size -= tnum;
414 rev_read(vlcp); // read another 32 buts
415 }
416
417 //************************************************************************/
424 static inline
426 {
427 if (vlcp->bits < 32) // if there are less then 32 bits, read more
428 {
429 rev_read(vlcp); // read 32 bits, but unstuffing might reduce this
430 if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
431 rev_read(vlcp); // read another 32
432 }
433 return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
434 }
435
436 //************************************************************************/
442 static inline
444 {
445 assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
446 vlcp->tmp >>= num_bits; // remove bits
447 vlcp->bits -= num_bits; // decrement the number of bits
448 return (ui32)vlcp->tmp;
449 }
450
451 //************************************************************************/
462 static inline
464 {
465 //process 4 bytes at a time
466 if (mrp->bits > 32)
467 return;
468 ui32 val = 0;
469 if (mrp->size > 3) // If there are 3 byte or more
470 { // (mrp->data - 3) move pointer back to read 32 bits at once
471 memcpy(&val, mrp->data - 3, sizeof(val)); // read 32 bits
472 mrp->data -= 4; // move back pointer
473 mrp->size -= 4; // reduce count
474 }
475 else if (mrp->size > 0)
476 {
477 int i = 24;
478 while (mrp->size > 0) {
479 ui32 v = *mrp->data--; // read one byte at a time
480 val |= (v << i); // put byte in its correct location
481 --mrp->size;
482 i -= 8;
483 }
484 }
485
486 //accumulate in tmp, and keep count in bits
487 ui32 bits, tmp = val >> 24;
488
489 //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
490 bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
491 bool unstuff = (val >> 24) > 0x8F;
492
493 //process the next byte
494 tmp |= ((val >> 16) & 0xFF) << bits;
495 bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
496 unstuff = ((val >> 16) & 0xFF) > 0x8F;
497
498 tmp |= ((val >> 8) & 0xFF) << bits;
499 bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
500 unstuff = ((val >> 8) & 0xFF) > 0x8F;
501
502 tmp |= (val & 0xFF) << bits;
503 bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
504 unstuff = (val & 0xFF) > 0x8F;
505
506 mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
507 mrp->bits += bits;
508 mrp->unstuff = unstuff; // next byte
509 }
510
511 //************************************************************************/
526 static inline
527 void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
528 {
529 mrp->data = data + lcup + len2 - 1;
530 mrp->size = len2;
531 mrp->unstuff = true;
532 mrp->bits = 0;
533 mrp->tmp = 0;
534
535 //This code is designed for an architecture that read address should
536 // align to the read size (address multiple of 4 if read size is 4)
537 //These few lines take care of the case where data is not at a multiple
538 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream
539 int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
540 for (int i = 0; i < num; ++i) {
541 ui64 d;
542 //read a byte, 0 if no more data
543 d = (mrp->size-- > 0) ? *mrp->data-- : 0;
544 //check if unstuffing is needed
545 ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
546 mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
547 mrp->bits += d_bits;
548 mrp->unstuff = d > 0x8F; // for next byte
549 }
550 rev_read_mrp(mrp);
551 }
552
553 //************************************************************************/
560 static inline
562 {
563 if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
564 {
565 rev_read_mrp(mrp); // read 30-32 bits from mrp
566 if (mrp->bits < 32) // if there is a space of 32 bits
567 rev_read_mrp(mrp); // read more
568 }
569 return (ui32)mrp->tmp; // return the head of mrp->tmp
570 }
571
572 //************************************************************************/
578 inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
579 {
580 assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
581 mrp->tmp >>= num_bits; // discard the lowest num_bits bits
582 mrp->bits -= num_bits;
583 return (ui32)mrp->tmp; // return data after consumption
584 }
585
586 //************************************************************************/
597
598 //************************************************************************/
616 template<int X>
617 static inline
619 {
620 assert(msp->bits <= 128);
621
622 v128_t offset, val, validity, all_xff;
623 val = vsx_v128_load(msp->data);
624 int bytes = msp->size >= 16 ? 16 : msp->size;
625 validity = vsx_i8x16_splat((char)bytes);
626 msp->data += bytes;
627 msp->size -= bytes;
628 ui32 bits = 128;
629 offset = vsx_i64x2_const(0x0706050403020100,0x0F0E0D0C0B0A0908);
630 validity = vsx_i8x16_gt(validity, offset);
631 all_xff = vsx_i8x16_const(OJPH_REPEAT16(-1));
632 if (X == 0xFF) // the compiler should remove this if statement
633 {
634 v128_t t = vsx_v128_xor(validity, all_xff); // complement
635 val = vsx_v128_or(t, val); // fill with 0xFF
636 }
637 else if (X == 0)
638 val = vsx_v128_and(validity, val); // fill with zeros
639 else
640 assert(0);
641
642 v128_t ff_bytes;
643 ff_bytes = vsx_i8x16_eq(val, all_xff);
644 ff_bytes = vsx_v128_and(ff_bytes, validity);
645 ui32 flags = vsx_i8x16_bitmask(ff_bytes);
646 flags <<= 1; // unstuff following byte
647 ui32 next_unstuff = flags >> 16;
648 flags |= msp->unstuff;
649 flags &= 0xFFFF;
650 while (flags)
651 { // bit unstuffing occurs on average once every 256 bytes
652 // therefore it is not an issue if it is a bit slow
653 // here we process 16 bytes
654 --bits; // consuming one stuffing bit
655
656 ui32 loc = 31 - count_leading_zeros(flags);
657 flags ^= 1 << loc;
658
659 v128_t m, t, c;
660 t = vsx_i8x16_splat((char)loc);
661 m = vsx_i8x16_gt(offset, t);
662
663 t = vsx_v128_and(m, val); // keep bits at locations larger than loc
664 c = vsx_u64x2_shr(t, 1); // 1 bits left
665 t = vsx_i64x2_shuffle(t, vsx_i64x2_const(0, 0), 1, 2);
666 t = vsx_i64x2_shl(t, 63); // keep the MSB only
667 t = vsx_v128_or(t, c); // combine the above 3 steps
668
669 val = vsx_v128_or(t, vsx_v128_andnot(val, m));
670 }
671
672 // combine with earlier data
673 assert(msp->bits >= 0 && msp->bits <= 128);
674 int cur_bytes = msp->bits >> 3;
675 ui32 cur_bits = msp->bits & 7;
676 v128_t b1, b2;
677 b1 = vsx_i64x2_shl(val, cur_bits);
678 //next shift 8 bytes right
679 b2 = vsx_i64x2_shuffle(vsx_i64x2_const(0, 0), val, 1, 2);
680 b2 = vsx_u64x2_shr(b2, 64u - cur_bits);
681 b2 = (cur_bits > 0) ? b2 : vsx_i64x2_const(0, 0);
682 b1 = vsx_v128_or(b1, b2);
683 b2 = vsx_v128_load(msp->tmp + cur_bytes);
684 b2 = vsx_v128_or(b1, b2);
685 vsx_v128_store(msp->tmp + cur_bytes, b2);
686
687 ui32 consumed_bits = bits < 128u - cur_bits ? bits : 128u - cur_bits;
688 cur_bytes = (msp->bits + consumed_bits + 7) >> 3; // round up
689 int upper = vsx_u16x8_extract_lane(val, 7);
690 upper >>= consumed_bits + 16 - 128;
691 msp->tmp[cur_bytes] = (ui8)upper; // copy byte
692
693 msp->bits += bits;
694 msp->unstuff = next_unstuff; // next unstuff
695 assert(msp->unstuff == 0 || msp->unstuff == 1);
696 }
697
698 //************************************************************************/
707 template<int X>
708 static inline
709 void frwd_init(frwd_struct *msp, const ui8* data, int size)
710 {
711 msp->data = data;
713 vsx_v128_store(msp->tmp + 16, vsx_i64x2_const(0, 0));
714 vsx_v128_store(msp->tmp + 32, vsx_i64x2_const(0, 0));
715
716 msp->bits = 0;
717 msp->unstuff = 0;
718 msp->size = size;
719
720 frwd_read<X>(msp); // read 128 bits more
721 }
722
723 //************************************************************************/
729 static inline
730 void frwd_advance(frwd_struct *msp, ui32 num_bits)
731 {
732 assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
733 msp->bits -= num_bits;
734
735 v128_t *p = (v128_t*)(msp->tmp + ((num_bits >> 3) & 0x18));
736 num_bits &= 63;
737
738 v128_t v0, v1, c0, c1, t;
739 v0 = vsx_v128_load(p);
740 v1 = vsx_v128_load(p + 1);
741
742 // shift right by num_bits
743 c0 = vsx_u64x2_shr(v0, num_bits);
744 t = vsx_i64x2_shuffle(v0, vsx_i64x2_const(0, 0), 1, 2);
745 t = vsx_i64x2_shl(t, 64 - num_bits);
746 t = (num_bits > 0) ? t : vsx_i64x2_const(0, 0);
747 c0 = vsx_v128_or(c0, t);
748 t = vsx_i64x2_shuffle(vsx_i64x2_const(0, 0), v1, 1, 2);
749 t = vsx_i64x2_shl(t, 64 - num_bits);
750 t = (num_bits > 0) ? t : vsx_i64x2_const(0, 0);
751 c0 = vsx_v128_or(c0, t);
752
753 vsx_v128_store(msp->tmp, c0);
754
755 c1 = vsx_u64x2_shr(v1, num_bits);
756 t = vsx_i64x2_shuffle(v1, vsx_i64x2_const(0, 0), 1, 2);
757 t = vsx_i64x2_shl(t, 64 - num_bits);
758 t = (num_bits > 0) ? t : vsx_i64x2_const(0, 0);
759 c1 = vsx_v128_or(c1, t);
760
761 vsx_v128_store(msp->tmp + 16, c1);
762 }
763
764 //************************************************************************/
771 template<int X>
772 static inline
774 {
775 if (msp->bits <= 128)
776 {
777 frwd_read<X>(msp);
778 if (msp->bits <= 128) //need to test
779 frwd_read<X>(msp);
780 }
781 v128_t t = vsx_v128_load(msp->tmp);
782 return t;
783 }
784
785 //************************************************************************/
802 template<int X>
803 static inline
804 ui32 destuff_frwd(const ui8* src, int size, ui8* dst, ui32 cap)
805 {
806 if (size < 0)
807 size = 0;
808 ui8* o = dst;
809 ui8* o_end = dst + cap;
810 const ui8* s = src;
811 const ui8* s_end = src + size;
812 ui64 acc = 0; // partial output byte, low nb bits are valid
813 ui32 nb = 0; // number of valid bits in acc; always < 8
814 bool prev_ff = false;
815
816 // fast path; 16 source bytes at a time when they contain no 0xFF
817 while (s + 16 <= s_end && o + 24 <= o_end)
818 {
819 v128_t v = vsx_v128_load(s);
820 if (vec_any_eq((vsx_v_u8)v, vec_splats((unsigned char)0xFF))
821 || prev_ff)
822 { // process these 16 bytes one at a time
823 for (int i = 0; i < 16; ++i) {
824 ui8 b = *s++;
825 acc |= (ui64)b << nb;
826 nb += prev_ff ? 7u : 8u;
827 prev_ff = (b == 0xFFu);
828 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
829 }
830 continue;
831 }
832 ui64 v0, v1;
833 memcpy(&v0, s, 8);
834 memcpy(&v1, s + 8, 8);
835 ui64 w0 = acc | (v0 << nb);
836 ui64 w1 = (v1 << nb) | (nb ? (v0 >> (64 - nb)) : 0);
837 memcpy(o, &w0, 8);
838 memcpy(o + 8, &w1, 8);
839 acc = nb ? (v1 >> (64 - nb)) : 0;
840 o += 16;
841 s += 16;
842 }
843 // tail; one byte at a time
844 while (s < s_end && o < o_end)
845 {
846 ui8 b = *s++;
847 acc |= (ui64)b << nb;
848 nb += prev_ff ? 7u : 8u;
849 prev_ff = (b == 0xFFu);
850 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
851 }
852 // fill the bits above nb with X, and pad with X bytes
853 ui32 fill = (X == 0xFF) ? (0xFFu << nb) : 0;
854 *o = (ui8)((ui32)acc | fill);
855 memset(o + 1, X, 64);
856 return (ui32)(o - dst) + 1;
857 }
858
859 //************************************************************************/
872 static inline
873 v128_t vsx_dfetch(const ui8* dbuf, ui32 limit, ui32 pos)
874 {
875 ui32 off = pos >> 3;
876 off = off < limit ? off : limit;
877 const ui8* p = dbuf + off;
878 v128_t v = vsx_v128_load(p);
879 v128_t w = vsx_v128_load(p + 8);
880 int k = (int)(pos & 7);
881 v128_t r = vsx_u64x2_shr(v, k);
882 // shift left by 64 - k without branching on k == 0; vector shifts
883 // are modulo 64, so the shift is split into 1 and 63 - k
884 v128_t c = vsx_i64x2_shl(vsx_i64x2_shl(w, 1), 63 - k);
885 return vsx_v128_or(r, c);
886 }
887
888 //************************************************************************/
900 template <int N>
901 static inline
903 frwd_struct* magsgn, ui32 p, v128_t& vn)
904 {
905 v128_t w0; // workers
906 v128_t insig; // lanes hold FF's if samples are insignificant
907 v128_t flags; // lanes hold e_k, e_1, and rho
908 v128_t row; // decoded row
909
910 row = vsx_i64x2_const(0, 0);
911 w0 = vsx_i32x4_shuffle(inf_u_q, inf_u_q, N, N, N, N);
912 // we keeps e_k, e_1, and rho in w2
913 flags = vsx_v128_and(w0, vsx_i32x4_const(0x1110,0x2220,0x4440,0x8880));
914 insig = vsx_i32x4_eq(flags, vsx_i64x2_const(0, 0));
915 if (vsx_i8x16_bitmask(insig) != 0xFFFF) //are all insignificant?
916 {
917 U_q = vsx_i32x4_shuffle(U_q, U_q, N, N, N, N);
918 flags = vsx_i16x8_mul(flags, vsx_i16x8_const(8,8,4,4,2,2,1,1));
919 v128_t ms_vec = frwd_fetch<0xFF>(magsgn);
920
921 // U_q holds U_q for this quad
922 // flags has e_k, e_1, and rho such that e_k is sitting in the
923 // 0x8000, e_1 in 0x800, and rho in 0x80
924
925 // next e_k and m_n
926 v128_t m_n;
927 w0 = vsx_u32x4_shr(flags, 15); // e_k
928 m_n = vsx_i32x4_sub(U_q, w0);
929 m_n = vsx_v128_andnot(m_n, insig);
930
931 // find cumulative sums
932 // to find at which bit in ms_vec the sample starts
933 v128_t ex_sum, shfl, inc_sum = m_n; // inclusive scan
934 shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
935 inc_sum = vsx_i32x4_add(inc_sum, shfl);
936 shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum, 1, 2);
937 inc_sum = vsx_i32x4_add(inc_sum, shfl);
938 int total_mn = vsx_u16x8_extract_lane(inc_sum, 6);
939 ex_sum = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
940
941 // find the starting byte and starting bit
942 v128_t byte_idx = vsx_u32x4_shr(ex_sum, 3);
943 v128_t bit_idx =
945 byte_idx = vsx_i8x16_swizzle(byte_idx,
946 vsx_i32x4_const(0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C));
947 byte_idx =
948 vsx_i32x4_add(byte_idx, vsx_i32x4_const(OJPH_REPEAT4(0x03020100)));
949 v128_t d0 = vsx_i8x16_swizzle(ms_vec, byte_idx);
950 byte_idx =
951 vsx_i32x4_add(byte_idx, vsx_i32x4_const(OJPH_REPEAT4(0x01010101)));
952 v128_t d1 = vsx_i8x16_swizzle(ms_vec, byte_idx);
953
954 // shift samples values to correct location
955 bit_idx = vsx_v128_or(bit_idx, vsx_i32x4_shl(bit_idx, 16));
956 v128_t bit_shift = vsx_i8x16_swizzle(
957 vsx_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1,
958 -1, 127, 63, 31, 15, 7, 3, 1), bit_idx);
959 bit_shift =
960 vsx_i16x8_add(bit_shift, vsx_i16x8_const(OJPH_REPEAT8(0x0101)));
961 d0 = vsx_i16x8_mul(d0, bit_shift);
962 d0 = vsx_u16x8_shr(d0, 8); // we should have 8 bits in the LSB
963 d1 = vsx_i16x8_mul(d1, bit_shift);
964 d1 = // 8 in MSB
965 vsx_v128_and(d1, vsx_u32x4_const(OJPH_REPEAT4(0xFF00FF00)));
966 d0 = vsx_v128_or(d0, d1);
967
968 // find location of e_k and mask
969 v128_t shift;
972 ui32 U_q_m1 = vsx_u32x4_extract_lane(U_q, 0) - 1u;
973 w0 = vsx_i32x4_sub(twos, w0);
974 shift = vsx_i32x4_shl(w0, U_q_m1);
975 ms_vec = vsx_v128_and(d0, vsx_i32x4_sub(shift, ones));
976
977 // next e_1
978 w0 = vsx_v128_and(flags, vsx_i32x4_const(OJPH_REPEAT4(0x800)));
979 w0 = vsx_i32x4_eq(w0, vsx_i64x2_const(0, 0));
980 w0 = vsx_v128_andnot(shift, w0); // e_1 in correct position
981 ms_vec = vsx_v128_or(ms_vec, w0); // e_1
982 w0 = vsx_i32x4_shl(ms_vec, 31); // sign
983 ms_vec = vsx_v128_or(ms_vec, ones); // bin center
984 v128_t tvn = ms_vec;
985 ms_vec = vsx_i32x4_add(ms_vec, twos);// + 2
986 ms_vec = vsx_i32x4_shl(ms_vec, p - 1);
987 ms_vec = vsx_v128_or(ms_vec, w0); // sign
988 row = vsx_v128_andnot(ms_vec, insig); // significant only
989
990 ms_vec = vsx_v128_andnot(tvn, insig); // significant only
991 if (N == 0) // the compiler should remove one
992 tvn = vsx_i8x16_swizzle(ms_vec,
993 vsx_i32x4_const(0x07060504, 0x0F0E0D0C, -1, -1));
994 else if (N == 1)
995 tvn = vsx_i8x16_swizzle(ms_vec,
996 vsx_i32x4_const(-1, 0x07060504, 0x0F0E0D0C, -1));
997 else
998 assert(0);
999 vn = vsx_v128_or(vn, tvn);
1000
1001 if (total_mn)
1002 frwd_advance(magsgn, (ui32)total_mn);
1003 }
1004 return row;
1005 }
1006
1007 //************************************************************************/
1017 static inline
1019 const ui8* dbuf, ui32 limit, ui32& pos,
1020 ui32 p, v128_t& vn)
1021 {
1022 v128_t w0; // workers
1023 v128_t insig; // lanes hold FF's if samples are insignificant
1024 v128_t flags; // lanes hold e_k, e_1, and rho
1025 v128_t row; // decoded row
1026
1027 row = vsx_i64x2_const(0, 0);
1028 w0 = vsx_i8x16_swizzle(inf_u_q,
1029 vsx_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100,
1030 0x0504, 0x0504, 0x0504, 0x0504));
1031 // we keeps e_k, e_1, and rho in w2
1032 flags = vsx_v128_and(w0,
1033 vsx_u16x8_const(0x1110, 0x2220, 0x4440, 0x8880,
1034 0x1110, 0x2220, 0x4440, 0x8880));
1035 insig = vsx_i16x8_eq(flags, vsx_i64x2_const(0, 0));
1036 if (vsx_i8x16_bitmask(insig) != 0xFFFF) //are all insignificant?
1037 {
1038 U_q = vsx_i8x16_swizzle(U_q,
1039 vsx_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100,
1040 0x0504, 0x0504, 0x0504, 0x0504));
1041 flags = vsx_i16x8_mul(flags, vsx_i16x8_const(8,4,2,1,8,4,2,1));
1042 v128_t ms_vec = vsx_dfetch(dbuf, limit, pos);
1043
1044 // U_q holds U_q for this quad
1045 // flags has e_k, e_1, and rho such that e_k is sitting in the
1046 // 0x8000, e_1 in 0x800, and rho in 0x80
1047
1048 // next e_k and m_n
1049 v128_t m_n;
1050 w0 = vsx_u16x8_shr(flags, 15); // e_k
1051 m_n = vsx_i16x8_sub(U_q, w0);
1052 m_n = vsx_v128_andnot(m_n, insig);
1053
1054 // find cumulative sums
1055 // to find at which bit in ms_vec the sample starts
1056 v128_t ex_sum, shfl, inc_sum = m_n; // inclusive scan
1058 inc_sum, 7, 8, 9, 10, 11, 12, 13, 14);
1059 inc_sum = vsx_i16x8_add(inc_sum, shfl);
1060 shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
1061 inc_sum = vsx_i16x8_add(inc_sum, shfl);
1062 shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum, 1, 2);
1063 inc_sum = vsx_i16x8_add(inc_sum, shfl);
1064 int total_mn = vsx_u16x8_extract_lane(inc_sum, 7);
1065 ex_sum = vsx_i16x8_shuffle(vsx_i64x2_const(0,0),
1066 inc_sum, 7, 8, 9, 10, 11, 12, 13, 14);
1067
1068 // find the starting byte and starting bit
1069 v128_t byte_idx = vsx_u16x8_shr(ex_sum, 3);
1070 v128_t bit_idx =
1072 byte_idx = vsx_i8x16_swizzle(byte_idx,
1073 vsx_i16x8_const(0x0000, 0x0202, 0x0404, 0x0606,
1074 0x0808, 0x0A0A, 0x0C0C, 0x0E0E));
1075 byte_idx =
1076 vsx_i16x8_add(byte_idx, vsx_i16x8_const(OJPH_REPEAT8(0x0100)));
1077 v128_t d0 = vsx_i8x16_swizzle(ms_vec, byte_idx);
1078 byte_idx =
1079 vsx_i16x8_add(byte_idx, vsx_i16x8_const(OJPH_REPEAT8(0x0101)));
1080 v128_t d1 = vsx_i8x16_swizzle(ms_vec, byte_idx);
1081
1082 // shift samples values to correct location
1083 v128_t bit_shift = vsx_i8x16_swizzle(
1084 vsx_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1,
1085 -1, 127, 63, 31, 15, 7, 3, 1), bit_idx);
1086 bit_shift =
1087 vsx_i16x8_add(bit_shift, vsx_i16x8_const(OJPH_REPEAT8(0x0101)));
1088 d0 = vsx_i16x8_mul(d0, bit_shift);
1089 d0 = vsx_u16x8_shr(d0, 8); // we should have 8 bits in the LSB
1090 d1 = vsx_i16x8_mul(d1, bit_shift);
1091 d1 = // 8 in MSB
1093 d0 = vsx_v128_or(d0, d1);
1094
1095 // find location of e_k and mask
1096 v128_t shift, t0, t1;
1099 v128_t U_q_m1 = vsx_i32x4_sub(U_q, ones);
1100 ui32 Uq0 = vsx_u16x8_extract_lane(U_q_m1, 0);
1101 ui32 Uq1 = vsx_u16x8_extract_lane(U_q_m1, 4);
1102 w0 = vsx_i16x8_sub(twos, w0);
1103 t0 = vsx_v128_and(w0, vsx_i64x2_const(-1, 0));
1104 t1 = vsx_v128_and(w0, vsx_i64x2_const(0, -1));
1105 t0 = vsx_i32x4_shl(t0, Uq0);
1106 t1 = vsx_i32x4_shl(t1, Uq1);
1107 shift = vsx_v128_or(t0, t1);
1108 ms_vec = vsx_v128_and(d0, vsx_i16x8_sub(shift, ones));
1109
1110 // next e_1
1111 w0 = vsx_v128_and(flags, vsx_i16x8_const(OJPH_REPEAT8(0x800)));
1112 w0 = vsx_i16x8_eq(w0, vsx_i64x2_const(0, 0));
1113 w0 = vsx_v128_andnot(shift, w0); // e_1 in correct position
1114 ms_vec = vsx_v128_or(ms_vec, w0); // e_1
1115 w0 = vsx_i16x8_shl(ms_vec, 15); // sign
1116 ms_vec = vsx_v128_or(ms_vec, ones); // bin center
1117 v128_t tvn = ms_vec;
1118 ms_vec = vsx_i16x8_add(ms_vec, twos);// + 2
1119 ms_vec = vsx_i16x8_shl(ms_vec, p - 1);
1120 ms_vec = vsx_v128_or(ms_vec, w0); // sign
1121 row = vsx_v128_andnot(ms_vec, insig); // significant only
1122
1123 ms_vec = vsx_v128_andnot(tvn, insig); // significant only
1124 w0 = vsx_i8x16_swizzle(ms_vec,
1125 vsx_i16x8_const(0x0302, 0x0706, -1, -1, -1, -1, -1, -1));
1126 vn = vsx_v128_or(vn, w0);
1127 w0 = vsx_i8x16_swizzle(ms_vec,
1128 vsx_i16x8_const(-1, 0x0B0A, 0x0F0E, -1, -1, -1, -1, -1));
1129 vn = vsx_v128_or(vn, w0);
1130
1131 pos += (ui32)total_mn;
1132 }
1133 return row;
1134 }
1135
1136
1137 //************************************************************************/
1154 bool ojph_decode_codeblock_vsx(ui8* coded_data, ui32* decoded_data,
1155 ui32 missing_msbs, ui32 num_passes,
1156 ui32 lengths1, ui32 lengths2,
1157 ui32 width, ui32 height, ui32 stride,
1158 bool stripe_causal)
1159 {
1160 static bool insufficient_precision = false;
1161 static bool modify_code = false;
1162 static bool truncate_spp_mrp = false;
1163
1164 if (num_passes > 1 && lengths2 == 0)
1165 {
1166 OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
1167 "one coding pass, but zero length for "
1168 "2nd and potential 3rd pass.\n");
1169 num_passes = 1;
1170 }
1171
1172 if (num_passes > 3)
1173 {
1174 OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
1175 "This codeblocks has %d passes.\n",
1176 num_passes);
1177 return false;
1178 }
1179
1180 if (missing_msbs > 30) // p < 0
1181 {
1182 if (insufficient_precision == false)
1183 {
1184 insufficient_precision = true;
1185 OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
1186 "codeblock. This message will not be "
1187 "displayed again.\n");
1188 }
1189 return false;
1190 }
1191 else if (missing_msbs == 30) // p == 0
1192 { // not enough precision to decode and set the bin center to 1
1193 if (modify_code == false) {
1194 modify_code = true;
1195 OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
1196 "pass. The code can be modified to support "
1197 "this case. This message will not be "
1198 "displayed again.\n");
1199 }
1200 return false; // 32 bits are not enough to decode this
1201 }
1202 else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
1203 {
1204 if (num_passes > 1) {
1205 num_passes = 1;
1206 if (truncate_spp_mrp == false) {
1207 truncate_spp_mrp = true;
1208 OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
1209 "nor MagRef passes; both will be skipped. "
1210 "This message will not be displayed "
1211 "again.\n");
1212 }
1213 }
1214 }
1215 ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
1216 // There is a way to handle the case of p == 0, but a different path
1217 // is required
1218
1219 if (lengths1 < 2)
1220 {
1221 OJPH_WARN(0x00010006, "Wrong codeblock length.\n");
1222 return false;
1223 }
1224
1225 // read scup and fix the bytes there
1226 int lcup, scup;
1227 lcup = (int)lengths1; // length of CUP
1228 //scup is the length of MEL + VLC
1229 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1230 if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
1231 return false;
1232
1233 // The temporary storage scratch holds two types of data in an
1234 // interleaved fashion. The interleaving allows us to use one
1235 // memory pointer.
1236 // We have one entry for a decoded VLC code, and one entry for UVLC.
1237 // Entries are 16 bits each, corresponding to one quad,
1238 // but since we want to use XMM registers of the SSE family
1239 // of SIMD; we allocated 16 bytes or more per quad row; that is,
1240 // the width is no smaller than 16 bytes (or 8 entries), and the
1241 // height is 512 quads
1242 // Each VLC entry contains, in the following order, starting
1243 // from MSB
1244 // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
1245 // Each entry in UVLC contains u_q
1246 // One extra row to handle the case of SPP propagating downwards
1247 // when codeblock width is 4
1248 ui16 scratch[8 * 513] = {0}; // 8+ kB
1249
1250 // We need an extra two entries (one inf and one u_q) beyond
1251 // the last column.
1252 // If the block width is 4 (2 quads), then we use sstr of 8
1253 // (enough for 4 quads). If width is 8 (4 quads) we use
1254 // sstr is 16 (enough for 8 quads). For a width of 16 (8
1255 // quads), we use 24 (enough for 12 quads).
1256 ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
1257
1258 assert((stride & 0x3) == 0);
1259
1260 ui32 mmsbp2 = missing_msbs + 2;
1261
1262 // The cleanup pass is decoded in two steps; in step one,
1263 // the VLC and MEL segments are decoded, generating a record that
1264 // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
1265 // This information should be sufficient for the next step.
1266 // In step 2, we decode the MagSgn segment.
1267
1268 // step 1 decoding VLC and MEL segments
1269 {
1270 // init structures
1271 dec_mel_st mel;
1272 mel_init(&mel, coded_data, lcup, scup);
1273 rev_struct vlc;
1274 rev_init(&vlc, coded_data, lcup, scup);
1275
1276 int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
1277 // data represented as runs of 0 events
1278 // See mel_decode description
1279
1280 ui32 vlc_val;
1281 ui32 c_q = 0;
1282 ui16 *sp = scratch;
1283 //initial quad row
1284 for (ui32 x = 0; x < width; sp += 4)
1285 {
1286 // decode VLC
1288
1289 // first quad
1290 vlc_val = rev_fetch(&vlc);
1291
1292 //decode VLC using the context c_q and the head of VLC bitstream
1293 ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
1294
1295 // if context is zero, use one MEL event
1296 if (c_q == 0) //zero context
1297 {
1298 run -= 2; //subtract 2, since events number if multiplied by 2
1299
1300 // Is the run terminated in 1? if so, use decoded VLC code,
1301 // otherwise, discard decoded data, since we will decoded again
1302 // using a different context
1303 t0 = (run == -1) ? t0 : 0;
1304
1305 // is run -1 or -2? this means a run has been consumed
1306 if (run < 0)
1307 run = mel_get_run(&mel); // get another run
1308 }
1309 //run -= (c_q == 0) ? 2 : 0;
1310 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1311 //if (run < 0)
1312 // run = mel_get_run(&mel); // get another run
1313 sp[0] = t0;
1314 x += 2;
1315
1316 // prepare context for the next quad; eqn. 1 in ITU T.814
1317 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1318
1319 //remove data from vlc stream (0 bits are removed if vlc is not used)
1320 vlc_val = rev_advance(&vlc, t0 & 0x7);
1321
1322 //second quad
1323 ui16 t1 = 0;
1324
1325 //decode VLC using the context c_q and the head of VLC bitstream
1326 t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
1327
1328 // if context is zero, use one MEL event
1329 if (c_q == 0 && x < width) //zero context
1330 {
1331 run -= 2; //subtract 2, since events number if multiplied by 2
1332
1333 // if event is 0, discard decoded t1
1334 t1 = (run == -1) ? t1 : 0;
1335
1336 if (run < 0) // have we consumed all events in a run
1337 run = mel_get_run(&mel); // if yes, then get another run
1338 }
1339 t1 = x < width ? t1 : 0;
1340 //run -= (c_q == 0 && x < width) ? 2 : 0;
1341 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1342 //if (run < 0)
1343 // run = mel_get_run(&mel); // get another run
1344 sp[2] = t1;
1345 x += 2;
1346
1347 //prepare context for the next quad, eqn. 1 in ITU T.814
1348 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1349
1350 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1351 vlc_val = rev_advance(&vlc, t1 & 0x7);
1352
1353 // decode u
1355 // uvlc_mode is made up of u_offset bits from the quad pair
1356 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1357 if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
1358 { // the MEL run of events
1359 run -= 2; //subtract 2, since events number if multiplied by 2
1360
1361 uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
1362 // is 0x40
1363
1364 if (run < 0)//if run is consumed (run is -1 or -2), get another run
1365 run = mel_get_run(&mel);
1366 }
1367 //run -= (uvlc_mode == 0xc0) ? 2 : 0;
1368 //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
1369 //if (run < 0)
1370 // run = mel_get_run(&mel); // get another run
1371
1372 //decode uvlc_mode to get u for both quads
1373 ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
1374 //remove total prefix length
1375 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1376 uvlc_entry >>= 3;
1377 //extract suffixes for quad 0 and 1
1378 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1379 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1380 vlc_val = rev_advance(&vlc, len);
1381 uvlc_entry >>= 4;
1382 // quad 0 length
1383 len = uvlc_entry & 0x7; // quad 0 suffix length
1384 uvlc_entry >>= 3;
1385 ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
1386 sp[1] = u_q;
1387 u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len)); //kappa == 1
1388 sp[3] = u_q;
1389 }
1390 sp[0] = sp[1] = 0;
1391
1392 //non initial quad rows
1393 for (ui32 y = 2; y < height; y += 2)
1394 {
1395 c_q = 0; // context
1396 ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads
1397
1398 for (ui32 x = 0; x < width; sp += 4)
1399 {
1400 // decode VLC
1402
1403 // sigma_q (n, ne, nf)
1404 c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
1405 c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
1406
1407 // first quad
1408 vlc_val = rev_fetch(&vlc);
1409
1410 //decode VLC using the context c_q and the head of VLC bitstream
1411 ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
1412
1413 // if context is zero, use one MEL event
1414 if (c_q == 0) //zero context
1415 {
1416 run -= 2; //subtract 2, since events number is multiplied by 2
1417
1418 // Is the run terminated in 1? if so, use decoded VLC code,
1419 // otherwise, discard decoded data, since we will decoded again
1420 // using a different context
1421 t0 = (run == -1) ? t0 : 0;
1422
1423 // is run -1 or -2? this means a run has been consumed
1424 if (run < 0)
1425 run = mel_get_run(&mel); // get another run
1426 }
1427 //run -= (c_q == 0) ? 2 : 0;
1428 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1429 //if (run < 0)
1430 // run = mel_get_run(&mel); // get another run
1431 sp[0] = t0;
1432 x += 2;
1433
1434 // prepare context for the next quad; eqn. 2 in ITU T.814
1435 // sigma_q (w, sw)
1436 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1437 // sigma_q (nw)
1438 c_q |= sp[0 - (si32)sstr] & 0x80;
1439 // sigma_q (n, ne, nf)
1440 c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
1441 c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
1442
1443 //remove data from vlc stream (0 bits are removed if vlc is unused)
1444 vlc_val = rev_advance(&vlc, t0 & 0x7);
1445
1446 //second quad
1447 ui16 t1 = 0;
1448
1449 //decode VLC using the context c_q and the head of VLC bitstream
1450 t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1451
1452 // if context is zero, use one MEL event
1453 if (c_q == 0 && x < width) //zero context
1454 {
1455 run -= 2; //subtract 2, since events number if multiplied by 2
1456
1457 // if event is 0, discard decoded t1
1458 t1 = (run == -1) ? t1 : 0;
1459
1460 if (run < 0) // have we consumed all events in a run
1461 run = mel_get_run(&mel); // if yes, then get another run
1462 }
1463 t1 = x < width ? t1 : 0;
1464 //run -= (c_q == 0 && x < width) ? 2 : 0;
1465 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1466 //if (run < 0)
1467 // run = mel_get_run(&mel); // get another run
1468 sp[2] = t1;
1469 x += 2;
1470
1471 // partial c_q, will be completed when we process the next quad
1472 // sigma_q (w, sw)
1473 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1474 // sigma_q (nw)
1475 c_q |= sp[2 - (si32)sstr] & 0x80;
1476
1477 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1478 vlc_val = rev_advance(&vlc, t1 & 0x7);
1479
1480 // decode u
1482 // uvlc_mode is made up of u_offset bits from the quad pair
1483 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1484 ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1485 //remove total prefix length
1486 vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1487 uvlc_entry >>= 3;
1488 //extract suffixes for quad 0 and 1
1489 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1490 ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1491 vlc_val = rev_advance(&vlc, len);
1492 uvlc_entry >>= 4;
1493 // quad 0 length
1494 len = uvlc_entry & 0x7; // quad 0 suffix length
1495 uvlc_entry >>= 3;
1496 ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q
1497 sp[1] = u_q;
1498 u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
1499 sp[3] = u_q;
1500 }
1501 sp[0] = sp[1] = 0;
1502 }
1503 }
1504
1505 // step2 we decode magsgn
1506 // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
1507 // The 32 bit path decode 16 bits data, for which one would think
1508 // 16 bits are enough, because we want to put in the center of the
1509 // bin.
1510 // If you have mmsbp2 equals 16 bit, and reversible coding, and
1511 // no bitplanes are missing, then we can decoding using the 16 bit
1512 // path, but we are not doing this here.
1513 if (mmsbp2 >= 16)
1514 {
1515 // We allocate a scratch row for storing v_n values.
1516 // We have 512 quads horizontally.
1517 // We may go beyond the last entry by up to 4 entries.
1518 // Here we allocate additional 8 entries.
1519 // There are two rows in this structure, the bottom
1520 // row is used to store processed entries.
1521 const int v_n_size = 512 + 8;
1522 ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
1523
1524 frwd_struct magsgn;
1525 frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1526
1527 {
1528 ui16 *sp = scratch;
1529 ui32 *vp = v_n_scratch;
1530 ui32 *dp = decoded_data;
1531 vp[0] = 2; // for easy calculation of emax
1532
1533 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1534 {
1535 //here we process two quads
1536 v128_t w0, w1; // workers
1537 v128_t inf_u_q, U_q;
1538 // determine U_q
1539 {
1540 inf_u_q = vsx_v128_load(sp);
1541 U_q = vsx_u32x4_shr(inf_u_q, 16);
1542
1543 w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2));
1544 ui32 i = vsx_i8x16_bitmask(w0);
1545 if (i & 0xFF) // only the lower two U_q
1546 return false;
1547 }
1548
1550 v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1551 v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1552 w0 = vsx_v128_load(vp);
1553 w0 = vsx_v128_and(w0, vsx_i32x4_const(-1,0,0,0));
1554 w0 = vsx_v128_or(w0, vn);
1555 vsx_v128_store(vp, w0);
1556
1557 //interleave in ssse3 style
1558
1559 w0 = vsx_i32x4_shuffle(row0, row1, 0, 4, 1, 5);
1560 w1 = vsx_i32x4_shuffle(row0, row1, 2, 6, 3, 7);
1561 row0 = vsx_i32x4_shuffle(w0, w1, 0, 4, 1, 5);
1562 row1 = vsx_i32x4_shuffle(w0, w1, 2, 6, 3, 7);
1563 vsx_v128_store(dp, row0);
1564 vsx_v128_store(dp + stride, row1);
1565 }
1566 }
1567
1568 for (ui32 y = 2; y < height; y += 2)
1569 {
1570 {
1571 // perform 31 - count_leading_zeros(*vp) here
1572 ui32 *vp = v_n_scratch;
1573 const v128_t lut_lo = vsx_i8x16_const(
1574 31, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
1575 );
1576 const v128_t lut_hi = vsx_i8x16_const(
1577 31, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
1578 );
1579 const v128_t nibble_mask = vsx_i8x16_const(OJPH_REPEAT16(0x0F));
1580 const v128_t byte_offset8 = vsx_i16x8_const(OJPH_REPEAT8(8));
1581 const v128_t byte_offset16 = vsx_i16x8_const(OJPH_REPEAT8(16));
1582 const v128_t cc = vsx_i32x4_const(OJPH_REPEAT4(31));
1583 for (ui32 x = 0; x <= width; x += 8, vp += 4)
1584 {
1585 v128_t v, t; // workers
1586 v = vsx_v128_load(vp);
1587
1588 t = vsx_v128_and(nibble_mask, v);
1589 v = vsx_v128_and(vsx_u16x8_shr(v, 4), nibble_mask);
1590 t = vsx_i8x16_swizzle(lut_lo, t);
1591 v = vsx_i8x16_swizzle(lut_hi, v);
1592 v = vsx_u8x16_min(v, t);
1593
1594 t = vsx_u16x8_shr(v, 8);
1595 v = vsx_v128_or(v, byte_offset8);
1596 v = vsx_u8x16_min(v, t);
1597
1598 t = vsx_u32x4_shr(v, 16);
1599 v = vsx_v128_or(v, byte_offset16);
1600 v = vsx_u8x16_min(v, t);
1601
1602 v = vsx_i16x8_sub(cc, v);
1603 vsx_v128_store(vp + v_n_size, v);
1604 }
1605 }
1606
1607 ui32 *vp = v_n_scratch;
1608 ui16 *sp = scratch + (y >> 1) * sstr;
1609 ui32 *dp = decoded_data + y * stride;
1610 vp[0] = 2; // for easy calculation of emax
1611
1612 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1613 {
1614 //process two quads
1615 v128_t w0, w1; // workers
1616 v128_t inf_u_q, U_q;
1617 // determine U_q
1618 {
1619 v128_t gamma, emax, kappa, u_q; // needed locally
1620
1621 inf_u_q = vsx_v128_load(sp);
1622 gamma =
1623 vsx_v128_and(inf_u_q, vsx_i32x4_const(OJPH_REPEAT4(0xF0)));
1625 gamma = vsx_v128_and(gamma, w0);
1626 gamma = vsx_i32x4_eq(gamma, vsx_i64x2_const(0, 0));
1627
1628 emax = vsx_v128_load(vp + v_n_size);
1629 w0 = vsx_i32x4_shuffle(emax, vsx_i64x2_const(0,0), 1, 2, 3, 4);
1630 emax = vsx_i16x8_max(w0, emax); // no max_epi32 in ssse3
1631 emax = vsx_v128_andnot(emax, gamma);
1632
1633 kappa = vsx_i32x4_const(OJPH_REPEAT4(1));
1634 kappa = vsx_i16x8_max(emax, kappa); // no max_epi32 in ssse3
1635
1636 u_q = vsx_u32x4_shr(inf_u_q, 16);
1637 U_q = vsx_i32x4_add(u_q, kappa);
1638
1639 w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2));
1640 ui32 i = vsx_i8x16_bitmask(w0);
1641 if (i & 0xFF) // only the lower two U_q
1642 return false;
1643 }
1644
1646 v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1647 v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1648 w0 = vsx_v128_load(vp);
1649 w0 = vsx_v128_and(w0, vsx_i32x4_const(-1,0,0,0));
1650 w0 = vsx_v128_or(w0, vn);
1651 vsx_v128_store(vp, w0);
1652
1653 //interleave in ssse3 style
1654 w0 = vsx_i32x4_shuffle(row0, row1, 0, 4, 1, 5);
1655 w1 = vsx_i32x4_shuffle(row0, row1, 2, 6, 3, 7);
1656 row0 = vsx_i32x4_shuffle(w0, w1, 0, 4, 1, 5);
1657 row1 = vsx_i32x4_shuffle(w0, w1, 2, 6, 3, 7);
1658 vsx_v128_store(dp, row0);
1659 vsx_v128_store(dp + stride, row1);
1660 }
1661 }
1662 }
1663 else
1664 {
1665 // reduce bitplane by 16 because we now have 16 bits instead of 32
1666 p -= 16;
1667
1668 // We allocate a scratch row for storing v_n values.
1669 // We have 512 quads horizontally.
1670 // We may go beyond the last entry by up to 8 entries.
1671 // Therefore we allocate additional 8 entries.
1672 // There are two rows in this structure, the bottom
1673 // row is used to store processed entries.
1674 const int v_n_size = 512 + 8;
1675 ui16 v_n_scratch[2 * v_n_size] = {0}; // 2+ kB
1676
1677 // destuff the MagSgn bitstream upfront; per-quad consumption then
1678 // advances a bit position in a GPR (see destuff_frwd)
1679 const ui32 dbuf_cap = 4096 * 15 / 8;
1680 ui8 dbuf[dbuf_cap + 72];
1681 ui32 limit = destuff_frwd<0xFF>(coded_data, lcup - scup,
1682 dbuf, dbuf_cap);
1683 ui32 pos = 0;
1684
1685 {
1686 ui16 *sp = scratch;
1687 ui16 *vp = v_n_scratch;
1688 ui32 *dp = decoded_data;
1689 vp[0] = 2; // for easy calculation of emax
1690
1691 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1692 {
1693 //here we process two quads
1694 v128_t w0, w1; // workers
1695 v128_t inf_u_q, U_q;
1696 // determine U_q
1697 {
1698 inf_u_q = vsx_v128_load(sp);
1699 U_q = vsx_u32x4_shr(inf_u_q, 16);
1700
1701 w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2));
1702 ui32 i = vsx_i8x16_bitmask(w0);
1703 if (i & 0xFF) // only the lower two U_q
1704 return false;
1705 }
1706
1708 v128_t row = decode_two_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1709 w0 = vsx_v128_load(vp);
1710 w0 = vsx_v128_and(w0, vsx_i16x8_const(-1,0,0,0,0,0,0,0));
1711 w0 = vsx_v128_or(w0, vn);
1712 vsx_v128_store(vp, w0);
1713
1714 //interleave in ssse3 style
1715 w0 = vsx_i8x16_swizzle(row,
1716 vsx_i16x8_const(-1, 0x0100, -1, 0x0504,
1717 -1, 0x0908, -1, 0x0D0C));
1718 vsx_v128_store(dp, w0);
1719 w1 = vsx_i8x16_swizzle(row,
1720 vsx_i16x8_const(-1, 0x0302, -1, 0x0706,
1721 -1, 0x0B0A, -1, 0x0F0E));
1722 vsx_v128_store(dp + stride, w1);
1723 }
1724 }
1725
1726 for (ui32 y = 2; y < height; y += 2)
1727 {
1728 {
1729 // perform 15 - count_leading_zeros(*vp) here
1730 ui16 *vp = v_n_scratch;
1731 const v128_t lut_lo = vsx_i8x16_const(
1732 15, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
1733 );
1734 const v128_t lut_hi = vsx_i8x16_const(
1735 15, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
1736 );
1737 const v128_t nibble_mask = vsx_i8x16_const(OJPH_REPEAT16(0x0F));
1738 const v128_t byte_offset8 = vsx_i16x8_const(OJPH_REPEAT8(8));
1739 const v128_t cc = vsx_i16x8_const(OJPH_REPEAT8(15));
1740 for (ui32 x = 0; x <= width; x += 16, vp += 8)
1741 {
1742 v128_t v, t; // workers
1743 v = vsx_v128_load(vp);
1744
1745 t = vsx_v128_and(nibble_mask, v);
1746 v = vsx_v128_and(vsx_u16x8_shr(v, 4), nibble_mask);
1747 t = vsx_i8x16_swizzle(lut_lo, t);
1748 v = vsx_i8x16_swizzle(lut_hi, v);
1749 v = vsx_u8x16_min(v, t);
1750
1751 t = vsx_u16x8_shr(v, 8);
1752 v = vsx_v128_or(v, byte_offset8);
1753 v = vsx_u8x16_min(v, t);
1754
1755 v = vsx_i16x8_sub(cc, v);
1756 vsx_v128_store(vp + v_n_size, v);
1757 }
1758 }
1759
1760 ui16 *vp = v_n_scratch;
1761 ui16 *sp = scratch + (y >> 1) * sstr;
1762 ui32 *dp = decoded_data + y * stride;
1763 vp[0] = 2; // for easy calculation of emax
1764
1765 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1766 {
1767 //process two quads
1768 v128_t w0, w1; // workers
1769 v128_t inf_u_q, U_q;
1770 // determine U_q
1771 {
1772 v128_t gamma, emax, kappa, u_q; // needed locally
1773
1774 inf_u_q = vsx_v128_load(sp);
1775 gamma =
1776 vsx_v128_and(inf_u_q, vsx_i32x4_const(OJPH_REPEAT4(0xF0)));
1778 gamma = vsx_v128_and(gamma, w0);
1779 gamma = vsx_i32x4_eq(gamma, vsx_i64x2_const(0, 0));
1780
1781 emax = vsx_v128_load(vp + v_n_size);
1782 w0 = vsx_i16x8_shuffle(emax,
1783 vsx_i64x2_const(0, 0), 1, 2, 3, 4, 5, 6, 7, 8);
1784 emax = vsx_i16x8_max(w0, emax); // no max_epi32 in ssse3
1785 emax = vsx_i8x16_swizzle(emax,
1786 vsx_i16x8_const(0x0100, -1, 0x0302, -1,
1787 0x0504, -1, 0x0706, -1));
1788 emax = vsx_v128_andnot(emax, gamma);
1789
1790 kappa = vsx_i32x4_const(OJPH_REPEAT4(1));
1791 kappa = vsx_i16x8_max(emax, kappa); // no max_epi32 in ssse3
1792
1793 u_q = vsx_u32x4_shr(inf_u_q, 16);
1794 U_q = vsx_i32x4_add(u_q, kappa);
1795
1796 w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2));
1797 ui32 i = vsx_i8x16_bitmask(w0);
1798 if (i & 0xFF) // only the lower two U_q
1799 return false;
1800 }
1801
1803 v128_t row = decode_two_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1804 w0 = vsx_v128_load(vp);
1805 w0 = vsx_v128_and(w0, vsx_i16x8_const(-1,0,0,0,0,0,0,0));
1806 w0 = vsx_v128_or(w0, vn);
1807 vsx_v128_store(vp, w0);
1808
1809 w0 = vsx_i8x16_swizzle(row,
1810 vsx_i16x8_const(-1, 0x0100, -1, 0x0504,
1811 -1, 0x0908, -1, 0x0D0C));
1812 vsx_v128_store(dp, w0);
1813 w1 = vsx_i8x16_swizzle(row,
1814 vsx_i16x8_const(-1, 0x0302, -1, 0x0706,
1815 -1, 0x0B0A, -1, 0x0F0E));
1816 vsx_v128_store(dp + stride, w1);
1817 }
1818 }
1819
1820 // increase bitplane back by 16 because we need to process 32 bits
1821 p += 16;
1822 }
1823
1824 if (num_passes > 1)
1825 {
1826 // We use scratch again, we can divide it into multiple regions
1827 // sigma holds all the significant samples, and it cannot
1828 // be modified after it is set. it will be used during the
1829 // Magnitude Refinement Pass
1830 ui16* const sigma = scratch;
1831
1832 ui32 mstr = (width + 3u) >> 2; // divide by 4, since each
1833 // ui16 contains 4 columns
1834 mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
1835
1836 // We re-arrange quad significance, where each 4 consecutive
1837 // bits represent one quad, into column significance, where,
1838 // each 4 consequtive bits represent one column of 4 rows
1839 {
1840 ui32 y;
1841
1842 const v128_t mask_3 = vsx_i32x4_const(OJPH_REPEAT4(0x30));
1843 const v128_t mask_C = vsx_i32x4_const(OJPH_REPEAT4(0xC0));
1844 const v128_t shuffle_mask = vsx_i32x4_const(0x0C080400,-1,-1,-1);
1845 for (y = 0; y < height; y += 4)
1846 {
1847 ui16* sp = scratch + (y >> 1) * sstr;
1848 ui16* dp = sigma + (y >> 2) * mstr;
1849 for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1850 {
1851 v128_t s0, s1, u3, uC, t0, t1;
1852
1853 s0 = vsx_v128_load(sp);
1854 u3 = vsx_v128_and(s0, mask_3);
1855 u3 = vsx_u32x4_shr(u3, 4);
1856 uC = vsx_v128_and(s0, mask_C);
1857 uC = vsx_u32x4_shr(uC, 2);
1858 t0 = vsx_v128_or(u3, uC);
1859
1860 s1 = vsx_v128_load(sp + sstr);
1861 u3 = vsx_v128_and(s1, mask_3);
1862 u3 = vsx_u32x4_shr(u3, 2);
1863 uC = vsx_v128_and(s1, mask_C);
1864 t1 = vsx_v128_or(u3, uC);
1865
1866 v128_t r = vsx_v128_or(t0, t1);
1867 r = vsx_i8x16_swizzle(r, shuffle_mask);
1868
1869 vsx_v128_store32_lane(dp, r, 0);
1870 }
1871 dp[0] = 0; // set an extra entry on the right with 0
1872 }
1873 {
1874 // reset one row after the codeblock
1875 ui16* dp = sigma + (y >> 2) * mstr;
1876 v128_t zero = vsx_i64x2_const(0, 0);
1877 for (ui32 x = 0; x < width; x += 32, dp += 8)
1878 vsx_v128_store(dp, zero);
1879 dp[0] = 0; // set an extra entry on the right with 0
1880 }
1881 }
1882
1883 // We perform Significance Propagation Pass here
1884 {
1885 // This stores significance information of the previous
1886 // 4 rows. Significance information in this array includes
1887 // all signicant samples in bitplane p - 1; that is,
1888 // significant samples for bitplane p (discovered during the
1889 // cleanup pass and stored in sigma) and samples that have recently
1890 // became significant (during the SPP) in bitplane p-1.
1891 // We store enough for the widest row, containing 1024 columns,
1892 // which is equivalent to 256 of ui16, since each stores 4 columns.
1893 // We add an extra 8 entries, just in case we need more
1894 ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
1895
1896 frwd_struct sigprop;
1897 frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
1898
1899 for (ui32 y = 0; y < height; y += 4)
1900 {
1901 ui32 pattern = 0xFFFFu; // a pattern needed samples
1902 if (height - y < 4) {
1903 pattern = 0x7777u;
1904 if (height - y < 3) {
1905 pattern = 0x3333u;
1906 if (height - y < 2)
1907 pattern = 0x1111u;
1908 }
1909 }
1910
1911 // prev holds sign. info. for the previous quad, together
1912 // with the rows on top of it and below it.
1913 ui32 prev = 0;
1914 ui16 *prev_sig = prev_row_sig;
1915 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1916 ui32 *dpp = decoded_data + y * stride;
1917 for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1918 {
1919 // only rows and columns inside the stripe are included
1920 si32 s = (si32)x + 4 - (si32)width;
1921 s = ojph_max(s, 0);
1922 pattern = pattern >> (s * 4);
1923
1924 // We first find locations that need to be tested (potential
1925 // SPP members); these location will end up in mbr
1926 // In each iteration, we produce 16 bits because cwd can have
1927 // up to 16 bits of significance information, followed by the
1928 // corresponding 16 bits of sign information; therefore, it is
1929 // sufficient to fetch 32 bit data per loop.
1930
1931 // Althougth we are interested in 16 bits only, we load 32 bits.
1932 // For the 16 bits we are producing, we need the next 4 bits --
1933 // We need data for at least 5 columns out of 8.
1934 // Therefore loading 32 bits is easier than loading 16 bits
1935 // twice.
1936 ui32 ps; memcpy(&ps, prev_sig, sizeof(ps));
1937 ui32 ns; memcpy(&ns, cur_sig + mstr, sizeof(ns));
1938 ui32 u = (ps & 0x88888888) >> 3; // the row on top
1939 if (!stripe_causal)
1940 u |= (ns & 0x11111111) << 3; // the row below
1941
1942 ui32 cs; memcpy(&cs, cur_sig, sizeof(cs));
1943 // vertical integration
1944 ui32 mbr = cs; // this sig. info.
1945 mbr |= (cs & 0x77777777) << 1; //above neighbors
1946 mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
1947 mbr |= u;
1948 // horizontal integration
1949 ui32 t = mbr;
1950 mbr |= t << 4; // neighbors on the left
1951 mbr |= t >> 4; // neighbors on the right
1952 mbr |= prev >> 12; // significance of previous group
1953
1954 // remove outside samples, and already significant samples
1955 mbr &= pattern;
1956 mbr &= ~cs;
1957
1958 // find samples that become significant during the SPP
1959 ui32 new_sig = mbr;
1960 if (new_sig)
1961 {
1962 v128_t cwd_vec = frwd_fetch<0>(&sigprop);
1963 ui32 cwd = vsx_u32x4_extract_lane(cwd_vec, 0);
1964
1965 ui32 cnt = 0;
1966 ui32 col_mask = 0xFu;
1967 ui32 inv_sig = ~cs & pattern;
1968 for (int i = 0; i < 16; i += 4, col_mask <<= 4)
1969 {
1970 if ((col_mask & new_sig) == 0)
1971 continue;
1972
1973 //scan one column
1974 ui32 sample_mask = 0x1111u & col_mask;
1975 if (new_sig & sample_mask)
1976 {
1977 new_sig &= ~sample_mask;
1978 if (cwd & 1)
1979 {
1980 ui32 t = 0x33u << i;
1981 new_sig |= t & inv_sig;
1982 }
1983 cwd >>= 1; ++cnt;
1984 }
1985
1986 sample_mask <<= 1;
1987 if (new_sig & sample_mask)
1988 {
1989 new_sig &= ~sample_mask;
1990 if (cwd & 1)
1991 {
1992 ui32 t = 0x76u << i;
1993 new_sig |= t & inv_sig;
1994 }
1995 cwd >>= 1; ++cnt;
1996 }
1997
1998 sample_mask <<= 1;
1999 if (new_sig & sample_mask)
2000 {
2001 new_sig &= ~sample_mask;
2002 if (cwd & 1)
2003 {
2004 ui32 t = 0xECu << i;
2005 new_sig |= t & inv_sig;
2006 }
2007 cwd >>= 1; ++cnt;
2008 }
2009
2010 sample_mask <<= 1;
2011 if (new_sig & sample_mask)
2012 {
2013 new_sig &= ~sample_mask;
2014 if (cwd & 1)
2015 {
2016 ui32 t = 0xC8u << i;
2017 new_sig |= t & inv_sig;
2018 }
2019 cwd >>= 1; ++cnt;
2020 }
2021 }
2022
2023 if (new_sig)
2024 {
2025 // Spread new_sig, such that each bit is in one byte with a
2026 // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
2027 v128_t new_sig_vec = vsx_i16x8_splat((si16)new_sig);
2028 new_sig_vec = vsx_i8x16_swizzle(new_sig_vec,
2029 vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
2030 new_sig_vec = vsx_v128_and(new_sig_vec,
2031 vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2032 new_sig_vec = vsx_i8x16_eq(new_sig_vec,
2033 vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2034
2035 // find cumulative sums
2036 // to find which bit in cwd we should extract
2037 v128_t ex_sum, shfl, inc_sum = new_sig_vec; // inclusive scan
2038 inc_sum = vsx_i8x16_abs(inc_sum); // cvrt to 0 or 1
2039 shfl = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum,
2040 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
2041 inc_sum = vsx_i8x16_add(inc_sum, shfl);
2042 shfl = vsx_i16x8_shuffle(vsx_i64x2_const(0,0), inc_sum,
2043 7, 8, 9, 10, 11, 12, 13, 14);
2044 inc_sum = vsx_i8x16_add(inc_sum, shfl);
2045 shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum,
2046 3, 4, 5, 6);
2047 inc_sum = vsx_i8x16_add(inc_sum, shfl);
2048 shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum,
2049 1, 2);
2050 inc_sum = vsx_i8x16_add(inc_sum, shfl);
2051 cnt += vsx_u8x16_extract_lane(inc_sum, 15);
2052 // exclusive scan
2053 ex_sum = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum,
2054 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
2055
2056 // Spread cwd, such that each bit is in one byte
2057 // with a value of 0 or 1.
2058 cwd_vec = vsx_i16x8_splat((si16)cwd);
2059 cwd_vec = vsx_i8x16_swizzle(cwd_vec,
2060 vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
2061 cwd_vec = vsx_v128_and(cwd_vec,
2062 vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2063 cwd_vec = vsx_i8x16_eq(cwd_vec,
2064 vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2065 cwd_vec = vsx_i8x16_abs(cwd_vec);
2066
2067 // Obtain bit from cwd_vec correspondig to ex_sum
2068 // Basically, collect needed bits from cwd_vec
2069 v128_t v = vsx_i8x16_swizzle(cwd_vec, ex_sum);
2070
2071 // load data and set spp coefficients
2073 0,-1,-1,-1,4,-1,-1,-1,8,-1,-1,-1,12,-1,-1,-1);
2074 v128_t val = vsx_i32x4_splat(3 << (p - 2));
2075 ui32 *dp = dpp;
2076 for (int c = 0; c < 4; ++ c) {
2077 v128_t s0, s0_ns, s0_val;
2078 // load coefficients
2079 s0 = vsx_v128_load(dp);
2080
2081 // epi32 is -1 only for coefficient that
2082 // are changed during the SPP
2083 s0_ns = vsx_i8x16_swizzle(new_sig_vec, m);
2084 s0_ns = vsx_i32x4_eq(s0_ns,
2086
2087 // obtain sign for coefficients in SPP
2088 s0_val = vsx_i8x16_swizzle(v, m);
2089 s0_val = vsx_i32x4_shl(s0_val, 31);
2090 s0_val = vsx_v128_or(s0_val, val);
2091 s0_val = vsx_v128_and(s0_val, s0_ns);
2092
2093 // update vector
2094 s0 = vsx_v128_or(s0, s0_val);
2095 // store coefficients
2096 vsx_v128_store(dp, s0);
2097 // prepare for next row
2098 dp += stride;
2100 }
2101 }
2102 frwd_advance(&sigprop, cnt);
2103 }
2104
2105 new_sig |= cs;
2106 *prev_sig = (ui16)(new_sig);
2107
2108 // vertical integration for the new sig. info.
2109 t = new_sig;
2110 new_sig |= (t & 0x7777) << 1; //above neighbors
2111 new_sig |= (t & 0xEEEE) >> 1; //below neighbors
2112 // add sig. info. from the row on top and below
2113 prev = new_sig | u;
2114 // we need only the bits in 0xF000
2115 prev &= 0xF000;
2116 }
2117 }
2118 }
2119
2120 // We perform Magnitude Refinement Pass here
2121 if (num_passes > 2)
2122 {
2123 rev_struct magref;
2124 rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
2125
2126 for (ui32 y = 0; y < height; y += 4)
2127 {
2128 ui16 *cur_sig = sigma + (y >> 2) * mstr;
2129 ui32 *dpp = decoded_data + y * stride;
2130 for (ui32 i = 0; i < width; i += 4, dpp += 4)
2131 {
2132 //Process one entry from sigma array at a time
2133 // Each nibble (4 bits) in the sigma array represents 4 rows,
2134 ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
2135 ui16 sig = *cur_sig++; // 16 bit that will be processed now
2136 int total_bits = 0;
2137 if (sig) // if any of the 32 bits are set
2138 {
2139 // We work on 4 rows, with 4 samples each, since
2140 // data is 32 bit (4 bytes)
2141
2142 // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
2143 v128_t sig_vec = vsx_i16x8_splat((si16)sig);
2144 sig_vec = vsx_i8x16_swizzle(sig_vec,
2145 vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
2146 sig_vec = vsx_v128_and(sig_vec,
2147 vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2148 sig_vec = vsx_i8x16_eq(sig_vec,
2149 vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2150 sig_vec = vsx_i8x16_abs(sig_vec);
2151
2152 // find cumulative sums
2153 // to find which bit in cwd we should extract
2154 v128_t ex_sum, shfl, inc_sum = sig_vec; // inclusive scan
2155 shfl = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum,
2156 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
2157 inc_sum = vsx_i8x16_add(inc_sum, shfl);
2158 shfl = vsx_i16x8_shuffle(vsx_i64x2_const(0,0), inc_sum,
2159 7, 8, 9, 10, 11, 12, 13, 14);
2160 inc_sum = vsx_i8x16_add(inc_sum, shfl);
2161 shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum,
2162 3, 4, 5, 6);
2163 inc_sum = vsx_i8x16_add(inc_sum, shfl);
2164 shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum,
2165 1, 2);
2166 inc_sum = vsx_i8x16_add(inc_sum, shfl);
2167 total_bits = vsx_u8x16_extract_lane(inc_sum, 15);
2168 // exclusive scan
2169 ex_sum = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum,
2170 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
2171
2172 // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
2173 // cwd_vec. Then, convert these to a form suitable
2174 // for coefficient modifications; in particular, a value
2175 // of 0 is presented as binary 11, and a value of 1 is
2176 // represented as binary 01
2177 v128_t cwd_vec = vsx_i16x8_splat((si16)cwd);
2178 cwd_vec = vsx_i8x16_swizzle(cwd_vec,
2179 vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
2180 cwd_vec = vsx_v128_and(cwd_vec,
2181 vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2182 cwd_vec = vsx_i8x16_eq(cwd_vec,
2183 vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2184 cwd_vec =
2186 cwd_vec = vsx_i8x16_add(cwd_vec, cwd_vec);
2187 cwd_vec =
2189
2190 // load data and insert the mrp bit
2191 v128_t m = vsx_i8x16_const(0,-1,-1,-1,4,-1,-1,-1,
2192 8,-1,-1,-1,12,-1,-1,-1);
2193 ui32 *dp = dpp;
2194 for (int c = 0; c < 4; ++c) {
2195 v128_t s0, s0_sig, s0_idx, s0_val;
2196 // load coefficients
2197 s0 = vsx_v128_load(dp);
2198 // find significant samples in this row
2199 s0_sig = vsx_i8x16_swizzle(sig_vec, m);
2200 s0_sig = vsx_i8x16_eq(s0_sig, vsx_i64x2_const(0, 0));
2201 // get MRP bit index, and MRP pattern
2202 s0_idx = vsx_i8x16_swizzle(ex_sum, m);
2203 s0_val = vsx_i8x16_swizzle(cwd_vec, s0_idx);
2204 // keep data from significant samples only
2205 s0_val = vsx_v128_andnot(s0_val, s0_sig);
2206 // move mrp bits to correct position, and employ
2207 s0_val = vsx_i32x4_shl(s0_val, p - 2);
2208 s0 = vsx_v128_xor(s0, s0_val);
2209 // store coefficients
2210 vsx_v128_store(dp, s0);
2211 // prepare for next row
2212 dp += stride;
2214 }
2215 }
2216 // consume data according to the number of bits set
2217 rev_advance_mrp(&magref, (ui32)total_bits);
2218 }
2219 }
2220 }
2221 }
2222
2223 return true;
2224 }
2225 }
2226}
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static ui32 rev_fetch(rev_struct *vlcp)
Retrieves 32 bits from the head of a rev_struct structure.
static void rev_init_mrp(rev_struct *mrp, ui8 *data, int lcup, int len2)
Initialized rev_struct structure for MRP segment, and reads a number of bytes such that the next 32 b...
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static ui32 destuff_frwd(const ui8 *src, int size, ui8 *dst, ui32 cap)
Destuffs a bitstream into a contiguous buffer, upfront.
static void frwd_advance(frwd_struct32 *msp, ui32 num_bits)
Consume num_bits bits from the bitstream of frwd_struct32.
static void rev_read_mrp(rev_struct *mrp)
Reads and unstuffs from rev_struct.
static ui32 rev_fetch_mrp(rev_struct *mrp)
Retrieves 32 bits from the head of a rev_struct structure.
static void frwd_read(frwd_struct32 *msp)
Read and unstuffs 32 bits from forward-growing bitstream.
static void rev_read(rev_struct *vlcp)
Read and unstuff data from a backwardly-growing segment.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void rev_init(rev_struct *vlcp, ui8 *data, int lcup, int scup)
Initiates the rev_struct structure and reads a few bytes to move the read address to multiple of 4.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static v128_t decode_one_quad32(const v128_t inf_u_q, v128_t U_q, frwd_struct *magsgn, ui32 p, v128_t &vn)
decodes one quad, using 32 bit data
static v128_t decode_two_quad16(const v128_t inf_u_q, v128_t U_q, const ui8 *dbuf, ui32 limit, ui32 &pos, ui32 p, v128_t &vn)
decodes twos consecutive quads (one octet), using 16 bit data
bool ojph_decode_codeblock_vsx(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
Decodes one codeblock, processing the cleanup, siginificance propagation, and magnitude refinement pa...
static ui32 frwd_fetch(frwd_struct32 *msp)
Fetches 32 bits from the frwd_struct32 bitstream.
static ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void frwd_init(frwd_struct32 *msp, const ui8 *data, int size)
Initialize frwd_struct32 struct and reads some bytes.
static v128_t vsx_dfetch(const ui8 *dbuf, ui32 limit, ui32 pos)
Fetches 128 bits from a destuffed bitstream buffer.
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
uint64_t ui64
Definition ojph_defs.h:56
uint16_t ui16
Definition ojph_defs.h:52
static ui32 count_leading_zeros(ui32 val)
Definition ojph_arch.h:206
int32_t si32
Definition ojph_defs.h:55
int16_t si16
Definition ojph_defs.h:53
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define OJPH_REPEAT2(a)
Macros that help with typing and space.
#define OJPH_REPEAT4(a)
#define OJPH_REPEAT16(a)
#define OJPH_REPEAT8(a)
#define ojph_max(a, b)
Definition ojph_defs.h:73
#define OJPH_WARN(t,...)
static v128_t vsx_i8x16_splat(signed char x)
static v128_t vsx_i32x4_sub(v128_t a, v128_t b)
static v128_t vsx_i16x8_sub(v128_t a, v128_t b)
#define vsx_i8x16_shuffle(a, b, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15)
static v128_t vsx_u16x8_shr(v128_t a, int n)
static int vsx_i8x16_bitmask(v128_t a)
static v128_t vsx_i8x16_swizzle(v128_t a, v128_t idx)
static v128_t vsx_i8x16_const(signed char c0, signed char c1, signed char c2, signed char c3, signed char c4, signed char c5, signed char c6, signed char c7, signed char c8, signed char c9, signed char c10, signed char c11, signed char c12, signed char c13, signed char c14, signed char c15)
static v128_t vsx_v128_xor(v128_t a, v128_t b)
static v128_t vsx_i16x8_mul(v128_t a, v128_t b)
static v128_t vsx_u32x4_shr(v128_t a, int n)
#define vsx_i16x8_shuffle(a, b, c0, c1, c2, c3, c4, c5, c6, c7)
static v128_t vsx_i16x8_splat(short x)
static v128_t vsx_u32x4_const(unsigned int c0, unsigned int c1, unsigned int c2, unsigned int c3)
#define vsx_v128_store32_lane(p, a, i)
static v128_t vsx_i32x4_const(int c0, int c1, int c2, int c3)
static v128_t vsx_u8x16_min(v128_t a, v128_t b)
static v128_t vsx_u64x2_const(unsigned long long c0, unsigned long long c1)
static v128_t vsx_i32x4_shl(v128_t a, int n)
static v128_t vsx_i16x8_max(v128_t a, v128_t b)
static v128_t vsx_i64x2_const(long long c0, long long c1)
static v128_t vsx_i8x16_abs(v128_t a)
static v128_t vsx_i32x4_add(v128_t a, v128_t b)
#define vsx_u32x4_extract_lane(a, i)
__vector unsigned char v128_t
__vector unsigned char vsx_v_u8
static v128_t vsx_i8x16_add(v128_t a, v128_t b)
static v128_t vsx_u16x8_const(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
static v128_t vsx_i16x8_add(v128_t a, v128_t b)
static v128_t vsx_v128_andnot(v128_t a, v128_t b)
static void vsx_v128_store(void *p, v128_t a)
static v128_t vsx_u64x2_shr(v128_t a, int n)
static v128_t vsx_v128_and(v128_t a, v128_t b)
static v128_t vsx_i64x2_shl(v128_t a, int n)
static v128_t vsx_i16x8_eq(v128_t a, v128_t b)
static v128_t vsx_v128_or(v128_t a, v128_t b)
#define vsx_i64x2_shuffle(a, b, c0, c1)
static v128_t vsx_i8x16_eq(v128_t a, v128_t b)
#define vsx_i32x4_shuffle(a, b, c0, c1, c2, c3)
static v128_t vsx_i32x4_splat(int x)
static v128_t vsx_v128_load(const void *p)
#define vsx_u16x8_extract_lane(a, i)
static v128_t vsx_i16x8_const(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
static v128_t vsx_i16x8_shl(v128_t a, int n)
static v128_t vsx_u32x4_splat(unsigned int x)
#define vsx_u8x16_extract_lane(a, i)
static v128_t vsx_i8x16_gt(v128_t a, v128_t b)
static v128_t vsx_i32x4_eq(v128_t a, v128_t b)
static v128_t vsx_i32x4_gt(v128_t a, v128_t b)
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)
State structure for reading and unstuffing of forward-growing bitstreams; these are: MagSgn and SPP b...
ui8 tmp[48]
temporary buffer of read data + 16 extra
const ui8 * data
pointer to bitstream
ui32 bits
number of bits stored in tmp
ui32 unstuff
1 if a bit needs to be unstuffed from next byte
A structure for reading and unstuffing a segment that grows backward, such as VLC and MRP.
ui32 bits
number of bits stored in tmp
int size
number of bytes left
ui8 * data
pointer to where to read data
ui64 tmp
temporary buffer of read data