OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_img_io_sse41.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_img_io_sse41.cpp
34// Author: Aous Naman
35// Date: 23 May 2022
36//***************************************************************************/
37
38
39#include <cstdlib>
40#include <cstring>
41#include <immintrin.h>
42
43#include "ojph_file.h"
44#include "ojph_img_io.h"
45#include "ojph_mem.h"
46#include "ojph_message.h"
47
48namespace ojph {
49
51 static
52 ui16 be2le(const ui16 v)
53 {
54 return (ui16)((v<<8) | (v>>8));
55 }
56
58 void sse41_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1,
59 const line_buf *ln2, void *dp,
60 int bit_depth, int count)
61 {
62 ojph_unused(ln1);
63 ojph_unused(ln2);
64
65 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
66 __m128i zero = _mm_setzero_si128();
67 __m128i mask = _mm_set_epi64x(0x0F0B07030E0A0602, 0x0D0905010C080400);
68 const si32 *sp = ln0->i32;
69 ui8* p = (ui8 *)dp;
70
71 // 16 bytes or entries in each loop
72 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
73 {
74 __m128i a, t;
75 a = _mm_load_si128((__m128i*)sp);
76 a = _mm_max_epi32(a, zero);
77 t = _mm_min_epi32(a, max_val_vec);
78
79 a = _mm_load_si128((__m128i*)sp + 1);
80 a = _mm_max_epi32(a, zero);
81 a = _mm_min_epi32(a, max_val_vec);
82 a = _mm_slli_epi32(a, 8);
83 t = _mm_or_si128(t, a);
84
85 a = _mm_load_si128((__m128i*)sp + 2);
86 a = _mm_max_epi32(a, zero);
87 a = _mm_min_epi32(a, max_val_vec);
88 a = _mm_slli_epi32(a, 16);
89 t = _mm_or_si128(t, a);
90
91 a = _mm_load_si128((__m128i*)sp + 3);
92 a = _mm_max_epi32(a, zero);
93 a = _mm_min_epi32(a, max_val_vec);
94 a = _mm_slli_epi32(a, 24);
95 t = _mm_or_si128(t, a);
96
97 t = _mm_shuffle_epi8(t, mask);
98 _mm_storeu_si128((__m128i*)p, t);
99 }
100
101 int max_val = (1 << bit_depth) - 1;
102 for ( ; count > 0; --count)
103 {
104 int val = *sp++;
105 val = val >= 0 ? val : 0;
106 val = val <= max_val ? val : max_val;
107 *p++ = (ui8)val;
108 }
109 }
110
112 void sse41_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1,
113 const line_buf *ln2, void *dp,
114 int bit_depth, int count)
115 {
116 const si32 *sp0 = ln0->i32;
117 const si32 *sp1 = ln1->i32;
118 const si32 *sp2 = ln2->i32;
119 ui8* p = (ui8 *)dp;
120
121 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
122 __m128i zero = _mm_setzero_si128();
123 __m128i m0 = _mm_set_epi64x(0xFFFFFFFF0E0D0C0A, 0x0908060504020100);
124
125 // 16 entries in each loop
126 for ( ; count >= 16; count -= 16, sp0 += 16, sp1 += 16, sp2 += 16, p += 48)
127 {
128 __m128i a, t, u, v, w;
129 a = _mm_load_si128((__m128i*)sp0);
130 a = _mm_max_epi32(a, zero);
131 t = _mm_min_epi32(a, max_val_vec);
132
133 a = _mm_load_si128((__m128i*)sp1);
134 a = _mm_max_epi32(a, zero);
135 a = _mm_min_epi32(a, max_val_vec);
136 a = _mm_slli_epi32(a, 8);
137 t = _mm_or_si128(t, a);
138
139 a = _mm_load_si128((__m128i*)sp2);
140 a = _mm_max_epi32(a, zero);
141 a = _mm_min_epi32(a, max_val_vec);
142 a = _mm_slli_epi32(a, 16);
143 t = _mm_or_si128(t, a);
144 t = _mm_shuffle_epi8(t, m0);
145
146 a = _mm_load_si128((__m128i*)sp0 + 1);
147 a = _mm_max_epi32(a, zero);
148 u = _mm_min_epi32(a, max_val_vec);
149
150 a = _mm_load_si128((__m128i*)sp1 + 1);
151 a = _mm_max_epi32(a, zero);
152 a = _mm_min_epi32(a, max_val_vec);
153 a = _mm_slli_epi32(a, 8);
154 u = _mm_or_si128(u, a);
155
156 a = _mm_load_si128((__m128i*)sp2 + 1);
157 a = _mm_max_epi32(a, zero);
158 a = _mm_min_epi32(a, max_val_vec);
159 a = _mm_slli_epi32(a, 16);
160 u = _mm_or_si128(u, a);
161 u = _mm_shuffle_epi8(u, m0);
162
163 a = _mm_load_si128((__m128i*)sp0 + 2);
164 a = _mm_max_epi32(a, zero);
165 v = _mm_min_epi32(a, max_val_vec);
166
167 a = _mm_load_si128((__m128i*)sp1 + 2);
168 a = _mm_max_epi32(a, zero);
169 a = _mm_min_epi32(a, max_val_vec);
170 a = _mm_slli_epi32(a, 8);
171 v = _mm_or_si128(v, a);
172
173 a = _mm_load_si128((__m128i*)sp2 + 2);
174 a = _mm_max_epi32(a, zero);
175 a = _mm_min_epi32(a, max_val_vec);
176 a = _mm_slli_epi32(a, 16);
177 v = _mm_or_si128(v, a);
178 v = _mm_shuffle_epi8(v, m0);
179
180 a = _mm_load_si128((__m128i*)sp0 + 3);
181 a = _mm_max_epi32(a, zero);
182 w = _mm_min_epi32(a, max_val_vec);
183
184 a = _mm_load_si128((__m128i*)sp1 + 3);
185 a = _mm_max_epi32(a, zero);
186 a = _mm_min_epi32(a, max_val_vec);
187 a = _mm_slli_epi32(a, 8);
188 w = _mm_or_si128(w, a);
189
190 a = _mm_load_si128((__m128i*)sp2 + 3);
191 a = _mm_max_epi32(a, zero);
192 a = _mm_min_epi32(a, max_val_vec);
193 a = _mm_slli_epi32(a, 16);
194 w = _mm_or_si128(w, a);
195 w = _mm_shuffle_epi8(w, m0);
196
197 t = _mm_or_si128(t, _mm_bslli_si128(u, 12));
198 u = _mm_or_si128(_mm_bsrli_si128(u, 4), _mm_bslli_si128(v, 8));
199 v = _mm_or_si128(_mm_bsrli_si128(v, 8), _mm_bslli_si128(w, 4));
200
201 _mm_storeu_si128((__m128i*)p + 0, t);
202 _mm_storeu_si128((__m128i*)p + 1, u);
203 _mm_storeu_si128((__m128i*)p + 2, v);
204 }
205
206 int max_val = (1<<bit_depth) - 1;
207 for ( ; count > 0; --count)
208 {
209 int val;
210 val = *sp0++;
211 val = val >= 0 ? val : 0;
212 val = val <= max_val ? val : max_val;
213 *p++ = (ui8) val;
214 val = *sp1++;
215 val = val >= 0 ? val : 0;
216 val = val <= max_val ? val : max_val;
217 *p++ = (ui8) val;
218 val = *sp2++;
219 val = val >= 0 ? val : 0;
220 val = val <= max_val ? val : max_val;
221 *p++ = (ui8) val;
222 }
223 }
224
226 void sse41_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1,
227 const line_buf *ln2, void *dp,
228 int bit_depth, int count)
229 {
230 ojph_unused(ln1);
231 ojph_unused(ln2);
232
233 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
234 __m128i zero = _mm_setzero_si128();
235 __m128i mask = _mm_set_epi64x(0x0F0E0B0A07060302, 0x0D0C090805040100);
236 const si32 *sp = ln0->i32;
237 ui16* p = (ui16 *)dp;
238
239 // 8 entries in each loop
240 for ( ; count >= 8; count -= 8, sp += 8, p += 8)
241 {
242 __m128i a, t;
243 a = _mm_load_si128((__m128i*)sp);
244 a = _mm_max_epi32(a, zero);
245 t = _mm_min_epi32(a, max_val_vec);
246
247 a = _mm_load_si128((__m128i*)sp + 1);
248 a = _mm_max_epi32(a, zero);
249 a = _mm_min_epi32(a, max_val_vec);
250 a = _mm_slli_epi32(a, 16);
251 t = _mm_or_si128(t, a);
252
253 t = _mm_shuffle_epi8(t, mask);
254 _mm_storeu_si128((__m128i*)p, t);
255 }
256
257 int max_val = (1<<bit_depth) - 1;
258 for ( ; count > 0; --count)
259 {
260 int val = *sp++;
261 val = val >= 0 ? val : 0;
262 val = val <= max_val ? val : max_val;
263 *p++ = (ui16) val;
264 }
265 }
266
268 void sse41_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1,
269 const line_buf *ln2, void *dp,
270 int bit_depth, int count)
271 {
272 const si32 *sp0 = ln0->i32;
273 const si32 *sp1 = ln1->i32;
274 const si32 *sp2 = ln2->i32;
275 ui16* p = (ui16*)dp;
276
277 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
278 __m128i zero = _mm_setzero_si128();
279
280 __m128i m0 = _mm_set_epi64x(0x0B0A0908FFFF0706, 0x0504FFFF03020100);
281 __m128i m1 = _mm_set_epi64x(0xFFFFFFFF0504FFFF, 0xFFFF0100FFFFFFFF);
282 __m128i m2 = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF0F0E0D0CFFFF);
283 __m128i m3 = _mm_set_epi64x(0x0706FFFFFFFF0302, 0x0D0CFFFFFFFF0908);
284 __m128i m4 = _mm_set_epi64x(0xFFFF03020100FFFF, 0xFFFFFFFFFFFFFFFF);
285 __m128i m5 = _mm_set_epi64x(0xFFFFFFFF0F0EFFFF, 0xFFFF0B0AFFFFFFFF);
286 __m128i m6 = _mm_set_epi64x(0x0F0E0D0CFFFF0B0A, 0x0908FFFF07060504);
287
288 // 24 entries in each loop
289 for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
290 {
291 __m128i a, b, t, u, v;
292 a = _mm_load_si128((__m128i*)sp0);
293 a = _mm_max_epi32(a, zero);
294 t = _mm_min_epi32(a, max_val_vec);
295
296 a = _mm_load_si128((__m128i*)sp1);
297 a = _mm_max_epi32(a, zero);
298 a = _mm_min_epi32(a, max_val_vec);
299 a = _mm_slli_epi32(a, 16);
300 t = _mm_or_si128(t, a);
301
302 a = _mm_load_si128((__m128i*)sp2);
303 a = _mm_max_epi32(a, zero);
304 u = _mm_min_epi32(a, max_val_vec);
305
306 a = _mm_load_si128((__m128i*)sp0 + 1);
307 a = _mm_max_epi32(a, zero);
308 a = _mm_min_epi32(a, max_val_vec);
309 a = _mm_slli_epi32(a, 16);
310 u = _mm_or_si128(u, a);
311
312 a = _mm_load_si128((__m128i*)sp1 + 1);
313 a = _mm_max_epi32(a, zero);
314 v = _mm_min_epi32(a, max_val_vec);
315
316 a = _mm_load_si128((__m128i*)sp2 + 1);
317 a = _mm_max_epi32(a, zero);
318 a = _mm_min_epi32(a, max_val_vec);
319 a = _mm_slli_epi32(a, 16);
320 v = _mm_or_si128(v, a);
321
322 a = _mm_shuffle_epi8(t, m0);
323 b = _mm_shuffle_epi8(u, m1);
324 a = _mm_or_si128(a, b);
325 _mm_storeu_si128((__m128i*)p, a);
326
327 a = _mm_shuffle_epi8(t, m2);
328 b = _mm_shuffle_epi8(u, m3);
329 a = _mm_or_si128(a, b);
330 b = _mm_shuffle_epi8(v, m4);
331 a = _mm_or_si128(a, b);
332 _mm_storeu_si128((__m128i*)p + 1, a);
333
334 a = _mm_shuffle_epi8(u, m5);
335 b = _mm_shuffle_epi8(v, m6);
336 a = _mm_or_si128(a, b);
337 _mm_storeu_si128((__m128i*)p + 2, a);
338 }
339
340 int max_val = (1<<bit_depth) - 1;
341 for ( ; count > 0; --count)
342 {
343 int val;
344 val = *sp0++;
345 val = val >= 0 ? val : 0;
346 val = val <= max_val ? val : max_val;
347 *p++ = be2le((ui16) val);
348 val = *sp1++;
349 val = val >= 0 ? val : 0;
350 val = val <= max_val ? val : max_val;
351 *p++ = be2le((ui16) val);
352 val = *sp2++;
353 val = val >= 0 ? val : 0;
354 val = val <= max_val ? val : max_val;
355 *p++ = (ui16) val;
356 }
357 }
358
360 void sse41_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1,
361 const line_buf *ln2, void *dp,
362 int bit_depth, int count)
363 {
364 ojph_unused(ln1);
365 ojph_unused(ln2);
366
367 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
368 __m128i zero = _mm_setzero_si128();
369 __m128i mask = _mm_set_epi64x(0x0E0F0A0B06070203, 0x0C0D080904050001);
370 const si32 *sp = ln0->i32;
371 ui16* p = (ui16 *)dp;
372
373 // 8 entries in each loop
374 for ( ; count >= 8; count -= 8, sp += 8, p += 8)
375 {
376 __m128i a, t;
377 a = _mm_load_si128((__m128i*)sp);
378 a = _mm_max_epi32(a, zero);
379 t = _mm_min_epi32(a, max_val_vec);
380
381 a = _mm_load_si128((__m128i*)sp + 1);
382 a = _mm_max_epi32(a, zero);
383 a = _mm_min_epi32(a, max_val_vec);
384 a = _mm_slli_epi32(a, 16);
385 t = _mm_or_si128(t, a);
386
387 t = _mm_shuffle_epi8(t, mask);
388 _mm_storeu_si128((__m128i*)p, t);
389 }
390
391 int max_val = (1<<bit_depth) - 1;
392 for ( ; count > 0; --count)
393 {
394 int val = *sp++;
395 val = val >= 0 ? val : 0;
396 val = val <= max_val ? val : max_val;
397 *p++ = be2le((ui16) val);
398 }
399 }
400
402 void sse41_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1,
403 const line_buf *ln2, void *dp,
404 int bit_depth, int count)
405 {
406 const si32 *sp0 = ln0->i32;
407 const si32 *sp1 = ln1->i32;
408 const si32 *sp2 = ln2->i32;
409 ui16* p = (ui16*)dp;
410
411 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
412 __m128i zero = _mm_setzero_si128();
413
414 __m128i m0 = _mm_set_epi64x(0x0A0B0809FFFF0607, 0x0405FFFF02030001);
415 __m128i m1 = _mm_set_epi64x(0xFFFFFFFF0405FFFF, 0xFFFF0001FFFFFFFF);
416 __m128i m2 = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF0E0F0C0DFFFF);
417 __m128i m3 = _mm_set_epi64x(0x0607FFFFFFFF0203, 0x0C0DFFFFFFFF0809);
418 __m128i m4 = _mm_set_epi64x(0xFFFF02030001FFFF, 0xFFFFFFFFFFFFFFFF);
419 __m128i m5 = _mm_set_epi64x(0xFFFFFFFF0E0FFFFF, 0xFFFF0A0BFFFFFFFF);
420 __m128i m6 = _mm_set_epi64x(0x0E0F0C0DFFFF0A0B, 0x0809FFFF06070405);
421
422 // 24 entries in each loop
423 for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
424 {
425 __m128i a, b, t, u, v;
426 a = _mm_load_si128((__m128i*)sp0);
427 a = _mm_max_epi32(a, zero);
428 t = _mm_min_epi32(a, max_val_vec);
429
430 a = _mm_load_si128((__m128i*)sp1);
431 a = _mm_max_epi32(a, zero);
432 a = _mm_min_epi32(a, max_val_vec);
433 a = _mm_slli_epi32(a, 16);
434 t = _mm_or_si128(t, a);
435
436 a = _mm_load_si128((__m128i*)sp2);
437 a = _mm_max_epi32(a, zero);
438 u = _mm_min_epi32(a, max_val_vec);
439
440 a = _mm_load_si128((__m128i*)sp0 + 1);
441 a = _mm_max_epi32(a, zero);
442 a = _mm_min_epi32(a, max_val_vec);
443 a = _mm_slli_epi32(a, 16);
444 u = _mm_or_si128(u, a);
445
446 a = _mm_load_si128((__m128i*)sp1 + 1);
447 a = _mm_max_epi32(a, zero);
448 v = _mm_min_epi32(a, max_val_vec);
449
450 a = _mm_load_si128((__m128i*)sp2 + 1);
451 a = _mm_max_epi32(a, zero);
452 a = _mm_min_epi32(a, max_val_vec);
453 a = _mm_slli_epi32(a, 16);
454 v = _mm_or_si128(v, a);
455
456 a = _mm_shuffle_epi8(t, m0);
457 b = _mm_shuffle_epi8(u, m1);
458 a = _mm_or_si128(a, b);
459 _mm_storeu_si128((__m128i*)p, a);
460
461 a = _mm_shuffle_epi8(t, m2);
462 b = _mm_shuffle_epi8(u, m3);
463 a = _mm_or_si128(a, b);
464 b = _mm_shuffle_epi8(v, m4);
465 a = _mm_or_si128(a, b);
466 _mm_storeu_si128((__m128i*)p + 1, a);
467
468 a = _mm_shuffle_epi8(u, m5);
469 b = _mm_shuffle_epi8(v, m6);
470 a = _mm_or_si128(a, b);
471 _mm_storeu_si128((__m128i*)p + 2, a);
472 }
473
474 int max_val = (1<<bit_depth) - 1;
475 for ( ; count > 0; --count)
476 {
477 int val;
478 val = *sp0++;
479 val = val >= 0 ? val : 0;
480 val = val <= max_val ? val : max_val;
481 *p++ = be2le((ui16) val);
482 val = *sp1++;
483 val = val >= 0 ? val : 0;
484 val = val <= max_val ? val : max_val;
485 *p++ = be2le((ui16) val);
486 val = *sp2++;
487 val = val >= 0 ? val : 0;
488 val = val <= max_val ? val : max_val;
489 *p++ = be2le((ui16) val);
490 }
491 }
492}
void sse41_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
void sse41_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
uint16_t ui16
Definition: ojph_defs.h:52
void sse41_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
static ui16 be2le(const ui16 v)
Definition: ojph_img_io.cpp:55
void sse41_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
void sse41_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
int32_t si32
Definition: ojph_defs.h:55
void sse41_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
uint8_t ui8
Definition: ojph_defs.h:50
#define ojph_unused(x)
Definition: ojph_defs.h:78
si32 * i32
Definition: ojph_mem.h:155