54 return (
ui16)((v<<8) | (v>>8));
60 int bit_depth,
int count)
65 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
66 __m128i zero = _mm_setzero_si128();
67 __m128i mask = _mm_set_epi64x(0x0F0B07030E0A0602, 0x0D0905010C080400);
72 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
75 a = _mm_load_si128((__m128i*)sp);
76 a = _mm_max_epi32(a, zero);
77 t = _mm_min_epi32(a, max_val_vec);
79 a = _mm_load_si128((__m128i*)sp + 1);
80 a = _mm_max_epi32(a, zero);
81 a = _mm_min_epi32(a, max_val_vec);
82 a = _mm_slli_epi32(a, 8);
83 t = _mm_or_si128(t, a);
85 a = _mm_load_si128((__m128i*)sp + 2);
86 a = _mm_max_epi32(a, zero);
87 a = _mm_min_epi32(a, max_val_vec);
88 a = _mm_slli_epi32(a, 16);
89 t = _mm_or_si128(t, a);
91 a = _mm_load_si128((__m128i*)sp + 3);
92 a = _mm_max_epi32(a, zero);
93 a = _mm_min_epi32(a, max_val_vec);
94 a = _mm_slli_epi32(a, 24);
95 t = _mm_or_si128(t, a);
97 t = _mm_shuffle_epi8(t, mask);
98 _mm_storeu_si128((__m128i*)p, t);
101 int max_val = (1 << bit_depth) - 1;
102 for ( ; count > 0; --count)
105 val = val >= 0 ? val : 0;
106 val = val <= max_val ? val : max_val;
114 int bit_depth,
int count)
121 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
122 __m128i zero = _mm_setzero_si128();
123 __m128i m0 = _mm_set_epi64x(0xFFFFFFFF0E0D0C0A, 0x0908060504020100);
126 for ( ; count >= 16; count -= 16, sp0 += 16, sp1 += 16, sp2 += 16, p += 48)
128 __m128i a, t, u, v, w;
129 a = _mm_load_si128((__m128i*)sp0);
130 a = _mm_max_epi32(a, zero);
131 t = _mm_min_epi32(a, max_val_vec);
133 a = _mm_load_si128((__m128i*)sp1);
134 a = _mm_max_epi32(a, zero);
135 a = _mm_min_epi32(a, max_val_vec);
136 a = _mm_slli_epi32(a, 8);
137 t = _mm_or_si128(t, a);
139 a = _mm_load_si128((__m128i*)sp2);
140 a = _mm_max_epi32(a, zero);
141 a = _mm_min_epi32(a, max_val_vec);
142 a = _mm_slli_epi32(a, 16);
143 t = _mm_or_si128(t, a);
144 t = _mm_shuffle_epi8(t, m0);
146 a = _mm_load_si128((__m128i*)sp0 + 1);
147 a = _mm_max_epi32(a, zero);
148 u = _mm_min_epi32(a, max_val_vec);
150 a = _mm_load_si128((__m128i*)sp1 + 1);
151 a = _mm_max_epi32(a, zero);
152 a = _mm_min_epi32(a, max_val_vec);
153 a = _mm_slli_epi32(a, 8);
154 u = _mm_or_si128(u, a);
156 a = _mm_load_si128((__m128i*)sp2 + 1);
157 a = _mm_max_epi32(a, zero);
158 a = _mm_min_epi32(a, max_val_vec);
159 a = _mm_slli_epi32(a, 16);
160 u = _mm_or_si128(u, a);
161 u = _mm_shuffle_epi8(u, m0);
163 a = _mm_load_si128((__m128i*)sp0 + 2);
164 a = _mm_max_epi32(a, zero);
165 v = _mm_min_epi32(a, max_val_vec);
167 a = _mm_load_si128((__m128i*)sp1 + 2);
168 a = _mm_max_epi32(a, zero);
169 a = _mm_min_epi32(a, max_val_vec);
170 a = _mm_slli_epi32(a, 8);
171 v = _mm_or_si128(v, a);
173 a = _mm_load_si128((__m128i*)sp2 + 2);
174 a = _mm_max_epi32(a, zero);
175 a = _mm_min_epi32(a, max_val_vec);
176 a = _mm_slli_epi32(a, 16);
177 v = _mm_or_si128(v, a);
178 v = _mm_shuffle_epi8(v, m0);
180 a = _mm_load_si128((__m128i*)sp0 + 3);
181 a = _mm_max_epi32(a, zero);
182 w = _mm_min_epi32(a, max_val_vec);
184 a = _mm_load_si128((__m128i*)sp1 + 3);
185 a = _mm_max_epi32(a, zero);
186 a = _mm_min_epi32(a, max_val_vec);
187 a = _mm_slli_epi32(a, 8);
188 w = _mm_or_si128(w, a);
190 a = _mm_load_si128((__m128i*)sp2 + 3);
191 a = _mm_max_epi32(a, zero);
192 a = _mm_min_epi32(a, max_val_vec);
193 a = _mm_slli_epi32(a, 16);
194 w = _mm_or_si128(w, a);
195 w = _mm_shuffle_epi8(w, m0);
197 t = _mm_or_si128(t, _mm_bslli_si128(u, 12));
198 u = _mm_or_si128(_mm_bsrli_si128(u, 4), _mm_bslli_si128(v, 8));
199 v = _mm_or_si128(_mm_bsrli_si128(v, 8), _mm_bslli_si128(w, 4));
201 _mm_storeu_si128((__m128i*)p + 0, t);
202 _mm_storeu_si128((__m128i*)p + 1, u);
203 _mm_storeu_si128((__m128i*)p + 2, v);
206 int max_val = (1<<bit_depth) - 1;
207 for ( ; count > 0; --count)
211 val = val >= 0 ? val : 0;
212 val = val <= max_val ? val : max_val;
215 val = val >= 0 ? val : 0;
216 val = val <= max_val ? val : max_val;
219 val = val >= 0 ? val : 0;
220 val = val <= max_val ? val : max_val;
228 int bit_depth,
int count)
233 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
234 __m128i zero = _mm_setzero_si128();
235 __m128i mask = _mm_set_epi64x(0x0F0E0B0A07060302, 0x0D0C090805040100);
240 for ( ; count >= 8; count -= 8, sp += 8, p += 8)
243 a = _mm_load_si128((__m128i*)sp);
244 a = _mm_max_epi32(a, zero);
245 t = _mm_min_epi32(a, max_val_vec);
247 a = _mm_load_si128((__m128i*)sp + 1);
248 a = _mm_max_epi32(a, zero);
249 a = _mm_min_epi32(a, max_val_vec);
250 a = _mm_slli_epi32(a, 16);
251 t = _mm_or_si128(t, a);
253 t = _mm_shuffle_epi8(t, mask);
254 _mm_storeu_si128((__m128i*)p, t);
257 int max_val = (1<<bit_depth) - 1;
258 for ( ; count > 0; --count)
261 val = val >= 0 ? val : 0;
262 val = val <= max_val ? val : max_val;
270 int bit_depth,
int count)
277 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
278 __m128i zero = _mm_setzero_si128();
280 __m128i m0 = _mm_set_epi64x(0x0B0A0908FFFF0706, 0x0504FFFF03020100);
281 __m128i m1 = _mm_set_epi64x(0xFFFFFFFF0504FFFF, 0xFFFF0100FFFFFFFF);
282 __m128i m2 = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF0F0E0D0CFFFF);
283 __m128i m3 = _mm_set_epi64x(0x0706FFFFFFFF0302, 0x0D0CFFFFFFFF0908);
284 __m128i m4 = _mm_set_epi64x(0xFFFF03020100FFFF, 0xFFFFFFFFFFFFFFFF);
285 __m128i m5 = _mm_set_epi64x(0xFFFFFFFF0F0EFFFF, 0xFFFF0B0AFFFFFFFF);
286 __m128i m6 = _mm_set_epi64x(0x0F0E0D0CFFFF0B0A, 0x0908FFFF07060504);
289 for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
291 __m128i a, b, t, u, v;
292 a = _mm_load_si128((__m128i*)sp0);
293 a = _mm_max_epi32(a, zero);
294 t = _mm_min_epi32(a, max_val_vec);
296 a = _mm_load_si128((__m128i*)sp1);
297 a = _mm_max_epi32(a, zero);
298 a = _mm_min_epi32(a, max_val_vec);
299 a = _mm_slli_epi32(a, 16);
300 t = _mm_or_si128(t, a);
302 a = _mm_load_si128((__m128i*)sp2);
303 a = _mm_max_epi32(a, zero);
304 u = _mm_min_epi32(a, max_val_vec);
306 a = _mm_load_si128((__m128i*)sp0 + 1);
307 a = _mm_max_epi32(a, zero);
308 a = _mm_min_epi32(a, max_val_vec);
309 a = _mm_slli_epi32(a, 16);
310 u = _mm_or_si128(u, a);
312 a = _mm_load_si128((__m128i*)sp1 + 1);
313 a = _mm_max_epi32(a, zero);
314 v = _mm_min_epi32(a, max_val_vec);
316 a = _mm_load_si128((__m128i*)sp2 + 1);
317 a = _mm_max_epi32(a, zero);
318 a = _mm_min_epi32(a, max_val_vec);
319 a = _mm_slli_epi32(a, 16);
320 v = _mm_or_si128(v, a);
322 a = _mm_shuffle_epi8(t, m0);
323 b = _mm_shuffle_epi8(u, m1);
324 a = _mm_or_si128(a, b);
325 _mm_storeu_si128((__m128i*)p, a);
327 a = _mm_shuffle_epi8(t, m2);
328 b = _mm_shuffle_epi8(u, m3);
329 a = _mm_or_si128(a, b);
330 b = _mm_shuffle_epi8(v, m4);
331 a = _mm_or_si128(a, b);
332 _mm_storeu_si128((__m128i*)p + 1, a);
334 a = _mm_shuffle_epi8(u, m5);
335 b = _mm_shuffle_epi8(v, m6);
336 a = _mm_or_si128(a, b);
337 _mm_storeu_si128((__m128i*)p + 2, a);
340 int max_val = (1<<bit_depth) - 1;
341 for ( ; count > 0; --count)
345 val = val >= 0 ? val : 0;
346 val = val <= max_val ? val : max_val;
349 val = val >= 0 ? val : 0;
350 val = val <= max_val ? val : max_val;
353 val = val >= 0 ? val : 0;
354 val = val <= max_val ? val : max_val;
362 int bit_depth,
int count)
367 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
368 __m128i zero = _mm_setzero_si128();
369 __m128i mask = _mm_set_epi64x(0x0E0F0A0B06070203, 0x0C0D080904050001);
374 for ( ; count >= 8; count -= 8, sp += 8, p += 8)
377 a = _mm_load_si128((__m128i*)sp);
378 a = _mm_max_epi32(a, zero);
379 t = _mm_min_epi32(a, max_val_vec);
381 a = _mm_load_si128((__m128i*)sp + 1);
382 a = _mm_max_epi32(a, zero);
383 a = _mm_min_epi32(a, max_val_vec);
384 a = _mm_slli_epi32(a, 16);
385 t = _mm_or_si128(t, a);
387 t = _mm_shuffle_epi8(t, mask);
388 _mm_storeu_si128((__m128i*)p, t);
391 int max_val = (1<<bit_depth) - 1;
392 for ( ; count > 0; --count)
395 val = val >= 0 ? val : 0;
396 val = val <= max_val ? val : max_val;
404 int bit_depth,
int count)
411 __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
412 __m128i zero = _mm_setzero_si128();
414 __m128i m0 = _mm_set_epi64x(0x0A0B0809FFFF0607, 0x0405FFFF02030001);
415 __m128i m1 = _mm_set_epi64x(0xFFFFFFFF0405FFFF, 0xFFFF0001FFFFFFFF);
416 __m128i m2 = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF0E0F0C0DFFFF);
417 __m128i m3 = _mm_set_epi64x(0x0607FFFFFFFF0203, 0x0C0DFFFFFFFF0809);
418 __m128i m4 = _mm_set_epi64x(0xFFFF02030001FFFF, 0xFFFFFFFFFFFFFFFF);
419 __m128i m5 = _mm_set_epi64x(0xFFFFFFFF0E0FFFFF, 0xFFFF0A0BFFFFFFFF);
420 __m128i m6 = _mm_set_epi64x(0x0E0F0C0DFFFF0A0B, 0x0809FFFF06070405);
423 for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
425 __m128i a, b, t, u, v;
426 a = _mm_load_si128((__m128i*)sp0);
427 a = _mm_max_epi32(a, zero);
428 t = _mm_min_epi32(a, max_val_vec);
430 a = _mm_load_si128((__m128i*)sp1);
431 a = _mm_max_epi32(a, zero);
432 a = _mm_min_epi32(a, max_val_vec);
433 a = _mm_slli_epi32(a, 16);
434 t = _mm_or_si128(t, a);
436 a = _mm_load_si128((__m128i*)sp2);
437 a = _mm_max_epi32(a, zero);
438 u = _mm_min_epi32(a, max_val_vec);
440 a = _mm_load_si128((__m128i*)sp0 + 1);
441 a = _mm_max_epi32(a, zero);
442 a = _mm_min_epi32(a, max_val_vec);
443 a = _mm_slli_epi32(a, 16);
444 u = _mm_or_si128(u, a);
446 a = _mm_load_si128((__m128i*)sp1 + 1);
447 a = _mm_max_epi32(a, zero);
448 v = _mm_min_epi32(a, max_val_vec);
450 a = _mm_load_si128((__m128i*)sp2 + 1);
451 a = _mm_max_epi32(a, zero);
452 a = _mm_min_epi32(a, max_val_vec);
453 a = _mm_slli_epi32(a, 16);
454 v = _mm_or_si128(v, a);
456 a = _mm_shuffle_epi8(t, m0);
457 b = _mm_shuffle_epi8(u, m1);
458 a = _mm_or_si128(a, b);
459 _mm_storeu_si128((__m128i*)p, a);
461 a = _mm_shuffle_epi8(t, m2);
462 b = _mm_shuffle_epi8(u, m3);
463 a = _mm_or_si128(a, b);
464 b = _mm_shuffle_epi8(v, m4);
465 a = _mm_or_si128(a, b);
466 _mm_storeu_si128((__m128i*)p + 1, a);
468 a = _mm_shuffle_epi8(u, m5);
469 b = _mm_shuffle_epi8(v, m6);
470 a = _mm_or_si128(a, b);
471 _mm_storeu_si128((__m128i*)p + 2, a);
474 int max_val = (1<<bit_depth) - 1;
475 for ( ; count > 0; --count)
479 val = val >= 0 ? val : 0;
480 val = val <= max_val ? val : max_val;
483 val = val >= 0 ? val : 0;
484 val = val <= max_val ? val : max_val;
487 val = val >= 0 ? val : 0;
488 val = val <= max_val ? val : max_val;
void sse41_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
void sse41_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
void sse41_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
static ui16 be2le(const ui16 v)
void sse41_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
void sse41_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
void sse41_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)