54 return (
ui16)((v<<8) | (v>>8));
60 int bit_depth,
int count)
65 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
66 __m256i zero = _mm256_setzero_si256();
67 __m256i mask = _mm256_set_epi64x(0x0F0B07030E0A0602, 0x0D0905010C080400,
68 0x0F0B07030E0A0602, 0x0D0905010C080400);
73 for ( ; count >= 32; count -= 32, sp += 32, p += 32)
75 __m256i a, t, u, v0, v1;
76 a = _mm256_load_si256((__m256i*)sp);
77 a = _mm256_max_epi32(a, zero);
78 t = _mm256_min_epi32(a, max_val_vec);
80 a = _mm256_load_si256((__m256i*)sp + 1);
81 a = _mm256_max_epi32(a, zero);
82 a = _mm256_min_epi32(a, max_val_vec);
83 a = _mm256_slli_epi32(a, 16);
84 t = _mm256_or_si256(t, a);
86 a = _mm256_load_si256((__m256i*)sp + 2);
87 a = _mm256_max_epi32(a, zero);
88 u = _mm256_min_epi32(a, max_val_vec);
90 a = _mm256_load_si256((__m256i*)sp + 3);
91 a = _mm256_max_epi32(a, zero);
92 a = _mm256_min_epi32(a, max_val_vec);
93 a = _mm256_slli_epi32(a, 16);
94 u = _mm256_or_si256(u, a);
96 v0 = _mm256_permute2x128_si256(t, u, 0x20);
97 v1 = _mm256_permute2x128_si256(t, u, 0x31);
98 v1 = _mm256_slli_epi32(v1, 8);
99 v0 = _mm256_or_si256(v0, v1);
101 v0 = _mm256_shuffle_epi8(v0, mask);
102 _mm256_storeu_si256((__m256i*)p, v0);
105 int max_val = (1 << bit_depth) - 1;
106 for ( ; count > 0; --count)
109 val = val >= 0 ? val : 0;
110 val = val <= max_val ? val : max_val;
118 int bit_depth,
int count)
125 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
126 __m256i zero = _mm256_setzero_si256();
127 __m256i m0 = _mm256_set_epi64x(0xFFFFFFFF0E0D0C0A, 0x0908060504020100,
128 0xFFFFFFFF0E0D0C0A, 0x0908060504020100);
131 for ( ; count >= 32; count -= 32, sp0 += 32, sp1 += 32, sp2 += 32, p += 96)
133 __m256i a, t, u, v, w;
134 a = _mm256_load_si256((__m256i*)sp0);
135 a = _mm256_max_epi32(a, zero);
136 t = _mm256_min_epi32(a, max_val_vec);
138 a = _mm256_load_si256((__m256i*)sp1);
139 a = _mm256_max_epi32(a, zero);
140 a = _mm256_min_epi32(a, max_val_vec);
141 a = _mm256_slli_epi32(a, 8);
142 t = _mm256_or_si256(t, a);
144 a = _mm256_load_si256((__m256i*)sp2);
145 a = _mm256_max_epi32(a, zero);
146 a = _mm256_min_epi32(a, max_val_vec);
147 a = _mm256_slli_epi32(a, 16);
148 t = _mm256_or_si256(t, a);
149 t = _mm256_shuffle_epi8(t, m0);
152 a = _mm256_load_si256((__m256i*)sp0 + 1);
153 a = _mm256_max_epi32(a, zero);
154 u = _mm256_min_epi32(a, max_val_vec);
156 a = _mm256_load_si256((__m256i*)sp1 + 1);
157 a = _mm256_max_epi32(a, zero);
158 a = _mm256_min_epi32(a, max_val_vec);
159 a = _mm256_slli_epi32(a, 8);
160 u = _mm256_or_si256(u, a);
162 a = _mm256_load_si256((__m256i*)sp2 + 1);
163 a = _mm256_max_epi32(a, zero);
164 a = _mm256_min_epi32(a, max_val_vec);
165 a = _mm256_slli_epi32(a, 16);
166 u = _mm256_or_si256(u, a);
167 u = _mm256_shuffle_epi8(u, m0);
170 a = _mm256_load_si256((__m256i*)sp0 + 2);
171 a = _mm256_max_epi32(a, zero);
172 v = _mm256_min_epi32(a, max_val_vec);
174 a = _mm256_load_si256((__m256i*)sp1 + 2);
175 a = _mm256_max_epi32(a, zero);
176 a = _mm256_min_epi32(a, max_val_vec);
177 a = _mm256_slli_epi32(a, 8);
178 v = _mm256_or_si256(v, a);
180 a = _mm256_load_si256((__m256i*)sp2 + 2);
181 a = _mm256_max_epi32(a, zero);
182 a = _mm256_min_epi32(a, max_val_vec);
183 a = _mm256_slli_epi32(a, 16);
184 v = _mm256_or_si256(v, a);
185 v = _mm256_shuffle_epi8(v, m0);
188 a = _mm256_load_si256((__m256i*)sp0 + 3);
189 a = _mm256_max_epi32(a, zero);
190 w = _mm256_min_epi32(a, max_val_vec);
192 a = _mm256_load_si256((__m256i*)sp1 + 3);
193 a = _mm256_max_epi32(a, zero);
194 a = _mm256_min_epi32(a, max_val_vec);
195 a = _mm256_slli_epi32(a, 8);
196 w = _mm256_or_si256(w, a);
198 a = _mm256_load_si256((__m256i*)sp2 + 3);
199 a = _mm256_max_epi32(a, zero);
200 a = _mm256_min_epi32(a, max_val_vec);
201 a = _mm256_slli_epi32(a, 16);
202 w = _mm256_or_si256(w, a);
203 w = _mm256_shuffle_epi8(w, m0);
205 _mm_storeu_si128((__m128i*)(p ), _mm256_castsi256_si128(t));
206 _mm_storeu_si128((__m128i*)(p + 12), _mm256_extracti128_si256(t, 1));
207 _mm_storeu_si128((__m128i*)(p + 24), _mm256_castsi256_si128(u));
208 _mm_storeu_si128((__m128i*)(p + 36), _mm256_extracti128_si256(u, 1));
209 _mm_storeu_si128((__m128i*)(p + 48), _mm256_castsi256_si128(v));
210 _mm_storeu_si128((__m128i*)(p + 60), _mm256_extracti128_si256(v, 1));
211 _mm_storeu_si128((__m128i*)(p + 72), _mm256_castsi256_si128(w));
212 _mm_storeu_si128((__m128i*)(p + 84), _mm256_extracti128_si256(w, 1));
215 int max_val = (1<<bit_depth) - 1;
216 for ( ; count > 0; --count)
220 val = val >= 0 ? val : 0;
221 val = val <= max_val ? val : max_val;
224 val = val >= 0 ? val : 0;
225 val = val <= max_val ? val : max_val;
228 val = val >= 0 ? val : 0;
229 val = val <= max_val ? val : max_val;
237 int bit_depth,
int count)
242 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
243 __m256i zero = _mm256_setzero_si256();
244 __m256i mask = _mm256_set_epi64x(0x0F0E0B0A07060302, 0x0D0C090805040100,
245 0x0F0E0B0A07060302, 0x0D0C090805040100);
250 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
253 a = _mm256_load_si256((__m256i*)sp);
254 a = _mm256_max_epi32(a, zero);
255 t = _mm256_min_epi32(a, max_val_vec);
257 a = _mm256_load_si256((__m256i*)sp + 1);
258 a = _mm256_max_epi32(a, zero);
259 a = _mm256_min_epi32(a, max_val_vec);
260 a = _mm256_slli_epi32(a, 16);
261 t = _mm256_or_si256(t, a);
263 t = _mm256_shuffle_epi8(t, mask);
264 t = _mm256_permute4x64_epi64(t, 0xD8);
265 _mm256_storeu_si256((__m256i*)p, t);
268 int max_val = (1<<bit_depth) - 1;
269 for ( ; count > 0; --count)
272 val = val >= 0 ? val : 0;
273 val = val <= max_val ? val : max_val;
281 int bit_depth,
int count)
286 __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
287 __m256i zero = _mm256_setzero_si256();
288 __m256i mask = _mm256_set_epi64x(0x0E0F0A0B06070203, 0x0C0D080904050001,
289 0x0E0F0A0B06070203, 0x0C0D080904050001);
294 for ( ; count >= 16; count -= 16, sp += 16, p += 16)
297 a = _mm256_load_si256((__m256i*)sp);
298 a = _mm256_max_epi32(a, zero);
299 t = _mm256_min_epi32(a, max_val_vec);
301 a = _mm256_load_si256((__m256i*)sp + 1);
302 a = _mm256_max_epi32(a, zero);
303 a = _mm256_min_epi32(a, max_val_vec);
304 a = _mm256_slli_epi32(a, 16);
305 t = _mm256_or_si256(t, a);
307 t = _mm256_shuffle_epi8(t, mask);
308 t = _mm256_permute4x64_epi64(t, 0xD8);
309 _mm256_storeu_si256((__m256i*)p, t);
312 int max_val = (1<<bit_depth) - 1;
313 for ( ; count > 0; --count)
316 val = val >= 0 ? val : 0;
317 val = val <= max_val ? val : max_val;
void avx2_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
void avx2_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
static ui16 be2le(const ui16 v)
void avx2_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
void avx2_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)