11#if defined(QT_COMPILER_SUPPORTS_AVX2)
24BYTE_MUL_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
26 __m256i pixelVectorAG = _mm256_srli_epi16(pixelVector, 8);
27 __m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask);
29 pixelVectorAG = _mm256_mullo_epi16(pixelVectorAG, alphaChannel);
30 pixelVectorRB = _mm256_mullo_epi16(pixelVectorRB, alphaChannel);
32 pixelVectorRB = _mm256_add_epi16(pixelVectorRB, _mm256_srli_epi16(pixelVectorRB, 8));
33 pixelVectorAG = _mm256_add_epi16(pixelVectorAG, _mm256_srli_epi16(pixelVectorAG, 8));
34 pixelVectorRB = _mm256_add_epi16(pixelVectorRB, half);
35 pixelVectorAG = _mm256_add_epi16(pixelVectorAG, half);
37 pixelVectorRB = _mm256_srli_epi16(pixelVectorRB, 8);
38 pixelVectorAG = _mm256_andnot_si256(colorMask, pixelVectorAG);
40 pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB);
44BYTE_MUL_RGB64_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
46 __m256i pixelVectorAG = _mm256_srli_epi32(pixelVector, 16);
47 __m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask);
49 pixelVectorAG = _mm256_mullo_epi32(pixelVectorAG, alphaChannel);
50 pixelVectorRB = _mm256_mullo_epi32(pixelVectorRB, alphaChannel);
52 pixelVectorRB = _mm256_add_epi32(pixelVectorRB, _mm256_srli_epi32(pixelVectorRB, 16));
53 pixelVectorAG = _mm256_add_epi32(pixelVectorAG, _mm256_srli_epi32(pixelVectorAG, 16));
54 pixelVectorRB = _mm256_add_epi32(pixelVectorRB, half);
55 pixelVectorAG = _mm256_add_epi32(pixelVectorAG, half);
57 pixelVectorRB = _mm256_srli_epi32(pixelVectorRB, 16);
58 pixelVectorAG = _mm256_andnot_si256(colorMask, pixelVectorAG);
60 pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB);
65INTERPOLATE_PIXEL_255_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
67 const __m256i srcVectorAG = _mm256_srli_epi16(srcVector, 8);
68 const __m256i dstVectorAG = _mm256_srli_epi16(dstVector, 8);
69 const __m256i srcVectorRB = _mm256_and_si256(srcVector, colorMask);
70 const __m256i dstVectorRB = _mm256_and_si256(dstVector, colorMask);
71 const __m256i srcVectorAGalpha = _mm256_mullo_epi16(srcVectorAG, alphaChannel);
72 const __m256i srcVectorRBalpha = _mm256_mullo_epi16(srcVectorRB, alphaChannel);
73 const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi16(dstVectorAG, oneMinusAlphaChannel);
74 const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi16(dstVectorRB, oneMinusAlphaChannel);
75 __m256i finalAG = _mm256_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
76 __m256i finalRB = _mm256_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
77 finalAG = _mm256_add_epi16(finalAG, _mm256_srli_epi16(finalAG, 8));
78 finalRB = _mm256_add_epi16(finalRB, _mm256_srli_epi16(finalRB, 8));
79 finalAG = _mm256_add_epi16(finalAG, half);
80 finalRB = _mm256_add_epi16(finalRB, half);
81 finalAG = _mm256_andnot_si256(colorMask, finalAG);
82 finalRB = _mm256_srli_epi16(finalRB, 8);
84 dstVector = _mm256_or_si256(finalAG, finalRB);
88INTERPOLATE_PIXEL_RGB64_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
90 const __m256i srcVectorAG = _mm256_srli_epi32(srcVector, 16);
91 const __m256i dstVectorAG = _mm256_srli_epi32(dstVector, 16);
92 const __m256i srcVectorRB = _mm256_and_si256(srcVector, colorMask);
93 const __m256i dstVectorRB = _mm256_and_si256(dstVector, colorMask);
94 const __m256i srcVectorAGalpha = _mm256_mullo_epi32(srcVectorAG, alphaChannel);
95 const __m256i srcVectorRBalpha = _mm256_mullo_epi32(srcVectorRB, alphaChannel);
96 const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi32(dstVectorAG, oneMinusAlphaChannel);
97 const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi32(dstVectorRB, oneMinusAlphaChannel);
98 __m256i finalAG = _mm256_add_epi32(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
99 __m256i finalRB = _mm256_add_epi32(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
100 finalAG = _mm256_add_epi32(finalAG, _mm256_srli_epi32(finalAG, 16));
101 finalRB = _mm256_add_epi32(finalRB, _mm256_srli_epi32(finalRB, 16));
102 finalAG = _mm256_add_epi32(finalAG, half);
103 finalRB = _mm256_add_epi32(finalRB, half);
104 finalAG = _mm256_andnot_si256(colorMask, finalAG);
105 finalRB = _mm256_srli_epi32(finalRB, 16);
107 dstVector = _mm256_or_si256(finalAG, finalRB);
113 const __m256i half = _mm256_set1_epi16(0x80);
114 const __m256i one = _mm256_set1_epi16(0xff);
115 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
116 const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
117 const __m256i offsetMask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
118 const __m256i alphaShuffleMask = _mm256_set_epi8(
char(0xff),15,
char(0xff),15,
char(0xff),11,
char(0xff),11,
char(0xff),7,
char(0xff),7,
char(0xff),3,
char(0xff),3,
119 char(0xff),15,
char(0xff),15,
char(0xff),11,
char(0xff),11,
char(0xff),7,
char(0xff),7,
char(0xff),3,
char(0xff),3);
121 const int minusOffsetToAlignDstOn32Bytes = (
reinterpret_cast<quintptr>(
dst) >> 2) & 0x7;
125 if (minusOffsetToAlignDstOn32Bytes != 0 &&
x < (
length - 7)) {
126 const __m256i prologueMask = _mm256_sub_epi32(_mm256_set1_epi32(minusOffsetToAlignDstOn32Bytes - 1), offsetMask);
127 const __m256i srcVector = _mm256_maskload_epi32((
const int *)&
src[
x - minusOffsetToAlignDstOn32Bytes], prologueMask);
128 const __m256i prologueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, prologueMask);
129 if (!_mm256_testz_si256(srcVector, prologueAlphaMask)) {
130 if (_mm256_testc_si256(srcVector, prologueAlphaMask)) {
131 _mm256_maskstore_epi32((
int *)&
dst[
x - minusOffsetToAlignDstOn32Bytes], prologueMask, srcVector);
133 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
134 alphaChannel = _mm256_sub_epi16(one, alphaChannel);
135 __m256i dstVector = _mm256_maskload_epi32((
int *)&
dst[
x - minusOffsetToAlignDstOn32Bytes], prologueMask);
136 BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
137 dstVector = _mm256_add_epi8(dstVector, srcVector);
138 _mm256_maskstore_epi32((
int *)&
dst[
x - minusOffsetToAlignDstOn32Bytes], prologueMask, dstVector);
141 x += (8 - minusOffsetToAlignDstOn32Bytes);
145 const __m256i srcVector = _mm256_lddqu_si256((
const __m256i *)&
src[
x]);
146 if (!_mm256_testz_si256(srcVector, alphaMask)) {
147 if (_mm256_testc_si256(srcVector, alphaMask)) {
148 _mm256_store_si256((__m256i *)&
dst[
x], srcVector);
150 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
151 alphaChannel = _mm256_sub_epi16(one, alphaChannel);
152 __m256i dstVector = _mm256_load_si256((__m256i *)&
dst[
x]);
153 BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
154 dstVector = _mm256_add_epi8(dstVector, srcVector);
155 _mm256_store_si256((__m256i *)&
dst[
x], dstVector);
162 const __m256i epilogueMask = _mm256_add_epi32(offsetMask, _mm256_set1_epi32(
x -
length));
163 const __m256i srcVector = _mm256_maskload_epi32((
const int *)&
src[
x], epilogueMask);
164 const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
165 if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
166 if (_mm256_testc_si256(srcVector, epilogueAlphaMask)) {
167 _mm256_maskstore_epi32((
int *)&
dst[
x], epilogueMask, srcVector);
169 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
170 alphaChannel = _mm256_sub_epi16(one, alphaChannel);
171 __m256i dstVector = _mm256_maskload_epi32((
int *)&
dst[
x], epilogueMask);
172 BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
173 dstVector = _mm256_add_epi8(dstVector, srcVector);
174 _mm256_maskstore_epi32((
int *)&
dst[
x], epilogueMask, dstVector);
190 const __m256i half = _mm256_set1_epi16(0x80);
191 const __m256i one = _mm256_set1_epi16(0xff);
192 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
193 const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
194 const __m256i alphaShuffleMask = _mm256_set_epi8(
char(0xff),15,
char(0xff),15,
char(0xff),11,
char(0xff),11,
char(0xff),7,
char(0xff),7,
char(0xff),3,
char(0xff),3,
195 char(0xff),15,
char(0xff),15,
char(0xff),11,
char(0xff),11,
char(0xff),7,
char(0xff),7,
char(0xff),3,
char(0xff),3);
196 const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
198 __m256i srcVector = _mm256_lddqu_si256((
const __m256i *)&
src[
x]);
199 if (!_mm256_testz_si256(srcVector, alphaMask)) {
200 BYTE_MUL_AVX2(srcVector, constAlphaVector, colorMask, half);
202 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
203 alphaChannel = _mm256_sub_epi16(one, alphaChannel);
204 __m256i dstVector = _mm256_load_si256((__m256i *)&
dst[
x]);
205 BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
206 dstVector = _mm256_add_epi8(dstVector, srcVector);
207 _mm256_store_si256((__m256i *)&
dst[
x], dstVector);
214void qt_blend_argb32_on_argb32_avx2(
uchar *destPixels,
int dbpl,
215 const
uchar *srcPixels,
int sbpl,
219 if (const_alpha == 256) {
220 for (
int y = 0;
y <
h; ++
y) {
223 BLEND_SOURCE_OVER_ARGB32_AVX2(
dst,
src,
w);
227 }
else if (const_alpha != 0) {
228 const_alpha = (const_alpha * 255) >> 8;
229 for (
int y = 0;
y <
h; ++
y) {
232 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(
dst,
src,
w, const_alpha);
239void qt_blend_rgb32_on_rgb32_avx2(
uchar *destPixels,
int dbpl,
240 const uchar *srcPixels,
int sbpl,
244 if (const_alpha == 256) {
245 for (
int y = 0;
y <
h; ++
y) {
254 if (const_alpha == 0)
257 const __m256i half = _mm256_set1_epi16(0x80);
258 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
260 const_alpha = (const_alpha * 255) >> 8;
261 int one_minus_const_alpha = 255 - const_alpha;
262 const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
263 const __m256i oneMinusConstAlpha = _mm256_set1_epi16(one_minus_const_alpha);
264 for (
int y = 0;
y <
h; ++
y) {
274 for (;
x < (
w - 7);
x += 8) {
275 const __m256i srcVector = _mm256_lddqu_si256((
const __m256i *)&
src[
x]);
276 __m256i dstVector = _mm256_load_si256((__m256i *)&
dst[
x]);
277 INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
278 _mm256_store_si256((__m256i *)&
dst[
x], dstVector);
293 __m128i value128 = _mm256_castsi256_si128(value256);
296 __m256i *dst256 =
reinterpret_cast<__m256i *
>(dest);
298 while (
reinterpret_cast<uchar *
>(dst256 + 4) <=
end) {
299 _mm256_storeu_si256(dst256 + 0, value256);
300 _mm256_storeu_si256(dst256 + 1, value256);
301 _mm256_storeu_si256(dst256 + 2, value256);
302 _mm256_storeu_si256(dst256 + 3, value256);
307 bytes =
end -
reinterpret_cast<uchar *
>(dst256);
308 switch (bytes /
sizeof(value256)) {
309 case 3: _mm256_storeu_si256(dst256++, value256);
Q_FALLTHROUGH();
310 case 2: _mm256_storeu_si256(dst256++, value256);
Q_FALLTHROUGH();
311 case 1: _mm256_storeu_si256(dst256++, value256);
315 __m128i *dst128 =
reinterpret_cast<__m128i *
>(dst256);
316 if (bytes &
sizeof(value128))
317 _mm_storeu_si128(dst128++, value128);
321 _mm_storel_epi64(
reinterpret_cast<__m128i *
>(
end - 8), value128);
326#if defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
328 __m128i value64 = _mm_set_epi64x(0,
value);
329# ifdef Q_PROCESSOR_X86_64
330 asm (
"" :
"+x" (value64));
332 __m256i value256 = _mm256_broadcastq_epi64(value64);
334 __m256i value256 = _mm256_set1_epi64x(
value);
337 qt_memfillXX_avx2(
reinterpret_cast<uchar *
>(dest), value256,
count *
sizeof(
quint64));
357 if (const_alpha == 255)
360 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(
dst,
src,
length, const_alpha);
363#if QT_CONFIG(raster_64bit)
367 const __m256i half = _mm256_set1_epi32(0x8000);
368 const __m256i one = _mm256_set1_epi32(0xffff);
369 const __m256i colorMask = _mm256_set1_epi32(0x0000ffff);
370 __m256i alphaMask = _mm256_set1_epi32(0xff000000);
371 alphaMask = _mm256_unpacklo_epi8(alphaMask, alphaMask);
372 const __m256i alphaShuffleMask = _mm256_set_epi8(
char(0xff),
char(0xff),15,14,
char(0xff),
char(0xff),15,14,
char(0xff),
char(0xff),7,6,
char(0xff),
char(0xff),7,6,
373 char(0xff),
char(0xff),15,14,
char(0xff),
char(0xff),15,14,
char(0xff),
char(0xff),7,6,
char(0xff),
char(0xff),7,6);
375 if (const_alpha == 255) {
380 const __m256i srcVector = _mm256_lddqu_si256((
const __m256i *)&
src[
x]);
381 if (!_mm256_testz_si256(srcVector, alphaMask)) {
383 if (_mm256_testc_si256(srcVector, alphaMask)) {
385 _mm256_store_si256((__m256i *)&
dst[
x], srcVector);
387 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
388 alphaChannel = _mm256_sub_epi32(one, alphaChannel);
389 __m256i dstVector = _mm256_load_si256((__m256i *)&
dst[
x]);
390 BYTE_MUL_RGB64_AVX2(dstVector, alphaChannel, colorMask, half);
391 dstVector = _mm256_add_epi16(dstVector, srcVector);
392 _mm256_store_si256((__m256i *)&
dst[
x], dstVector);
399 const __m256i constAlphaVector = _mm256_set1_epi32(const_alpha | (const_alpha << 8));
404 __m256i srcVector = _mm256_lddqu_si256((
const __m256i *)&
src[
x]);
405 if (!_mm256_testz_si256(srcVector, alphaMask)) {
407 BYTE_MUL_RGB64_AVX2(srcVector, constAlphaVector, colorMask, half);
409 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
410 alphaChannel = _mm256_sub_epi32(one, alphaChannel);
411 __m256i dstVector = _mm256_load_si256((__m256i *)&
dst[
x]);
412 BYTE_MUL_RGB64_AVX2(dstVector, alphaChannel, colorMask, half);
413 dstVector = _mm256_add_epi16(dstVector, srcVector);
414 _mm256_store_si256((__m256i *)&
dst[
x], dstVector);
423#if QT_CONFIG(raster_fp)
428 const float a = const_alpha / 255.0f;
429 const __m128 one = _mm_set1_ps(1.0f);
430 const __m128 constAlphaVector = _mm_set1_ps(
a);
431 const __m256 one256 = _mm256_set1_ps(1.0f);
432 const __m256 constAlphaVector256 = _mm256_set1_ps(
a);
435 __m256 srcVector = _mm256_loadu_ps((
const float *)&
src[
x]);
436 __m256 dstVector = _mm256_loadu_ps((
const float *)&
dst[
x]);
437 srcVector = _mm256_mul_ps(srcVector, constAlphaVector256);
438 __m256 alphaChannel = _mm256_permute_ps(srcVector, _MM_SHUFFLE(3, 3, 3, 3));
439 alphaChannel = _mm256_sub_ps(one256, alphaChannel);
440 dstVector = _mm256_mul_ps(dstVector, alphaChannel);
441 dstVector = _mm256_add_ps(dstVector, srcVector);
442 _mm256_storeu_ps((
float *)(
dst +
x), dstVector);
445 __m128 srcVector = _mm_load_ps((
float *)(
src +
x));
446 __m128 dstVector = _mm_load_ps((
const float *)(
dst +
x));
447 srcVector = _mm_mul_ps(srcVector, constAlphaVector);
448 __m128 alphaChannel = _mm_permute_ps(srcVector, _MM_SHUFFLE(3, 3, 3, 3));
449 alphaChannel = _mm_sub_ps(one, alphaChannel);
450 dstVector = _mm_mul_ps(dstVector, alphaChannel);
451 dstVector = _mm_add_ps(dstVector, srcVector);
452 _mm_store_ps((
float *)(
dst +
x), dstVector);
459 if (const_alpha == 255) {
462 const int ialpha = 255 - const_alpha;
471 const __m256i half = _mm256_set1_epi16(0x80);
472 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
473 const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
474 const __m256i oneMinusConstAlpha = _mm256_set1_epi16(ialpha);
476 const __m256i srcVector = _mm256_lddqu_si256((
const __m256i *)&
src[
x]);
477 __m256i dstVector = _mm256_load_si256((__m256i *)&
dst[
x]);
478 INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
479 _mm256_store_si256((__m256i *)&
dst[
x], dstVector);
488#if QT_CONFIG(raster_64bit)
492 if (const_alpha == 255) {
495 const uint ca = const_alpha | (const_alpha << 8);
496 const uint cia = 65535 - ca;
505 const __m256i half = _mm256_set1_epi32(0x8000);
506 const __m256i colorMask = _mm256_set1_epi32(0x0000ffff);
507 const __m256i constAlphaVector = _mm256_set1_epi32(ca);
508 const __m256i oneMinusConstAlpha = _mm256_set1_epi32(cia);
510 const __m256i srcVector = _mm256_lddqu_si256((
const __m256i *)&
src[
x]);
511 __m256i dstVector = _mm256_load_si256((__m256i *)&
dst[
x]);
512 INTERPOLATE_PIXEL_RGB64_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
513 _mm256_store_si256((__m256i *)&
dst[
x], dstVector);
523#if QT_CONFIG(raster_fp)
527 if (const_alpha == 255) {
530 const float ca = const_alpha / 255.f;
531 const float cia = 1.0f - ca;
533 const __m128 constAlphaVector = _mm_set1_ps(ca);
534 const __m128 oneMinusConstAlpha = _mm_set1_ps(cia);
535 const __m256 constAlphaVector256 = _mm256_set1_ps(ca);
536 const __m256 oneMinusConstAlpha256 = _mm256_set1_ps(cia);
539 __m256 srcVector = _mm256_loadu_ps((
const float *)&
src[
x]);
540 __m256 dstVector = _mm256_loadu_ps((
const float *)&
dst[
x]);
541 srcVector = _mm256_mul_ps(srcVector, constAlphaVector256);
542 dstVector = _mm256_mul_ps(dstVector, oneMinusConstAlpha256);
543 dstVector = _mm256_add_ps(dstVector, srcVector);
544 _mm256_storeu_ps((
float *)&
dst[
x], dstVector);
547 __m128 srcVector = _mm_load_ps((
const float *)&
src[
x]);
548 __m128 dstVector = _mm_load_ps((
const float *)&
dst[
x]);
549 srcVector = _mm_mul_ps(srcVector, constAlphaVector);
550 dstVector = _mm_mul_ps(dstVector, oneMinusConstAlpha);
551 dstVector = _mm_add_ps(dstVector, srcVector);
552 _mm_store_ps((
float *)&
dst[
x], dstVector);
563 if (const_alpha != 255)
570 const __m256i colorVector = _mm256_set1_epi32(
color);
571 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
572 const __m256i half = _mm256_set1_epi16(0x80);
573 const __m256i minusAlphaOfColorVector = _mm256_set1_epi16(minusAlphaOfColor);
579 __m256i dstVector = _mm256_load_si256((__m256i *)&
dst[
x]);
580 BYTE_MUL_AVX2(dstVector, minusAlphaOfColorVector, colorMask, half);
581 dstVector = _mm256_add_epi8(colorVector, dstVector);
582 _mm256_store_si256((__m256i *)&
dst[
x], dstVector);
589#if QT_CONFIG(raster_64bit)
593 if (const_alpha == 255 &&
color.isOpaque()) {
596 if (const_alpha != 255)
599 const uint minusAlphaOfColor = 65535 -
color.alpha();
602 const __m256i colorVector = _mm256_set1_epi64x(
color);
603 const __m256i colorMask = _mm256_set1_epi32(0x0000ffff);
604 const __m256i half = _mm256_set1_epi32(0x8000);
605 const __m256i minusAlphaOfColorVector = _mm256_set1_epi32(minusAlphaOfColor);
611 __m256i dstVector = _mm256_load_si256((__m256i *)&
dst[
x]);
612 BYTE_MUL_RGB64_AVX2(dstVector, minusAlphaOfColorVector, colorMask, half);
613 dstVector = _mm256_add_epi16(colorVector, dstVector);
614 _mm256_store_si256((__m256i *)&
dst[
x], dstVector);
622#if QT_CONFIG(raster_fp)
626 if (const_alpha == 255) {
630 const float a = const_alpha / 255.0f;
631 const __m128 alphaVector = _mm_set1_ps(
a);
632 const __m128 minusAlphaVector = _mm_set1_ps(1.0f -
a);
633 __m128 colorVector = _mm_load_ps((
const float *)&
color);
634 colorVector = _mm_mul_ps(colorVector, alphaVector);
635 const __m256 colorVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(colorVector), colorVector, 1);
636 const __m256 minusAlphaVector256 = _mm256_set1_ps(1.0f -
a);
639 __m256 dstVector = _mm256_loadu_ps((
const float *)&
dst[
x]);
640 dstVector = _mm256_mul_ps(dstVector, minusAlphaVector256);
641 dstVector = _mm256_add_ps(dstVector, colorVector256);
642 _mm256_storeu_ps((
float *)&
dst[
x], dstVector);
645 __m128 dstVector = _mm_load_ps((
const float *)&
dst[
x]);
646 dstVector = _mm_mul_ps(dstVector, minusAlphaVector);
647 dstVector = _mm_add_ps(dstVector, colorVector);
648 _mm_store_ps((
float *)&
dst[
x], dstVector);
656 if (const_alpha == 255 &&
color.a >= 1.0f) {
660 __m128 colorVector = _mm_load_ps((
const float *)&
color);
661 if (const_alpha != 255)
662 colorVector = _mm_mul_ps(colorVector, _mm_set1_ps(const_alpha / 255.f));
663 __m128 minusAlphaOfColorVector =
664 _mm_sub_ps(_mm_set1_ps(1.0f), _mm_permute_ps(colorVector, _MM_SHUFFLE(3, 3, 3, 3)));
665 const __m256 colorVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(colorVector), colorVector, 1);
666 const __m256 minusAlphaVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(minusAlphaOfColorVector),
667 minusAlphaOfColorVector, 1);
670 __m256 dstVector = _mm256_loadu_ps((
const float *)&
dst[
x]);
671 dstVector = _mm256_mul_ps(dstVector, minusAlphaVector256);
672 dstVector = _mm256_add_ps(dstVector, colorVector256);
673 _mm256_storeu_ps((
float *)&
dst[
x], dstVector);
676 __m128 dstVector = _mm_load_ps((
const float *)&
dst[
x]);
677 dstVector = _mm_mul_ps(dstVector, minusAlphaOfColorVector);
678 dstVector = _mm_add_ps(dstVector, colorVector);
679 _mm_store_ps((
float *)&
dst[
x], dstVector);
685#define interpolate_4_pixels_16_avx2(tlr1, tlr2, blr1, blr2, distx, disty, colorMask, v_256, b) \
688 const __m256i vdistx = _mm256_permute4x64_epi64(distx, _MM_SHUFFLE(3, 1, 2, 0)); \
689 const __m256i vdisty = _mm256_permute4x64_epi64(disty, _MM_SHUFFLE(3, 1, 2, 0)); \
691 __m256i dxdy = _mm256_mullo_epi16 (vdistx, vdisty); \
692 const __m256i distx_ = _mm256_slli_epi16(vdistx, 4); \
693 const __m256i disty_ = _mm256_slli_epi16(vdisty, 4); \
694 __m256i idxidy = _mm256_add_epi16(dxdy, _mm256_sub_epi16(v_256, _mm256_add_epi16(distx_, disty_))); \
695 __m256i dxidy = _mm256_sub_epi16(distx_, dxdy); \
696 __m256i idxdy = _mm256_sub_epi16(disty_, dxdy); \
698 __m256i tlr1AG = _mm256_srli_epi16(tlr1, 8); \
699 __m256i tlr1RB = _mm256_and_si256(tlr1, colorMask); \
700 __m256i tlr2AG = _mm256_srli_epi16(tlr2, 8); \
701 __m256i tlr2RB = _mm256_and_si256(tlr2, colorMask); \
702 __m256i blr1AG = _mm256_srli_epi16(blr1, 8); \
703 __m256i blr1RB = _mm256_and_si256(blr1, colorMask); \
704 __m256i blr2AG = _mm256_srli_epi16(blr2, 8); \
705 __m256i blr2RB = _mm256_and_si256(blr2, colorMask); \
707 __m256i odxidy1 = _mm256_unpacklo_epi32(idxidy, dxidy); \
708 __m256i odxidy2 = _mm256_unpackhi_epi32(idxidy, dxidy); \
709 tlr1AG = _mm256_mullo_epi16(tlr1AG, odxidy1); \
710 tlr1RB = _mm256_mullo_epi16(tlr1RB, odxidy1); \
711 tlr2AG = _mm256_mullo_epi16(tlr2AG, odxidy2); \
712 tlr2RB = _mm256_mullo_epi16(tlr2RB, odxidy2); \
713 __m256i odxdy1 = _mm256_unpacklo_epi32(idxdy, dxdy); \
714 __m256i odxdy2 = _mm256_unpackhi_epi32(idxdy, dxdy); \
715 blr1AG = _mm256_mullo_epi16(blr1AG, odxdy1); \
716 blr1RB = _mm256_mullo_epi16(blr1RB, odxdy1); \
717 blr2AG = _mm256_mullo_epi16(blr2AG, odxdy2); \
718 blr2RB = _mm256_mullo_epi16(blr2RB, odxdy2); \
721 __m256i topAG = _mm256_hadd_epi32(tlr1AG, tlr2AG); \
722 __m256i topRB = _mm256_hadd_epi32(tlr1RB, tlr2RB); \
723 __m256i botAG = _mm256_hadd_epi32(blr1AG, blr2AG); \
724 __m256i botRB = _mm256_hadd_epi32(blr1RB, blr2RB); \
725 __m256i rAG = _mm256_add_epi16(topAG, botAG); \
726 __m256i rRB = _mm256_add_epi16(topRB, botRB); \
727 rRB = _mm256_srli_epi16(rRB, 8); \
729 rAG = _mm256_permute4x64_epi64(rAG, _MM_SHUFFLE(3, 1, 2, 0)); \
730 rRB = _mm256_permute4x64_epi64(rRB, _MM_SHUFFLE(3, 1, 2, 0)); \
731 _mm256_storeu_si256((__m256i*)(b), _mm256_blendv_epi8(rAG, rRB, colorMask)); \
749 int &fx,
int &fy,
int fdx,
int )
757 const int disty = (fy & 0x0000ffff) >> 8;
758 const int idisty = 256 - disty;
762 const int adjust = (fdx < 0) ? fdx *
length : 0;
763 const int offset = (fx + adjust) >> 16;
778 quint32 rb = (((
t & 0xff00ff) * idisty + (
b & 0xff00ff) * disty) >> 8) & 0xff00ff;
779 quint32 ag = ((((
t>>8) & 0xff00ff) * idisty + ((
b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
785 }
while (
x <
image.x1 &&
f < lim);
788 const __m256i disty_ = _mm256_set1_epi16(disty);
789 const __m256i idisty_ = _mm256_set1_epi16(idisty);
790 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
793 for (;
f < lim;
x += 8,
f += 8) {
795 __m256i
top = _mm256_loadu_si256((
const __m256i*)((
const uint *)(
s1)+
x));
796 __m256i topAG = _mm256_srli_epi16(
top, 8);
797 __m256i topRB = _mm256_and_si256(
top, colorMask);
799 topAG = _mm256_mullo_epi16 (topAG, idisty_);
800 topRB = _mm256_mullo_epi16 (topRB, idisty_);
803 __m256i
bottom = _mm256_loadu_si256((
const __m256i*)((
const uint *)(
s2)+
x));
804 __m256i bottomAG = _mm256_srli_epi16(
bottom, 8);
805 __m256i bottomRB = _mm256_and_si256(
bottom, colorMask);
806 bottomAG = _mm256_mullo_epi16 (bottomAG, disty_);
807 bottomRB = _mm256_mullo_epi16 (bottomRB, disty_);
810 __m256i rAG =_mm256_add_epi16(topAG, bottomAG);
811 rAG = _mm256_srli_epi16(rAG, 8);
812 _mm256_storeu_si256((__m256i*)(&intermediate.
buffer_ag[
f]), rAG);
813 __m256i rRB =_mm256_add_epi16(topRB, bottomRB);
814 rRB = _mm256_srli_epi16(rRB, 8);
815 _mm256_storeu_si256((__m256i*)(&intermediate.
buffer_rb[
f]), rRB);
824 intermediate.
buffer_rb[
f] = (((
t & 0xff00ff) * idisty + (
b & 0xff00ff) * disty) >> 8) & 0xff00ff;
825 intermediate.
buffer_ag[
f] = ((((
t>>8) & 0xff00ff) * idisty + ((
b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
830 intermediate_adder_avx2(
b,
end, intermediate,
offset, fx, fdx);
835 fx -=
offset * FixedScale;
837 const __m128i v_fdx = _mm_set1_epi32(fdx * 4);
838 const __m128i v_blend = _mm_set1_epi32(0x00800080);
839 const __m128i vdx_shuffle = _mm_set_epi8(
char(0x80), 13,
char(0x80), 13,
char(0x80), 9,
char(0x80), 9,
840 char(0x80), 5,
char(0x80), 5,
char(0x80), 1,
char(0x80), 1);
841 __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
843 while (
b <
end - 3) {
844 const __m128i
offset = _mm_srli_epi32(v_fx, 16);
845 __m256i vrb = _mm256_i32gather_epi64((
const long long *)intermediate.
buffer_rb,
offset, 4);
846 __m256i vag = _mm256_i32gather_epi64((
const long long *)intermediate.
buffer_ag,
offset, 4);
848 __m128i vdx = _mm_shuffle_epi8(v_fx, vdx_shuffle);
849 __m128i vidx = _mm_sub_epi16(_mm_set1_epi16(256), vdx);
850 __m256i vmulx = _mm256_castsi128_si256(_mm_unpacklo_epi32(vidx, vdx));
851 vmulx = _mm256_inserti128_si256(vmulx, _mm_unpackhi_epi32(vidx, vdx), 1);
853 vrb = _mm256_mullo_epi16(vrb, vmulx);
854 vag = _mm256_mullo_epi16(vag, vmulx);
856 __m256i vrbag = _mm256_hadd_epi32(vrb, vag);
857 vrbag = _mm256_permute4x64_epi64(vrbag, _MM_SHUFFLE(3, 1, 2, 0));
859 __m128i rb = _mm256_castsi256_si128(vrbag);
860 __m128i ag = _mm256_extracti128_si256(vrbag, 1);
861 rb = _mm_srli_epi16(rb, 8);
863 _mm_storeu_si128((__m128i*)
b, _mm_blendv_epi8(ag, rb, v_blend));
866 v_fx = _mm_add_epi32(v_fx, v_fdx);
868 fx = _mm_cvtsi128_si32(v_fx);
870 const int x = (fx >> 16);
872 const uint distx = (fx & 0x0000ffff) >> 8;
873 const uint idistx = 256 - distx;
880 fx +=
offset * FixedScale;
884 int &fx,
int &fy,
int fdx,
int )
891 const int disty8 = (fy & 0x0000ffff) >> 8;
892 const int disty4 = (disty8 + 0x08) >> 4;
910 boundedEnd =
qMin(boundedEnd,
b + (max_fx - fx) / fdx);
912 boundedEnd =
qMin(boundedEnd,
b + (min_fx - fx) / fdx);
915 const __m256i vdistShuffle =
916 _mm256_setr_epi8(0,
char(0x80), 0,
char(0x80), 4,
char(0x80), 4,
char(0x80), 8,
char(0x80), 8,
char(0x80), 12,
char(0x80), 12,
char(0x80),
917 0,
char(0x80), 0,
char(0x80), 4,
char(0x80), 4,
char(0x80), 8,
char(0x80), 8,
char(0x80), 12,
char(0x80), 12,
char(0x80));
918 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
919 const __m256i v_256 = _mm256_set1_epi16(256);
920 const __m256i v_disty = _mm256_set1_epi16(disty4);
921 const __m256i v_fdx = _mm256_set1_epi32(fdx * 8);
922 const __m256i v_fx_r = _mm256_set1_epi32(0x08);
923 const __m256i v_index = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
924 __m256i v_fx = _mm256_set1_epi32(fx);
925 v_fx = _mm256_add_epi32(v_fx, _mm256_mullo_epi32(_mm256_set1_epi32(fdx), v_index));
927 while (
b < boundedEnd - 7) {
928 const __m256i
offset = _mm256_srli_epi32(v_fx, 16);
929 const __m128i offsetLo = _mm256_castsi256_si128(
offset);
930 const __m128i offsetHi = _mm256_extracti128_si256(
offset, 1);
931 const __m256i toplo = _mm256_i32gather_epi64((
const long long *)
s1, offsetLo, 4);
932 const __m256i tophi = _mm256_i32gather_epi64((
const long long *)
s1, offsetHi, 4);
933 const __m256i botlo = _mm256_i32gather_epi64((
const long long *)
s2, offsetLo, 4);
934 const __m256i bothi = _mm256_i32gather_epi64((
const long long *)
s2, offsetHi, 4);
936 __m256i v_distx = _mm256_srli_epi16(v_fx, 8);
937 v_distx = _mm256_srli_epi16(_mm256_add_epi32(v_distx, v_fx_r), 4);
938 v_distx = _mm256_shuffle_epi8(v_distx, vdistShuffle);
940 interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256,
b);
942 v_fx = _mm256_add_epi32(v_fx, v_fdx);
944 fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , 0);
946 while (
b < boundedEnd) {
948 int distx8 = (fx & 0x0000ffff) >> 8;
962 int distx8 = (fx & 0x0000ffff) >> 8;
970 int &fx,
int &fy,
int fdx,
int fdy)
992 int distx = (fx & 0x0000ffff) >> 8;
993 int disty = (fy & 0x0000ffff) >> 8;
1001 boundedEnd =
qMin(boundedEnd,
b + (max_fx - fx) / fdx);
1003 boundedEnd =
qMin(boundedEnd,
b + (min_fx - fx) / fdx);
1005 boundedEnd =
qMin(boundedEnd,
b + (max_fy - fy) / fdy);
1007 boundedEnd =
qMin(boundedEnd,
b + (min_fy - fy) / fdy);
1010 const __m256i vdistShuffle =
1011 _mm256_setr_epi8(0,
char(0x80), 0,
char(0x80), 4,
char(0x80), 4,
char(0x80), 8,
char(0x80), 8,
char(0x80), 12,
char(0x80), 12,
char(0x80),
1012 0,
char(0x80), 0,
char(0x80), 4,
char(0x80), 4,
char(0x80), 8,
char(0x80), 8,
char(0x80), 12,
char(0x80), 12,
char(0x80));
1013 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
1014 const __m256i v_256 = _mm256_set1_epi16(256);
1015 const __m256i v_fdx = _mm256_set1_epi32(fdx * 8);
1016 const __m256i v_fdy = _mm256_set1_epi32(fdy * 8);
1017 const __m256i v_fxy_r = _mm256_set1_epi32(0x08);
1018 const __m256i v_index = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
1019 __m256i v_fx = _mm256_set1_epi32(fx);
1020 __m256i v_fy = _mm256_set1_epi32(fy);
1021 v_fx = _mm256_add_epi32(v_fx, _mm256_mullo_epi32(_mm256_set1_epi32(fdx), v_index));
1022 v_fy = _mm256_add_epi32(v_fy, _mm256_mullo_epi32(_mm256_set1_epi32(fdy), v_index));
1026 const __m256i vbpl = _mm256_set1_epi16(bytesPerLine/4);
1028 while (
b < boundedEnd - 7) {
1029 const __m256i vy = _mm256_packs_epi32(_mm256_srli_epi32(v_fy, 16), _mm256_setzero_si256());
1031 __m256i
offset = _mm256_unpacklo_epi16(_mm256_mullo_epi16(vy, vbpl), _mm256_mulhi_epi16(vy, vbpl));
1032 offset = _mm256_add_epi32(
offset, _mm256_srli_epi32(v_fx, 16));
1033 const __m128i offsetLo = _mm256_castsi256_si128(
offset);
1034 const __m128i offsetHi = _mm256_extracti128_si256(
offset, 1);
1035 const uint *topData = (
const uint *)(textureData);
1036 const uint *botData = (
const uint *)(textureData + bytesPerLine);
1037 const __m256i toplo = _mm256_i32gather_epi64((
const long long *)topData, offsetLo, 4);
1038 const __m256i tophi = _mm256_i32gather_epi64((
const long long *)topData, offsetHi, 4);
1039 const __m256i botlo = _mm256_i32gather_epi64((
const long long *)botData, offsetLo, 4);
1040 const __m256i bothi = _mm256_i32gather_epi64((
const long long *)botData, offsetHi, 4);
1042 __m256i v_distx = _mm256_srli_epi16(v_fx, 8);
1043 __m256i v_disty = _mm256_srli_epi16(v_fy, 8);
1044 v_distx = _mm256_srli_epi16(_mm256_add_epi32(v_distx, v_fxy_r), 4);
1045 v_disty = _mm256_srli_epi16(_mm256_add_epi32(v_disty, v_fxy_r), 4);
1046 v_distx = _mm256_shuffle_epi8(v_distx, vdistShuffle);
1047 v_disty = _mm256_shuffle_epi8(v_disty, vdistShuffle);
1049 interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256,
b);
1051 v_fx = _mm256_add_epi32(v_fx, v_fdx);
1052 v_fy = _mm256_add_epi32(v_fy, v_fdy);
1054 fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , 0);
1055 fy = _mm_extract_epi32(_mm256_castsi256_si128(v_fy) , 0);
1057 while (
b < boundedEnd) {
1064 int distx = (fx & 0x0000ffff) >> 8;
1065 int disty = (fy & 0x0000ffff) >> 8;
1074 int x1 = (fx >> 16);
1076 int y1 = (fy >> 16);
1090 int distx = (fx & 0x0000ffff) >> 8;
1091 int disty = (fy & 0x0000ffff) >> 8;
1103 static const __m256i offsetMask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
1104 return _mm256_add_epi32(offsetMask, _mm256_set1_epi32(-
count));
1111 const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
1112 const __m256i rgbaMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15));
1113 const __m256i shuffleMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15));
1114 const __m256i half = _mm256_set1_epi16(0x0080);
1115 const __m256i
zero = _mm256_setzero_si256();
1117 for (;
i <
count - 7;
i += 8) {
1118 __m256i srcVector = _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(
src +
i));
1119 if (!_mm256_testz_si256(srcVector, alphaMask)) {
1121 bool cf = _mm256_testc_si256(srcVector, alphaMask);
1123 srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1125 __m256i src1 = _mm256_unpacklo_epi8(srcVector,
zero);
1126 __m256i src2 = _mm256_unpackhi_epi8(srcVector,
zero);
1127 __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1128 __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1129 src1 = _mm256_mullo_epi16(src1, alpha1);
1130 src2 = _mm256_mullo_epi16(src2, alpha2);
1131 src1 = _mm256_add_epi16(src1, _mm256_srli_epi16(src1, 8));
1132 src2 = _mm256_add_epi16(src2, _mm256_srli_epi16(src2, 8));
1133 src1 = _mm256_add_epi16(src1, half);
1134 src2 = _mm256_add_epi16(src2, half);
1135 src1 = _mm256_srli_epi16(src1, 8);
1136 src2 = _mm256_srli_epi16(src2, 8);
1137 src1 = _mm256_blend_epi16(src1, alpha1, 0x88);
1138 src2 = _mm256_blend_epi16(src2, alpha2, 0x88);
1139 srcVector = _mm256_packus_epi16(src1, src2);
1140 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(
buffer +
i), srcVector);
1143 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(
buffer +
i), srcVector);
1146 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(
buffer +
i),
zero);
1151 const __m256i epilogueMask = epilogueMaskFromCount(
count -
i);
1152 __m256i srcVector = _mm256_maskload_epi32(
reinterpret_cast<const int *
>(
src +
i), epilogueMask);
1153 const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
1155 if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
1157 bool cf = _mm256_testc_si256(srcVector, epilogueAlphaMask);
1159 srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1161 __m256i src1 = _mm256_unpacklo_epi8(srcVector,
zero);
1162 __m256i src2 = _mm256_unpackhi_epi8(srcVector,
zero);
1163 __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1164 __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1165 src1 = _mm256_mullo_epi16(src1, alpha1);
1166 src2 = _mm256_mullo_epi16(src2, alpha2);
1167 src1 = _mm256_add_epi16(src1, _mm256_srli_epi16(src1, 8));
1168 src2 = _mm256_add_epi16(src2, _mm256_srli_epi16(src2, 8));
1169 src1 = _mm256_add_epi16(src1, half);
1170 src2 = _mm256_add_epi16(src2, half);
1171 src1 = _mm256_srli_epi16(src1, 8);
1172 src2 = _mm256_srli_epi16(src2, 8);
1173 src1 = _mm256_blend_epi16(src1, alpha1, 0x88);
1174 src2 = _mm256_blend_epi16(src2, alpha2, 0x88);
1175 srcVector = _mm256_packus_epi16(src1, src2);
1176 _mm256_maskstore_epi32(
reinterpret_cast<int *
>(
buffer +
i), epilogueMask, srcVector);
1179 _mm256_maskstore_epi32(
reinterpret_cast<int *
>(
buffer +
i), epilogueMask, srcVector);
1182 _mm256_maskstore_epi32(
reinterpret_cast<int *
>(
buffer +
i), epilogueMask,
zero);
1215 const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
1216 const __m256i rgbaMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15));
1217 const __m256i shuffleMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15));
1218 const __m256i
zero = _mm256_setzero_si256();
1220 for (;
i <
count - 7;
i += 8) {
1222 __m256i srcVector = _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(
src +
i));
1223 if (!_mm256_testz_si256(srcVector, alphaMask)) {
1225 bool cf = _mm256_testc_si256(srcVector, alphaMask);
1227 srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1235 srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0));
1237 const __m256i src1 = _mm256_unpacklo_epi8(srcVector, srcVector);
1238 const __m256i src2 = _mm256_unpackhi_epi8(srcVector, srcVector);
1240 const __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1241 const __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1242 dst1 = _mm256_mulhi_epu16(src1, alpha1);
1243 dst2 = _mm256_mulhi_epu16(src2, alpha2);
1244 dst1 = _mm256_add_epi16(dst1, _mm256_srli_epi16(dst1, 15));
1245 dst2 = _mm256_add_epi16(dst2, _mm256_srli_epi16(dst2, 15));
1246 dst1 = _mm256_blend_epi16(dst1, src1, 0x88);
1247 dst2 = _mm256_blend_epi16(dst2, src2, 0x88);
1255 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(
buffer +
i), dst1);
1256 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(
buffer +
i) + 1, dst2);
1260 __m256i epilogueMask = epilogueMaskFromCount(
count -
i);
1261 const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
1263 __m256i srcVector = _mm256_maskload_epi32(
reinterpret_cast<const int *
>(
src +
i), epilogueMask);
1265 if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
1267 bool cf = _mm256_testc_si256(srcVector, epilogueAlphaMask);
1269 srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1270 srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0));
1271 const __m256i src1 = _mm256_unpacklo_epi8(srcVector, srcVector);
1272 const __m256i src2 = _mm256_unpackhi_epi8(srcVector, srcVector);
1274 const __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1275 const __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1276 dst1 = _mm256_mulhi_epu16(src1, alpha1);
1277 dst2 = _mm256_mulhi_epu16(src2, alpha2);
1278 dst1 = _mm256_add_epi16(dst1, _mm256_srli_epi16(dst1, 15));
1279 dst2 = _mm256_add_epi16(dst2, _mm256_srli_epi16(dst2, 15));
1280 dst1 = _mm256_blend_epi16(dst1, src1, 0x88);
1281 dst2 = _mm256_blend_epi16(dst2, src2, 0x88);
1289 epilogueMask = _mm256_permute4x64_epi64(epilogueMask, _MM_SHUFFLE(3, 1, 2, 0));
1290 _mm256_maskstore_epi64(
reinterpret_cast<qint64 *
>(
buffer +
i),
1291 _mm256_unpacklo_epi32(epilogueMask, epilogueMask),
1293 _mm256_maskstore_epi64(
reinterpret_cast<qint64 *
>(
buffer +
i + 4),
1294 _mm256_unpackhi_epi32(epilogueMask, epilogueMask),
1332 const __m256i vh = _mm256_set1_epi32(0x8000);
1333 for (;
i <
count - 3;
i += 4) {
1334 __m256i vs256 = _mm256_loadu_si256((
const __m256i *)(
s +
i));
1335 __m256i va256 = _mm256_shufflelo_epi16(vs256, _MM_SHUFFLE(3, 3, 3, 3));
1336 va256 = _mm256_shufflehi_epi16(va256, _MM_SHUFFLE(3, 3, 3, 3));
1337 const __m256i vmullo = _mm256_mullo_epi16(vs256, va256);
1338 const __m256i vmulhi = _mm256_mulhi_epu16(vs256, va256);
1339 __m256i vslo = _mm256_unpacklo_epi16(vmullo, vmulhi);
1340 __m256i vshi = _mm256_unpackhi_epi16(vmullo, vmulhi);
1341 vslo = _mm256_add_epi32(vslo, _mm256_srli_epi32(vslo, 16));
1342 vshi = _mm256_add_epi32(vshi, _mm256_srli_epi32(vshi, 16));
1343 vslo = _mm256_add_epi32(vslo, vh);
1344 vshi = _mm256_add_epi32(vshi, vh);
1345 vslo = _mm256_srli_epi32(vslo, 16);
1346 vshi = _mm256_srli_epi32(vshi, 16);
1347 vs256 = _mm256_packus_epi32(vslo, vshi);
1348 _mm256_storeu_si256((__m256i *)(
buffer +
i), vs256);
1351 __m128i vs = _mm_loadl_epi64((
const __m128i *)(
s +
i));
1352 __m128i va = _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3));
1354 _mm_storel_epi64((__m128i *)(
buffer +
i), vs);
1363 const __m256 vf = _mm256_set1_ps(255.0f);
1364 const __m256 vh = _mm256_set1_ps(0.5f);
1366 for (;
i + 1 <
count;
i += 2) {
1367 __m256 vsf = _mm256_cvtph_ps(_mm_loadu_si128((
const __m128i *)(
s +
i)));
1368 vsf = _mm256_mul_ps(vsf, vf);
1369 vsf = _mm256_add_ps(vsf, vh);
1370 __m256i vsi = _mm256_cvttps_epi32(vsf);
1371 vsi = _mm256_packs_epi32(vsi, vsi);
1372 vsi = _mm256_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1373 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1374 __m128i vsi128 = _mm256_castsi256_si128(vsi);
1375 vsi128 = _mm_packus_epi16(vsi128, vsi128);
1376 _mm_storel_epi64((__m128i *)(
buffer +
i), vsi128);
1379 __m128 vsf = _mm_cvtph_ps(_mm_loadl_epi64((
const __m128i *)(
s +
i)));
1380 vsf = _mm_mul_ps(vsf, _mm_set1_ps(255.0f));
1381 vsf = _mm_add_ps(vsf, _mm_set1_ps(0.5f));
1382 __m128i vsi = _mm_cvttps_epi32(vsf);
1383 vsi = _mm_packs_epi32(vsi, vsi);
1384 vsi = _mm_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1385 vsi = _mm_packus_epi16(vsi, vsi);
1386 buffer[
i] = _mm_cvtsi128_si32(vsi);
1395 const __m256 vf = _mm256_set1_ps(255.0f);
1396 const __m256 vh = _mm256_set1_ps(0.5f);
1398 for (;
i + 1 <
count;
i += 2) {
1399 __m256 vsf = _mm256_cvtph_ps(_mm_loadu_si128((
const __m128i *)(
s +
i)));
1400 __m256 vsa = _mm256_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1401 vsf = _mm256_mul_ps(vsf, vsa);
1402 vsf = _mm256_blend_ps(vsf, vsa, 0x88);
1403 vsf = _mm256_mul_ps(vsf, vf);
1404 vsf = _mm256_add_ps(vsf, vh);
1405 __m256i vsi = _mm256_cvttps_epi32(vsf);
1406 vsi = _mm256_packus_epi32(vsi, vsi);
1407 vsi = _mm256_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1408 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1409 __m128i vsi128 = _mm256_castsi256_si128(vsi);
1410 vsi128 = _mm_packus_epi16(vsi128, vsi128);
1411 _mm_storel_epi64((__m128i *)(
buffer +
i), vsi128);
1414 __m128 vsf = _mm_cvtph_ps(_mm_loadl_epi64((
const __m128i *)(
s +
i)));
1415 __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1416 vsf = _mm_mul_ps(vsf, vsa);
1417 vsf = _mm_insert_ps(vsf, vsa, 0x30);
1418 vsf = _mm_mul_ps(vsf, _mm_set1_ps(255.0f));
1419 vsf = _mm_add_ps(vsf, _mm_set1_ps(0.5f));
1420 __m128i vsi = _mm_cvttps_epi32(vsf);
1421 vsi = _mm_packus_epi32(vsi, vsi);
1422 vsi = _mm_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1423 vsi = _mm_packus_epi16(vsi, vsi);
1424 buffer[
i] = _mm_cvtsi128_si32(vsi);
1433 const __m256 vf = _mm256_set1_ps(65535.0f);
1434 const __m256 vh = _mm256_set1_ps(0.5f);
1436 for (;
i + 1 <
count;
i += 2) {
1437 __m256 vsf = _mm256_cvtph_ps(_mm_loadu_si128((
const __m128i *)(
s +
i)));
1438 vsf = _mm256_mul_ps(vsf, vf);
1439 vsf = _mm256_add_ps(vsf, vh);
1440 __m256i vsi = _mm256_cvttps_epi32(vsf);
1441 vsi = _mm256_packus_epi32(vsi, vsi);
1442 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1443 _mm_storeu_si128((__m128i *)(
buffer +
i), _mm256_castsi256_si128(vsi));
1446 __m128 vsf = _mm_cvtph_ps(_mm_loadl_epi64((
const __m128i *)(
s +
i)));
1447 vsf = _mm_mul_ps(vsf, _mm_set1_ps(65535.0f));
1448 vsf = _mm_add_ps(vsf, _mm_set1_ps(0.5f));
1449 __m128i vsi = _mm_cvttps_epi32(vsf);
1450 vsi = _mm_packus_epi32(vsi, vsi);
1451 _mm_storel_epi64((__m128i *)(
buffer +
i), vsi);
1460 const __m256 vf = _mm256_set1_ps(65535.0f);
1461 const __m256 vh = _mm256_set1_ps(0.5f);
1463 for (;
i + 1 <
count;
i += 2) {
1464 __m256 vsf = _mm256_cvtph_ps(_mm_loadu_si128((
const __m128i *)(
s +
i)));
1465 __m256 vsa = _mm256_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
1466 vsf = _mm256_mul_ps(vsf, vsa);
1467 vsf = _mm256_blend_ps(vsf, vsa, 0x88);
1468 vsf = _mm256_mul_ps(vsf, vf);
1469 vsf = _mm256_add_ps(vsf, vh);
1470 __m256i vsi = _mm256_cvttps_epi32(vsf);
1471 vsi = _mm256_packus_epi32(vsi, vsi);
1472 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1473 _mm_storeu_si128((__m128i *)(
buffer +
i), _mm256_castsi256_si128(vsi));
1476 __m128 vsf = _mm_cvtph_ps(_mm_loadl_epi64((
const __m128i *)(
s +
i)));
1477 __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
1478 vsf = _mm_mul_ps(vsf, vsa);
1479 vsf = _mm_insert_ps(vsf, vsa, 0x30);
1480 vsf = _mm_mul_ps(vsf, _mm_set1_ps(65535.0f));
1481 vsf = _mm_add_ps(vsf, _mm_set1_ps(0.5f));
1482 __m128i vsi = _mm_cvttps_epi32(vsf);
1483 vsi = _mm_packus_epi32(vsi, vsi);
1484 _mm_storel_epi64((__m128i *)(
buffer +
i), vsi);
1493 const __m256 vf = _mm256_set1_ps(1.0f / 255.0f);
1495 for (;
i + 1 <
count;
i += 2) {
1496 __m256i vsi = _mm256_cvtepu8_epi32(_mm_loadl_epi64((
const __m128i *)(
src +
i)));
1497 vsi = _mm256_shuffle_epi32(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1498 __m256 vsf = _mm256_cvtepi32_ps(vsi);
1499 vsf = _mm256_mul_ps(vsf, vf);
1500 _mm_storeu_si128((__m128i *)(
d +
i), _mm256_cvtps_ph(vsf, 0));
1503 __m128i vsi = _mm_cvtsi32_si128(
src[
i]);
1504 vsi = _mm_cvtepu8_epi32(vsi);
1505 vsi = _mm_shuffle_epi32(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1506 __m128 vsf = _mm_cvtepi32_ps(vsi);
1507 vsf = _mm_mul_ps(vsf, _mm_set1_ps(1.0f / 255.0f));
1508 _mm_storel_epi64((__m128i *)(
d +
i), _mm_cvtps_ph(vsf, 0));
1516 const __m128 vf = _mm_set1_ps(1.0f / 255.0f);
1519 __m128i vsi = _mm_cvtsi32_si128(
s);
1520 vsi = _mm_cvtepu8_epi32(vsi);
1521 vsi = _mm_shuffle_epi32(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1522 __m128 vsf = _mm_cvtepi32_ps(vsi);
1523 const uint8_t
a = (
s >> 24);
1525 vsf = _mm_mul_ps(vsf, vf);
1527 vsf = _mm_set1_ps(0.0f);
1529 const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1530 __m128 vsr = _mm_rcp_ps(vsa);
1531 vsr = _mm_sub_ps(_mm_add_ps(vsr, vsr), _mm_mul_ps(vsr, _mm_mul_ps(vsr, vsa)));
1532 vsr = _mm_insert_ps(vsr, _mm_set_ss(1.0f), 0x30);
1533 vsf = _mm_mul_ps(vsf, vsr);
1535 _mm_storel_epi64((__m128i *)(
d +
i), _mm_cvtps_ph(vsf, 0));
1539#if QT_CONFIG(raster_fp)
1545 for (;
i + 1 <
count;
i += 2) {
1546 __m256 vsf = _mm256_cvtph_ps(_mm_loadu_si128((
const __m128i *)(
s +
i)));
1547 __m256 vsa = _mm256_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1548 vsf = _mm256_mul_ps(vsf, vsa);
1549 vsf = _mm256_blend_ps(vsf, vsa, 0x88);
1550 _mm256_storeu_ps((
float *)(
buffer +
i), vsf);
1553 __m128 vsf = _mm_cvtph_ps(_mm_loadl_epi64((
const __m128i *)(
s +
i)));
1554 __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1555 vsf = _mm_mul_ps(vsf, vsa);
1556 vsf = _mm_insert_ps(vsf, vsa, 0x30);
1557 _mm_store_ps((
float *)(
buffer +
i), vsf);
1566 const __m128 *
s =
reinterpret_cast<const __m128 *
>(
src);
1567 const __m128
zero = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
1569 __m128 vsf = _mm_load_ps(
reinterpret_cast<const float *
>(
s +
i));
1570 const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1571 const float a = _mm_cvtss_f32(vsa);
1577 __m128 vsr = _mm_rcp_ps(vsa);
1578 vsr = _mm_sub_ps(_mm_add_ps(vsr, vsr), _mm_mul_ps(vsr, _mm_mul_ps(vsr, vsa)));
1579 vsf = _mm_mul_ps(vsf, vsr);
1580 vsf = _mm_insert_ps(vsf, _mm_set_ss(1.0f), 0x30);
1582 _mm_storel_epi64((__m128i *)(
d +
i), _mm_cvtps_ph(vsf, 0));
1590 const __m128 *
s =
reinterpret_cast<const __m128 *
>(
src);
1591 const __m128
zero = _mm_set1_ps(0.0f);
1593 __m128 vsf = _mm_load_ps(
reinterpret_cast<const float *
>(
s +
i));
1594 const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1595 const float a = _mm_cvtss_f32(vsa);
1601 __m128 vsr = _mm_rcp_ps(vsa);
1602 vsr = _mm_sub_ps(_mm_add_ps(vsr, vsr), _mm_mul_ps(vsr, _mm_mul_ps(vsr, vsa)));
1603 vsr = _mm_insert_ps(vsr, _mm_set_ss(1.0f), 0x30);
1604 vsf = _mm_mul_ps(vsf, vsr);
1606 _mm_storel_epi64((__m128i *)(
d +
i), _mm_cvtps_ph(vsf, 0));
Combined button and popup list for selecting options.
void qt_memfill64(quint64 *dest, quint64 color, qsizetype count)
void qt_memfill32(quint32 *dest, quint32 color, qsizetype count)
void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2)
static constexpr int BufferSize
static uint INTERPOLATE_PIXEL_256(uint x, uint a, uint y, uint b)
static void blend_pixel(quint32 &dst, const quint32 src)
static uint INTERPOLATE_PIXEL_255(uint x, uint a, uint y, uint b)
static uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty)
static uint BYTE_MUL(uint x, uint a)
#define Q_DECL_VECTORCALL
EGLOutputLayerEXT EGLint EGLAttrib value
[5]
constexpr const T & qMin(const T &a, const T &b)
constexpr T qAbs(const T &t)
GLint GLfloat GLfloat GLfloat v2
GLboolean GLboolean GLboolean b
GLint GLint GLint GLint GLint x
[0]
GLfloat GLfloat GLfloat w
[0]
GLboolean GLboolean GLboolean GLboolean a
[7]
GLuint GLfloat GLfloat GLfloat GLfloat y1
GLuint GLfloat GLfloat GLfloat x1
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat s1
GLenum GLuint GLenum GLsizei length
GLdouble GLdouble GLdouble GLdouble top
GLenum GLenum GLsizei count
GLenum GLuint GLintptr offset
GLfloat GLfloat GLfloat GLfloat h
GLfixed GLfixed GLfixed y2
constexpr int qAlpha(QRgb rgb)
static QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
static QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
static T multiplyAlpha255(T rgba64, uint alpha255)
#define SIMD_EPILOGUE(i, length, max)
#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length)
unsigned long long quint64