4#include <private/qdrawhelper_p.h>
5#include <private/qdrawingprimitive_sse2_p.h>
6#include <private/qpaintengine_raster_p.h>
7#include <private/qpixellayout_p.h>
9#if defined(QT_COMPILER_SUPPORTS_SSE4_1)
18 const __m128i alphaMask = _mm_set1_epi32(0xff000000);
19 const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
20 const __m128i shuffleMask = _mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15);
21 const __m128i half = _mm_set1_epi16(0x0080);
22 const __m128i
zero = _mm_setzero_si128();
25 __m128i srcVector = _mm_loadu_si128((
const __m128i *)&
src[
i]);
26 if (!_mm_testz_si128(srcVector, alphaMask)) {
27 if (!_mm_testc_si128(srcVector, alphaMask)) {
29 srcVector = _mm_shuffle_epi8(srcVector, rgbaMask);
30 __m128i src1 = _mm_unpacklo_epi8(srcVector,
zero);
31 __m128i src2 = _mm_unpackhi_epi8(srcVector,
zero);
32 __m128i alpha1 = _mm_shuffle_epi8(src1, shuffleMask);
33 __m128i alpha2 = _mm_shuffle_epi8(src2, shuffleMask);
34 src1 = _mm_mullo_epi16(src1, alpha1);
35 src2 = _mm_mullo_epi16(src2, alpha2);
36 src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 8));
37 src2 = _mm_add_epi16(src2, _mm_srli_epi16(src2, 8));
38 src1 = _mm_add_epi16(src1, half);
39 src2 = _mm_add_epi16(src2, half);
40 src1 = _mm_srli_epi16(src1, 8);
41 src2 = _mm_srli_epi16(src2, 8);
42 src1 = _mm_blend_epi16(src1, alpha1, 0x88);
43 src2 = _mm_blend_epi16(src2, alpha2, 0x88);
44 srcVector = _mm_packus_epi16(src1, src2);
45 _mm_storeu_si128((__m128i *)&
buffer[
i], srcVector);
48 _mm_storeu_si128((__m128i *)&
buffer[
i], _mm_shuffle_epi8(srcVector, rgbaMask));
50 _mm_storeu_si128((__m128i *)&
buffer[
i], srcVector);
67 const __m128i alphaMask = _mm_set1_epi32(0xff000000);
68 const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
69 const __m128i shuffleMask = _mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15);
70 const __m128i
zero = _mm_setzero_si128();
73 __m128i srcVector = _mm_loadu_si128((
const __m128i *)&
src[
i]);
74 if (!_mm_testz_si128(srcVector, alphaMask)) {
75 bool cf = _mm_testc_si128(srcVector, alphaMask);
78 srcVector = _mm_shuffle_epi8(srcVector, rgbaMask);
79 const __m128i src1 = _mm_unpacklo_epi8(srcVector, srcVector);
80 const __m128i src2 = _mm_unpackhi_epi8(srcVector, srcVector);
82 __m128i alpha1 = _mm_shuffle_epi8(src1, shuffleMask);
83 __m128i alpha2 = _mm_shuffle_epi8(src2, shuffleMask);
84 __m128i dst1 = _mm_mulhi_epu16(src1, alpha1);
85 __m128i dst2 = _mm_mulhi_epu16(src2, alpha2);
87 dst1 = _mm_add_epi16(dst1, _mm_srli_epi16(dst1, 15));
88 dst2 = _mm_add_epi16(dst2, _mm_srli_epi16(dst2, 15));
90 dst1 = _mm_blend_epi16(dst1, src1, 0x88);
91 dst2 = _mm_blend_epi16(dst2, src2, 0x88);
92 _mm_storeu_si128((__m128i *)&
buffer[
i], dst1);
93 _mm_storeu_si128((__m128i *)&
buffer[
i + 2], dst2);
95 _mm_storeu_si128((__m128i *)&
buffer[
i], src1);
96 _mm_storeu_si128((__m128i *)&
buffer[
i + 2], src2);
113 __m128 ia = _mm_rcp_ps(
a);
115 ia = _mm_sub_ps(_mm_add_ps(ia, ia), _mm_mul_ps(ia, _mm_mul_ps(ia,
a)));
116 ia = _mm_mul_ps(ia, _mm_set1_ps(mul));
120template<
bool RGBA,
bool RGBx>
124 if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == 0) {
135 const __m128i alphaMask = _mm_set1_epi32(0xff000000);
136 const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
137 const __m128i
zero = _mm_setzero_si128();
139 for (;
i <
count - 3;
i += 4) {
140 __m128i srcVector = _mm_loadu_si128((
const __m128i *)&
src[
i]);
141 if (!_mm_testz_si128(srcVector, alphaMask)) {
142 if (!_mm_testc_si128(srcVector, alphaMask)) {
143 __m128i srcVectorAlpha = _mm_srli_epi32(srcVector, 24);
145 srcVector = _mm_shuffle_epi8(srcVector, rgbaMask);
146 const __m128
a = _mm_cvtepi32_ps(srcVectorAlpha);
147 const __m128 ia = reciprocal_mul_ps(
a, 255.0f);
148 __m128i src1 = _mm_unpacklo_epi8(srcVector,
zero);
149 __m128i src3 = _mm_unpackhi_epi8(srcVector,
zero);
150 __m128i src2 = _mm_unpackhi_epi16(src1,
zero);
151 __m128i src4 = _mm_unpackhi_epi16(src3,
zero);
152 src1 = _mm_unpacklo_epi16(src1,
zero);
153 src3 = _mm_unpacklo_epi16(src3,
zero);
154 __m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0));
155 __m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1));
156 __m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2));
157 __m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3));
158 src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src1), ia1));
159 src2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src2), ia2));
160 src3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src3), ia3));
161 src4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src4), ia4));
162 src1 = _mm_packus_epi32(src1, src2);
163 src3 = _mm_packus_epi32(src3, src4);
164 src1 = _mm_packus_epi16(src1, src3);
166 __m128i srcVectorAlphaMask = _mm_cmpeq_epi32(srcVectorAlpha,
zero);
167 src1 = _mm_andnot_si128(srcVectorAlphaMask, src1);
170 srcVector = _mm_or_si128(src1, alphaMask);
172 srcVector = _mm_blendv_epi8(src1, srcVector, alphaMask);
173 _mm_storeu_si128((__m128i *)&
buffer[
i], srcVector);
176 _mm_storeu_si128((__m128i *)&
buffer[
i], _mm_shuffle_epi8(srcVector, rgbaMask));
178 _mm_storeu_si128((__m128i *)&
buffer[
i], srcVector);
182 _mm_storeu_si128((__m128i *)&
buffer[
i], alphaMask);
202 if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == 0) {
209 const __m128i alphaMask = _mm_set1_epi64x(
qint64(
Q_UINT64_C(0xffff) << 48));
210 const __m128i alphaMask32 = _mm_set1_epi32(0xff000000);
211 const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
212 const __m128i
zero = _mm_setzero_si128();
214 for (;
i <
count - 3;
i += 4) {
215 __m128i srcVector1 = _mm_loadu_si128((
const __m128i *)&
src[
i]);
216 __m128i srcVector2 = _mm_loadu_si128((
const __m128i *)&
src[
i + 2]);
217 bool transparent1 = _mm_testz_si128(srcVector1, alphaMask);
218 bool opaque1 = _mm_testc_si128(srcVector1, alphaMask);
219 bool transparent2 = _mm_testz_si128(srcVector2, alphaMask);
220 bool opaque2 = _mm_testc_si128(srcVector2, alphaMask);
222 if (!(transparent1 && transparent2)) {
223 if (!(opaque1 && opaque2)) {
224 __m128i srcVector1Alpha = _mm_srli_epi64(srcVector1, 48);
225 __m128i srcVector2Alpha = _mm_srli_epi64(srcVector2, 48);
226 __m128i srcVectorAlpha = _mm_packus_epi32(srcVector1Alpha, srcVector2Alpha);
227 const __m128
a = _mm_cvtepi32_ps(srcVectorAlpha);
229 srcVectorAlpha = _mm_add_epi32(srcVectorAlpha, _mm_set1_epi32(128));
230 srcVectorAlpha = _mm_sub_epi32(srcVectorAlpha, _mm_srli_epi32(srcVectorAlpha, 8));
231 srcVectorAlpha = _mm_srli_epi32(srcVectorAlpha, 8);
232 srcVectorAlpha = _mm_slli_epi32(srcVectorAlpha, 24);
233 const __m128 ia = reciprocal_mul_ps(
a, 255.0f);
234 __m128i src1 = _mm_unpacklo_epi16(srcVector1,
zero);
235 __m128i src2 = _mm_unpackhi_epi16(srcVector1,
zero);
236 __m128i src3 = _mm_unpacklo_epi16(srcVector2,
zero);
237 __m128i src4 = _mm_unpackhi_epi16(srcVector2,
zero);
238 __m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0));
239 __m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1));
240 __m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2));
241 __m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3));
242 src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src1), ia1));
243 src2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src2), ia2));
244 src3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src3), ia3));
245 src4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src4), ia4));
246 src1 = _mm_packus_epi32(src1, src2);
247 src3 = _mm_packus_epi32(src3, src4);
249 __m128i srcVector1AlphaMask = _mm_cmpeq_epi64(srcVector1Alpha,
zero);
250 __m128i srcVector2AlphaMask = _mm_cmpeq_epi64(srcVector2Alpha,
zero);
251 src1 = _mm_andnot_si128(srcVector1AlphaMask, src1);
252 src3 = _mm_andnot_si128(srcVector2AlphaMask, src3);
253 src1 = _mm_packus_epi16(src1, src3);
255 src1 = _mm_blendv_epi8(src1, srcVectorAlpha, alphaMask32);
258 src1 = _mm_shuffle_epi8(src1, rgbaMask);
259 _mm_storeu_si128((__m128i *)&
buffer[
i], src1);
261 __m128i src1 = _mm_unpacklo_epi16(srcVector1,
zero);
262 __m128i src2 = _mm_unpackhi_epi16(srcVector1,
zero);
263 __m128i src3 = _mm_unpacklo_epi16(srcVector2,
zero);
264 __m128i src4 = _mm_unpackhi_epi16(srcVector2,
zero);
265 src1 = _mm_add_epi32(src1, _mm_set1_epi32(128));
266 src2 = _mm_add_epi32(src2, _mm_set1_epi32(128));
267 src3 = _mm_add_epi32(src3, _mm_set1_epi32(128));
268 src4 = _mm_add_epi32(src4, _mm_set1_epi32(128));
269 src1 = _mm_sub_epi32(src1, _mm_srli_epi32(src1, 8));
270 src2 = _mm_sub_epi32(src2, _mm_srli_epi32(src2, 8));
271 src3 = _mm_sub_epi32(src3, _mm_srli_epi32(src3, 8));
272 src4 = _mm_sub_epi32(src4, _mm_srli_epi32(src4, 8));
273 src1 = _mm_srli_epi32(src1, 8);
274 src2 = _mm_srli_epi32(src2, 8);
275 src3 = _mm_srli_epi32(src3, 8);
276 src4 = _mm_srli_epi32(src4, 8);
277 src1 = _mm_packus_epi32(src1, src2);
278 src3 = _mm_packus_epi32(src3, src4);
279 src1 = _mm_packus_epi16(src1, src3);
281 src1 = _mm_shuffle_epi8(src1, rgbaMask);
282 _mm_storeu_si128((__m128i *)&
buffer[
i], src1);
290 buffer[
i] = qConvertRgba64ToRgb32_sse4<RGBA ? PixelOrderRGB : PixelOrderBGR>(
src[
i]);
298 if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == 0) {
307 const __m128i alphaMask = _mm_set1_epi64x(
qint64(
Q_UINT64_C(0xffff) << 48));
308 const __m128i
zero = _mm_setzero_si128();
310 for (;
i <
count - 3;
i += 4) {
311 __m128i srcVector1 = _mm_loadu_si128((
const __m128i *)&
src[
i + 0]);
312 __m128i srcVector2 = _mm_loadu_si128((
const __m128i *)&
src[
i + 2]);
313 bool transparent1 = _mm_testz_si128(srcVector1, alphaMask);
314 bool opaque1 = _mm_testc_si128(srcVector1, alphaMask);
315 bool transparent2 = _mm_testz_si128(srcVector2, alphaMask);
316 bool opaque2 = _mm_testc_si128(srcVector2, alphaMask);
318 if (!(transparent1 && transparent2)) {
319 if (!(opaque1 && opaque2)) {
320 __m128i srcVector1Alpha = _mm_srli_epi64(srcVector1, 48);
321 __m128i srcVector2Alpha = _mm_srli_epi64(srcVector2, 48);
322 __m128i srcVectorAlpha = _mm_packus_epi32(srcVector1Alpha, srcVector2Alpha);
323 const __m128
a = _mm_cvtepi32_ps(srcVectorAlpha);
324 const __m128 ia = reciprocal_mul_ps(
a, 65535.0f);
325 __m128i src1 = _mm_unpacklo_epi16(srcVector1,
zero);
326 __m128i src2 = _mm_unpackhi_epi16(srcVector1,
zero);
327 __m128i src3 = _mm_unpacklo_epi16(srcVector2,
zero);
328 __m128i src4 = _mm_unpackhi_epi16(srcVector2,
zero);
329 __m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0));
330 __m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1));
331 __m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2));
332 __m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3));
333 src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src1), ia1));
334 src2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src2), ia2));
335 src3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src3), ia3));
336 src4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src4), ia4));
337 src1 = _mm_packus_epi32(src1, src2);
338 src3 = _mm_packus_epi32(src3, src4);
340 __m128i srcVector1AlphaMask = _mm_cmpeq_epi64(srcVector1Alpha,
zero);
341 __m128i srcVector2AlphaMask = _mm_cmpeq_epi64(srcVector2Alpha,
zero);
342 src1 = _mm_andnot_si128(srcVector1AlphaMask, src1);
343 src3 = _mm_andnot_si128(srcVector2AlphaMask, src3);
346 src1 = _mm_or_si128(src1, alphaMask);
347 src3 = _mm_or_si128(src3, alphaMask);
349 src1 = _mm_blendv_epi8(src1, srcVector1, alphaMask);
350 src3 = _mm_blendv_epi8(src3, srcVector2, alphaMask);
352 _mm_storeu_si128((__m128i *)&
buffer[
i + 0], src1);
353 _mm_storeu_si128((__m128i *)&
buffer[
i + 2], src3);
356 srcVector1 = _mm_or_si128(srcVector1, alphaMask);
357 srcVector2 = _mm_or_si128(srcVector2, alphaMask);
360 _mm_storeu_si128((__m128i *)&
buffer[
i + 0], srcVector1);
361 _mm_storeu_si128((__m128i *)&
buffer[
i + 2], srcVector2);
436 convertARGBFromARGB32PM_sse4<false,true>(
d,
src,
count);
443 convertARGBFromARGB32PM_sse4<false,false>(
d,
src,
count);
450 convertARGBFromARGB32PM_sse4<true,false>(
d,
src,
count);
457 convertARGBFromARGB32PM_sse4<true,true>(
d,
src,
count);
460template<QtPixelOrder PixelOrder>
466 d[
i] = qConvertArgb32ToA2rgb30_sse4<PixelOrder>(
src[
i]);
476#if QT_CONFIG(raster_64bit)
480 convertARGBFromRGBA64PM_sse4<false>(dest,
buffer,
length);
486 convertARGBFromRGBA64PM_sse4<true>(dest,
buffer,
length);
494 convertARGBFromRGBA64PM_sse4<false>(
d,
src,
count);
501 convertARGBFromRGBA64PM_sse4<true>(
d,
src,
count);
508 convertRGBA64FromRGBA64PM_sse4<false>(
d,
src,
count);
515 convertRGBA64FromRGBA64PM_sse4<true>(
d,
src,
count);
518#if QT_CONFIG(raster_fp)
524 __m128 vsf = _mm_load_ps(
reinterpret_cast<const float *
>(
s +
i));
525 __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
526 vsf = _mm_mul_ps(vsf, vsa);
527 vsf = _mm_insert_ps(vsf, vsa, 0x30);
528 _mm_store_ps(
reinterpret_cast<float *
>(
buffer +
i), vsf);
537 const __m128
zero = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
539 __m128 vsf = _mm_load_ps(
reinterpret_cast<const float *
>(
src +
i));
540 const __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
541 const float a = _mm_cvtss_f32(vsa);
547 __m128 vsr = _mm_rcp_ps(vsa);
548 vsr = _mm_sub_ps(_mm_add_ps(vsr, vsr), _mm_mul_ps(vsr, _mm_mul_ps(vsr, vsa)));
549 vsf = _mm_mul_ps(vsf, vsr);
550 vsf = _mm_insert_ps(vsf, _mm_set_ss(1.0f), 0x30);
552 _mm_store_ps(
reinterpret_cast<float *
>(
d +
i), vsf);
560 const __m128
zero = _mm_set1_ps(0.0f);
562 __m128 vsf = _mm_load_ps(
reinterpret_cast<const float *
>(
src +
i));
563 const __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
564 const float a = _mm_cvtss_f32(vsa);
570 __m128 vsr = _mm_rcp_ps(vsa);
571 vsr = _mm_sub_ps(_mm_add_ps(vsr, vsr), _mm_mul_ps(vsr, _mm_mul_ps(vsr, vsa)));
572 vsr = _mm_insert_ps(vsr, _mm_set_ss(1.0f), 0x30);
573 vsf = _mm_mul_ps(vsf, vsr);
575 _mm_store_ps(
reinterpret_cast<float *
>(
d +
i), vsf);
constexpr QRgba64 unpremultiplied() const
void setAlpha(quint16 _alpha)
static constexpr QRgba64 fromArgb32(uint rgb)
constexpr QRgba64 premultiplied() const
Combined button and popup list for selecting options.
#define Q_DECL_VECTORCALL
GLsizei const GLfloat * v
[13]
GLint GLint GLint GLint GLint x
[0]
GLboolean GLboolean GLboolean GLboolean a
[7]
GLenum GLuint GLenum GLsizei length
GLenum GLenum GLsizei count
GLint GLint GLint GLint GLint GLint GLint GLbitfield mask
static quint32 RGBA2ARGB(quint32 x)
static quint32 ARGB2RGBA(quint32 x)
QRgb qUnpremultiply(QRgb p)
constexpr QRgb qPremultiply(QRgb x)
static uint toArgb32(QRgba64 rgba64)
static uint toRgba8888(QRgba64 rgba64)
#define SIMD_EPILOGUE(i, length, max)