Qt 6.x
The Qt SDK
Loading...
Searching...
No Matches
qfloat16.cpp
Go to the documentation of this file.
1// Copyright (C) 2020 The Qt Company Ltd.
2// Copyright (C) 2016 by Southwest Research Institute (R)
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include "qfloat16.h"
6#include "private/qsimd_p.h"
7#include <cmath> // for fpclassify()'s return values
8
9#include <QtCore/qdatastream.h>
10#include <QtCore/qmetatype.h>
11#include <QtCore/qtextstream.h>
12
15
17
18
132int qfloat16::fpClassify() const noexcept
133{
134 return isInf() ? FP_INFINITE : isNaN() ? FP_NAN
135 : !(b16 & 0x7fff) ? FP_ZERO : isNormal() ? FP_NORMAL : FP_SUBNORMAL;
136}
137
163#if QT_COMPILER_SUPPORTS_HERE(F16C)
164static inline bool hasFastF16()
165{
166 // qsimd.cpp:detectProcessorFeatures() turns off this feature if AVX
167 // state-saving is not enabled by the OS
168 return qCpuHasFeature(F16C);
169}
170
171#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
172static bool hasFastF16Avx256()
173{
174 // 256-bit AVX512 don't have a performance penalty (see qstring.cpp for more info)
175 return qCpuHasFeature(ArchSkylakeAvx512);
176}
177
178static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
179void qFloatToFloat16_tail_avx256(quint16 *out, const float *in, qsizetype len) noexcept
180{
181 __mmask16 mask = _bzhi_u32(-1, len);
182 __m256 f32 = _mm256_maskz_loadu_ps(mask, in );
183 __m128i f16 = _mm256_maskz_cvtps_ph(mask, f32, _MM_FROUND_TO_NEAREST_INT);
184 _mm_mask_storeu_epi16(out, mask, f16);
185};
186
187static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
188void qFloatFromFloat16_tail_avx256(float *out, const quint16 *in, qsizetype len) noexcept
189{
190 __mmask16 mask = _bzhi_u32(-1, len);
191 __m128i f16 = _mm_maskz_loadu_epi16(mask, in);
192 __m256 f32 = _mm256_cvtph_ps(f16);
193 _mm256_mask_storeu_ps(out, mask, f32);
194};
195#endif
196
198static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept
199{
200 constexpr qsizetype Step = sizeof(__m256i) / sizeof(float);
201 constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float);
202 qsizetype i = 0;
203
204 if (len >= Step) {
205 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
206 __m256 f32 = _mm256_loadu_ps(in + offset);
207 __m128i f16 = _mm256_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
208 _mm_storeu_si128(reinterpret_cast<__m128i *>(out + offset), f16);
209 };
210
211 // main loop: convert Step (8) floats per iteration
212 for ( ; i + Step < len; i += Step)
213 convertOneChunk(i);
214
215 // epilogue: convert the last chunk, possibly overlapping with the last
216 // iteration of the loop
217 return convertOneChunk(len - Step);
218 }
219
220#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
221 if (hasFastF16Avx256())
222 return qFloatToFloat16_tail_avx256(out, in, len);
223#endif
224
225 if (len >= HalfStep) {
226 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
227 __m128 f32 = _mm_loadu_ps(in + offset);
228 __m128i f16 = _mm_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
229 _mm_storel_epi64(reinterpret_cast<__m128i *>(out + offset), f16);
230 };
231
232 // two conversions, possibly overlapping
233 convertOneChunk(0);
234 return convertOneChunk(len - HalfStep);
235 }
236
237 // Inlining "qfloat16::qfloat16(float f)":
238 for ( ; i < len; ++i)
239 out[i] = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(in[i]), 0), 0);
240}
241
243static void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) noexcept
244{
245 constexpr qsizetype Step = sizeof(__m256i) / sizeof(float);
246 constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float);
247 qsizetype i = 0;
248
249 if (len >= Step) {
250 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
251 __m128i f16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + offset));
252 __m256 f32 = _mm256_cvtph_ps(f16);
253 _mm256_storeu_ps(out + offset, f32);
254 };
255
256 // main loop: convert Step (8) floats per iteration
257 for ( ; i + Step < len; i += Step)
258 convertOneChunk(i);
259
260 // epilogue: convert the last chunk, possibly overlapping with the last
261 // iteration of the loop
262 return convertOneChunk(len - Step);
263 }
264
265#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
266 if (hasFastF16Avx256())
267 return qFloatFromFloat16_tail_avx256(out, in, len);
268#endif
269
270 if (len >= HalfStep) {
271 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
272 __m128i f16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(in + offset));
273 __m128 f32 = _mm_cvtph_ps(f16);
274 _mm_storeu_ps(out + offset, f32);
275 };
276
277 // two conversions, possibly overlapping
278 convertOneChunk(0);
279 return convertOneChunk(len - HalfStep);
280 }
281
282 // Inlining "qfloat16::operator float()":
283 for ( ; i < len; ++i)
284 out[i] = _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(in[i])));
285}
286
287#elif defined(__ARM_FP16_FORMAT_IEEE) && defined(__ARM_NEON__) && (__ARM_FP & 2)
288static inline bool hasFastF16()
289{
290 return true;
291}
292
293static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept
294{
295 __fp16 *out_f16 = reinterpret_cast<__fp16 *>(out);
296 qsizetype i = 0;
297 for (; i < len - 3; i += 4)
298 vst1_f16(out_f16 + i, vcvt_f16_f32(vld1q_f32(in + i)));
299 SIMD_EPILOGUE(i, len, 3)
300 out_f16[i] = __fp16(in[i]);
301}
302
303static void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) noexcept
304{
305 const __fp16 *in_f16 = reinterpret_cast<const __fp16 *>(in);
306 qsizetype i = 0;
307 for (; i < len - 3; i += 4)
308 vst1q_f32(out + i, vcvt_f32_f16(vld1_f16(in_f16 + i)));
309 SIMD_EPILOGUE(i, len, 3)
310 out[i] = float(in_f16[i]);
311}
312#else
313static inline bool hasFastF16()
314{
315 return false;
316}
317
318static void qFloatToFloat16_fast(quint16 *, const float *, qsizetype) noexcept
319{
320 Q_UNREACHABLE();
321}
322
323static void qFloatFromFloat16_fast(float *, const quint16 *, qsizetype) noexcept
324{
325 Q_UNREACHABLE();
326}
327#endif
338Q_CORE_EXPORT void qFloatToFloat16(qfloat16 *out, const float *in, qsizetype len) noexcept
339{
340 if (hasFastF16())
341 return qFloatToFloat16_fast(reinterpret_cast<quint16 *>(out), in, len);
342
343 for (qsizetype i = 0; i < len; ++i)
344 out[i] = qfloat16(in[i]);
345}
346
357Q_CORE_EXPORT void qFloatFromFloat16(float *out, const qfloat16 *in, qsizetype len) noexcept
358{
359 if (hasFastF16())
360 return qFloatFromFloat16_fast(out, reinterpret_cast<const quint16 *>(in), len);
361
362 for (qsizetype i = 0; i < len; ++i)
363 out[i] = float(in[i]);
364}
365
366#ifndef QT_NO_DATASTREAM
379{
380 return ds << f.b16;
381}
382
396{
397 return ds >> f.b16;
398}
399#endif
400
402{
403 float f;
404 ts >> f;
405 f16 = qfloat16(f);
406 return ts;
407}
408
410{
411 return ts << float(f);
412}
413
415
416#include "qfloat16tables.cpp"
\inmodule QtCore\reentrant
Definition qdatastream.h:30
\inmodule QtCore
\keyword 16-bit Floating Point Support\inmodule QtCore \inheaderfile QFloat16
Definition qfloat16.h:46
Q_CORE_EXPORT void qFloatFromFloat16(float *out, const qfloat16 *in, qsizetype len) noexcept
Definition qfloat16.cpp:357
Q_CORE_EXPORT void qFloatToFloat16(qfloat16 *out, const float *in, qsizetype len) noexcept
Definition qfloat16.cpp:338
Combined button and popup list for selecting options.
QDataStream & operator<<(QDataStream &ds, qfloat16 f)
Definition qfloat16.cpp:378
static bool hasFastF16()
Definition qfloat16.cpp:313
static void qFloatToFloat16_fast(quint16 *, const float *, qsizetype) noexcept
Definition qfloat16.cpp:318
static void qFloatFromFloat16_fast(float *, const quint16 *, qsizetype) noexcept
Definition qfloat16.cpp:323
QDataStream & operator>>(QDataStream &ds, qfloat16 &f)
Definition qfloat16.cpp:395
#define QT_DECL_METATYPE_EXTERN(TYPE, EXPORT)
Definition qmetatype.h:1367
#define QT_IMPL_METATYPE_EXTERN(TYPE)
Definition qmetatype.h:1369
GLfloat GLfloat f
GLenum GLuint GLintptr offset
GLint GLint GLint GLint GLint GLint GLint GLbitfield mask
GLenum GLsizei len
GLuint in
#define qCpuHasFeature(feature)
Definition qsimd_p.h:378
#define QT_FUNCTION_TARGET(x)
Definition qsimd_p.h:133
#define SIMD_EPILOGUE(i, length, max)
Definition qsimd_p.h:33
unsigned short quint16
Definition qtypes.h:43
ptrdiff_t qsizetype
Definition qtypes.h:70
QTextStream out(stdout)
[7]