Qt 6.x
The Qt SDK
Loading...
Searching...
No Matches
qunicodetools.cpp
Go to the documentation of this file.
1// Copyright (C) 2020 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qunicodetools_p.h"
5
6#include "qunicodetables_p.h"
7#include "qvarlengtharray.h"
8#if QT_CONFIG(library)
9#include "qlibrary.h"
10#endif
11
12#include <limits.h>
13
14#define FLAG(x) (1 << (x))
15
17
18using namespace Qt::StringLiterals;
19
20#ifdef QT_BUILD_INTERNAL
21Q_CONSTINIT Q_AUTOTEST_EXPORT
22#else
23constexpr
24#endif
26
27namespace QUnicodeTools {
28
29// -----------------------------------------------------------------------------------------------------
30//
31// The text boundaries determination algorithm.
32// See https://www.unicode.org/reports/tr29/tr29-37.html
33//
34// -----------------------------------------------------------------------------------------------------
35
36namespace GB {
37
38// This table is indexed by the grapheme break classes of two
39// (adjacent) code points.
40// The class of the first code point selects an entry.
41// If the entry's bit at position second_cp_class is set
42// (in other words: if entry & (1u << second_cp_class) is non-zero)
43// then there is NO grapheme break between the two code points.
44
46
47// Check that we have enough bits in the table (in case
48// NumGraphemeBreakClasses grows too much).
49static_assert(sizeof(GBTableEntryType) * CHAR_BIT >= QUnicodeTables::NumGraphemeBreakClasses,
50 "Internal error: increase the size in bits of GBTableEntryType");
51
52// GB9, GB9a
57
58static const GBTableEntryType HardBreak = 0u;
59
63 HardBreak, // LF
64 HardBreak, // Control
65 Extend_SpacingMark_ZWJ, // Extend
67 Extend_SpacingMark_ZWJ, // RegionalIndicator
78 ), // Prepend
79 Extend_SpacingMark_ZWJ, // SpacingMark
85 ), // L
89 ), // V
92 ), // T
96 ), // LV
99 ), // LVT
100 Extend_SpacingMark_ZWJ // Extended_Pictographic
101};
102
105{
106 return (breakTable[first] & FLAG(second)) == 0;
107}
108
109// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
110// so we need to store some local state.
111enum class State : uchar {
112 Normal,
113 GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
114 GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
115 GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
116};
117
118} // namespace GB
119
120static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
121{
124 for (qsizetype i = 0; i != len; ++i) {
125 qsizetype pos = i;
126 char32_t ucs4 = string[i];
127 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
128 ushort low = string[i + 1];
129 if (QChar::isLowSurrogate(low)) {
130 ucs4 = QChar::surrogateToUcs4(ucs4, low);
131 ++i;
132 }
133 }
134
137
138 bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls);
139 bool handled = false;
140
141 switch (state) {
143 break; // will deal with it below
144
148 // keep going in the current state
149 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
150 handled = true;
151 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
153 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
154 handled = true;
155 } else {
157 }
158 break;
159
163 shouldBreak = false;
164 handled = true;
165 }
166
168 break;
169
173 shouldBreak = false;
174 handled = true;
175 }
176
178 break;
179 }
180
181 if (!handled) {
186 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
187 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
189 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
190 }
191 } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
193 }
194 }
195
196 if (shouldBreak)
197 attributes[pos].graphemeBoundary = true;
198
199 lcls = cls;
200 }
201
202 attributes[len].graphemeBoundary = true; // GB2
203}
204
205
206namespace WB {
207
208enum Action {
212 LookupW
214
216// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
217 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
218 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
219 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
220 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
221 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
222 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
223 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
224 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
225 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
228 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
229 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
230 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
231 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
232 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
235 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
236};
237
238} // namespace WB
239
240static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
241{
242 enum WordType {
243 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
244 } currentWordType = WordTypeNone;
245
247 auto real_cls = cls; // Unaffected by WB4
248
249 for (qsizetype i = 0; i != len; ++i) {
250 qsizetype pos = i;
251 char32_t ucs4 = string[i];
252 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
253 ushort low = string[i + 1];
254 if (QChar::isLowSurrogate(low)) {
255 ucs4 = QChar::surrogateToUcs4(ucs4, low);
256 ++i;
257 }
258 }
259
263 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
264 // which caused "hi.there" to be treated like if it were just a single word;
265 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
266 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
267 if (ucs4 == 0x002E) // FULL STOP
269 else if (ucs4 == 0x003A) // COLON
271 }
272
273 uchar action = WB::breakTable[cls][ncls];
274 switch (action) {
275 case WB::Break:
277 && prop->graphemeBreakClass
279 // WB3c: ZWJ × \p{Extended_Pictographic}
280 action = WB::NoBreak;
281 }
282 break;
283 case WB::NoBreak:
285 // WB4: X(Extend|Format)* -> X
286 real_cls = ncls;
287 continue;
288 }
290 // WB15/WB16: break between pairs of Regional indicator
292 }
294 && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
295 // WB3d should not be affected by WB4
296 action = WB::Break;
297 }
298 break;
299 case WB::Lookup:
300 case WB::LookupW:
301 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
302 ucs4 = string[lookahead];
303 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
304 ushort low = string[lookahead + 1];
305 if (QChar::isLowSurrogate(low)) {
306 ucs4 = QChar::surrogateToUcs4(ucs4, low);
307 ++lookahead;
308 }
309 }
310
311 prop = QUnicodeTables::properties(ucs4);
313
315 // WB4: X(Extend|Format)* -> X
316 continue;
317 }
318
319 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
320 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
321 i = lookahead;
322 ncls = tcls;
323 action = WB::NoBreak;
324 }
325 break;
326 }
327 if (action != WB::NoBreak) {
328 action = WB::Break;
330 action = WB::NoBreak; // WB7a
331 }
332 break;
333 }
334
335 cls = ncls;
336 real_cls = ncls;
337
338 if (action == WB::Break) {
339 attributes[pos].wordBreak = true;
340 if (currentWordType != WordTypeNone)
341 attributes[pos].wordEnd = true;
342 switch (cls) {
344 currentWordType = WordTypeHiraganaKatakana;
345 attributes[pos].wordStart = true;
346 break;
350 currentWordType = WordTypeAlphaNumeric;
351 attributes[pos].wordStart = true;
352 break;
353 default:
354 currentWordType = WordTypeNone;
355 break;
356 }
357 }
358 }
359
360 if (currentWordType != WordTypeNone)
361 attributes[len].wordEnd = true;
362 attributes[len].wordBreak = true; // WB2
363}
364
365
366namespace SB {
367
368enum State {
382 Lookup
384
386// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
390
391 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
392 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
393 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
394 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
395
396 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
397 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
398 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
399 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
400 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
401};
402
403} // namespace SB
404
405static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
406{
407 uchar state = SB::BAfter; // to meet SB1
408 for (qsizetype i = 0; i != len; ++i) {
409 qsizetype pos = i;
410 char32_t ucs4 = string[i];
411 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
412 ushort low = string[i + 1];
413 if (QChar::isLowSurrogate(low)) {
414 ucs4 = QChar::surrogateToUcs4(ucs4, low);
415 ++i;
416 }
417 }
418
421
423 state = SB::breakTable[state][ncls];
424 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
426 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
427 ucs4 = string[lookahead];
428 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
429 ushort low = string[lookahead + 1];
430 if (QChar::isLowSurrogate(low)) {
431 ucs4 = QChar::surrogateToUcs4(ucs4, low);
432 ++lookahead;
433 }
434 }
435
436 prop = QUnicodeTables::properties(ucs4);
438 switch (tcls) {
445 continue;
447 i = lookahead;
449 break;
450 default:
451 break;
452 }
453 break;
454 }
455 }
456 if (Q_UNLIKELY(state == SB::Break)) {
457 attributes[pos].sentenceBoundary = true;
459 }
460 }
461
462 attributes[len].sentenceBoundary = true; // SB2
463}
464
465
466// -----------------------------------------------------------------------------------------------------
467//
468// The line breaking algorithm.
469// See http://www.unicode.org/reports/tr14/tr14-39.html
470//
471// -----------------------------------------------------------------------------------------------------
472
473namespace LB {
474
475namespace NS { // Number Sequence
476
477// LB25 recommends to not break lines inside numbers of the form
478// described by the following regular expression:
479// (PR|PO)?(OP|HY)?NU(NU|SY|IS)*(CL|CP)?(PR|PO)?
480
481enum Action {
485 Break
487
488enum Class {
494 CLCP
496
497static const uchar actionTable[CLCP + 1][CLCP + 1] = {
498// XX PRPO OPHY NU SYIS CLCP
499 { None , Start , Start , Start , None , None }, // XX
500 { None , Start , Continue, Continue, None , None }, // PRPO
501 { None , Start , Start , Continue, None , None }, // OPHY
502 { Break , Break , Break , Continue, Continue, Continue }, // NU
503 { Break , Break , Break , Continue, Continue, Continue }, // SYIS
504 { Break , Continue, Break , Break , Break , Break }, // CLCP
505};
506
508{
509 switch (lbc) {
510 case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
511 // resolve AI math symbols in numerical context to IS
513 return SYIS;
514 break;
516 return PRPO;
518 return OPHY;
520 return NU;
522 return SYIS;
524 return CLCP;
525 default:
526 break;
527 }
528 return XX;
529}
530
531} // namespace NS
532
533/* In order to support the tailored implementation of LB25 properly
534 the following changes were made in the pair table to allow breaks
535 where the numeric expression doesn't match the template (i.e. [^NU](IS|SY)NU):
536 (CL)(PO) from IB to DB
537 (CP)(PO) from IB to DB
538 (CL)(PR) from IB to DB
539 (CP)(PR) from IB to DB
540 (PO)(OP) from IB to DB
541 (PR)(OP) from IB to DB
542 (IS)(NU) from IB to DB
543 (SY)(NU) from IB to DB
544*/
545
546/* In order to implementat LB21a properly a special rule HH has been introduced and
547 the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
548 (HL)(HY|BA) from IB to CI
549 (HY|BA)(!CB) from DB to HH
550*/
551
552enum Action {
560};
561
563/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM*/
564/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
565/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
566/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
567/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
568/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
569/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
570/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
571/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
572/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
573/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB },
574/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
575/* NU */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
576/* AL */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
577/* HL */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
578/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
579/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
580/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
581/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
582/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB },
583/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
584/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
585/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
586/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
587/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
588/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
589/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB },
590/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
591/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
592/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB },
593/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
594/* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
595/* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
596};
597
598// The following line break classes are not treated by the pair table
599// and must be resolved outside:
600// AI, BK, CB, CJ, CR, LF, NL, ZWJ, SA, SG, SP, XX
601
602} // namespace LB
603
604static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
605{
606 qsizetype nestart = 0;
607 LB::NS::Class nelast = LB::NS::XX;
608
612
613 for (qsizetype i = 0; i != len; ++i) {
614 qsizetype pos = i;
615 char32_t ucs4 = string[i];
616 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
617 ushort low = string[i + 1];
618 if (QChar::isLowSurrogate(low)) {
619 ucs4 = QChar::surrogateToUcs4(ucs4, low);
620 ++i;
621 }
622 }
623
627
631 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
632 ) {
633 // LB27: use SPACE for line breaking
634 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
635 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
636 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
638 } else {
640 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
642 if (FLAG(prop->category) & test)
644 }
646 // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
649 }
650 }
651 }
652
654 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
656 if (FLAG(prop->category) & test)
658 }
659
661 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
663 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
666 goto next_no_cls_update;
667 }
668 goto next;
669 }
670
673 goto next; // LB6: x(BK|CR|LF|NL)
674 goto next_no_cls_update; // LB7: xSP
675 }
676
678 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
680 // don't update anything
681 goto next_no_cls_update;
682 }
683
685 // LB8a: ZWJ x
686 goto next;
687 }
688
689 // LB25: do not break lines inside numbers
690 {
692 switch (LB::NS::actionTable[nelast][necur]) {
693 case LB::NS::Break:
694 // do not change breaks before and after the expression
695 for (qsizetype j = nestart + 1; j < pos; ++j)
696 attributes[j].lineBreak = false;
698 case LB::NS::None:
699 nelast = LB::NS::XX; // reset state
700 break;
701 case LB::NS::Start:
702 nestart = i;
704 default:
705 nelast = necur;
706 break;
707 }
708 }
709
711 // LB30a
713 goto next;
714 }
715
717 && lastProp->category == QChar::Other_NotAssigned
718 && lastProp->graphemeBreakClass
720 // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
721 goto next;
722 }
723
724 // for South East Asian chars that require a complex analysis, the Unicode
725 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
728
729 tcls = cls;
731 // LB10
734 case LB::DirectBreak:
735 attributes[pos].lineBreak = true;
736 break;
739 attributes[pos].lineBreak = true;
740 break;
743 goto next_no_cls_update;
744 attributes[pos].lineBreak = true;
745 break;
748 goto next_no_cls_update;
749 break;
752 attributes[pos].lineBreak = true;
753 break;
755 switch (static_cast<QUnicodeTables::EastAsianWidth>(prop->eastAsianWidth)) {
756 default:
758 break;
763 attributes[pos].lineBreak = true;
764 break;
765 }
766 break;
768 // nothing to do
769 default:
770 break;
771 }
772
773 next:
774 cls = ncls;
775 lastProp = prop;
776 next_no_cls_update:
777 lcls = ncls;
778 }
779
781 // LB25: do not break lines inside numbers
782 for (qsizetype j = nestart + 1; j < len; ++j)
783 attributes[j].lineBreak = false;
784 }
785
786 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
787 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
788}
789
790
791static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
792{
793 for (qsizetype i = 0; i != len; ++i) {
794 uint ucs4 = string[i];
795 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
796 ushort low = string[i + 1];
797 if (QChar::isLowSurrogate(low)) {
798 ucs4 = QChar::surrogateToUcs4(ucs4, low);
799 ++i;
800 }
801 }
802
803 if (Q_UNLIKELY(QChar::isSpace(ucs4)))
804 attributes[i].whiteSpace = true;
805 }
806}
807
808namespace Tailored {
809
810using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
811
812
813enum Form {
814 Invalid = 0x0,
825 Other
827
828static const unsigned char indicForms[0xe00-0x900] = {
829 // Devangari
834
839
844
849
854
859
864
869
870 // Bengali
875
880
885
890
895
900
905
910
911 // Gurmukhi
916
921
926
931
936
941
946
951
952 // Gujarati
957
962
967
972
977
982
987
992
993 // Oriya
998
1003
1008
1013
1018
1023
1028
1033
1034 //Tamil
1039
1044
1049
1054
1059
1064
1069
1074
1075 // Telugu
1080
1085
1090
1095
1100
1105
1110
1115
1116 // Kannada
1121
1126
1131
1136
1141
1146
1151
1156
1157 // Malayalam
1162
1167
1172
1177
1182
1187
1192
1197
1198 // Sinhala
1203
1208
1213
1218
1223
1228
1233
1238};
1239
1240static inline Form form(unsigned short uc) {
1241 if (uc < 0x900 || uc > 0xdff) {
1242 if (uc == 0x25cc)
1243 return Consonant;
1244 if (uc == 0x200c || uc == 0x200d)
1245 return Control;
1246 return Other;
1247 }
1248 return (Form)indicForms[uc-0x900];
1249}
1250
1251// #define INDIC_DEBUG
1252#ifdef INDIC_DEBUG
1253#define IDEBUG qDebug
1254#else
1255#define IDEBUG if constexpr (1) ; else qDebug
1256#endif
1257
1258/* syllables are of the form:
1259
1260 (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1261 (Consonant Nukta? Halant)* Consonant Halant
1262 IndependentVowel VowelMark? StressMark?
1263
1264 We return syllable boundaries on invalid combinations as well
1265*/
1266static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1267{
1268 *invalid = false;
1269 IDEBUG("indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1270 const char16_t *uc = s+start;
1271
1272 qsizetype pos = 0;
1273 Form state = form(uc[pos]);
1274 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1275 pos++;
1276
1277 if (state != Consonant && state != IndependentVowel) {
1278 if (state != Other)
1279 *invalid = true;
1280 goto finish;
1281 }
1282
1283 while (pos < end - start) {
1284 Form newState = form(uc[pos]);
1285 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1286 switch (newState) {
1287 case Control:
1288 newState = state;
1289 if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1290 break;
1291 // the control character should be the last char in the item
1292 if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1293 break;
1294 if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1295 break;
1296 // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1297 ++pos;
1298 goto finish;
1299 case Consonant:
1300 if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1301 break;
1302 goto finish;
1303 case Halant:
1304 if (state == Nukta || state == Consonant)
1305 break;
1306 // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1307 if (script == QChar::Script_Bengali && pos == 1 &&
1308 (uc[0] == 0x0985 || uc[0] == 0x098f))
1309 break;
1310 // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1311 if (script == QChar::Script_Sinhala && state == Matra) {
1312 ++pos;
1313 continue;
1314 }
1315 if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1316 ++pos;
1317 continue;
1318 }
1319 goto finish;
1320 case Nukta:
1321 if (state == Consonant)
1322 break;
1323 goto finish;
1324 case StressMark:
1325 if (state == VowelMark)
1326 break;
1327 Q_FALLTHROUGH();
1328 case VowelMark:
1329 if (state == Matra || state == LengthMark || state == IndependentVowel)
1330 break;
1331 Q_FALLTHROUGH();
1332 case Matra:
1333 if (state == Consonant || state == Nukta)
1334 break;
1335 if (state == Matra) {
1336 // ### needs proper testing for correct two/three part matras
1337 break;
1338 }
1339 // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1340 // it work for all Indic languages?
1341 // the combination Independent_A + Vowel Sign AA is allowed.
1342 if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1343 break;
1344 if (script == QChar::Script_Tamil && state == Matra) {
1345 if (uc[pos-1] == 0x0bc6 &&
1346 (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1347 break;
1348 if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1349 break;
1350 }
1351 goto finish;
1352
1353 case LengthMark:
1354 if (state == Matra) {
1355 // ### needs proper testing for correct two/three part matras
1356 break;
1357 }
1358 case IndependentVowel:
1359 case Invalid:
1360 case Other:
1361 goto finish;
1362 }
1363 state = newState;
1364 pos++;
1365 }
1366 finish:
1367 return pos+start;
1368}
1369
1370static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1371{
1372 qsizetype end = from + len;
1373 attributes += from;
1374 qsizetype i = 0;
1375 while (i < len) {
1376 bool invalid;
1377 qsizetype boundary = indic_nextSyllableBoundary(script, text, from+i, end, &invalid) - from;
1378 attributes[i].graphemeBoundary = true;
1379
1380 if (boundary > len-1) boundary = len;
1381 i++;
1382 while (i < boundary) {
1383 attributes[i].graphemeBoundary = false;
1384 ++i;
1385 }
1386 assert(i == boundary);
1387 }
1388
1389
1390}
1391
1392#if QT_CONFIG(library)
1393
1394#define LIBTHAI_MAJOR 0
1395
1396/*
1397 * if libthai changed please update these codes too.
1398 */
1399struct thcell_t {
1400 unsigned char base;
1401 unsigned char hilo;
1402 unsigned char top;
1403};
1404
1405using ThBrk = struct _ThBrk;
1406
1407namespace {
1408
1409class LibThai final
1410{
1411 Q_DISABLE_COPY_MOVE(LibThai)
1412
1413 using th_brk_new_def = ThBrk *(*)(const char *);
1414 using th_brk_delete_def = void (*)(ThBrk *);
1415 using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
1416 using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
1417
1418public:
1419 LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
1420 {
1421 m_th_brk_find_breaks =
1422 reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve("th_brk_find_breaks"));
1423 m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve("th_next_cell"));
1424
1425 auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve("th_brk_new"));
1426 if (th_brk_new) {
1427 m_state = th_brk_new(nullptr);
1428 m_th_brk_delete =
1429 reinterpret_cast<th_brk_delete_def>(m_library.resolve("th_brk_delete"));
1430 }
1431 }
1432
1433 ~LibThai()
1434 {
1435 if (m_state && m_th_brk_delete)
1436 m_th_brk_delete(m_state);
1437 m_library.unload();
1438 }
1439
1440 bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1441
1442 int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
1443 {
1444 Q_ASSERT(m_state);
1445 Q_ASSERT(m_th_brk_find_breaks);
1446 return m_th_brk_find_breaks(m_state, s, pos, pos_sz);
1447 }
1448
1449 size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
1450 {
1451 Q_ASSERT(m_th_next_cell);
1452 return m_th_next_cell(s, len, cell, is_decomp_am);
1453 }
1454
1455private:
1456 QLibrary m_library;
1457
1458 // Global state for th_brk_find_breaks().
1459 // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1460 // state is read-only, and so it is safe to use it from multiple threads after
1461 // initialization. This is also stated in the libthai documentation.
1462 ThBrk *m_state = nullptr;
1463
1464 th_brk_find_breaks_def m_th_brk_find_breaks = nullptr;
1465 th_next_cell_def m_th_next_cell = nullptr;
1466 th_brk_delete_def m_th_brk_delete = nullptr;
1467};
1468
1469} // unnamed namespace
1470
1471Q_GLOBAL_STATIC(LibThai, g_libThai)
1472
1473static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1474{
1475 qsizetype i;
1476 unsigned char *result = reinterpret_cast<unsigned char *>(cstr);
1477
1478 for (i = 0; i < len; ++i) {
1479 if (string[i] <= 0xa0)
1480 result[i] = static_cast<unsigned char>(string[i]);
1481 else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1482 result[i] = static_cast<unsigned char>(string[i] - 0xe00 + 0xa0);
1483 else
1484 result[i] = static_cast<unsigned char>(~0); // Same encoding as libthai uses for invalid chars
1485 }
1486
1487 result[len] = 0;
1488}
1489
1490/*
1491 * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1492 */
1493static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1494{
1495 constexpr qsizetype Prealloc = 128;
1497 QVarLengthArray<int, Prealloc> break_positions(len);
1498 qsizetype numbreaks, i;
1499 struct thcell_t tis_cell;
1500
1501 LibThai *libThai = g_libThai;
1502 if (!libThai || !libThai->isInitialized())
1503 return;
1504
1505 to_tis620(string, len, s.data());
1506
1507 for (i = 0; i < len; ++i) {
1508 attributes[i].wordBreak = false;
1509 attributes[i].wordStart = false;
1510 attributes[i].wordEnd = false;
1511 attributes[i].lineBreak = false;
1512 }
1513
1514 attributes[0].wordBreak = true;
1515 attributes[0].wordStart = true;
1516 attributes[0].wordEnd = false;
1517 numbreaks = libThai->brk_find_breaks(reinterpret_cast<const unsigned char *>(s.data()),
1518 break_positions.data(),
1519 static_cast<size_t>(break_positions.size()));
1520 for (i = 0; i < numbreaks; ++i) {
1521 attributes[break_positions[i]].wordBreak = true;
1522 attributes[break_positions[i]].wordStart = true;
1523 attributes[break_positions[i]].wordEnd = true;
1524 attributes[break_positions[i]].lineBreak = true;
1525 }
1526 if (numbreaks > 0)
1527 attributes[break_positions[numbreaks - 1]].wordStart = false;
1528
1529 /* manage grapheme boundaries */
1530 i = 0;
1531 while (i < len) {
1532 size_t cell_length =
1533 libThai->next_cell(reinterpret_cast<const unsigned char *>(s.data()) + i,
1534 size_t(len - i), &tis_cell, true);
1535
1536 attributes[i].graphemeBoundary = true;
1537 for (size_t j = 1; j < cell_length; ++j)
1538 attributes[i + j].graphemeBoundary = false;
1539
1540 i += cell_length;
1541 }
1542}
1543
1544#endif // QT_CONFIG(library)
1545
1546static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1547{
1548 assert(script == QChar::Script_Thai);
1549#if QT_CONFIG(library)
1550 const char16_t *uc = text + from;
1551 attributes += from;
1552 Q_UNUSED(script);
1553 thaiAssignAttributes(uc, len, attributes);
1554#else
1555 Q_UNUSED(script);
1556 Q_UNUSED(text);
1557 Q_UNUSED(from);
1558 Q_UNUSED(len);
1559 Q_UNUSED(attributes);
1560#endif
1561}
1562
1563/*
1564 tibetan syllables are of the form:
1565 head position consonant
1566 first sub-joined consonant
1567 ....intermediate sub-joined consonants (if any)
1568 last sub-joined consonant
1569 sub-joined vowel (a-chung U+0F71)
1570 standard or compound vowel sign (or 'virama' for devanagari transliteration)
1571*/
1572
1573typedef enum {
1580
1581/* this table starts at U+0f40 */
1582static const unsigned char tibetanForm[0x80] = {
1587
1592
1597
1602
1607
1612
1617
1622};
1623
1624#define tibetan_form(c) \
1625 ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1626
1627static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1628{
1629 const char16_t *uc = s + start;
1630
1631 qsizetype pos = 0;
1633
1634/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1635 pos++;
1636
1637 if (state != TibetanHeadConsonant) {
1638 if (state != TibetanOther)
1639 *invalid = true;
1640 goto finish;
1641 }
1642
1643 while (pos < end - start) {
1645 switch (newState) {
1648 if (state != TibetanHeadConsonant &&
1650 goto finish;
1651 state = newState;
1652 break;
1653 case TibetanVowel:
1654 if (state != TibetanHeadConsonant &&
1657 goto finish;
1658 break;
1659 case TibetanOther:
1661 goto finish;
1662 }
1663 pos++;
1664 }
1665
1666finish:
1667 *invalid = false;
1668 return start+pos;
1669}
1670
1671static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1672{
1673 qsizetype end = from + len;
1674 qsizetype i = 0;
1675 Q_UNUSED(script);
1676 attributes += from;
1677 while (i < len) {
1678 bool invalid;
1679 qsizetype boundary = tibetan_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1680
1681 attributes[i].graphemeBoundary = true;
1682
1683 if (boundary > len-1) boundary = len;
1684 i++;
1685 while (i < boundary) {
1686 attributes[i].graphemeBoundary = false;
1687 ++i;
1688 }
1689 assert(i == boundary);
1690 }
1691}
1692
1695 Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
1696 Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
1697 Mymr_CC_NGA = 3, /* Consonant NGA */
1698 Mymr_CC_YA = 4, /* Consonant YA */
1699 Mymr_CC_RA = 5, /* Consonant RA */
1700 Mymr_CC_WA = 6, /* Consonant WA */
1701 Mymr_CC_HA = 7, /* Consonant HA */
1702 Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
1703 Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
1704 Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
1705 Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
1706 Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
1707 Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
1708 Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
1712 Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
1713 Mymr_CC_COUNT = 19 /* This is the number of character classes */
1715
1718
1719 Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
1720 Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
1721 Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
1722 Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
1723 Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
1724 first in a syllable */
1725 Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
1726
1727 /* position flags */
1729 Mymr_CF_POS_BELOW = 0x00040000,
1730 Mymr_CF_POS_ABOVE = 0x00020000,
1731 Mymr_CF_POS_AFTER = 0x00010000,
1732 Mymr_CF_POS_MASK = 0x000f0000,
1733
1734 Mymr_CF_AFTER_KINZI = 0x00100000
1736
1738
1739/* Characters that get refrered to by name */
1741{
1745 Mymr_C_RA = 0x101B,
1746 Mymr_C_YA = 0x101A,
1747 Mymr_C_NGA = 0x1004,
1749 Mymr_C_VIRAMA = 0x1039
1751
1752enum
1753{
1772
1773
1774typedef int MymrCharClass;
1775
1776
1778{
1780 Mymr_c1, Mymr_c1, Mymr_c2, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, /* 1000 - 100F */
1782 Mymr_c1, Mymr_c1, Mymr_ya, Mymr_ra, Mymr_c1, Mymr_wa, Mymr_c1, Mymr_ha, /* 1010 - 101F */
1784 Mymr_xx, Mymr_id, Mymr_id, Mymr_xx, Mymr_dr, Mymr_da, Mymr_da, Mymr_db, /* 1020 - 102F */
1786 Mymr_sp, Mymr_vi, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1030 - 103F */
1788 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1040 - 104F */
1790 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1050 - 105F */
1791};
1792
1793static MymrCharClass
1795{
1796 if (ch == Mymr_C_SIGN_ZWJ)
1798
1799 if (ch == Mymr_C_SIGN_ZWNJ)
1801
1802 if (ch < 0x1000 || ch > 0x105f)
1803 return Mymr_CC_RESERVED;
1804
1805 return mymrCharClasses[ch - 0x1000];
1806}
1807
1808static const signed char mymrStateTable[][Mymr_CC_COUNT] =
1809{
1810/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
1811 { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
1812 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
1813 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
1814 {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
1815 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
1816 {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
1817 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
1818 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
1819 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
1820 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
1821 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
1822 {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
1823 {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
1824 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
1825 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
1826 {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
1827 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
1828 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
1829 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
1830 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
1831 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
1832 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
1833 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
1834 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
1835 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
1836 {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
1837 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
1838 {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
1839/* exit state -2 is for invalid order of medials and combination of invalids
1840 with virama where virama should treat as start of next syllable
1841 */
1842};
1843
1844/*#define MYANMAR_DEBUG */
1845#ifdef MYANMAR_DEBUG
1846#define MMDEBUG qDebug
1847#else
1848# define MMDEBUG \
1849 if (0) \
1850 printf
1851#endif
1852
1853/*
1854// Given an input string of characters and a location in which to start looking
1855// calculate, using the state table, which one is the last character of the syllable
1856// that starts in the starting position.
1857*/
1858static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1859{
1860 const char16_t *uc = s + start;
1861 int state = 0;
1863 *invalid = false;
1864
1865 while (pos < end) {
1866 MymrCharClass charClass = getMyanmarCharClass(*uc);
1868 if (pos == start)
1869 *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
1870
1871 MMDEBUG("state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
1872
1873 if (state < 0) {
1874 if (state < -1)
1875 --pos;
1876 break;
1877 }
1878 ++uc;
1879 ++pos;
1880 }
1881 return pos;
1882}
1883
1884static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1885{
1886 qsizetype end = from + len;
1887 qsizetype i = 0;
1888 Q_UNUSED(script);
1889 attributes += from;
1890 while (i < len) {
1891 bool invalid;
1892 qsizetype boundary = myanmar_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1893
1894 attributes[i].graphemeBoundary = true;
1895 attributes[i].lineBreak = true;
1896
1897 if (boundary > len-1)
1898 boundary = len;
1899 i++;
1900 while (i < boundary) {
1901 attributes[i].graphemeBoundary = false;
1902 ++i;
1903 }
1904 assert(i == boundary);
1905 }
1906}
1907
1908/*
1909// Vocabulary
1910// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
1911// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
1912// split vowels, signs... but there is only one base in a syllable, it has to be coded as
1913// the first character of the syllable.
1914// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
1915// Khmer language has five of them. Khmer split vowels either have one part before the
1916// base and one after the base or they have a part before the base and a part above the base.
1917// The first part of all Khmer split vowels is the same character, identical to
1918// the glyph of Khmer dependent vowel SRA EI
1919// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
1920// Differently than indian languages, the coeng modifies the consonant that follows it,
1921// not the one preceding it Each consonant has two forms, the base form and the subscript form
1922// the base form is the normal one (using the consonants code-point), the subscript form is
1923// displayed when the combination coeng + consonant is encountered.
1924// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
1925// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
1926// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
1927// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
1928// if it is attached to a consonant of the first series or a consonant of the second series
1929// Most consonants have an equivalent in the other series, but some of theme exist only in
1930// one series (for example SA). If we want to use the consonant SA with a vowel sound that
1931// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
1932// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
1933// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
1934// MUSIKATOAN a second series consonant to have a first series vowel sound.
1935// Consonant shifter are both normally supercript marks, but, when they are followed by a
1936// superscript, they change shape and take the form of subscript dependent vowel SRA U.
1937// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
1938// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
1939// be placed after the coeng consonant.
1940// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
1941// Each vowel has its own position. Only one vowel per syllable is allowed.
1942// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
1943// Allowed in a syllable.
1944//
1945//
1946// order is important here! This order must be the same that is found in each horizontal
1947// line in the statetable for Khmer (see khmerStateTable) .
1948*/
1951 CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
1952 CC_CONSONANT2 = 2, /* Consonant of type 2 */
1953 CC_CONSONANT3 = 3, /* Consonant of type 3 */
1954 CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
1956 CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
1957 CC_COENG = 7, /* Subscript consonant combining character */
1961 CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
1962 CC_COUNT = 12 /* This is the number of character classes */
1964
1965
1967 CF_CLASS_MASK = 0x0000FFFF,
1968
1969 CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
1970 CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
1971 CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
1972 CF_COENG = 0x08000000, /* flag to speed up comparing */
1973 CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
1974 CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
1975
1976 /* position flags */
1977 CF_POS_BEFORE = 0x00080000,
1978 CF_POS_BELOW = 0x00040000,
1979 CF_POS_ABOVE = 0x00020000,
1980 CF_POS_AFTER = 0x00010000,
1981 CF_POS_MASK = 0x000f0000
1983
1985
1986/* Characters that get referred to by name */
1988 C_SIGN_ZWNJ = 0x200C,
1989 C_SIGN_ZWJ = 0x200D,
1990 C_RO = 0x179A,
1991 C_VOWEL_AA = 0x17B6,
1993 C_VOWEL_E = 0x17C1,
1994 C_COENG = 0x17D2
1996
1997
1998/*
1999// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2000// they are also used to know where a character should be placed (location in reference to the base character)
2001// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2002// indicate error in syllable construction
2003*/
2004enum {
2018
2019 /* split vowel */
2023
2024
2025/*
2026// Character class: a character class value
2027// ORed with character class flags.
2028*/
2029typedef unsigned long KhmerCharClass;
2030
2031
2032/*
2033// Character class tables
2034// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2035// _sa Sign placed above the base
2036// _sp Sign placed after the base
2037// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2038// _c2 Consonant of type 2 (only RO)
2039// _c3 Consonant of type 3
2040// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2041// _cd Consonant-shifter
2042// _dl Dependent vowel placed before the base (left of the base)
2043// _db Dependent vowel placed below the base
2044// _da Dependent vowel placed above the base
2045// _dr Dependent vowel placed behind the base (right of the base)
2046// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2047// it to create a subscript consonant or independent vowel
2048// _va Khmer split vowel in which the first part is before the base and the second one above the base
2049// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2050*/
2052 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
2053 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
2054 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
2055 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
2056 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
2057 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
2058};
2059
2060/* this enum must reflect the range of khmerCharClasses */
2063 KhmerLastChar = 0x17df
2065
2066/*
2067// Below we define how a character in the input string is either in the khmerCharClasses table
2068// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2069// within the syllable, but are not in the table) we also get their type back, or an unknown object
2070// in which case we get _xx (CC_RESERVED) back
2071*/
2073{
2074 if (uc == C_SIGN_ZWJ) {
2075 return CC_ZERO_WIDTH_J_MARK;
2076 }
2077
2078 if (uc == C_SIGN_ZWNJ) {
2079 return CC_ZERO_WIDTH_NJ_MARK;
2080 }
2081
2082 if (uc < KhmerFirstChar || uc > KhmerLastChar) {
2083 return CC_RESERVED;
2084 }
2085
2086 return khmerCharClasses[uc - KhmerFirstChar];
2087}
2088
2089
2090/*
2091// The stateTable is used to calculate the end (the length) of a well
2092// formed Khmer Syllable.
2093//
2094// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2095// CharClassValues. This coincidence of values allows the follow up of the table.
2096//
2097// Each line corresponds to a state, which does not necessarily need to be a type
2098// of component... for example, state 2 is a base, with is always a first character
2099// in the syllable, but the state could be produced a consonant of any type when
2100// it is the first character that is analysed (in ground state).
2101//
2102// Differentiating 3 types of consonants is necessary in order to
2103// forbid the use of certain combinations, such as having a second
2104// coeng after a coeng RO,
2105// The inexistent possibility of having a type 3 after another type 3 is permitted,
2106// eliminating it would very much complicate the table, and it does not create typing
2107// problems, as the case above.
2108//
2109// The table is quite complex, in order to limit the number of coeng consonants
2110// to 2 (by means of the table).
2111//
2112// There a peculiarity, as far as Unicode is concerned:
2113// - The consonant-shifter is considered in two possible different
2114// locations, the one considered in Unicode 3.0 and the one considered in
2115// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2116//
2117//
2118// xx independent character, such as a number, punctuation sign or non-khmer char
2119//
2120// c1 Khmer consonant of type 1 or an independent vowel
2121// that is, a letter in which the subscript for is only under the
2122// base, not taking any space to the right or to the left
2123//
2124// c2 Khmer consonant of type 2, the coeng form takes space under
2125// and to the left of the base (only RO is of this type)
2126//
2127// c3 Khmer consonant of type 3. Its subscript form takes space under
2128// and to the right of the base.
2129//
2130// cs Khmer consonant shifter
2131//
2132// rb Khmer robat
2133//
2134// co coeng character (u17D2)
2135//
2136// dv dependent vowel (including split vowels, they are treated in the same way).
2137// even if dv is not defined above, the component that is really tested for is
2138// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2139//
2140// zwj Zero Width joiner
2141//
2142// zwnj Zero width non joiner
2143//
2144// sa above sign
2145//
2146// sp post sign
2147//
2148// there are lines with equal content but for an easier understanding
2149// (and maybe change in the future) we did not join them
2150*/
2151static const signed char khmerStateTable[][CC_COUNT] =
2152{
2153 /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2154 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2155 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2156 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2157 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2158 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2159 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2160 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2161 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2162 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2163 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2164 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2165 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2166 {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2167 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2168 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2169 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2170 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2171 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2172 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2173 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2174 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2175};
2176
2177
2178/* #define KHMER_DEBUG */
2179#ifdef KHMER_DEBUG
2180#define KHDEBUG qDebug
2181#else
2182# define KHDEBUG \
2183 if (0) \
2184 printf
2185#endif
2186
2187/*
2188// Given an input string of characters and a location in which to start looking
2189// calculate, using the state table, which one is the last character of the syllable
2190// that starts in the starting position.
2191*/
2192static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2193{
2194 const char16_t *uc = s + start;
2195 int state = 0;
2197 *invalid = false;
2198
2199 while (pos < end) {
2200 KhmerCharClass charClass = getKhmerCharClass(*uc);
2201 if (pos == start) {
2202 *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2203 }
2204 state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2205
2206 KHDEBUG("state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2207 charClass, *uc );
2208
2209 if (state < 0) {
2210 break;
2211 }
2212 ++uc;
2213 ++pos;
2214 }
2215 return pos;
2216}
2217
2218static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2219{
2220 qsizetype end = from + len;
2221 qsizetype i = 0;
2222 Q_UNUSED(script);
2223 attributes += from;
2224 while ( i < len ) {
2225 bool invalid;
2226 qsizetype boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from;
2227
2228 attributes[i].graphemeBoundary = true;
2229
2230 if ( boundary > len-1 ) boundary = len;
2231 i++;
2232 while ( i < boundary ) {
2233 attributes[i].graphemeBoundary = false;
2234 ++i;
2235 }
2236 assert( i == boundary );
2237 }
2238}
2239
2240
2242// Script_Unknown,
2243 nullptr,
2244// Script_Inherited,
2245 nullptr,
2246// Script_Common,
2247 nullptr,
2248// Script_Latin,
2249 nullptr,
2250// Script_Greek,
2251 nullptr,
2252// Script_Cyrillic,
2253 nullptr,
2254// Script_Armenian,
2255 nullptr,
2256// Script_Hebrew,
2257 nullptr,
2258// Script_Arabic,
2259 nullptr,
2260// Script_Syriac,
2261 nullptr,
2262// Script_Thaana,
2263 nullptr,
2264// Script_Devanagari,
2266// Script_Bengali,
2268// Script_Gurmukhi,
2270// Script_Gujarati,
2272// Script_Oriya,
2274// Script_Tamil,
2276// Script_Telugu,
2278// Script_Kannada,
2280// Script_Malayalam,
2282// Script_Sinhala,
2284// Script_Thai,
2286// Script_Lao,
2287 nullptr,
2288// Script_Tibetan,
2290// Script_Myanmar,
2292// Script_Georgian,
2293 nullptr,
2294// Script_Hangul,
2295 nullptr,
2296// Script_Ethiopic,
2297 nullptr,
2298// Script_Cherokee,
2299 nullptr,
2300// Script_CanadianAboriginal,
2301 nullptr,
2302// Script_Ogham,
2303 nullptr,
2304// Script_Runic,
2305 nullptr,
2306// Script_Khmer,
2308};
2309
2310static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2311 const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2312 QCharAttributes *attributes)
2313{
2314 if (stringLength == 0)
2315 return;
2316 for (qsizetype i = 0; i < numItems; ++i) {
2317 QChar::Script script = items[i].script;
2318 if (script > QChar::Script_Khmer)
2319 script = QChar::Script_Common;
2320 CharAttributeFunction attributeFunction = charAttributeFunction[script];
2321 if (!attributeFunction)
2322 continue;
2323 qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2324 attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2325 }
2326}
2327
2328}
2329
2330Q_CORE_EXPORT void initCharAttributes(QStringView string,
2331 const ScriptItem *items, qsizetype numItems,
2332 QCharAttributes *attributes, CharAttributeOptions options)
2333{
2334 if (string.size() <= 0)
2335 return;
2336
2337 if (!(options & DontClearAttributes))
2338 ::memset(attributes, 0, (string.size() + 1) * sizeof(QCharAttributes));
2339
2340 if (options & GraphemeBreaks)
2341 getGraphemeBreaks(string.utf16(), string.size(), attributes);
2342 if (options & WordBreaks)
2343 getWordBreaks(string.utf16(), string.size(), attributes);
2344 if (options & SentenceBreaks)
2345 getSentenceBreaks(string.utf16(), string.size(), attributes);
2346 if (options & LineBreaks)
2347 getLineBreaks(string.utf16(), string.size(), attributes, options);
2348 if (options & WhiteSpaces)
2349 getWhiteSpaces(string.utf16(), string.size(), attributes);
2350
2352 if (!items || numItems <= 0)
2353 return;
2354
2355 Tailored::getCharAttributes(string.utf16(), string.size(), items, numItems, attributes);
2356 }
2357}
2358
2359
2360// ----------------------------------------------------------------------------
2361//
2362// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2363//
2364// ----------------------------------------------------------------------------
2365
2366Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2367{
2368 qsizetype sor = 0;
2369 qsizetype eor = 0;
2371
2372 for (qsizetype i = 0; i < string.size(); ++i, eor = i) {
2373 char32_t ucs4 = string[i].unicode();
2374 if (QChar::isHighSurrogate(ucs4) && i + 1 < string.size()) {
2375 ushort low = string[i + 1].unicode();
2376 if (QChar::isLowSurrogate(low)) {
2377 ucs4 = QChar::surrogateToUcs4(ucs4, low);
2378 ++i;
2379 }
2380 }
2381
2383
2384 QChar::Script nscript = QChar::Script(prop->script);
2385
2386 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2387 continue;
2388
2389 // inherit preceding Common-s
2390 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2391 // also covers a case where the base character of Common script followed
2392 // by one or more combining marks of non-Inherited, non-Common script
2393 script = nscript;
2394 continue;
2395 }
2396
2397 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2398 // Thus, a combining mark - whatever its script property value is - should inherit
2399 // the script property value of its base character.
2401 if (Q_UNLIKELY(FLAG(prop->category) & test))
2402 continue;
2403
2405 Q_ASSERT(sor < eor);
2406 scripts->append(ScriptItem{sor, script});
2407 sor = eor;
2408
2409 script = nscript;
2410 }
2411
2412 Q_ASSERT(script >= QChar::Script_Common);
2413 Q_ASSERT(eor == string.size());
2414 scripts->append(ScriptItem{sor, script});
2415}
2416
2417} // namespace QUnicodeTools
2418
static constexpr char32_t surrogateToUcs4(char16_t high, char16_t low) noexcept
Converts a UTF16 surrogate pair with the given high and low values to it's UCS-4-encoded code point.
Definition qchar.h:508
Category
This enum maps the Unicode character categories.
Definition qchar.h:104
@ Mark_SpacingCombining
Definition qchar.h:106
@ Symbol_Math
Definition qchar.h:137
@ Mark_NonSpacing
Definition qchar.h:105
@ Mark_Enclosing
Definition qchar.h:107
@ Other_NotAssigned
Definition qchar.h:121
constexpr bool isLowSurrogate() const noexcept
Returns true if the QChar is the low part of a UTF16 surrogate (for example if its code point is in r...
Definition qchar.h:480
Script
Definition qchar.h:144
@ Script_Tamil
Definition qchar.h:162
@ Script_Thai
Definition qchar.h:167
@ Script_Kannada
Definition qchar.h:164
@ Script_Common
Definition qchar.h:147
@ Script_Malayalam
Definition qchar.h:165
@ Script_Bengali
Definition qchar.h:158
@ Script_Khmer
Definition qchar.h:178
@ Script_Sinhala
Definition qchar.h:166
constexpr bool isSpace() const noexcept
Returns true if the character is a separator character (Separator_* categories or certain code points...
Definition qchar.h:466
constexpr bool isHighSurrogate() const noexcept
Returns true if the QChar is the high part of a UTF16 surrogate (for example if its code point is in ...
Definition qchar.h:479
\inmodule QtCore \reentrant
Definition qlibrary.h:17
\inmodule QtCore
Definition qstringview.h:76
void append(const T &t)
const QLoggingCategory & category()
[1]
QString text
else opt state
[0]
void newState(QList< State > &states, const char *token, const char *lexem, bool pre)
short next
Definition keywords.cpp:445
Combined button and popup list for selecting options.
Q_DECL_CONST_FUNCTION Q_CORE_EXPORT const Properties *QT_FASTCALL properties(char32_t ucs4) noexcept
@ GraphemeBreak_Extended_Pictographic
static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first, QUnicodeTables::GraphemeBreakClass second)
static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses]
static const GBTableEntryType Extend_SpacingMark_ZWJ
static const GBTableEntryType HardBreak
Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
static const uchar actionTable[CLCP+1][CLCP+1]
static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ]
static const uchar breakTable[BAfter+1][QUnicodeTables::NumSentenceBreakClasses]
static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
const CharAttributeFunction charAttributeFunction[]
static const MymrCharClass mymrCharClasses[]
static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static const signed char mymrStateTable[][Mymr_CC_COUNT]
static MymrCharClass getMyanmarCharClass(ushort ch)
static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const unsigned char tibetanForm[0x80]
static const KhmerCharClass khmerCharClasses[]
static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static const signed char khmerStateTable[][CC_COUNT]
static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const unsigned char indicForms[0xe00-0x900]
static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
void(*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) CharAttributeFunction
static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static void getCharAttributes(const char16_t *string, qsizetype stringLength, const QUnicodeTools::ScriptItem *items, qsizetype numItems, QCharAttributes *attributes)
static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static KhmerCharClass getKhmerCharClass(ushort uc)
static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses]
static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initCharAttributes(QStringView string, const ScriptItem *items, qsizetype numItems, QCharAttributes *attributes, CharAttributeOptions options)
static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
#define assert
#define FLAG(x)
Definition qchar.cpp:23
#define Q_FALLTHROUGH()
#define Q_UNLIKELY(x)
#define Q_LIKELY(x)
DBusConnection const char DBusError DBusBusType DBusError return DBusConnection DBusHandleMessageFunction void DBusFreeFunction return DBusConnection return DBusConnection return const char DBusError return DBusConnection DBusMessage dbus_uint32_t return DBusConnection dbus_bool_t DBusConnection DBusAddWatchFunction DBusRemoveWatchFunction DBusWatchToggledFunction void DBusFreeFunction return DBusConnection DBusDispatchStatusFunction void DBusFreeFunction DBusTimeout return DBusTimeout return DBusWatch return DBusWatch unsigned int return DBusError const DBusError return const DBusMessage return DBusMessage return DBusMessage return DBusMessage return DBusMessage return DBusMessage return DBusMessageIter int const void return DBusMessageIter DBusMessageIter return DBusMessageIter void DBusMessageIter void int return DBusMessage DBusMessageIter return DBusMessageIter return DBusMessageIter DBusMessageIter const char const char const char const char return DBusMessage return DBusMessage const char return DBusMessage dbus_bool_t return DBusMessage dbus_uint32_t return DBusMessage void
#define Q_DECLARE_MIXED_ENUM_OPERATORS(Ret, Flags, Enum)
Definition qflags.h:241
#define Q_GLOBAL_STATIC(TYPE, NAME,...)
#define NS(x)
Definition qmetatype.cpp:65
GLenum GLuint GLintptr GLsizeiptr size
[1]
GLuint GLuint end
GLdouble GLdouble GLdouble GLdouble top
GLuint start
GLint first
GLenum GLsizei len
GLuint64EXT * result
[6]
GLdouble s
[6]
Definition qopenglext.h:235
static qreal position(const QQuickItem *item, QQuickAnchors::Anchor anchorLine)
#define Q_ASSERT(cond)
Definition qrandom.cpp:47
static QString lineBreak(QString s)
Definition main.cpp:652
#define Q_AUTOTEST_EXPORT
#define Q_UNUSED(x)
unsigned char uchar
Definition qtypes.h:27
unsigned short quint16
Definition qtypes.h:43
ptrdiff_t qsizetype
Definition qtypes.h:70
unsigned int uint
Definition qtypes.h:29
unsigned short ushort
Definition qtypes.h:28
qint64 qlonglong
Definition qtypes.h:58
#define KHDEBUG
#define IDEBUG
constexpr int qt_initcharattributes_default_algorithm_only
#define tibetan_form(c)
#define MMDEBUG
QList< QTreeWidgetItem * > items