1
/****************************************************************************
3
** Copyright (C) 2016 The Qt Company Ltd.
4
** Contact: https://www.qt.io/licensing/
6
** This file is part of the QtCore module of the Qt Toolkit.
8
** $QT_BEGIN_LICENSE:LGPL$
9
** Commercial License Usage
10
** Licensees holding valid commercial Qt licenses may use this file in
11
** accordance with the commercial license agreement provided with the
12
** Software or, alternatively, in accordance with the terms contained in
13
** a written agreement between you and The Qt Company. For licensing terms
14
** and conditions see https://www.qt.io/terms-conditions. For further
15
** information use the contact form at https://www.qt.io/contact-us.
17
** GNU Lesser General Public License Usage
18
** Alternatively, this file may be used under the terms of the GNU Lesser
19
** General Public License version 3 as published by the Free Software
20
** Foundation and appearing in the file LICENSE.LGPL3 included in the
21
** packaging of this file. Please review the following information to
22
** ensure the GNU Lesser General Public License version 3 requirements
23
** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25
** GNU General Public License Usage
26
** Alternatively, this file may be used under the terms of the GNU
27
** General Public License version 2.0 or (at your option) the GNU General
28
** Public license version 3 or any later version approved by the KDE Free
29
** Qt Foundation. The licenses are as published by the Free Software
30
** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31
** included in the packaging of this file. Please review the following
32
** information to ensure the GNU General Public License requirements will
33
** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34
** https://www.gnu.org/licenses/gpl-3.0.html.
38
****************************************************************************/
40
#include "qunicodetools_p.h"
42
#include "qunicodetables_p.h"
43
#include "qvarlengtharray.h"
45
#include "qharfbuzz_p.h"
47
#define FLAG(x) (1 << (x))
51
Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0;
53
namespace QUnicodeTools {
55
// -----------------------------------------------------------------------------------------------------
57
// The text boundaries determination algorithm.
58
// See http://www.unicode.org/reports/tr29/tr29-27.html
60
// -----------------------------------------------------------------------------------------------------
64
static const uchar breakTable[QUnicodeTables::GraphemeBreak_LVT + 1][QUnicodeTables::GraphemeBreak_LVT + 1] = {
65
// Other CR LF Control Extend RI Prepend S-Mark L V T LV LVT
66
{ true , true , true , true , false, true , true , false, true , true , true , true , true }, // Other
67
{ true , true , false, true , true , true , true , true , true , true , true , true , true }, // CR
68
{ true , true , true , true , true , true , true , true , true , true , true , true , true }, // LF
69
{ true , true , true , true , true , true , true , true , true , true , true , true , true }, // Control
70
{ true , true , true , true , false, true , true , false, true , true , true , true , true }, // Extend
71
{ true , true , true , true , false, false, true , false, true , true , true , true , true }, // RegionalIndicator
72
{ false, true , true , true , false, false, false, false, false, false, false, false, false }, // Prepend
73
{ true , true , true , true , false, true , true , false, true , true , true , true , true }, // SpacingMark
74
{ true , true , true , true , false, true , true , false, false, false, true , false, false }, // L
75
{ true , true , true , true , false, true , true , false, true , false, false, true , true }, // V
76
{ true , true , true , true , false, true , true , false, true , true , false, true , true }, // T
77
{ true , true , true , true , false, true , true , false, true , false, false, true , true }, // LV
78
{ true , true , true , true , false, true , true , false, true , true , false, true , true }, // LVT
83
static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
85
QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
86
for (quint32 i = 0; i != len; ++i) {
88
uint ucs4 = string[i];
89
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
90
ushort low = string[i + 1];
91
if (QChar::isLowSurrogate(low)) {
92
ucs4 = QChar::surrogateToUcs4(ucs4, low);
97
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
98
QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
100
if (Q_LIKELY(GB::breakTable[lcls][cls]))
101
attributes[pos].graphemeBoundary = true;
106
attributes[len].graphemeBoundary = true; // GB2
119
static const uchar breakTable[QUnicodeTables::WordBreak_ExtendNumLet + 1][QUnicodeTables::WordBreak_ExtendNumLet + 1] = {
120
// Other CR LF Newline Extend RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtendNumLet
121
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Other
122
{ Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
123
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
124
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
125
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
126
{ Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
127
{ Break , Break , Break , Break , NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // Katakana
128
{ Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak }, // HebrewLetter
129
{ Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak }, // ALetter
130
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
131
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
132
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
133
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
134
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
135
{ Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric
136
{ Break , Break , Break , Break , NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet
141
static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
144
WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
145
} currentWordType = WordTypeNone;
147
QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
148
for (quint32 i = 0; i != len; ++i) {
150
uint ucs4 = string[i];
151
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
152
ushort low = string[i + 1];
153
if (QChar::isLowSurrogate(low)) {
154
ucs4 = QChar::surrogateToUcs4(ucs4, low);
159
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
160
QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
161
#ifdef QT_BUILD_INTERNAL
162
if (qt_initcharattributes_default_algorithm_only) {
163
// as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
164
// which caused "hi.there" to be treated like if it were just a single word;
165
// we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
166
// and this code is needed to pass the coverage tests; remove once the issue is fixed.
167
if (ucs4 == 0x002E) // FULL STOP
168
ncls = QUnicodeTables::WordBreak_MidNumLet;
169
else if (ucs4 == 0x003A) // COLON
170
ncls = QUnicodeTables::WordBreak_MidLetter;
174
uchar action = WB::breakTable[cls][ncls];
179
if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend)) {
180
// WB4: X(Extend|Format)* -> X
186
for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
187
ucs4 = string[lookahead];
188
if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
189
ushort low = string[lookahead + 1];
190
if (QChar::isLowSurrogate(low)) {
191
ucs4 = QChar::surrogateToUcs4(ucs4, low);
196
prop = QUnicodeTables::properties(ucs4);
197
QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
199
if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend)) {
200
// WB4: X(Extend|Format)* -> X
204
if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
205
|| tcls == QUnicodeTables::WordBreak_ALetter)))) {
208
action = WB::NoBreak;
212
if (action != WB::NoBreak) {
214
if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
215
action = WB::NoBreak; // WB7a
221
if (action == WB::Break) {
222
attributes[pos].wordBreak = true;
223
if (currentWordType != WordTypeNone)
224
attributes[pos].wordEnd = true;
226
case QUnicodeTables::WordBreak_Katakana:
227
currentWordType = WordTypeHiraganaKatakana;
228
attributes[pos].wordStart = true;
230
case QUnicodeTables::WordBreak_HebrewLetter:
231
case QUnicodeTables::WordBreak_ALetter:
232
case QUnicodeTables::WordBreak_Numeric:
233
currentWordType = WordTypeAlphaNumeric;
234
attributes[pos].wordStart = true;
237
currentWordType = WordTypeNone;
243
if (currentWordType != WordTypeNone)
244
attributes[len].wordEnd = true;
245
attributes[len].wordBreak = true; // WB2
268
static const uchar breakTable[BAfter + 1][QUnicodeTables::SentenceBreak_Close + 1] = {
269
// Other CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
270
{ Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
271
{ Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
272
{ Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, STerm , STerm , Initial }, // Upper
274
{ Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
275
{ Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
276
{ Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
277
{ Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
279
{ Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
280
{ Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
281
{ Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
282
{ Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
283
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
288
static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
290
uchar state = SB::BAfter; // to meet SB1
291
for (quint32 i = 0; i != len; ++i) {
293
uint ucs4 = string[i];
294
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
295
ushort low = string[i + 1];
296
if (QChar::isLowSurrogate(low)) {
297
ucs4 = QChar::surrogateToUcs4(ucs4, low);
302
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
303
QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
305
Q_ASSERT(state <= SB::BAfter);
306
state = SB::breakTable[state][ncls];
307
if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
309
for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
310
ucs4 = string[lookahead];
311
if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
312
ushort low = string[lookahead + 1];
313
if (QChar::isLowSurrogate(low)) {
314
ucs4 = QChar::surrogateToUcs4(ucs4, low);
319
prop = QUnicodeTables::properties(ucs4);
320
QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
322
case QUnicodeTables::SentenceBreak_Other:
323
case QUnicodeTables::SentenceBreak_Extend:
324
case QUnicodeTables::SentenceBreak_Sp:
325
case QUnicodeTables::SentenceBreak_Numeric:
326
case QUnicodeTables::SentenceBreak_SContinue:
327
case QUnicodeTables::SentenceBreak_Close:
329
case QUnicodeTables::SentenceBreak_Lower:
339
if (Q_UNLIKELY(state == SB::Break)) {
340
attributes[pos].sentenceBoundary = true;
341
state = SB::breakTable[SB::Initial][ncls];
345
attributes[len].sentenceBoundary = true; // SB2
349
// -----------------------------------------------------------------------------------------------------
351
// The line breaking algorithm.
352
// See http://www.unicode.org/reports/tr14/tr14-35.html
354
// -----------------------------------------------------------------------------------------------------
358
namespace NS { // Number Sequence
360
// LB25 recommends to not break lines inside numbers of the form
361
// described by the following regular expression:
362
// (PR|PO)?(OP|HY)?NU(NU|SY|IS)*(CL|CP)?(PR|PO)?
380
static const uchar actionTable[CLCP + 1][CLCP + 1] = {
381
// XX PRPO OPHY NU SYIS CLCP
382
{ None , Start , Start , Start , None , None }, // XX
383
{ None , Start , Continue, Continue, None , None }, // PRPO
384
{ None , Start , Start , Continue, None , None }, // OPHY
385
{ Break , Break , Break , Continue, Continue, Continue }, // NU
386
{ Break , Break , Break , Continue, Continue, Continue }, // SYIS
387
{ Break , Continue, Break , Break , Break , Break }, // CLCP
390
inline Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
393
case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
394
// resolve AI math symbols in numerical context to IS
395
if (category == QChar::Symbol_Math)
398
case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
400
case QUnicodeTables::LineBreak_OP: case QUnicodeTables::LineBreak_HY:
402
case QUnicodeTables::LineBreak_NU:
404
case QUnicodeTables::LineBreak_SY: case QUnicodeTables::LineBreak_IS:
406
case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
416
/* In order to support the tailored implementation of LB25 properly
417
the following changes were made in the pair table to allow breaks
418
where the numeric expression doesn't match the template (i.e. [^NU](IS|SY)NU):
419
(CL)(PO) from IB to DB
420
(CP)(PO) from IB to DB
421
(CL)(PR) from IB to DB
422
(CP)(PR) from IB to DB
423
(PO)(OP) from IB to DB
424
(PR)(OP) from IB to DB
425
(IS)(NU) from IB to DB
426
(SY)(NU) from IB to DB
429
/* In order to implementat LB21a properly a special rule HH has been introduced and
430
the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
431
(HL)(HY|BA) from IB to CI
432
(HY|BA)(!CB) from DB to HH
436
ProhibitedBreak, PB = ProhibitedBreak,
437
DirectBreak, DB = DirectBreak,
438
IndirectBreak, IB = IndirectBreak,
439
CombiningIndirectBreak, CI = CombiningIndirectBreak,
440
CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
441
ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen
444
static const uchar breakTable[QUnicodeTables::LineBreak_CB + 1][QUnicodeTables::LineBreak_CB + 1] = {
445
/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB */
446
/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB },
447
/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
448
/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
449
/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB },
450
/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB },
451
/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
452
/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
453
/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
454
/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
455
/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB },
456
/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
457
/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
458
/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
459
/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
460
/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
461
/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
462
/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB },
463
/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB },
464
/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB },
465
/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
466
/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
467
/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
468
/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB },
469
/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB },
470
/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB },
471
/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB },
472
/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB },
473
/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB },
474
/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB },
475
/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }
478
// The following line break classes are not treated by the pair table
479
// and must be resolved outside:
480
// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
484
static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
487
LB::NS::Class nelast = LB::NS::XX;
489
QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
490
QUnicodeTables::LineBreakClass cls = lcls;
491
for (quint32 i = 0; i != len; ++i) {
493
uint ucs4 = string[i];
494
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
495
ushort low = string[i + 1];
496
if (QChar::isLowSurrogate(low)) {
497
ucs4 = QChar::surrogateToUcs4(ucs4, low);
502
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
503
QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
505
if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
506
// LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
507
static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
508
if (FLAG(prop->category) & test)
509
ncls = QUnicodeTables::LineBreak_CM;
511
if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM)) {
512
// LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
513
if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP)
514
ncls = QUnicodeTables::LineBreak_AL;
517
if (Q_LIKELY(ncls != QUnicodeTables::LineBreak_CM)) {
518
// LB25: do not break lines inside numbers
519
LB::NS::Class necur = LB::NS::toClass(ncls, (QChar::Category)prop->category);
520
switch (LB::NS::actionTable[nelast][necur]) {
522
// do not change breaks before and after the expression
523
for (quint32 j = nestart + 1; j < pos; ++j)
524
attributes[j].lineBreak = false;
527
nelast = LB::NS::XX; // reset state
538
if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
539
// LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
540
if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
541
attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
545
if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
546
if (ncls > QUnicodeTables::LineBreak_SP)
547
goto next; // LB6: x(BK|CR|LF|NL)
548
goto next_no_cls_update; // LB7: xSP
551
// for South East Asian chars that require a complex analysis, the Unicode
552
// standard recommends to treat them as AL. tailoring that do dictionary analysis can override
553
if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
554
cls = QUnicodeTables::LineBreak_AL;
556
switch (LB::breakTable[cls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
557
case LB::DirectBreak:
558
attributes[pos].lineBreak = true;
560
case LB::IndirectBreak:
561
if (lcls == QUnicodeTables::LineBreak_SP)
562
attributes[pos].lineBreak = true;
564
case LB::CombiningIndirectBreak:
565
if (lcls != QUnicodeTables::LineBreak_SP)
566
goto next_no_cls_update;
567
attributes[pos].lineBreak = true;
569
case LB::CombiningProhibitedBreak:
570
if (lcls != QUnicodeTables::LineBreak_SP)
571
goto next_no_cls_update;
573
case LB::ProhibitedBreakAfterHebrewPlusHyphen:
574
if (lcls != QUnicodeTables::LineBreak_HL)
575
attributes[pos].lineBreak = true;
577
case LB::ProhibitedBreak:
589
if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
590
// LB25: do not break lines inside numbers
591
for (quint32 j = nestart + 1; j < len; ++j)
592
attributes[j].lineBreak = false;
595
attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
596
attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
600
static void getWhiteSpaces(const ushort *string, quint32 len, QCharAttributes *attributes)
602
for (quint32 i = 0; i != len; ++i) {
603
uint ucs4 = string[i];
604
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
605
ushort low = string[i + 1];
606
if (QChar::isLowSurrogate(low)) {
607
ucs4 = QChar::surrogateToUcs4(ucs4, low);
612
if (Q_UNLIKELY(QChar::isSpace(ucs4)))
613
attributes[i].whiteSpace = true;
618
Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
619
const ScriptItem *items, int numItems,
620
QCharAttributes *attributes, CharAttributeOptions options)
625
if (!(options & DontClearAttributes))
626
::memset(attributes, 0, (length + 1) * sizeof(QCharAttributes));
628
if (options & GraphemeBreaks)
629
getGraphemeBreaks(string, length, attributes);
630
if (options & WordBreaks)
631
getWordBreaks(string, length, attributes);
632
if (options & SentenceBreaks)
633
getSentenceBreaks(string, length, attributes);
634
if (options & LineBreaks)
635
getLineBreaks(string, length, attributes);
636
if (options & WhiteSpaces)
637
getWhiteSpaces(string, length, attributes);
639
if (!qt_initcharattributes_default_algorithm_only) {
640
if (!items || numItems <= 0)
643
QVarLengthArray<HB_ScriptItem, 64> scriptItems;
644
scriptItems.reserve(numItems);
646
HB_Script startScript = script_to_hbscript(items[start].script);
647
if (Q_UNLIKELY(startScript == HB_Script_Inherited))
648
startScript = HB_Script_Common;
649
for (int i = start + 1; i < numItems; ++i) {
650
HB_Script script = script_to_hbscript(items[i].script);
651
if (Q_LIKELY(script == startScript || script == HB_Script_Inherited))
653
Q_ASSERT(items[i].position > items[start].position);
655
item.pos = items[start].position;
656
item.length = items[i].position - items[start].position;
657
item.script = startScript;
658
item.bidiLevel = 0; // unused
659
scriptItems.append(item);
661
startScript = script;
663
if (items[start].position + 1 < length) {
665
item.pos = items[start].position;
666
item.length = length - items[start].position;
667
item.script = startScript;
668
item.bidiLevel = 0; // unused
669
scriptItems.append(item);
671
Q_STATIC_ASSERT(sizeof(QCharAttributes) == sizeof(HB_CharAttributes));
672
HB_GetTailoredCharAttributes(string, length,
673
scriptItems.constData(), scriptItems.size(),
674
reinterpret_cast<HB_CharAttributes *>(attributes));
679
// ----------------------------------------------------------------------------
681
// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
683
// ----------------------------------------------------------------------------
685
Q_CORE_EXPORT void initScripts(const ushort *string, int length, uchar *scripts)
689
uchar script = QChar::Script_Common;
691
for (int i = 0; i < length; ++i, eor = i) {
692
uint ucs4 = string[i];
693
if (QChar::isHighSurrogate(ucs4) && i + 1 < length) {
694
ushort low = string[i + 1];
695
if (QChar::isLowSurrogate(low)) {
696
ucs4 = QChar::surrogateToUcs4(ucs4, low);
701
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
703
uchar nscript = prop->script;
705
if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
708
// inherit preceding Common-s
709
if (Q_UNLIKELY(script <= QChar::Script_Common)) {
710
// also covers a case where the base character of Common script followed
711
// by one or more combining marks of non-Inherited, non-Common script
716
// Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
717
// Thus, a combining mark — whatever its script property value is — should inherit
718
// the script property value of its base character.
719
static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
720
if (Q_UNLIKELY(FLAG(prop->category) & test))
723
Q_ASSERT(script > QChar::Script_Common);
725
::memset(scripts + sor, script, (eor - sor) * sizeof(uchar));
731
Q_ASSERT(script >= QChar::Script_Common);
732
Q_ASSERT(eor == length);
733
::memset(scripts + sor, script, (eor - sor) * sizeof(uchar));
736
} // namespace QUnicodeTools