2
* Licensed to the Apache Software Foundation (ASF) under one or more
3
* contributor license agreements. See the NOTICE file distributed with
4
* this work for additional information regarding copyright ownership.
5
* The ASF licenses this file to You under the Apache License, Version 2.0
6
* (the "License"); you may not use this file except in compliance with
7
* the License. You may obtain a copy of the License at
9
* http://www.apache.org/licenses/LICENSE-2.0
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
19
* $Id: RangeToken.cpp 678879 2008-07-22 20:05:05Z amassari $
22
// ---------------------------------------------------------------------------
24
// ---------------------------------------------------------------------------
31
#include <xercesc/util/regx/RangeToken.hpp>
32
#include <xercesc/util/regx/TokenFactory.hpp>
33
#include <xercesc/util/IllegalArgumentException.hpp>
34
#include <xercesc/util/XMLUniDefs.hpp>
36
#if XERCES_USE_TRANSCODER_ICU
37
#include <unicode/uchar.h>
39
#if (U_ICU_VERSION_MAJOR_NUM > 2) || (U_ICU_VERSION_MAJOR_NUM == 2 && U_ICU_VERSION_MINOR_NUM >=4)
40
#include <unicode/uset.h>
41
#include <xercesc/util/XMLString.hpp>
42
#include <xercesc/util/Janitor.hpp>
46
XERCES_CPP_NAMESPACE_BEGIN
48
// ---------------------------------------------------------------------------
49
// Static member data initialization
50
// ---------------------------------------------------------------------------
51
const int RangeToken::MAPSIZE = 256;
52
const unsigned int RangeToken::INITIALSIZE = 16;
54
// ---------------------------------------------------------------------------
55
// RangeToken: Constructors and Destructors
56
// ---------------------------------------------------------------------------
57
RangeToken::RangeToken(const Token::tokType tkType,
58
MemoryManager* const manager)
59
: Token(tkType, manager)
64
, fMaxCount(INITIALSIZE)
68
, fMemoryManager(manager)
73
RangeToken::~RangeToken() {
75
fMemoryManager->deallocate(fMap);//delete [] fMap;
76
fMemoryManager->deallocate(fRanges);//delete[] fRanges;
80
// This is a struct that defines a mapping for
81
// case-insensitive matching. The first character
82
// is the character we try to match in the range.
83
// The second is the character we add to the range,
84
// because it maps to the first when we're folding
86
struct ExceptionCharsStruct
90
XMLInt32 matchingChar;
94
// This is an array of character mappings that we will
95
// add to ranges for case-insensitive matching.
96
static const ExceptionCharsStruct s_exceptions[] =
146
// ---------------------------------------------------------------------------
147
// RangeToken: Getter methods
148
// ---------------------------------------------------------------------------
149
RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) {
151
if (fCaseIToken == 0 && tokFactory && fRanges) {
153
bool isNRange = (getTokenType() == T_NRANGE) ? true : false;
154
RangeToken* lwrToken = tokFactory->createRange(isNRange);
155
unsigned int exceptIndex = 0;
157
#if XERCES_USE_TRANSCODER_ICU && ((U_ICU_VERSION_MAJOR_NUM > 2) || (U_ICU_VERSION_MAJOR_NUM == 2 && U_ICU_VERSION_MINOR_NUM >=4))
158
UChar* rangeStr=(UChar*)fMemoryManager->allocate(40*fElemCount*sizeof(UChar));
159
ArrayJanitor<UChar> janRange(rangeStr, fMemoryManager);
161
rangeStr[c++] = chOpenSquare;
162
for (unsigned int i = 0; i < fElemCount - 1; i += 2) {
166
rangeStr[c++] = chBackSlash;
167
rangeStr[c++] = chLatin_U;
168
XMLString::binToText(fRanges[i], buffer, 10, 16, fMemoryManager);
169
len = XMLString::stringLen(buffer);
170
for(j=0;j<(8-len);j++)
171
rangeStr[c++] = chDigit_0;
174
rangeStr[c++] = *p++;
175
if(fRanges[i+1]!=fRanges[i])
177
rangeStr[c++] = chDash;
178
rangeStr[c++] = chBackSlash;
179
rangeStr[c++] = chLatin_U;
180
XMLString::binToText(fRanges[i+1], buffer, 10, 16, fMemoryManager);
181
len = XMLString::stringLen(buffer);
182
for(j=0;j<(8-len);j++)
183
rangeStr[c++] = chDigit_0;
186
rangeStr[c++] = *p++;
189
rangeStr[c++] = chCloseSquare;
190
rangeStr[c++] = chNull;
191
UErrorCode ec=U_ZERO_ERROR;
192
USet* range=uset_openPatternOptions(rangeStr, -1, USET_CASE_INSENSITIVE, &ec);
196
uint32_t cbCount=uset_serialize(range, NULL, 0, &ec);
197
uint16_t* buffer=(uint16_t*)fMemoryManager->allocate(cbCount*sizeof(uint16_t));
198
ArrayJanitor<uint16_t> janSet(buffer, fMemoryManager);
200
uset_serialize(range, buffer, cbCount, &ec);
201
USerializedSet serializedSet;
202
uset_getSerializedSet(&serializedSet, buffer, cbCount);
203
int32_t nSets=uset_getSerializedRangeCount(&serializedSet);
204
for(int32_t i=0; i<nSets; i++)
207
uset_getSerializedRange(&serializedSet, i, &start, &end);
208
lwrToken->addRange(start, end);
210
// does this release the memory allocated by the set?
211
uset_setSerializedToOne(&serializedSet, 32);
215
for (unsigned int i = 0; i < fElemCount - 1; i += 2) {
216
for (XMLInt32 ch = fRanges[i]; ch <= fRanges[i + 1]; ++ch) {
217
#if XERCES_USE_TRANSCODER_ICU
218
const XMLInt32 upperCh = u_toupper(ch);
222
lwrToken->addRange(upperCh, upperCh);
225
const XMLInt32 lowerCh = u_tolower(ch);
229
lwrToken->addRange(lowerCh, lowerCh);
232
const XMLInt32 titleCh = u_totitle(ch);
234
if (titleCh != ch && titleCh != upperCh)
236
lwrToken->addRange(titleCh, titleCh);
239
if (ch >= chLatin_A && ch <= chLatin_Z)
241
ch += chLatin_a - chLatin_A;
243
lwrToken->addRange(ch, ch);
245
else if (ch >= chLatin_a && ch <= chLatin_z)
247
ch -= chLatin_a - chLatin_A;
249
lwrToken->addRange(ch, ch);
253
const unsigned int exceptionsSize =
254
sizeof(s_exceptions) / sizeof(s_exceptions[0]);
256
// Add any exception chars. These are characters where the the
257
// case mapping is not symmetric. (Unicode case mappings are not isomorphic...)
258
while (exceptIndex < exceptionsSize)
260
if (s_exceptions[exceptIndex].baseChar < ch)
264
else if (s_exceptions[exceptIndex].baseChar == ch)
266
const XMLInt32 matchingChar =
267
s_exceptions[exceptIndex].matchingChar;
283
lwrToken->mergeRanges(this);
285
lwrToken->compactRanges();
286
lwrToken->createMap();
288
fCaseIToken = lwrToken;
294
// ---------------------------------------------------------------------------
295
// RangeToken: Setter methods
296
// ---------------------------------------------------------------------------
297
void RangeToken::setRangeValues(XMLInt32* const rangeValues, const unsigned int count)
302
fMemoryManager->deallocate(fMap);//delete [] fMap;
307
fMemoryManager->deallocate(fRanges);//delete [] fRanges;
311
fElemCount = fMaxCount = count;
312
fRanges = rangeValues;
315
// ---------------------------------------------------------------------------
316
// RangeToken: Range manipulation methods
317
// ---------------------------------------------------------------------------
318
void RangeToken::addRange(const XMLInt32 start, const XMLInt32 end) {
337
fRanges = (XMLInt32*) fMemoryManager->allocate
339
fMaxCount * sizeof(XMLInt32)
340
);//new XMLInt32[fMaxCount];
348
if (fRanges[fElemCount-1] + 1 == val1) {
350
fRanges[fElemCount-1] = val2;
354
if (fElemCount + 2 >= fMaxCount) {
358
if(fSorted && fRanges[fElemCount-1] >= val1)
360
for (int i = 0; i < (int)fElemCount; i +=2)
362
// check if this range is already part of this one
363
if (fRanges[i] <= val1 && fRanges[i+1] >= val2)
365
// or if the new one extends the old one
366
else if(fRanges[i]==val1 && fRanges[i+1] < val2)
371
else if (fRanges[i] > val1 ||
372
(fRanges[i]==val1 && fRanges[i+1] > val2))
374
for(int j=fElemCount-1;j>=i;j--)
375
fRanges[j+2]=fRanges[j];
385
if (fRanges[fElemCount-1] >= val1)
388
fRanges[fElemCount++] = val1;
389
fRanges[fElemCount++] = val2;
398
void RangeToken::sortRanges() {
400
if (fSorted || fRanges == 0)
403
for (int i = fElemCount - 4; i >= 0; i -= 2) {
405
for (int j = 0; j <= i; j +=2) {
407
if (fRanges[j] > fRanges[j + 2]
408
|| (fRanges[j]==fRanges[j+2] && fRanges[j+1] > fRanges[j+3])) {
410
XMLInt32 tmpVal = fRanges[j+2];
411
fRanges[j+2] = fRanges[j];
413
tmpVal = fRanges[j+3];
414
fRanges[j+3] = fRanges[j+1];
415
fRanges[j+1] = tmpVal;
423
void RangeToken::compactRanges() {
425
if (fCompacted || fRanges == 0 || fElemCount <= 2)
428
unsigned int base = 0;
429
unsigned int target = 0;
431
while (target < fElemCount) {
433
if (base != target) {
435
fRanges[base] = fRanges[target++];
436
fRanges[base+1] = fRanges[target++];
441
XMLInt32 baseEnd = fRanges[base + 1];
443
while (target < fElemCount) {
445
XMLInt32 startRange = fRanges[target];
447
if (baseEnd + 1 < startRange)
450
XMLInt32 endRange = fRanges[target + 1];
452
if (baseEnd + 1 == startRange || baseEnd < endRange) {
455
fRanges[base+1] = baseEnd;
458
else if (baseEnd >= endRange) {
462
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_CompactRangesError, fMemoryManager);
473
void RangeToken::mergeRanges(const Token *const tok) {
476
if (tok->getTokenType() != this->getTokenType())
477
ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_MergeRangesTypeMismatch, fMemoryManager);
479
RangeToken* rangeTok = (RangeToken *) tok;
481
if (rangeTok->fRanges == 0)
486
rangeTok->sortRanges();
490
fMaxCount = rangeTok->fMaxCount;
491
fRanges = (XMLInt32*) fMemoryManager->allocate
493
fMaxCount * sizeof(XMLInt32)
494
);//new XMLInt32[fMaxCount];
495
for (unsigned int index = 0; index < rangeTok->fElemCount; index++) {
496
fRanges[index] = rangeTok->fRanges[index];
499
fElemCount = rangeTok->fElemCount;
504
unsigned int newMaxCount = (fElemCount + rangeTok->fElemCount >= fMaxCount)
505
? fMaxCount + rangeTok->fMaxCount : fMaxCount;
506
XMLInt32* result = (XMLInt32*) fMemoryManager->allocate
508
newMaxCount * sizeof(XMLInt32)
509
);//new XMLInt32[newMaxCount];
511
for (unsigned int i=0, j=0, k=0; i < fElemCount || j < rangeTok->fElemCount;) {
513
if (i >= fElemCount) {
515
for (int count = 0; count < 2; count++) {
516
result[k++] = rangeTok->fRanges[j++];
519
else if (j >= rangeTok->fElemCount) {
521
for (int count = 0; count < 2; count++) {
522
result[k++] = fRanges[i++];
525
else if (rangeTok->fRanges[j] < fRanges[i]
526
|| (rangeTok->fRanges[j] == fRanges[i]
527
&& rangeTok->fRanges[j+1] < fRanges[i+1])) {
529
for (int count = 0; count < 2; count++) {
530
result[k++] = rangeTok->fRanges[j++];
535
for (int count = 0; count < 2; count++) {
537
result[k++] = fRanges[i++];
542
fMemoryManager->deallocate(fRanges);//delete [] fRanges;
543
fElemCount += rangeTok->fElemCount;
545
fMaxCount = newMaxCount;
548
void RangeToken::subtractRanges(RangeToken* const tok) {
550
if (fRanges == 0 || tok->fRanges == 0)
553
if (tok->getTokenType() == T_NRANGE) {
555
intersectRanges(tok);
563
tok->compactRanges();
565
unsigned int newMax = (fElemCount + tok->fElemCount >= fMaxCount)
566
? fMaxCount + tok->fMaxCount : fMaxCount;
567
XMLInt32* result = (XMLInt32*) fMemoryManager->allocate
569
newMax * sizeof(XMLInt32)
570
);//new XMLInt32[newMax];
571
unsigned int newElemCount = 0;
572
unsigned int srcCount = 0;
573
unsigned int subCount = 0;
575
while (srcCount < fElemCount && subCount < tok->fElemCount) {
577
XMLInt32 srcBegin = fRanges[srcCount];
578
XMLInt32 srcEnd = fRanges[srcCount + 1];
579
XMLInt32 subBegin = tok->fRanges[subCount];
580
XMLInt32 subEnd = tok->fRanges[subCount + 1];
582
if (srcEnd < subBegin) { // no overlap
584
result[newElemCount++] = fRanges[srcCount++];
585
result[newElemCount++] = fRanges[srcCount++];
587
else if (srcEnd >= subBegin && srcBegin <= subEnd) {
589
if (subBegin <= srcBegin && srcEnd <= subEnd) {
592
else if (subBegin <= srcBegin) {
594
fRanges[srcCount] = subEnd + 1;
597
else if (srcEnd <= subEnd) {
599
result[newElemCount++] = srcBegin;
600
result[newElemCount++] = subBegin - 1;
605
result[newElemCount++] = srcBegin;
606
result[newElemCount++] = subBegin - 1;
607
fRanges[srcCount] = subEnd + 1;
611
else if (subEnd < srcBegin) {
615
fMemoryManager->deallocate(result);//delete [] result;
616
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_SubtractRangesError, fMemoryManager);
620
while (srcCount < fElemCount) {
622
result[newElemCount++] = fRanges[srcCount++];
623
result[newElemCount++] = fRanges[srcCount++];
626
fMemoryManager->deallocate(fRanges);//delete [] fRanges;
628
fElemCount = newElemCount;
633
* Ignore whether 'tok' is NRANGE or not.
635
void RangeToken::intersectRanges(RangeToken* const tok) {
637
if (fRanges == 0 || tok->fRanges == 0)
644
tok->compactRanges();
646
unsigned int newMax = (fElemCount + tok->fElemCount >= fMaxCount)
647
? fMaxCount + tok->fMaxCount : fMaxCount;
648
XMLInt32* result = (XMLInt32*) fMemoryManager->allocate
650
newMax * sizeof(XMLInt32)
651
);//new XMLInt32[newMax];
652
unsigned int newElemCount = 0;
653
unsigned int srcCount = 0;
654
unsigned int tokCount = 0;
656
while (srcCount < fElemCount && tokCount < tok->fElemCount) {
658
XMLInt32 srcBegin = fRanges[srcCount];
659
XMLInt32 srcEnd = fRanges[srcCount + 1];
660
XMLInt32 tokBegin = tok->fRanges[tokCount];
661
XMLInt32 tokEnd = tok->fRanges[tokCount + 1];
663
if (srcEnd < tokBegin) {
666
else if (srcEnd >= tokBegin && srcBegin <= tokEnd) {
668
if (tokBegin <= srcBegin && srcEnd <= tokEnd) {
670
result[newElemCount++] = srcBegin;
671
result[newElemCount++] = srcEnd;
674
else if (tokBegin <= srcBegin) {
676
result[newElemCount++] = srcBegin;
677
result[newElemCount++] = tokEnd;
680
if (tokCount < tok->fElemCount)
681
fRanges[srcCount] = tokEnd + 1;
685
else if (srcEnd <= tokEnd) {
687
result[newElemCount++] = tokBegin;
688
result[newElemCount++] = srcEnd;
693
result[newElemCount++] = tokBegin;
694
result[newElemCount++] = tokEnd;
697
if (tokCount < tok->fElemCount)
698
fRanges[srcCount] = tokEnd + 1;
703
else if (tokEnd < srcBegin) {
706
if (tokCount >= tok->fElemCount)
711
fMemoryManager->deallocate(result);//delete [] result;
712
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_IntersectRangesError, fMemoryManager);
716
fMemoryManager->deallocate(fRanges);//delete [] fRanges;
718
fElemCount = newElemCount;
723
* for RANGE: Creates complement.
724
* for NRANGE: Creates the same meaning RANGE.
726
RangeToken* RangeToken::complementRanges(RangeToken* const tok,
727
TokenFactory* const tokFactory,
728
MemoryManager* const manager) {
730
if (tok->getTokenType() != T_RANGE && tok->getTokenType() != T_NRANGE)
731
ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_ComplementRangesInvalidArg, manager);
734
tok->compactRanges();
736
XMLInt32 lastElem = tok->fRanges[tok->fElemCount - 1];
737
RangeToken* rangeTok = tokFactory->createRange();
739
if (tok->fRanges[0] > 0) {
740
rangeTok->addRange(0, tok->fRanges[0] - 1);
743
for (unsigned int i= 1; i< tok->fElemCount - 2; i += 2) {
744
rangeTok->addRange(tok->fRanges[i] + 1, tok->fRanges[i+1] - 1);
747
if (lastElem != UTF16_MAX) {
748
rangeTok->addRange(lastElem + 1, UTF16_MAX);
751
rangeTok->fCompacted = true;
757
// ---------------------------------------------------------------------------
758
// RangeToken: Match methods
759
// ---------------------------------------------------------------------------
760
bool RangeToken::match(const XMLInt32 ch) {
766
if (getTokenType() == T_RANGE) {
769
return ((fMap[ch/32] & (1<<(ch&0x1f))) != 0);
773
for (unsigned int i= fNonMapIndex; i< fElemCount; i +=2) {
775
if (fRanges[i] <= ch && ch <= fRanges[i+1])
782
return ((fMap[ch/32] & (1<<(ch&0x1f))) == 0);
786
for (unsigned int i= fNonMapIndex; i< fElemCount; i += 2) {
788
if (fRanges[i] <= ch && ch <= fRanges[i+1])
796
// ---------------------------------------------------------------------------
797
// RangeToken: Private helpers methods
798
// ---------------------------------------------------------------------------
799
void RangeToken::expand(const unsigned int length) {
801
unsigned int newMax = fElemCount + length;
803
// Avoid too many reallocations by expanding by a percentage
804
unsigned int minNewMax = (unsigned int)((double)fElemCount * 1.25);
805
if (newMax < minNewMax)
808
XMLInt32* newList = (XMLInt32*) fMemoryManager->allocate
810
newMax * sizeof(XMLInt32)
811
);//new XMLInt32[newMax];
812
for (unsigned int index = 0; index < fElemCount; index++)
813
newList[index] = fRanges[index];
815
fMemoryManager->deallocate(fRanges);//delete [] fRanges;
820
void RangeToken::doCreateMap() {
824
int asize = MAPSIZE/32;
825
fMap = (int*) fMemoryManager->allocate(asize * sizeof(int));//new int[asize];
826
fNonMapIndex = fElemCount;
828
for (int i = 0; i < asize; i++) {
832
for (unsigned int j= 0; j < fElemCount; j += 2) {
834
XMLInt32 begin = fRanges[j];
835
XMLInt32 end = fRanges[j+1];
837
if (begin < MAPSIZE) {
839
for (int k = begin; k <= end && k < MAPSIZE; k++) {
840
fMap[k/32] |= 1<<(k&0x1F);
849
if (end >= MAPSIZE) {
857
XERCES_CPP_NAMESPACE_END
860
* End of file RangeToken.cpp