@@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN
28
28
29
29
BMPSet::BMPSet (const int32_t *parentList, int32_t parentListLength) :
30
30
list(parentList), listLength(parentListLength) {
31
- uprv_memset (asciiBytes , 0 , sizeof (asciiBytes ));
31
+ uprv_memset (latin1Contains , 0 , sizeof (latin1Contains ));
32
32
uprv_memset (table7FF, 0 , sizeof (table7FF));
33
33
uprv_memset (bmpBlockBits, 0 , sizeof (bmpBlockBits));
34
34
@@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
45
45
list4kStarts[i]=findCodePoint (i<<12 , list4kStarts[i-1 ], listLength-1 );
46
46
}
47
47
list4kStarts[0x11 ]=listLength-1 ;
48
+ containsFFFD=containsSlow (0xfffd , list4kStarts[0xf ], list4kStarts[0x10 ]);
48
49
49
50
initBits ();
50
51
overrideIllegal ();
51
52
}
52
53
53
54
BMPSet::BMPSet (const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
55
+ containsFFFD(otherBMPSet.containsFFFD),
54
56
list(newParentList), listLength(newParentListLength) {
55
- uprv_memcpy (asciiBytes , otherBMPSet.asciiBytes , sizeof (asciiBytes ));
57
+ uprv_memcpy (latin1Contains , otherBMPSet.latin1Contains , sizeof (latin1Contains ));
56
58
uprv_memcpy (table7FF, otherBMPSet.table7FF , sizeof (table7FF));
57
59
uprv_memcpy (bmpBlockBits, otherBMPSet.bmpBlockBits , sizeof (bmpBlockBits));
58
60
uprv_memcpy (list4kStarts, otherBMPSet.list4kStarts , sizeof (list4kStarts));
@@ -120,21 +122,38 @@ void BMPSet::initBits() {
120
122
UChar32 start, limit;
121
123
int32_t listIndex=0 ;
122
124
123
- // Set asciiBytes [].
125
+ // Set latin1Contains [].
124
126
do {
125
127
start=list[listIndex++];
126
128
if (listIndex<listLength) {
127
129
limit=list[listIndex++];
128
130
} else {
129
131
limit=0x110000 ;
130
132
}
131
- if (start>=0x80 ) {
133
+ if (start>=0x100 ) {
132
134
break ;
133
135
}
134
136
do {
135
- asciiBytes[start++]=1 ;
136
- } while (start<limit && start<0x80 );
137
- } while (limit<=0x80 );
137
+ latin1Contains[start++]=1 ;
138
+ } while (start<limit && start<0x100 );
139
+ } while (limit<=0x100 );
140
+
141
+ // Find the first range overlapping with (or after) 80..FF again,
142
+ // to include them in table7FF as well.
143
+ for (listIndex=0 ;;) {
144
+ start=list[listIndex++];
145
+ if (listIndex<listLength) {
146
+ limit=list[listIndex++];
147
+ } else {
148
+ limit=0x110000 ;
149
+ }
150
+ if (limit>0x80 ) {
151
+ if (start<0x80 ) {
152
+ start=0x80 ;
153
+ }
154
+ break ;
155
+ }
156
+ }
138
157
139
158
// Set table7FF[].
140
159
while (start<0x800 ) {
@@ -204,19 +223,14 @@ void BMPSet::initBits() {
204
223
* for faster validity checking at runtime.
205
224
* No need to set 0 values where they were reset to 0 in the constructor
206
225
* and not modified by initBits().
207
- * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
226
+ * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
208
227
* Need to set 0 values for surrogates D800..DFFF.
209
228
*/
210
229
void BMPSet::overrideIllegal () {
211
230
uint32_t bits, mask;
212
231
int32_t i;
213
232
214
- if (containsSlow (0xfffd , list4kStarts[0xf ], list4kStarts[0x10 ])) {
215
- // contains(FFFD)==TRUE
216
- for (i=0x80 ; i<0xc0 ; ++i) {
217
- asciiBytes[i]=1 ;
218
- }
219
-
233
+ if (containsFFFD) {
220
234
bits=3 ; // Lead bytes 0xC0 and 0xC1.
221
235
for (i=0 ; i<64 ; ++i) {
222
236
table7FF[i]|=bits;
@@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() {
233
247
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
234
248
}
235
249
} else {
236
- // contains(FFFD)==FALSE
237
250
mask=~(0x10001 <<0xd ); // Lead byte 0xED.
238
251
for (i=32 ; i<64 ; ++i) { // Second half of 4k block.
239
252
bmpBlockBits[i]&=mask;
@@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
277
290
278
291
UBool
279
292
BMPSet::contains (UChar32 c) const {
280
- if ((uint32_t )c<=0x7f ) {
281
- return (UBool)asciiBytes [c];
293
+ if ((uint32_t )c<=0xff ) {
294
+ return (UBool)latin1Contains [c];
282
295
} else if ((uint32_t )c<=0x7ff ) {
283
296
return (UBool)((table7FF[c&0x3f ]&((uint32_t )1 <<(c>>6 )))!=0 );
284
297
} else if ((uint32_t )c<0xd800 || (c>=0xe000 && c<=0xffff )) {
@@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
314
327
// span
315
328
do {
316
329
c=*s;
317
- if (c<=0x7f ) {
318
- if (!asciiBytes [c]) {
330
+ if (c<=0xff ) {
331
+ if (!latin1Contains [c]) {
319
332
break ;
320
333
}
321
334
} else if (c<=0x7ff ) {
@@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
354
367
// span not
355
368
do {
356
369
c=*s;
357
- if (c<=0x7f ) {
358
- if (asciiBytes [c]) {
370
+ if (c<=0xff ) {
371
+ if (latin1Contains [c]) {
359
372
break ;
360
373
}
361
374
} else if (c<=0x7ff ) {
@@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
403
416
// span
404
417
for (;;) {
405
418
c=*(--limit);
406
- if (c<=0x7f ) {
407
- if (!asciiBytes [c]) {
419
+ if (c<=0xff ) {
420
+ if (!latin1Contains [c]) {
408
421
break ;
409
422
}
410
423
} else if (c<=0x7ff ) {
@@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
446
459
// span not
447
460
for (;;) {
448
461
c=*(--limit);
449
- if (c<=0x7f ) {
450
- if (asciiBytes [c]) {
462
+ if (c<=0xff ) {
463
+ if (latin1Contains [c]) {
451
464
break ;
452
465
}
453
466
} else if (c<=0x7ff ) {
@@ -497,22 +510,22 @@ const uint8_t *
497
510
BMPSet::spanUTF8 (const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
498
511
const uint8_t *limit=s+length;
499
512
uint8_t b=*s;
500
- if (( int8_t )b>= 0 ) {
513
+ if (U8_IS_SINGLE (b) ) {
501
514
// Initial all-ASCII span.
502
515
if (spanCondition) {
503
516
do {
504
- if (!asciiBytes [b] || ++s==limit) {
517
+ if (!latin1Contains [b] || ++s==limit) {
505
518
return s;
506
519
}
507
520
b=*s;
508
- } while (( int8_t )b>= 0 );
521
+ } while (U8_IS_SINGLE (b) );
509
522
} else {
510
523
do {
511
- if (asciiBytes [b] || ++s==limit) {
524
+ if (latin1Contains [b] || ++s==limit) {
512
525
return s;
513
526
}
514
527
b=*s;
515
- } while (( int8_t )b>= 0 );
528
+ } while (U8_IS_SINGLE (b) );
516
529
}
517
530
length=(int32_t )(limit-s);
518
531
}
@@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
540
553
// single trail byte, check for preceding 3- or 4-byte lead byte
541
554
if (length>=2 && (b=*(limit-2 ))>=0xe0 ) {
542
555
limit-=2 ;
543
- if (asciiBytes[ 0x80 ] !=spanCondition) {
556
+ if (containsFFFD !=spanCondition) {
544
557
limit0=limit;
545
558
}
546
559
} else if (b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3 ))>=0xf0 ) {
547
560
// 4-byte lead byte with only two trail bytes
548
561
limit-=3 ;
549
- if (asciiBytes[ 0x80 ] !=spanCondition) {
562
+ if (containsFFFD !=spanCondition) {
550
563
limit0=limit;
551
564
}
552
565
}
553
566
} else {
554
567
// lead byte with no trail bytes
555
568
--limit;
556
- if (asciiBytes[ 0x80 ] !=spanCondition) {
569
+ if (containsFFFD !=spanCondition) {
557
570
limit0=limit;
558
571
}
559
572
}
@@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
563
576
564
577
while (s<limit) {
565
578
b=*s;
566
- if (b< 0xc0 ) {
567
- // ASCII; or trail bytes with the result of contains(FFFD).
579
+ if (U8_IS_SINGLE (b) ) {
580
+ // ASCII
568
581
if (spanCondition) {
569
582
do {
570
- if (!asciiBytes [b]) {
583
+ if (!latin1Contains [b]) {
571
584
return s;
572
585
} else if (++s==limit) {
573
586
return limit0;
574
587
}
575
588
b=*s;
576
- } while (b< 0xc0 );
589
+ } while (U8_IS_SINGLE (b) );
577
590
} else {
578
591
do {
579
- if (asciiBytes [b]) {
592
+ if (latin1Contains [b]) {
580
593
return s;
581
594
} else if (++s==limit) {
582
595
return limit0;
583
596
}
584
597
b=*s;
585
- } while (b< 0xc0 );
598
+ } while (U8_IS_SINGLE (b) );
586
599
}
587
600
}
588
601
++s; // Advance past the lead byte.
@@ -619,16 +632,17 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
619
632
UChar32 c=((UChar32)(b-0xf0 )<<18 )|((UChar32)t1<<12 )|(t2<<6 )|t3;
620
633
if ( ( (0x10000 <=c && c<=0x10ffff ) ?
621
634
containsSlow (c, list4kStarts[0x10 ], list4kStarts[0x11 ]) :
622
- asciiBytes[ 0x80 ]
635
+ containsFFFD
623
636
) != spanCondition
624
637
) {
625
638
return s-1 ;
626
639
}
627
640
s+=3 ;
628
641
continue ;
629
642
}
630
- } else /* 0xc0<=b<0xe0 */ {
643
+ } else {
631
644
if ( /* handle U+0000..U+07FF inline */
645
+ b>=0xc0 &&
632
646
(t1=(uint8_t )(*s-0x80 )) <= 0x3f
633
647
) {
634
648
if ((USetSpanCondition)((table7FF[t1]&((uint32_t )1 <<(b&0x1f )))!=0 ) != spanCondition) {
@@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
642
656
// Give an illegal sequence the same value as the result of contains(FFFD).
643
657
// Handle each byte of an illegal sequence separately to simplify the code;
644
658
// no need to optimize error handling.
645
- if (asciiBytes[ 0x80 ] !=spanCondition) {
659
+ if (containsFFFD !=spanCondition) {
646
660
return s-1 ;
647
661
}
648
662
}
@@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon
667
681
668
682
do {
669
683
b=s[--length];
670
- if (( int8_t )b>= 0 ) {
684
+ if (U8_IS_SINGLE (b) ) {
671
685
// ASCII sub-span
672
686
if (spanCondition) {
673
687
do {
674
- if (!asciiBytes [b]) {
688
+ if (!latin1Contains [b]) {
675
689
return length+1 ;
676
690
} else if (length==0 ) {
677
691
return 0 ;
678
692
}
679
693
b=s[--length];
680
- } while (( int8_t )b>= 0 );
694
+ } while (U8_IS_SINGLE (b) );
681
695
} else {
682
696
do {
683
- if (asciiBytes [b]) {
697
+ if (latin1Contains [b]) {
684
698
return length+1 ;
685
699
} else if (length==0 ) {
686
700
return 0 ;
687
701
}
688
702
b=s[--length];
689
- } while (( int8_t )b>= 0 );
703
+ } while (U8_IS_SINGLE (b) );
690
704
}
691
705
}
692
706
0 commit comments