Skip to content

Commit 75405a1

Browse files
srl295evanlucas
authored andcommitted
deps: ICU 60 bump
- Update to released ICU 60.1, including: - CLDR 32 (many new languages and data improvements) - Unicode 10 (8,518 new characters, including four new scripts, 7,494 new Han characters, and 56 new emoji characters) - UTF-8 malformed bytes now handled according to W3C/WHATWG spec Fixes: #15540 PR-URL: #16876 Reviewed-By: James M Snell <[email protected]> Reviewed-By: Michael Dawson <[email protected]>
1 parent 1ee6df9 commit 75405a1

File tree

254 files changed

+23876
-10365
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

254 files changed

+23876
-10365
lines changed

LICENSE

+1-1
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ The externally maintained libraries used by Node.js are:
230230
# ---------COPYING.libtabe ---- BEGIN--------------------
231231
#
232232
# /*
233-
# * Copyrighy (c) 1999 TaBE Project.
233+
# * Copyright (c) 1999 TaBE Project.
234234
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
235235
# * All rights reserved.
236236
# *

configure

+2-2
Original file line numberDiff line numberDiff line change
@@ -1092,8 +1092,8 @@ def glob_to_var(dir_base, dir_sub, patch_dir):
10921092
def configure_intl(o):
10931093
icus = [
10941094
{
1095-
'url': 'https://ssl.icu-project.org/files/icu4c/59.1/icu4c-59_1-src.zip',
1096-
'md5': '29a41f9bb576b06c7eef0487a84a7674',
1095+
'url': 'https://ssl.icu-project.org/files/icu4c/60.1/icu4c-60_1-src.zip',
1096+
'md5': 'e6cb990ac2a3161d31a3def8435f80cb',
10971097
},
10981098
]
10991099
def icu_download(path):

deps/icu-small/LICENSE

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ property of their respective owners.
131131
# ---------COPYING.libtabe ---- BEGIN--------------------
132132
#
133133
# /*
134-
# * Copyrighy (c) 1999 TaBE Project.
134+
# * Copyright (c) 1999 TaBE Project.
135135
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
136136
# * All rights reserved.
137137
# *

deps/icu-small/README-SMALL-ICU.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
Small ICU sources - auto generated by shrink-icu-src.py
22

33
This directory contains the ICU subset used by --with-intl=small-icu (the default)
4-
It is a strict subset of ICU 59 source files with the following exception(s):
5-
* deps/icu-small/source/data/in/icudt59l.dat : Reduced-size data file
4+
It is a strict subset of ICU 60 source files with the following exception(s):
5+
* deps/icu-small/source/data/in/icudt60l.dat : Reduced-size data file
66

77

88
To rebuild this directory, see ../../tools/icu/README.md

deps/icu-small/source/common/bmpset.cpp

+61-47
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN
2828

2929
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
3030
list(parentList), listLength(parentListLength) {
31-
uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
31+
uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
3232
uprv_memset(table7FF, 0, sizeof(table7FF));
3333
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
3434

@@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
4545
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
4646
}
4747
list4kStarts[0x11]=listLength-1;
48+
containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
4849

4950
initBits();
5051
overrideIllegal();
5152
}
5253

5354
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
55+
containsFFFD(otherBMPSet.containsFFFD),
5456
list(newParentList), listLength(newParentListLength) {
55-
uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
57+
uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
5658
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
5759
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
5860
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
@@ -120,21 +122,38 @@ void BMPSet::initBits() {
120122
UChar32 start, limit;
121123
int32_t listIndex=0;
122124

123-
// Set asciiBytes[].
125+
// Set latin1Contains[].
124126
do {
125127
start=list[listIndex++];
126128
if(listIndex<listLength) {
127129
limit=list[listIndex++];
128130
} else {
129131
limit=0x110000;
130132
}
131-
if(start>=0x80) {
133+
if(start>=0x100) {
132134
break;
133135
}
134136
do {
135-
asciiBytes[start++]=1;
136-
} while(start<limit && start<0x80);
137-
} while(limit<=0x80);
137+
latin1Contains[start++]=1;
138+
} while(start<limit && start<0x100);
139+
} while(limit<=0x100);
140+
141+
// Find the first range overlapping with (or after) 80..FF again,
142+
// to include them in table7FF as well.
143+
for(listIndex=0;;) {
144+
start=list[listIndex++];
145+
if(listIndex<listLength) {
146+
limit=list[listIndex++];
147+
} else {
148+
limit=0x110000;
149+
}
150+
if(limit>0x80) {
151+
if(start<0x80) {
152+
start=0x80;
153+
}
154+
break;
155+
}
156+
}
138157

139158
// Set table7FF[].
140159
while(start<0x800) {
@@ -204,19 +223,14 @@ void BMPSet::initBits() {
204223
* for faster validity checking at runtime.
205224
* No need to set 0 values where they were reset to 0 in the constructor
206225
* and not modified by initBits().
207-
* (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
226+
* (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
208227
* Need to set 0 values for surrogates D800..DFFF.
209228
*/
210229
void BMPSet::overrideIllegal() {
211230
uint32_t bits, mask;
212231
int32_t i;
213232

214-
if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
215-
// contains(FFFD)==TRUE
216-
for(i=0x80; i<0xc0; ++i) {
217-
asciiBytes[i]=1;
218-
}
219-
233+
if(containsFFFD) {
220234
bits=3; // Lead bytes 0xC0 and 0xC1.
221235
for(i=0; i<64; ++i) {
222236
table7FF[i]|=bits;
@@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() {
233247
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
234248
}
235249
} else {
236-
// contains(FFFD)==FALSE
237250
mask=~(0x10001<<0xd); // Lead byte 0xED.
238251
for(i=32; i<64; ++i) { // Second half of 4k block.
239252
bmpBlockBits[i]&=mask;
@@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
277290

278291
UBool
279292
BMPSet::contains(UChar32 c) const {
280-
if((uint32_t)c<=0x7f) {
281-
return (UBool)asciiBytes[c];
293+
if((uint32_t)c<=0xff) {
294+
return (UBool)latin1Contains[c];
282295
} else if((uint32_t)c<=0x7ff) {
283296
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
284297
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
@@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
314327
// span
315328
do {
316329
c=*s;
317-
if(c<=0x7f) {
318-
if(!asciiBytes[c]) {
330+
if(c<=0xff) {
331+
if(!latin1Contains[c]) {
319332
break;
320333
}
321334
} else if(c<=0x7ff) {
@@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
354367
// span not
355368
do {
356369
c=*s;
357-
if(c<=0x7f) {
358-
if(asciiBytes[c]) {
370+
if(c<=0xff) {
371+
if(latin1Contains[c]) {
359372
break;
360373
}
361374
} else if(c<=0x7ff) {
@@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
403416
// span
404417
for(;;) {
405418
c=*(--limit);
406-
if(c<=0x7f) {
407-
if(!asciiBytes[c]) {
419+
if(c<=0xff) {
420+
if(!latin1Contains[c]) {
408421
break;
409422
}
410423
} else if(c<=0x7ff) {
@@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
446459
// span not
447460
for(;;) {
448461
c=*(--limit);
449-
if(c<=0x7f) {
450-
if(asciiBytes[c]) {
462+
if(c<=0xff) {
463+
if(latin1Contains[c]) {
451464
break;
452465
}
453466
} else if(c<=0x7ff) {
@@ -497,22 +510,22 @@ const uint8_t *
497510
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
498511
const uint8_t *limit=s+length;
499512
uint8_t b=*s;
500-
if((int8_t)b>=0) {
513+
if(U8_IS_SINGLE(b)) {
501514
// Initial all-ASCII span.
502515
if(spanCondition) {
503516
do {
504-
if(!asciiBytes[b] || ++s==limit) {
517+
if(!latin1Contains[b] || ++s==limit) {
505518
return s;
506519
}
507520
b=*s;
508-
} while((int8_t)b>=0);
521+
} while(U8_IS_SINGLE(b));
509522
} else {
510523
do {
511-
if(asciiBytes[b] || ++s==limit) {
524+
if(latin1Contains[b] || ++s==limit) {
512525
return s;
513526
}
514527
b=*s;
515-
} while((int8_t)b>=0);
528+
} while(U8_IS_SINGLE(b));
516529
}
517530
length=(int32_t)(limit-s);
518531
}
@@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
540553
// single trail byte, check for preceding 3- or 4-byte lead byte
541554
if(length>=2 && (b=*(limit-2))>=0xe0) {
542555
limit-=2;
543-
if(asciiBytes[0x80]!=spanCondition) {
556+
if(containsFFFD!=spanCondition) {
544557
limit0=limit;
545558
}
546559
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
547560
// 4-byte lead byte with only two trail bytes
548561
limit-=3;
549-
if(asciiBytes[0x80]!=spanCondition) {
562+
if(containsFFFD!=spanCondition) {
550563
limit0=limit;
551564
}
552565
}
553566
} else {
554567
// lead byte with no trail bytes
555568
--limit;
556-
if(asciiBytes[0x80]!=spanCondition) {
569+
if(containsFFFD!=spanCondition) {
557570
limit0=limit;
558571
}
559572
}
@@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
563576

564577
while(s<limit) {
565578
b=*s;
566-
if(b<0xc0) {
567-
// ASCII; or trail bytes with the result of contains(FFFD).
579+
if(U8_IS_SINGLE(b)) {
580+
// ASCII
568581
if(spanCondition) {
569582
do {
570-
if(!asciiBytes[b]) {
583+
if(!latin1Contains[b]) {
571584
return s;
572585
} else if(++s==limit) {
573586
return limit0;
574587
}
575588
b=*s;
576-
} while(b<0xc0);
589+
} while(U8_IS_SINGLE(b));
577590
} else {
578591
do {
579-
if(asciiBytes[b]) {
592+
if(latin1Contains[b]) {
580593
return s;
581594
} else if(++s==limit) {
582595
return limit0;
583596
}
584597
b=*s;
585-
} while(b<0xc0);
598+
} while(U8_IS_SINGLE(b));
586599
}
587600
}
588601
++s; // Advance past the lead byte.
@@ -619,16 +632,17 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
619632
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
620633
if( ( (0x10000<=c && c<=0x10ffff) ?
621634
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
622-
asciiBytes[0x80]
635+
containsFFFD
623636
) != spanCondition
624637
) {
625638
return s-1;
626639
}
627640
s+=3;
628641
continue;
629642
}
630-
} else /* 0xc0<=b<0xe0 */ {
643+
} else {
631644
if( /* handle U+0000..U+07FF inline */
645+
b>=0xc0 &&
632646
(t1=(uint8_t)(*s-0x80)) <= 0x3f
633647
) {
634648
if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
@@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
642656
// Give an illegal sequence the same value as the result of contains(FFFD).
643657
// Handle each byte of an illegal sequence separately to simplify the code;
644658
// no need to optimize error handling.
645-
if(asciiBytes[0x80]!=spanCondition) {
659+
if(containsFFFD!=spanCondition) {
646660
return s-1;
647661
}
648662
}
@@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon
667681

668682
do {
669683
b=s[--length];
670-
if((int8_t)b>=0) {
684+
if(U8_IS_SINGLE(b)) {
671685
// ASCII sub-span
672686
if(spanCondition) {
673687
do {
674-
if(!asciiBytes[b]) {
688+
if(!latin1Contains[b]) {
675689
return length+1;
676690
} else if(length==0) {
677691
return 0;
678692
}
679693
b=s[--length];
680-
} while((int8_t)b>=0);
694+
} while(U8_IS_SINGLE(b));
681695
} else {
682696
do {
683-
if(asciiBytes[b]) {
697+
if(latin1Contains[b]) {
684698
return length+1;
685699
} else if(length==0) {
686700
return 0;
687701
}
688702
b=s[--length];
689-
} while((int8_t)b>=0);
703+
} while(U8_IS_SINGLE(b));
690704
}
691705
}
692706

deps/icu-small/source/common/bmpset.h

+8-7
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@ U_NAMESPACE_BEGIN
2828
* Helper class for frozen UnicodeSets, implements contains() and span()
2929
* optimized for BMP code points. Structured to be UTF-8-friendly.
3030
*
31-
* ASCII: Look up bytes.
31+
* Latin-1: Look up bytes.
3232
* 2-byte characters: Bits organized vertically.
3333
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
3434
* with mixed for illegal ranges.
35-
* Supplementary characters: Call contains() on the parent set.
35+
* Supplementary characters: Binary search over
36+
* the supplementary part of the parent set's inversion list.
3637
*/
3738
class BMPSet : public UMemory {
3839
public:
@@ -96,12 +97,12 @@ class BMPSet : public UMemory {
9697
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
9798

9899
/*
99-
* One byte per ASCII character, or trail byte in lead position.
100-
* 0 or 1 for ASCII characters.
101-
* The value for trail bytes is the result of contains(FFFD)
102-
* for faster validity checking at runtime.
100+
* One byte 0 or 1 per Latin-1 character.
103101
*/
104-
UBool asciiBytes[0xc0];
102+
UBool latin1Contains[0x100];
103+
104+
/* TRUE if contains(U+FFFD). */
105+
UBool containsFFFD;
105106

106107
/*
107108
* One bit per code point from U+0000..U+07FF.

0 commit comments

Comments
 (0)