Skip to content

Commit 02e5141

Browse files
committed
fix infinite loop in replace with AI collations
Signed-off-by: Tanzeel Khan <[email protected]>
1 parent e02ee26 commit 02e5141

File tree

3 files changed

+124
-4
lines changed

3 files changed

+124
-4
lines changed

contrib/babelfishpg_tsql/src/collation.c

+49-4
Original file line numberDiff line numberDiff line change
@@ -1475,12 +1475,13 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
14751475
int32_t src_len_utf8 = VARSIZE_ANY_EXHDR(src_text);
14761476
int32_t substr_len_utf8 = VARSIZE_ANY_EXHDR(substr_text);
14771477
int32_t src_ulen, substr_ulen;
1478-
int32_t u8_pos = -1;
1478+
int32_t u8_pos = -1, pos_prev_loop = -1;
14791479
UErrorCode status = U_ZERO_ERROR;
14801480
UStringSearch *usearch;
14811481
UChar *src_uchar, *substr_uchar;
14821482
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal(collid);
14831483
bool is_CS_AI = false;
1484+
bool is_substr_starts_with_surrogate;
14841485

14851486
if (OidIsValid(coll_info_of_inputcollid.oid) &&
14861487
coll_info_of_inputcollid.collateflags == 0x000e /* CS_AI */ )
@@ -1491,6 +1492,8 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
14911492
src_ulen = icu_to_uchar(&src_uchar, VARDATA_ANY(src_text), src_len_utf8);
14921493
substr_ulen = icu_to_uchar(&substr_uchar, VARDATA_ANY(substr_text), substr_len_utf8);
14931494

1495+
is_substr_starts_with_surrogate = U16_IS_SURROGATE(substr_uchar[0]);
1496+
14941497
usearch = usearch_openFromCollator(substr_uchar,
14951498
substr_ulen,
14961499
src_uchar,
@@ -1507,7 +1510,7 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
15071510
errmsg("failed to perform ICU search: %s",
15081511
u_errorName(status))));
15091512

1510-
for (int u16_pos = usearch_first(usearch, &status);
1513+
for (int32_t u16_pos = usearch_first(usearch, &status);
15111514
u16_pos != USEARCH_DONE;
15121515
u16_pos = usearch_next(usearch, &status))
15131516
{
@@ -1517,6 +1520,28 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
15171520
errmsg("failed to perform ICU search: %s",
15181521
u_errorName(status))));
15191522

1523+
/* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
1524+
if (u16_pos == pos_prev_loop)
1525+
{
1526+
/* IF UTF16 code point is in the range D800 - DBFF, then it is a surrogate pair */
1527+
int32_t next_char_idx = u16_pos + (src_uchar[u16_pos] & 0xF800 == 0xD800 ? 2 : 1);
1528+
1529+
if (is_substr_starts_with_surrogate && next_char_idx < src_ulen)
1530+
{
1531+
usearch_setOffset(usearch, next_char_idx, &status);
1532+
1533+
if (U_FAILURE(status))
1534+
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
1535+
errmsg("failed to set offset in ICU search: %s", u_errorName(status))));
1536+
1537+
continue;
1538+
}
1539+
else
1540+
break;
1541+
}
1542+
1543+
pos_prev_loop = u16_pos;
1544+
15201545
/* for CS_AI collations usearch can give false positives so we double check the results here */
15211546
if (!(is_CS_AI && icu_compare_utf8_coll(mylocale->info.icu.ucol, &src_uchar[usearch_getMatchedStart(usearch)], usearch_getMatchedLength(usearch), substr_uchar, substr_ulen, false) != 0))
15221547
{
@@ -1564,7 +1589,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
15641589
int32_t src_len = VARSIZE_ANY_EXHDR(src_text);
15651590
int32_t from_str_len = VARSIZE_ANY_EXHDR(from_text);
15661591
int32_t to_str_len = VARSIZE_ANY_EXHDR(to_text);
1567-
int32_t previous_pos;
1592+
int32_t previous_pos, pos_prev_loop = -1;
15681593
int32_t src_ulen, from_ulen; /* in utf-16 units */
15691594
UErrorCode status = U_ZERO_ERROR;
15701595
UStringSearch *usearch;
@@ -1573,6 +1598,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
15731598
StringInfoData resbuf;
15741599
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal(collid);
15751600
bool is_CS_AI = false;
1601+
bool is_substr_starts_with_surrogate;
15761602

15771603
if (OidIsValid(coll_info_of_inputcollid.oid) &&
15781604
coll_info_of_inputcollid.collateflags == 0x000e /* CS_AI */ )
@@ -1583,6 +1609,8 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
15831609
src_ulen = icu_to_uchar(&src_uchar, VARDATA_ANY(src_text), src_len);
15841610
from_ulen = icu_to_uchar(&from_uchar, VARDATA_ANY(from_text), from_str_len);
15851611

1612+
is_substr_starts_with_surrogate = U16_IS_SURROGATE(from_uchar[0]);
1613+
15861614
usearch = usearch_openFromCollator(from_uchar, /* needle */
15871615
from_ulen,
15881616
src_uchar, /* haystack */
@@ -1596,7 +1624,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
15961624
initStringInfo(&resbuf);
15971625
previous_pos = 0;
15981626

1599-
for (int pos = usearch_first(usearch, &status);
1627+
for (int32_t pos = usearch_first(usearch, &status);
16001628
pos != USEARCH_DONE;
16011629
pos = usearch_next(usearch, &status))
16021630
{
@@ -1609,6 +1637,23 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
16091637
errmsg("failed to perform ICU search: %s",
16101638
u_errorName(status))));
16111639

1640+
/* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
1641+
if (pos == pos_prev_loop)
1642+
{
1643+
/* IF UTF16 code point is in the range D800 - DBFF, then it is a surrogate pair */
1644+
int32_t next_char_idx = pos + (src_uchar[pos] & 0xF800 == 0xD800 ? 2 : 1);
1645+
1646+
if (is_substr_starts_with_surrogate && next_char_idx < src_ulen)
1647+
{
1648+
usearch_setOffset(usearch, next_char_idx, &status);
1649+
continue;
1650+
}
1651+
else
1652+
break;
1653+
}
1654+
1655+
pos_prev_loop = pos;
1656+
16121657
/* for CS_AI collations usearch can give false positives so we double check the results here */
16131658
if (is_CS_AI && icu_compare_utf8_coll(mylocale->info.icu.ucol, &src_uchar[usearch_getMatchedStart(usearch)], usearch_getMatchedLength(usearch), from_uchar, from_ulen, false) != 0)
16141659
continue;

test/JDBC/expected/charindex_and_replace_CIAI_collations.out

+60
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,66 @@ AAAAAABBBBBBBEEEEEEAAAAA
642642
DROP TABLE BABEL_4850_T
643643
GO
644644

645+
/* Substring to find starts with surrogate pair BABEL-5169 */
646+
SELECT CHARINDEX(N'🙂dEf', N'abc🙂def🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
647+
SELECT CHARINDEX(N'🙂D', N'abc🙂d🙂d🙂D' COLLATE Latin1_General_CS_AI)
648+
SELECT CHARINDEX(N'🙂dEf', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
649+
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
650+
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
651+
GO
652+
~~START~~
653+
int
654+
14
655+
~~END~~
656+
657+
~~START~~
658+
int
659+
8
660+
~~END~~
661+
662+
~~START~~
663+
int
664+
4
665+
~~END~~
666+
667+
~~START~~
668+
int
669+
4
670+
~~END~~
671+
672+
~~START~~
673+
int
674+
4
675+
~~END~~
676+
677+
678+
/* Substring to find starts with surrogate pair BABEL-5169 */
679+
SELECT REPLACE(N'abc🙂defghi🙂🙂', N'🙂def', N'jhi🙂' COLLATE Latin1_General_CI_AI)
680+
SELECT REPLACE(N'abc🙂🙂🙂🙂🙂defghi🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
681+
SELECT REPLACE(N'abc🙂🙂🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
682+
SELECT REPLACE(N'🙂abc🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
683+
GO
684+
~~START~~
685+
nvarchar
686+
abcjhi🙂ghi🙂🙂
687+
~~END~~
688+
689+
~~START~~
690+
nvarchar
691+
abc<----><----><----><----><---->defghi<----><---->
692+
~~END~~
693+
694+
~~START~~
695+
nvarchar
696+
abc<----><----><----><---->
697+
~~END~~
698+
699+
~~START~~
700+
nvarchar
701+
<---->abc<---->
702+
~~END~~
703+
704+
645705
-- psql
646706
CREATE COLLATION case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
647707
CREATE COLLATION ignore_accents (provider = icu, locale = 'nd-u-kc-true-ks-level1', deterministic = false);

test/JDBC/input/charindex_and_replace_CIAI_collations.mix

+15
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,21 @@ GO
204204
DROP TABLE BABEL_4850_T
205205
GO
206206

207+
/* Substring to find starts with surrogate pair BABEL-5169 */
208+
SELECT CHARINDEX(N'🙂dEf', N'abc🙂def🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
209+
SELECT CHARINDEX(N'🙂D', N'abc🙂d🙂d🙂D' COLLATE Latin1_General_CS_AI)
210+
SELECT CHARINDEX(N'🙂dEf', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
211+
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
212+
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
213+
GO
214+
215+
/* Substring to find starts with surrogate pair BABEL-5169 */
216+
SELECT REPLACE(N'abc🙂defghi🙂🙂', N'🙂def', N'jhi🙂' COLLATE Latin1_General_CI_AI)
217+
SELECT REPLACE(N'abc🙂🙂🙂🙂🙂defghi🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
218+
SELECT REPLACE(N'abc🙂🙂🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
219+
SELECT REPLACE(N'🙂abc🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
220+
GO
221+
207222
-- psql
208223
CREATE COLLATION case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
209224
CREATE COLLATION ignore_accents (provider = icu, locale = 'nd-u-kc-true-ks-level1', deterministic = false);

0 commit comments

Comments
 (0)