Skip to content

Commit 0fd96ad

Browse files
committed
fix infinite loop in replace with AI collations (babelfish-for-postgresql#2849)
ICU usearch_next() goes into infinite loop when pattern to search starts with a surrogate pair. To get around this we check if output of usearch_next() is stuck and not proceeding forwards and set the offset for next search ourselves. The next offset is simply the next character after the current char in source string. SRC STRING - 'abc🙂defghi🙂🙂' PATTERN TO FIND = '🙂def' usearch_next() gets stuck on "🙂" idx = 3 and repeatedly returns this index. We will intervene and set the offset to "d" idx = 4. So that usearch_next only starts looking from this character. Taks: BABEL-5167 Signed-off-by: Tanzeel Khan <[email protected]>
1 parent eecb8f2 commit 0fd96ad

File tree

3 files changed

+136
-4
lines changed

3 files changed

+136
-4
lines changed

contrib/babelfishpg_tsql/src/collation.c

+61-4
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,15 @@
4545
#define MAX_BYTES_PER_CHAR 4
4646
#define MAX_INPUT_LENGTH_TO_REMOVE_ACCENTS 250 * 1024 * 1024
4747

48+
/*
49+
* Check if Uchar is lead surrogate pair, If Uchar is in
50+
* the range D800 - DBFF then it is a lead surrogate pair
51+
*/
52+
#define UCHAR_IS_SURROGATE(c) ((c & 0xF800) == 0xD800)
53+
54+
/* Find length of given Uchar */
55+
#define UCHAR_LENGTH(c) (UCHAR_IS_SURROGATE(c) ? 2 : 1)
56+
4857
Oid server_collation_oid = InvalidOid;
4958
collation_callbacks *collation_callbacks_ptr = NULL;
5059
extern bool babelfish_dump_restore;
@@ -1475,12 +1484,13 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
14751484
int32_t src_len_utf8 = VARSIZE_ANY_EXHDR(src_text);
14761485
int32_t substr_len_utf8 = VARSIZE_ANY_EXHDR(substr_text);
14771486
int32_t src_ulen, substr_ulen;
1478-
int32_t u8_pos = -1;
1487+
int32_t u8_pos = -1, pos_prev_loop = -1;
14791488
UErrorCode status = U_ZERO_ERROR;
14801489
UStringSearch *usearch;
14811490
UChar *src_uchar, *substr_uchar;
14821491
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal(collid);
14831492
bool is_CS_AI = false;
1493+
bool is_substr_starts_with_surrogate;
14841494

14851495
if (OidIsValid(coll_info_of_inputcollid.oid) &&
14861496
coll_info_of_inputcollid.collateflags == 0x000e /* CS_AI */ )
@@ -1491,6 +1501,8 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
14911501
src_ulen = icu_to_uchar(&src_uchar, VARDATA_ANY(src_text), src_len_utf8);
14921502
substr_ulen = icu_to_uchar(&substr_uchar, VARDATA_ANY(substr_text), substr_len_utf8);
14931503

1504+
is_substr_starts_with_surrogate = UCHAR_IS_SURROGATE(substr_uchar[0]);
1505+
14941506
usearch = usearch_openFromCollator(substr_uchar,
14951507
substr_ulen,
14961508
src_uchar,
@@ -1507,7 +1519,7 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
15071519
errmsg("failed to perform ICU search: %s",
15081520
u_errorName(status))));
15091521

1510-
for (int u16_pos = usearch_first(usearch, &status);
1522+
for (int32_t u16_pos = usearch_first(usearch, &status);
15111523
u16_pos != USEARCH_DONE;
15121524
u16_pos = usearch_next(usearch, &status))
15131525
{
@@ -1517,6 +1529,27 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
15171529
errmsg("failed to perform ICU search: %s",
15181530
u_errorName(status))));
15191531

1532+
/* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
1533+
if (u16_pos == pos_prev_loop)
1534+
{
1535+
int32_t next_char_idx = u16_pos + UCHAR_LENGTH(src_uchar[u16_pos]);
1536+
1537+
if (is_substr_starts_with_surrogate && next_char_idx < src_ulen)
1538+
{
1539+
usearch_setOffset(usearch, next_char_idx, &status);
1540+
1541+
if (U_FAILURE(status))
1542+
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
1543+
errmsg("failed to set offset in ICU search: %s", u_errorName(status))));
1544+
1545+
continue;
1546+
}
1547+
else
1548+
break;
1549+
}
1550+
1551+
pos_prev_loop = u16_pos;
1552+
15201553
/* for CS_AI collations usearch can give false positives so we double check the results here */
15211554
if (!(is_CS_AI && icu_compare_utf8_coll(mylocale->info.icu.ucol, &src_uchar[usearch_getMatchedStart(usearch)], usearch_getMatchedLength(usearch), substr_uchar, substr_ulen, false) != 0))
15221555
{
@@ -1564,7 +1597,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
15641597
int32_t src_len = VARSIZE_ANY_EXHDR(src_text);
15651598
int32_t from_str_len = VARSIZE_ANY_EXHDR(from_text);
15661599
int32_t to_str_len = VARSIZE_ANY_EXHDR(to_text);
1567-
int32_t previous_pos;
1600+
int32_t previous_pos, pos_prev_loop = -1;
15681601
int32_t src_ulen, from_ulen; /* in utf-16 units */
15691602
UErrorCode status = U_ZERO_ERROR;
15701603
UStringSearch *usearch;
@@ -1573,6 +1606,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
15731606
StringInfoData resbuf;
15741607
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal(collid);
15751608
bool is_CS_AI = false;
1609+
bool is_substr_starts_with_surrogate;
15761610

15771611
if (OidIsValid(coll_info_of_inputcollid.oid) &&
15781612
coll_info_of_inputcollid.collateflags == 0x000e /* CS_AI */ )
@@ -1583,6 +1617,8 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
15831617
src_ulen = icu_to_uchar(&src_uchar, VARDATA_ANY(src_text), src_len);
15841618
from_ulen = icu_to_uchar(&from_uchar, VARDATA_ANY(from_text), from_str_len);
15851619

1620+
is_substr_starts_with_surrogate = UCHAR_IS_SURROGATE(from_uchar[0]);
1621+
15861622
usearch = usearch_openFromCollator(from_uchar, /* needle */
15871623
from_ulen,
15881624
src_uchar, /* haystack */
@@ -1596,7 +1632,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
15961632
initStringInfo(&resbuf);
15971633
previous_pos = 0;
15981634

1599-
for (int pos = usearch_first(usearch, &status);
1635+
for (int32_t pos = usearch_first(usearch, &status);
16001636
pos != USEARCH_DONE;
16011637
pos = usearch_next(usearch, &status))
16021638
{
@@ -1609,6 +1645,27 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
16091645
errmsg("failed to perform ICU search: %s",
16101646
u_errorName(status))));
16111647

1648+
/* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
1649+
if (pos == pos_prev_loop)
1650+
{
1651+
int32_t next_char_idx = pos + UCHAR_LENGTH(src_uchar[pos]);
1652+
1653+
if (is_substr_starts_with_surrogate && next_char_idx < src_ulen)
1654+
{
1655+
usearch_setOffset(usearch, next_char_idx, &status);
1656+
1657+
if (U_FAILURE(status))
1658+
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
1659+
errmsg("failed to set offset in ICU search: %s", u_errorName(status))));
1660+
1661+
continue;
1662+
}
1663+
else
1664+
break;
1665+
}
1666+
1667+
pos_prev_loop = pos;
1668+
16121669
/* for CS_AI collations usearch can give false positives so we double check the results here */
16131670
if (is_CS_AI && icu_compare_utf8_coll(mylocale->info.icu.ucol, &src_uchar[usearch_getMatchedStart(usearch)], usearch_getMatchedLength(usearch), from_uchar, from_ulen, false) != 0)
16141671
continue;

test/JDBC/expected/charindex_and_replace_CIAI_collations.out

+60
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,66 @@ AAAAAABBBBBBBEEEEEEAAAAA
642642
DROP TABLE BABEL_4850_T
643643
GO
644644

645+
/* Substring to find starts with surrogate pair BABEL-5169 */
646+
SELECT CHARINDEX(N'🙂dEf', N'abc🙂def🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
647+
SELECT CHARINDEX(N'🙂D', N'abc🙂d🙂d🙂D' COLLATE Latin1_General_CS_AI)
648+
SELECT CHARINDEX(N'🙂dEf', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
649+
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
650+
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
651+
GO
652+
~~START~~
653+
int
654+
14
655+
~~END~~
656+
657+
~~START~~
658+
int
659+
8
660+
~~END~~
661+
662+
~~START~~
663+
int
664+
4
665+
~~END~~
666+
667+
~~START~~
668+
int
669+
4
670+
~~END~~
671+
672+
~~START~~
673+
int
674+
4
675+
~~END~~
676+
677+
678+
/* Substring to find starts with surrogate pair BABEL-5169 */
679+
SELECT REPLACE(N'abc🙂defghi🙂🙂', N'🙂def', N'jhi🙂' COLLATE Latin1_General_CI_AI)
680+
SELECT REPLACE(N'abc🙂🙂🙂🙂🙂defghi🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
681+
SELECT REPLACE(N'abc🙂🙂🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
682+
SELECT REPLACE(N'🙂abc🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
683+
GO
684+
~~START~~
685+
nvarchar
686+
abcjhi🙂ghi🙂🙂
687+
~~END~~
688+
689+
~~START~~
690+
nvarchar
691+
abc<----><----><----><----><---->defghi<----><---->
692+
~~END~~
693+
694+
~~START~~
695+
nvarchar
696+
abc<----><----><----><---->
697+
~~END~~
698+
699+
~~START~~
700+
nvarchar
701+
<---->abc<---->
702+
~~END~~
703+
704+
645705
-- psql
646706
CREATE COLLATION case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
647707
CREATE COLLATION ignore_accents (provider = icu, locale = 'nd-u-kc-true-ks-level1', deterministic = false);

test/JDBC/input/charindex_and_replace_CIAI_collations.mix

+15
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,21 @@ GO
204204
DROP TABLE BABEL_4850_T
205205
GO
206206

207+
/* Substring to find starts with surrogate pair BABEL-5169 */
208+
SELECT CHARINDEX(N'🙂dEf', N'abc🙂def🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
209+
SELECT CHARINDEX(N'🙂D', N'abc🙂d🙂d🙂D' COLLATE Latin1_General_CS_AI)
210+
SELECT CHARINDEX(N'🙂dEf', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
211+
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
212+
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
213+
GO
214+
215+
/* Substring to find starts with surrogate pair BABEL-5169 */
216+
SELECT REPLACE(N'abc🙂defghi🙂🙂', N'🙂def', N'jhi🙂' COLLATE Latin1_General_CI_AI)
217+
SELECT REPLACE(N'abc🙂🙂🙂🙂🙂defghi🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
218+
SELECT REPLACE(N'abc🙂🙂🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
219+
SELECT REPLACE(N'🙂abc🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
220+
GO
221+
207222
-- psql
208223
CREATE COLLATION case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
209224
CREATE COLLATION ignore_accents (provider = icu, locale = 'nd-u-kc-true-ks-level1', deterministic = false);

0 commit comments

Comments
 (0)