Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix infinite loop in replace with AI collations #2849

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 61 additions & 4 deletions contrib/babelfishpg_tsql/src/collation.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@
#define MAX_BYTES_PER_CHAR 4
#define MAX_INPUT_LENGTH_TO_REMOVE_ACCENTS 250 * 1024 * 1024

/*
* Check if Uchar is lead surrogate pair, If Uchar is in
* the range D800 - DBFF then it is a lead surrogate pair
*/
#define UCHAR_IS_SURROGATE(c) ((c & 0xF800) == 0xD800)

/* Find length of given Uchar */
#define UCHAR_LENGTH(c) (UCHAR_IS_SURROGATE(c) ? 2 : 1)

Oid server_collation_oid = InvalidOid;
collation_callbacks *collation_callbacks_ptr = NULL;
extern bool babelfish_dump_restore;
Expand Down Expand Up @@ -1475,12 +1484,13 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
int32_t src_len_utf8 = VARSIZE_ANY_EXHDR(src_text);
int32_t substr_len_utf8 = VARSIZE_ANY_EXHDR(substr_text);
int32_t src_ulen, substr_ulen;
int32_t u8_pos = -1;
int32_t u8_pos = -1, pos_prev_loop = -1;
UErrorCode status = U_ZERO_ERROR;
UStringSearch *usearch;
UChar *src_uchar, *substr_uchar;
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal(collid);
bool is_CS_AI = false;
bool is_substr_starts_with_surrogate;

if (OidIsValid(coll_info_of_inputcollid.oid) &&
coll_info_of_inputcollid.collateflags == 0x000e /* CS_AI */ )
Expand All @@ -1491,6 +1501,8 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
src_ulen = icu_to_uchar(&src_uchar, VARDATA_ANY(src_text), src_len_utf8);
substr_ulen = icu_to_uchar(&substr_uchar, VARDATA_ANY(substr_text), substr_len_utf8);

is_substr_starts_with_surrogate = UCHAR_IS_SURROGATE(substr_uchar[0]);

usearch = usearch_openFromCollator(substr_uchar,
substr_ulen,
src_uchar,
Expand All @@ -1507,7 +1519,7 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
errmsg("failed to perform ICU search: %s",
u_errorName(status))));

for (int u16_pos = usearch_first(usearch, &status);
for (int32_t u16_pos = usearch_first(usearch, &status);
u16_pos != USEARCH_DONE;
u16_pos = usearch_next(usearch, &status))
{
Expand All @@ -1517,6 +1529,27 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
errmsg("failed to perform ICU search: %s",
u_errorName(status))));

/* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
if (u16_pos == pos_prev_loop)
{
int32_t next_char_idx = u16_pos + UCHAR_LENGTH(src_uchar[u16_pos]);

if (is_substr_starts_with_surrogate && next_char_idx < src_ulen)
{
usearch_setOffset(usearch, next_char_idx, &status);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need error checking?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


if (U_FAILURE(status))
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
errmsg("failed to set offset in ICU search: %s", u_errorName(status))));

continue;
}
else
break;
}

pos_prev_loop = u16_pos;

/* for CS_AI collations usearch can give false positives so we double check the results here */
if (!(is_CS_AI && icu_compare_utf8_coll(mylocale->info.icu.ucol, &src_uchar[usearch_getMatchedStart(usearch)], usearch_getMatchedLength(usearch), substr_uchar, substr_ulen, false) != 0))
{
Expand Down Expand Up @@ -1564,7 +1597,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
int32_t src_len = VARSIZE_ANY_EXHDR(src_text);
int32_t from_str_len = VARSIZE_ANY_EXHDR(from_text);
int32_t to_str_len = VARSIZE_ANY_EXHDR(to_text);
int32_t previous_pos;
int32_t previous_pos, pos_prev_loop = -1;
int32_t src_ulen, from_ulen; /* in utf-16 units */
UErrorCode status = U_ZERO_ERROR;
UStringSearch *usearch;
Expand All @@ -1573,6 +1606,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
StringInfoData resbuf;
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal(collid);
bool is_CS_AI = false;
bool is_substr_starts_with_surrogate;

if (OidIsValid(coll_info_of_inputcollid.oid) &&
coll_info_of_inputcollid.collateflags == 0x000e /* CS_AI */ )
Expand All @@ -1583,6 +1617,8 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
src_ulen = icu_to_uchar(&src_uchar, VARDATA_ANY(src_text), src_len);
from_ulen = icu_to_uchar(&from_uchar, VARDATA_ANY(from_text), from_str_len);

is_substr_starts_with_surrogate = UCHAR_IS_SURROGATE(from_uchar[0]);

usearch = usearch_openFromCollator(from_uchar, /* needle */
from_ulen,
src_uchar, /* haystack */
Expand All @@ -1596,7 +1632,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
initStringInfo(&resbuf);
previous_pos = 0;

for (int pos = usearch_first(usearch, &status);
for (int32_t pos = usearch_first(usearch, &status);
pos != USEARCH_DONE;
pos = usearch_next(usearch, &status))
{
Expand All @@ -1609,6 +1645,27 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
errmsg("failed to perform ICU search: %s",
u_errorName(status))));

/* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
if (pos == pos_prev_loop)
{
int32_t next_char_idx = pos + UCHAR_LENGTH(src_uchar[pos]);

if (is_substr_starts_with_surrogate && next_char_idx < src_ulen)
{
usearch_setOffset(usearch, next_char_idx, &status);

if (U_FAILURE(status))
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
errmsg("failed to set offset in ICU search: %s", u_errorName(status))));

continue;
}
else
break;
}

pos_prev_loop = pos;

/* for CS_AI collations usearch can give false positives so we double check the results here */
if (is_CS_AI && icu_compare_utf8_coll(mylocale->info.icu.ucol, &src_uchar[usearch_getMatchedStart(usearch)], usearch_getMatchedLength(usearch), from_uchar, from_ulen, false) != 0)
continue;
Expand Down
60 changes: 60 additions & 0 deletions test/JDBC/expected/charindex_and_replace_CIAI_collations.out
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,66 @@ AAAAAABBBBBBBEEEEEEAAAAA
DROP TABLE BABEL_4850_T
GO

/* Substring to find starts with surrogate pair BABEL-5169 */
SELECT CHARINDEX(N'🙂dEf', N'abc🙂def🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'🙂D', N'abc🙂d🙂d🙂D' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'🙂dEf', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
GO
~~START~~
int
14
~~END~~

~~START~~
int
8
~~END~~

~~START~~
int
4
~~END~~

~~START~~
int
4
~~END~~

~~START~~
int
4
~~END~~


/* Substring to find starts with surrogate pair BABEL-5169 */
SELECT REPLACE(N'abc🙂defghi🙂🙂', N'🙂def', N'jhi🙂' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'abc🙂🙂🙂🙂🙂defghi🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'abc🙂🙂🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'🙂abc🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
GO
~~START~~
nvarchar
abcjhi🙂ghi🙂🙂
~~END~~

~~START~~
nvarchar
abc<----><----><----><----><---->defghi<----><---->
~~END~~

~~START~~
nvarchar
abc<----><----><----><---->
~~END~~

~~START~~
nvarchar
<---->abc<---->
~~END~~


-- psql
CREATE COLLATION case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
CREATE COLLATION ignore_accents (provider = icu, locale = 'nd-u-kc-true-ks-level1', deterministic = false);
Expand Down
15 changes: 15 additions & 0 deletions test/JDBC/input/charindex_and_replace_CIAI_collations.mix
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,21 @@ GO
DROP TABLE BABEL_4850_T
GO

/* Substring to find starts with surrogate pair BABEL-5169 */
SELECT CHARINDEX(N'🙂dEf', N'abc🙂def🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'🙂D', N'abc🙂d🙂d🙂D' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'🙂dEf', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'🙂', N'abc🙂defgh🙂dEfi🙂🙂' COLLATE Latin1_General_CI_AI)
GO

/* Substring to find starts with surrogate pair BABEL-5169 */
SELECT REPLACE(N'abc🙂defghi🙂🙂', N'🙂def', N'jhi🙂' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'abc🙂🙂🙂🙂🙂defghi🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'abc🙂🙂🙂🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'🙂abc🙂', N'🙂', N'<---->' COLLATE Latin1_General_CI_AI)
GO

-- psql
CREATE COLLATION case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
CREATE COLLATION ignore_accents (provider = icu, locale = 'nd-u-kc-true-ks-level1', deterministic = false);
Expand Down
Loading