@@ -1475,12 +1475,13 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
1475
1475
int32_t src_len_utf8 = VARSIZE_ANY_EXHDR (src_text );
1476
1476
int32_t substr_len_utf8 = VARSIZE_ANY_EXHDR (substr_text );
1477
1477
int32_t src_ulen , substr_ulen ;
1478
- int32_t u8_pos = -1 ;
1478
+ int32_t u8_pos = -1 , pos_prev_loop = -1 ;
1479
1479
UErrorCode status = U_ZERO_ERROR ;
1480
1480
UStringSearch * usearch ;
1481
1481
UChar * src_uchar , * substr_uchar ;
1482
1482
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal (collid );
1483
1483
bool is_CS_AI = false;
1484
+ bool is_substr_starts_with_surrogate ;
1484
1485
1485
1486
if (OidIsValid (coll_info_of_inputcollid .oid ) &&
1486
1487
coll_info_of_inputcollid .collateflags == 0x000e /* CS_AI */ )
@@ -1491,6 +1492,8 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
1491
1492
src_ulen = icu_to_uchar (& src_uchar , VARDATA_ANY (src_text ), src_len_utf8 );
1492
1493
substr_ulen = icu_to_uchar (& substr_uchar , VARDATA_ANY (substr_text ), substr_len_utf8 );
1493
1494
1495
+ is_substr_starts_with_surrogate = U16_IS_SURROGATE (substr_uchar [0 ]);
1496
+
1494
1497
usearch = usearch_openFromCollator (substr_uchar ,
1495
1498
substr_ulen ,
1496
1499
src_uchar ,
@@ -1507,7 +1510,7 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
1507
1510
errmsg ("failed to perform ICU search: %s" ,
1508
1511
u_errorName (status ))));
1509
1512
1510
- for (int u16_pos = usearch_first (usearch , & status );
1513
+ for (int32_t u16_pos = usearch_first (usearch , & status );
1511
1514
u16_pos != USEARCH_DONE ;
1512
1515
u16_pos = usearch_next (usearch , & status ))
1513
1516
{
@@ -1517,6 +1520,28 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
1517
1520
errmsg ("failed to perform ICU search: %s" ,
1518
1521
u_errorName (status ))));
1519
1522
1523
+ /* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
1524
+ if (u16_pos == pos_prev_loop )
1525
+ {
1526
+ /* IF UTF16 code point is in the range D800 - DBFF, then it is a surrogate pair */
1527
+ int32_t next_char_idx = u16_pos + (src_uchar [u16_pos ] & 0xF800 == 0xD800 ? 2 : 1 );
1528
+
1529
+ if (is_substr_starts_with_surrogate && next_char_idx < src_ulen )
1530
+ {
1531
+ usearch_setOffset (usearch , next_char_idx , & status );
1532
+
1533
+ if (U_FAILURE (status ))
1534
+ ereport (ERROR , (errcode (ERRCODE_INTERNAL_ERROR ),
1535
+ errmsg ("failed to set offset in ICU search: %s" , u_errorName (status ))));
1536
+
1537
+ continue ;
1538
+ }
1539
+ else
1540
+ break ;
1541
+ }
1542
+
1543
+ pos_prev_loop = u16_pos ;
1544
+
1520
1545
/* for CS_AI collations usearch can give false positives so we double check the results here */
1521
1546
if (!(is_CS_AI && icu_compare_utf8_coll (mylocale -> info .icu .ucol , & src_uchar [usearch_getMatchedStart (usearch )], usearch_getMatchedLength (usearch ), substr_uchar , substr_ulen , false) != 0 ))
1522
1547
{
@@ -1564,7 +1589,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
1564
1589
int32_t src_len = VARSIZE_ANY_EXHDR (src_text );
1565
1590
int32_t from_str_len = VARSIZE_ANY_EXHDR (from_text );
1566
1591
int32_t to_str_len = VARSIZE_ANY_EXHDR (to_text );
1567
- int32_t previous_pos ;
1592
+ int32_t previous_pos , pos_prev_loop = -1 ;
1568
1593
int32_t src_ulen , from_ulen ; /* in utf-16 units */
1569
1594
UErrorCode status = U_ZERO_ERROR ;
1570
1595
UStringSearch * usearch ;
@@ -1573,6 +1598,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
1573
1598
StringInfoData resbuf ;
1574
1599
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal (collid );
1575
1600
bool is_CS_AI = false;
1601
+ bool is_substr_starts_with_surrogate ;
1576
1602
1577
1603
if (OidIsValid (coll_info_of_inputcollid .oid ) &&
1578
1604
coll_info_of_inputcollid .collateflags == 0x000e /* CS_AI */ )
@@ -1583,6 +1609,8 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
1583
1609
src_ulen = icu_to_uchar (& src_uchar , VARDATA_ANY (src_text ), src_len );
1584
1610
from_ulen = icu_to_uchar (& from_uchar , VARDATA_ANY (from_text ), from_str_len );
1585
1611
1612
+ is_substr_starts_with_surrogate = U16_IS_SURROGATE (from_uchar [0 ]);
1613
+
1586
1614
usearch = usearch_openFromCollator (from_uchar , /* needle */
1587
1615
from_ulen ,
1588
1616
src_uchar , /* haystack */
@@ -1596,7 +1624,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
1596
1624
initStringInfo (& resbuf );
1597
1625
previous_pos = 0 ;
1598
1626
1599
- for (int pos = usearch_first (usearch , & status );
1627
+ for (int32_t pos = usearch_first (usearch , & status );
1600
1628
pos != USEARCH_DONE ;
1601
1629
pos = usearch_next (usearch , & status ))
1602
1630
{
@@ -1609,6 +1637,23 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
1609
1637
errmsg ("failed to perform ICU search: %s" ,
1610
1638
u_errorName (status ))));
1611
1639
1640
+ /* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
1641
+ if (pos == pos_prev_loop )
1642
+ {
1643
+ /* IF UTF16 code point is in the range D800 - DBFF, then it is a surrogate pair */
1644
+ int32_t next_char_idx = pos + (src_uchar [pos ] & 0xF800 == 0xD800 ? 2 : 1 );
1645
+
1646
+ if (is_substr_starts_with_surrogate && next_char_idx < src_ulen )
1647
+ {
1648
+ usearch_setOffset (usearch , next_char_idx , & status );
1649
+ continue ;
1650
+ }
1651
+ else
1652
+ break ;
1653
+ }
1654
+
1655
+ pos_prev_loop = pos ;
1656
+
1612
1657
/* for CS_AI collations usearch can give false positives so we double check the results here */
1613
1658
if (is_CS_AI && icu_compare_utf8_coll (mylocale -> info .icu .ucol , & src_uchar [usearch_getMatchedStart (usearch )], usearch_getMatchedLength (usearch ), from_uchar , from_ulen , false) != 0 )
1614
1659
continue ;
0 commit comments