45
45
#define MAX_BYTES_PER_CHAR 4
46
46
#define MAX_INPUT_LENGTH_TO_REMOVE_ACCENTS 250 * 1024 * 1024
47
47
48
+ /*
49
+ * Check if Uchar is lead surrogate pair, If Uchar is in
50
+ * the range D800 - DBFF then it is a lead surrogate pair
51
+ */
52
+ #define UCHAR_IS_SURROGATE (c ) ((c & 0xF800) == 0xD800)
53
+
54
+ /* Find length of given Uchar */
55
+ #define UCHAR_LENGTH (c ) (UCHAR_IS_SURROGATE(c) ? 2 : 1)
56
+
48
57
Oid server_collation_oid = InvalidOid ;
49
58
collation_callbacks * collation_callbacks_ptr = NULL ;
50
59
extern bool babelfish_dump_restore ;
@@ -1475,12 +1484,13 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
1475
1484
int32_t src_len_utf8 = VARSIZE_ANY_EXHDR (src_text );
1476
1485
int32_t substr_len_utf8 = VARSIZE_ANY_EXHDR (substr_text );
1477
1486
int32_t src_ulen , substr_ulen ;
1478
- int32_t u8_pos = -1 ;
1487
+ int32_t u8_pos = -1 , pos_prev_loop = -1 ;
1479
1488
UErrorCode status = U_ZERO_ERROR ;
1480
1489
UStringSearch * usearch ;
1481
1490
UChar * src_uchar , * substr_uchar ;
1482
1491
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal (collid );
1483
1492
bool is_CS_AI = false;
1493
+ bool is_substr_starts_with_surrogate ;
1484
1494
1485
1495
if (OidIsValid (coll_info_of_inputcollid .oid ) &&
1486
1496
coll_info_of_inputcollid .collateflags == 0x000e /* CS_AI */ )
@@ -1491,6 +1501,8 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
1491
1501
src_ulen = icu_to_uchar (& src_uchar , VARDATA_ANY (src_text ), src_len_utf8 );
1492
1502
substr_ulen = icu_to_uchar (& substr_uchar , VARDATA_ANY (substr_text ), substr_len_utf8 );
1493
1503
1504
+ is_substr_starts_with_surrogate = UCHAR_IS_SURROGATE (substr_uchar [0 ]);
1505
+
1494
1506
usearch = usearch_openFromCollator (substr_uchar ,
1495
1507
substr_ulen ,
1496
1508
src_uchar ,
@@ -1507,7 +1519,7 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
1507
1519
errmsg ("failed to perform ICU search: %s" ,
1508
1520
u_errorName (status ))));
1509
1521
1510
- for (int u16_pos = usearch_first (usearch , & status );
1522
+ for (int32_t u16_pos = usearch_first (usearch , & status );
1511
1523
u16_pos != USEARCH_DONE ;
1512
1524
u16_pos = usearch_next (usearch , & status ))
1513
1525
{
@@ -1517,6 +1529,27 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
1517
1529
errmsg ("failed to perform ICU search: %s" ,
1518
1530
u_errorName (status ))));
1519
1531
1532
+ /* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
1533
+ if (u16_pos == pos_prev_loop )
1534
+ {
1535
+ int32_t next_char_idx = u16_pos + UCHAR_LENGTH (src_uchar [u16_pos ]);
1536
+
1537
+ if (is_substr_starts_with_surrogate && next_char_idx < src_ulen )
1538
+ {
1539
+ usearch_setOffset (usearch , next_char_idx , & status );
1540
+
1541
+ if (U_FAILURE (status ))
1542
+ ereport (ERROR , (errcode (ERRCODE_INTERNAL_ERROR ),
1543
+ errmsg ("failed to set offset in ICU search: %s" , u_errorName (status ))));
1544
+
1545
+ continue ;
1546
+ }
1547
+ else
1548
+ break ;
1549
+ }
1550
+
1551
+ pos_prev_loop = u16_pos ;
1552
+
1520
1553
/* for CS_AI collations usearch can give false positives so we double check the results here */
1521
1554
if (!(is_CS_AI && icu_compare_utf8_coll (mylocale -> info .icu .ucol , & src_uchar [usearch_getMatchedStart (usearch )], usearch_getMatchedLength (usearch ), substr_uchar , substr_ulen , false) != 0 ))
1522
1555
{
@@ -1564,7 +1597,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
1564
1597
int32_t src_len = VARSIZE_ANY_EXHDR (src_text );
1565
1598
int32_t from_str_len = VARSIZE_ANY_EXHDR (from_text );
1566
1599
int32_t to_str_len = VARSIZE_ANY_EXHDR (to_text );
1567
- int32_t previous_pos ;
1600
+ int32_t previous_pos , pos_prev_loop = -1 ;
1568
1601
int32_t src_ulen , from_ulen ; /* in utf-16 units */
1569
1602
UErrorCode status = U_ZERO_ERROR ;
1570
1603
UStringSearch * usearch ;
@@ -1573,6 +1606,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
1573
1606
StringInfoData resbuf ;
1574
1607
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal (collid );
1575
1608
bool is_CS_AI = false;
1609
+ bool is_substr_starts_with_surrogate ;
1576
1610
1577
1611
if (OidIsValid (coll_info_of_inputcollid .oid ) &&
1578
1612
coll_info_of_inputcollid .collateflags == 0x000e /* CS_AI */ )
@@ -1583,6 +1617,8 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
1583
1617
src_ulen = icu_to_uchar (& src_uchar , VARDATA_ANY (src_text ), src_len );
1584
1618
from_ulen = icu_to_uchar (& from_uchar , VARDATA_ANY (from_text ), from_str_len );
1585
1619
1620
+ is_substr_starts_with_surrogate = UCHAR_IS_SURROGATE (from_uchar [0 ]);
1621
+
1586
1622
usearch = usearch_openFromCollator (from_uchar , /* needle */
1587
1623
from_ulen ,
1588
1624
src_uchar , /* haystack */
@@ -1596,7 +1632,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
1596
1632
initStringInfo (& resbuf );
1597
1633
previous_pos = 0 ;
1598
1634
1599
- for (int pos = usearch_first (usearch , & status );
1635
+ for (int32_t pos = usearch_first (usearch , & status );
1600
1636
pos != USEARCH_DONE ;
1601
1637
pos = usearch_next (usearch , & status ))
1602
1638
{
@@ -1609,6 +1645,27 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
1609
1645
errmsg ("failed to perform ICU search: %s" ,
1610
1646
u_errorName (status ))));
1611
1647
1648
+ /* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
1649
+ if (pos == pos_prev_loop )
1650
+ {
1651
+ int32_t next_char_idx = pos + UCHAR_LENGTH (src_uchar [pos ]);
1652
+
1653
+ if (is_substr_starts_with_surrogate && next_char_idx < src_ulen )
1654
+ {
1655
+ usearch_setOffset (usearch , next_char_idx , & status );
1656
+
1657
+ if (U_FAILURE (status ))
1658
+ ereport (ERROR , (errcode (ERRCODE_INTERNAL_ERROR ),
1659
+ errmsg ("failed to set offset in ICU search: %s" , u_errorName (status ))));
1660
+
1661
+ continue ;
1662
+ }
1663
+ else
1664
+ break ;
1665
+ }
1666
+
1667
+ pos_prev_loop = pos ;
1668
+
1612
1669
/* for CS_AI collations usearch can give false positives so we double check the results here */
1613
1670
if (is_CS_AI && icu_compare_utf8_coll (mylocale -> info .icu .ucol , & src_uchar [usearch_getMatchedStart (usearch )], usearch_getMatchedLength (usearch ), from_uchar , from_ulen , false) != 0 )
1614
1671
continue ;
0 commit comments