diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 302f7224de4a7a..599cd0f738af76 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -94,6 +94,9 @@ repetition to an inner repetition, parentheses may be used. For example, the expression ``(?:a{6})*`` matches any multiple of six ``'a'`` characters. +Special Characters +^^^^^^^^^^^^^^^^^^ + The special characters are: .. index:: single: . (dot); in regular expressions @@ -818,6 +821,40 @@ Flags Corresponds to the inline flag ``(?x)``. +String Indexing Arguments +^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following functions and related ``Pattern`` methods support optional string +index ``pos`` & ``endpos`` parameters: + + * ``re.match()`` & ``Pattern.match()`` + * ``re.fullmatch()`` & ``Pattern.fullmatch()`` + * ``re.search()`` & ``Pattern.search()`` + * ``re.findall()`` & ``Pattern.findall()`` + * ``re.finditer()`` & ``Pattern.finditer()`` + +The optional parameter *pos* gives an index in the string where the search is +to start; it defaults to ``0``. This is not completely equivalent to slicing +the string; the ``'^'`` pattern character matches at the real beginning of the +string and at positions just after a newline, but not necessarily at the index +where the search is to start. + +The optional parameter *endpos* limits how far the string will be searched; it +will be as if the string is *endpos* characters long, so only the characters +from *pos* to ``endpos - 1`` will be searched for a match. If *endpos* is less +than *pos*, no match will be found; otherwise, if *rx* is a compiled regular +expression object, ``rx.search(string, 0, 50)`` is equivalent to +``rx.search(string[:50], 0)``.:: + + >>> pattern = re.compile("d") + >>> pattern.search("dog") # Match at index 0 + + >>> pattern.search("dog", 1) # No match; search doesn't include the "d" + +.. versionchanged:: 3.13 + Top-level module functions now support ``pos`` and ``endpos``. + + Functions ^^^^^^^^^ @@ -853,15 +890,18 @@ Functions about compiling regular expressions. -.. function:: search(pattern, string, flags=0) +.. function:: search(pattern, string, flags=0, pos=0, endpos=sys.maxsize) Scan through *string* looking for the first location where the regular expression *pattern* produces a match, and return a corresponding :class:`~re.Match`. Return ``None`` if no position in the string matches the pattern; note that this is different from finding a zero-length match at some point in the string. + .. versionchanged:: 3.13 + Now supports ``pos`` and ``endpos``. + -.. function:: match(pattern, string, flags=0) +.. function:: match(pattern, string, flags=0, pos=0, endpos=sys.maxsize) If zero or more characters at the beginning of *string* match the regular expression *pattern*, return a corresponding :class:`~re.Match`. Return @@ -874,8 +914,11 @@ Functions If you want to locate a match anywhere in *string*, use :func:`search` instead (see also :ref:`search-vs-match`). + .. versionchanged:: 3.13 + Now supports ``pos`` and ``endpos``. -.. function:: fullmatch(pattern, string, flags=0) + +.. function:: fullmatch(pattern, string, flags=0, pos=0, endpos=sys.maxsize) If the whole *string* matches the regular expression *pattern*, return a corresponding :class:`~re.Match`. Return ``None`` if the string does not match @@ -883,6 +926,9 @@ Functions .. versionadded:: 3.4 + .. versionchanged:: 3.13 + Now supports ``pos`` and ``endpos``. + .. function:: split(pattern, string, maxsplit=0, flags=0) @@ -933,7 +979,7 @@ Functions :ref:`keyword-only parameters `. -.. function:: findall(pattern, string, flags=0) +.. function:: findall(pattern, string, flags=0, pos=0, endpos=sys.maxsize) Return all non-overlapping matches of *pattern* in *string*, as a list of strings or tuples. The *string* is scanned left-to-right, and matches @@ -954,8 +1000,11 @@ Functions .. versionchanged:: 3.7 Non-empty matches can now start just after a previous empty match. + .. versionchanged:: 3.13 + Now supports ``pos`` and ``endpos``. + -.. function:: finditer(pattern, string, flags=0) +.. function:: finditer(pattern, string, flags=0, pos=0, endpos=sys.maxsize) Return an :term:`iterator` yielding :class:`~re.Match` objects over all non-overlapping matches for the RE *pattern* in *string*. The *string* @@ -965,6 +1014,9 @@ Functions .. versionchanged:: 3.7 Non-empty matches can now start just after a previous empty match. + .. versionchanged:: 3.13 + Now supports ``pos`` and ``endpos``. + .. function:: sub(pattern, repl, string, count=0, flags=0) @@ -1141,7 +1193,7 @@ Regular Expression Objects :py:class:`re.Pattern` supports ``[]`` to indicate a Unicode (str) or bytes pattern. See :ref:`types-genericalias`. -.. method:: Pattern.search(string[, pos[, endpos]]) +.. method:: Pattern.search(string, pos=0, endpos=sys.maxsize) Scan through *string* looking for the first location where this regular expression produces a match, and return a corresponding :class:`~re.Match`. @@ -1167,7 +1219,7 @@ Regular Expression Objects >>> pattern.search("dog", 1) # No match; search doesn't include the "d" -.. method:: Pattern.match(string[, pos[, endpos]]) +.. method:: Pattern.match(string, pos=0, endpos=sys.maxsize) If zero or more characters at the *beginning* of *string* match this regular expression, return a corresponding :class:`~re.Match`. Return ``None`` if the @@ -1186,7 +1238,7 @@ Regular Expression Objects :meth:`~Pattern.search` instead (see also :ref:`search-vs-match`). -.. method:: Pattern.fullmatch(string[, pos[, endpos]]) +.. method:: Pattern.fullmatch(string, pos=0, endpos=sys.maxsize) If the whole *string* matches this regular expression, return a corresponding :class:`~re.Match`. Return ``None`` if the string does not match the pattern; @@ -1209,14 +1261,14 @@ Regular Expression Objects Identical to the :func:`split` function, using the compiled pattern. -.. method:: Pattern.findall(string[, pos[, endpos]]) +.. method:: Pattern.findall(string, pos=0, endpos=sys.maxsize) Similar to the :func:`findall` function, using the compiled pattern, but also accepts optional *pos* and *endpos* parameters that limit the search region like for :meth:`search`. -.. method:: Pattern.finditer(string[, pos[, endpos]]) +.. method:: Pattern.finditer(string, pos=0, endpos=sys.maxsize) Similar to the :func:`finditer` function, using the compiled pattern, but also accepts optional *pos* and *endpos* parameters that limit the search diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index 7e8abbf6ffe155..30b6173cde0e0c 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -117,12 +117,32 @@ U UNICODE For compatibility only. Ignored for string patterns (it is the default), and forbidden for bytes patterns. +The following functions support optional pos/endpos arguments: + match + fullmatch + search + findall + finditer + +The optional parameter pos gives an index in the string where the search is +to start; it defaults to 0. This is not completely equivalent to slicing the +string; the '^' pattern character matches at the real beginning of the string +and at positions just after a newline, but not necessarily at the index where +the search is to start. + +The optional parameter endpos limits how far the string will be searched; +it will be as if the string is endpos characters long, so only the characters +from pos to endpos - 1 will be searched for a match. If endpos is less than +pos, no match will be found. Otherwise, if rx is a compiled regular expression +object, rx.search(string, 0, 50) is equivalent to rx.search(string[:50], 0). + This module also defines exception 'PatternError', aliased to 'error' for backward compatibility. """ import enum +import sys from . import _compiler, _parser import functools import _sre @@ -161,20 +181,20 @@ class RegexFlag: # -------------------------------------------------------------------- # public interface -def match(pattern, string, flags=0): +def match(pattern, string, flags=0, pos=0, endpos=sys.maxsize): """Try to apply the pattern at the start of the string, returning a Match object, or None if no match was found.""" - return _compile(pattern, flags).match(string) + return _compile(pattern, flags).match(string, pos=pos, endpos=endpos) -def fullmatch(pattern, string, flags=0): +def fullmatch(pattern, string, flags=0, pos=0, endpos=sys.maxsize): """Try to apply the pattern to all of the string, returning a Match object, or None if no match was found.""" - return _compile(pattern, flags).fullmatch(string) + return _compile(pattern, flags).fullmatch(string, pos=pos, endpos=endpos) -def search(pattern, string, flags=0): +def search(pattern, string, flags=0, pos=0, endpos=sys.maxsize): """Scan through string looking for a match to the pattern, returning a Match object, or None if no match was found.""" - return _compile(pattern, flags).search(string) + return _compile(pattern, flags).search(string, pos=pos, endpos=endpos) class _ZeroSentinel(int): pass @@ -267,7 +287,7 @@ def split(pattern, string, *args, maxsplit=_zero_sentinel, flags=_zero_sentinel) return _compile(pattern, flags).split(string, maxsplit) split.__text_signature__ = '(pattern, string, maxsplit=0, flags=0)' -def findall(pattern, string, flags=0): +def findall(pattern, string, flags=0, pos=0, endpos=sys.maxsize): """Return a list of all non-overlapping matches in the string. If one or more capturing groups are present in the pattern, return @@ -275,14 +295,14 @@ def findall(pattern, string, flags=0): has more than one group. Empty matches are included in the result.""" - return _compile(pattern, flags).findall(string) + return _compile(pattern, flags).findall(string, pos=pos, endpos=endpos) -def finditer(pattern, string, flags=0): +def finditer(pattern, string, flags=0, pos=0, endpos=sys.maxsize): """Return an iterator over all non-overlapping matches in the string. For each match, the iterator returns a Match object. Empty matches are included in the result.""" - return _compile(pattern, flags).finditer(string) + return _compile(pattern, flags).finditer(string, pos=pos, endpos=endpos) def compile(pattern, flags=0): "Compile a regular expression pattern, returning a Pattern object." diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 993a7d6e264a1f..c986c45c2e98f2 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1981,6 +1981,61 @@ def test_keyword_parameters(self): pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(), (7, 9)) + def test_module_function_pos_endpos(self): + # pos/endpos - positional arguments + self.assertEqual( + re.match(r'(ab)', 'abracadabra', 0, 7).span(), (7, 9)) + self.assertEqual( + re.match(r'(ab)', 'abracadabra', 0, 7, 10).span(), (7, 9)) + self.assertEqual( + re.fullmatch(r'(abra)', 'abracadabra', 1, 7).span(), (7, 11)) + self.assertEqual( + re.fullmatch(r'(ab)', 'abracadabra', 0, 7, 9).span(), (7, 9)) + self.assertEqual( + re.search(r'(ab)', 'abracadabra', 0, 3).span(), (7, 9)) + self.assertEqual( + re.search(r'(ab)', 'abracadabra', 0, 3, 10).span(), (7, 9)) + self.assertEqual( + re.findall(r'(ab)', 'abracadabracadabra', 0, 3), ['ab', 'ab']) + self.assertEqual( + re.findall(r'(ab)', 'abracadabracadabra', 0, 3, 16), ['ab', 'ab']) + iter = re.finditer(r":+", "a:b::c:::d", 0, 3) + self.assertEqual([item.group(0) for item in iter], ["::", ":::"]) + iter = re.finditer(r":+", "a:b::c:::d", 0, 3, 10) + self.assertEqual([item.group(0) for item in iter], ["::", ":::"]) + + # pos/endpos - keyword arguments + self.assertEqual( + re.match(r'(ab)', 'abracadabra', pos=7).span(), (7, 9)) + self.assertEqual( + re.match(r'(ab)', 'abracadabra', endpos=9).span(), (0, 2)) + self.assertEqual( + re.match(r'(ab)', 'abracadabra', pos=7, endpos=9).span(), (7, 9)) + self.assertEqual( + re.fullmatch(r'(abra)', 'abracadabra', pos=7).span(), (7, 11)) + self.assertEqual( + re.fullmatch(r'(ab)', 'abracadabra', endpos=2).span(), (0, 2)) + self.assertEqual( + re.fullmatch(r'(ab)', 'abracadabra', pos=7, endpos=9).span(), (7, 9)) + self.assertEqual( + re.search(r'(ab)', 'abracadabra', pos=3).span(), (7, 9)) + self.assertEqual( + re.search(r'(ab)', 'abracadabra', endpos=9).span(), (0, 2)) + self.assertEqual( + re.search(r'(ab)', 'abracadabra', pos=3, endpos=9).span(), (7, 9)) + self.assertEqual( + re.findall(r':+', 'a:b::c:::d', pos=3), ['::', ':::']) + self.assertEqual( + re.findall(r':+', 'a:b::c:::d', endpos=6), [':', '::']) + self.assertEqual( + re.findall(r':+', 'a:b::c:::d', pos=3, endpos=10), ['::', ':::']) + iter = re.finditer(r':+', 'a:b::c:::d', pos=3) + self.assertEqual([item.group(0) for item in iter], ['::', ':::']) + iter = re.finditer(r':+', 'a:b::c:::d', endpos=6) + self.assertEqual([item.group(0) for item in iter], [':', '::']) + iter = re.finditer(r':+', 'a:b::c:::d', pos=3, endpos=10) + self.assertEqual([item.group(0) for item in iter], ['::', ':::']) + def test_bug_20998(self): # Issue #20998: Fullmatch of repeated single character pattern # with ignore case. diff --git a/Misc/ACKS b/Misc/ACKS index 6b98be32905391..a4e21e055da007 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1695,6 +1695,7 @@ Reilly Tucker Siemens Paul Sijben SilentGhost Tim Silk +Adam Silkey Michael Simcich Ionel Simionescu Kirill Simonov diff --git a/Misc/NEWS.d/next/Library/2023-12-20-01-17-58.gh-issue-113304.fE5U8G.rst b/Misc/NEWS.d/next/Library/2023-12-20-01-17-58.gh-issue-113304.fE5U8G.rst new file mode 100644 index 00000000000000..840f9cb8cb0243 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-12-20-01-17-58.gh-issue-113304.fE5U8G.rst @@ -0,0 +1 @@ +Add ``pos``/``endpos`` parameters to top-level re module functions.