From 021c2fa669defa59f0fbd91cc4481cbe49f0f004 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 10 Apr 2023 14:08:17 -0600 Subject: [PATCH 1/9] Simplify instruction fetching --- Sources/_StringProcessing/Engine/Metrics.swift | 2 +- Sources/_StringProcessing/Engine/Processor.swift | 6 +++++- Sources/_StringProcessing/Utility/Protocols.swift | 10 ---------- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Metrics.swift b/Sources/_StringProcessing/Engine/Metrics.swift index 753c3c3d1..49a669a0b 100644 --- a/Sources/_StringProcessing/Engine/Metrics.swift +++ b/Sources/_StringProcessing/Engine/Metrics.swift @@ -21,7 +21,7 @@ extension Processor { } mutating func measure() { - let (opcode, _) = fetch().destructure + let (opcode, _) = fetch() if metrics.instructionCounts.keys.contains(opcode) { metrics.instructionCounts[opcode]! += 1 } else { diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 98aee7803..2d34ab3ea 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -160,6 +160,10 @@ extension Processor { } extension Processor { + func fetch() -> (Instruction.OpCode, Instruction.Payload) { + instructions[controller.pc].destructure + } + var slice: Input.SubSequence { // TODO: Should we whole-scale switch to slices, or // does that depend on options for some anchors? @@ -449,7 +453,7 @@ extension Processor { } #endif - let (opcode, payload) = fetch().destructure + let (opcode, payload) = fetch() switch opcode { case .invalid: fatalError("Invalid program") diff --git a/Sources/_StringProcessing/Utility/Protocols.swift b/Sources/_StringProcessing/Utility/Protocols.swift index 7542a17dd..24ffbcf70 100644 --- a/Sources/_StringProcessing/Utility/Protocols.swift +++ b/Sources/_StringProcessing/Utility/Protocols.swift @@ -44,13 +44,3 @@ protocol ProcessorProtocol { var registers: Registers { get } } -extension ProcessorProtocol { - func fetch() -> Instruction { - instructions[currentPC] - } - - var callStack: Array { [] } -// var savePoints: Array { [] } - var registers: Array { [] } - -} From ccf39927b8e8fc9f65ff98cb8688b859ac346c88 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Sat, 1 Apr 2023 10:33:24 -0600 Subject: [PATCH 2/9] Refactor metrics out, and void their storage in release builds --- .../_StringProcessing/Engine/Metrics.swift | 65 ++++++++++++++++++- .../_StringProcessing/Engine/Processor.swift | 35 ++++------ .../_StringProcessing/Engine/Tracing.swift | 5 ++ Sources/_StringProcessing/Executor.swift | 4 +- .../_StringProcessing/Utility/Traced.swift | 2 +- 5 files changed, 82 insertions(+), 29 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Metrics.swift b/Sources/_StringProcessing/Engine/Metrics.swift index 49a669a0b..372a7e1b4 100644 --- a/Sources/_StringProcessing/Engine/Metrics.swift +++ b/Sources/_StringProcessing/Engine/Metrics.swift @@ -1,13 +1,71 @@ extension Processor { +#if PROCESSOR_MEASUREMENTS_ENABLED struct ProcessorMetrics { var instructionCounts: [Instruction.OpCode: Int] = [:] var backtracks: Int = 0 var resets: Int = 0 + var cycleCount: Int = 0 + + var isTracingEnabled: Bool = false + var shouldMeasureMetrics: Bool = false + + init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { + self.isTracingEnabled = isTracingEnabled + self.shouldMeasureMetrics = shouldMeasureMetrics + } } - +#else + struct ProcessorMetrics { + var isTracingEnabled: Bool { false } + var shouldMeasureMetrics: Bool { false } + var cycleCount: Int { 0 } + + init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { } + } +#endif +} + +extension Processor { + + mutating func startCycleMetrics() { +#if PROCESSOR_MEASUREMENTS_ENABLED + if metrics.cycleCount == 0 { + trace() + measureMetrics() + } +#endif + } + + mutating func endCycleMetrics() { +#if PROCESSOR_MEASUREMENTS_ENABLED + metrics.cycleCount += 1 + trace() + measureMetrics() + _checkInvariants() +#endif + } +} + +extension Processor.ProcessorMetrics { + + mutating func addReset() { +#if PROCESSOR_MEASUREMENTS_ENABLED + self.resets += 1 +#endif + } + + mutating func addBacktrack() { +#if PROCESSOR_MEASUREMENTS_ENABLED + self.backtracks += 1 +#endif + } +} + +extension Processor { +#if PROCESSOR_MEASUREMENTS_ENABLED func printMetrics() { print("===") - print("Total cycle count: \(cycleCount)") + print("Total cycle count: \(metrics.cycleCount)") print("Backtracks: \(metrics.backtracks)") print("Resets: \(metrics.resets)") print("Instructions:") @@ -30,8 +88,9 @@ extension Processor { } mutating func measureMetrics() { - if shouldMeasureMetrics { + if metrics.shouldMeasureMetrics { measure() } } +#endif } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 2d34ab3ea..6586d36ad 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -86,11 +86,7 @@ struct Processor { var failureReason: Error? = nil - // MARK: Metrics, debugging, etc. - var cycleCount = 0 - var isTracingEnabled: Bool - let shouldMeasureMetrics: Bool - var metrics: ProcessorMetrics = ProcessorMetrics() + var metrics: ProcessorMetrics } extension Processor { @@ -116,8 +112,11 @@ extension Processor { self.subjectBounds = subjectBounds self.searchBounds = searchBounds self.matchMode = matchMode - self.isTracingEnabled = isTracingEnabled - self.shouldMeasureMetrics = shouldMeasureMetrics + + self.metrics = ProcessorMetrics( + isTracingEnabled: isTracingEnabled, + shouldMeasureMetrics: shouldMeasureMetrics) + self.currentPosition = searchBounds.lowerBound // Initialize registers with end of search bounds @@ -144,8 +143,8 @@ extension Processor { self.state = .inProgress self.failureReason = nil - - if shouldMeasureMetrics { metrics.resets += 1 } + + metrics.addReset() _checkInvariants() } @@ -400,8 +399,8 @@ extension Processor { storedCaptures = capEnds registers.ints = intRegisters registers.positions = posRegisters - - if shouldMeasureMetrics { metrics.backtracks += 1 } + + metrics.addBacktrack() } mutating func abort(_ e: Error? = nil) { @@ -440,18 +439,8 @@ extension Processor { _checkInvariants() assert(state == .inProgress) -#if PROCESSOR_MEASUREMENTS_ENABLED - if cycleCount == 0 { - trace() - measureMetrics() - } - defer { - cycleCount += 1 - trace() - measureMetrics() - _checkInvariants() - } -#endif + startCycleMetrics() + defer { endCycleMetrics() } let (opcode, payload) = fetch() switch opcode { diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 725319b00..b0ce67555 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -9,7 +9,12 @@ // //===----------------------------------------------------------------------===// + +// TODO: Remove this protocol (and/or reuse it for something like a FastProcessor) extension Processor: TracedProcessor { + var cycleCount: Int { metrics.cycleCount } + var isTracingEnabled: Bool { metrics.isTracingEnabled } + var isFailState: Bool { state == .fail } var isAcceptState: Bool { state == .accept } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 253858d1f..0453fcd80 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -31,7 +31,7 @@ struct Executor { subjectBounds: subjectBounds, searchBounds: searchBounds) #if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } #endif var low = searchBounds.lowerBound let high = searchBounds.upperBound @@ -60,7 +60,7 @@ struct Executor { var cpu = engine.makeProcessor( input: input, bounds: subjectBounds, matchMode: mode) #if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } #endif return try _match(input, from: subjectBounds.lowerBound, using: &cpu) } diff --git a/Sources/_StringProcessing/Utility/Traced.swift b/Sources/_StringProcessing/Utility/Traced.swift index 112a601b1..198564fe1 100644 --- a/Sources/_StringProcessing/Utility/Traced.swift +++ b/Sources/_StringProcessing/Utility/Traced.swift @@ -13,7 +13,7 @@ // TODO: Place shared formatting and trace infrastructure here protocol Traced { - var isTracingEnabled: Bool { get set } + var isTracingEnabled: Bool { get } } protocol TracedProcessor: ProcessorProtocol, Traced { From d5ffed3c0d2a1cc512ded370d30b7e9b8c7ac78d Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 15:36:30 -0600 Subject: [PATCH 3/9] put scalar matching on String, consistent few percent --- .../_StringProcessing/Engine/MEQuantify.swift | 7 ++- .../_StringProcessing/Engine/Processor.swift | 61 ++++++++++++++----- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 0e2f3923a..0a58f7ed2 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -5,8 +5,11 @@ extension Processor { case .bitset: next = _doMatchBitset(registers[payload.bitset]) case .asciiChar: - next = _doMatchScalar( - UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) + next = input.matchScalar( + UnicodeScalar.init(_value: UInt32(payload.asciiChar)), + at: currentPosition, + limitedBy: end, + boundaryCheck: true) case .builtin: // We only emit .quantify if it consumes a single character next = input._matchBuiltinCC( diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 6586d36ad..20148f478 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -272,21 +272,10 @@ extension Processor { currentPosition < end ? input.unicodeScalars[currentPosition] : nil } - func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> Input.Index? { - if s == loadScalar(), - let idx = input.unicodeScalars.index( - currentPosition, - offsetBy: 1, - limitedBy: end), - (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { - return idx - } else { - return nil - } - } - mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { - guard let next = _doMatchScalar(s, boundaryCheck) else { + guard let next = input.matchScalar( + s, at: currentPosition, limitedBy: end, boundaryCheck: boundaryCheck + ) else { signalFailure() return false } @@ -695,4 +684,48 @@ extension Processor { controller.step() } } + + func sleep() { + var i = 0 + for c in input { + if i > 20 { break } + i += 1 + if c == "C" { + blackHole(c) + } + } + } +} + +@inline(never) +func blackHole(_ t: T) { _ = t } + +extension String { + + // func consumeScalar(_ n: Distance) -> Bool { + + // } + + func matchScalar( + _ scalar: Unicode.Scalar, + at pos: Index, + limitedBy end: String.Index, + boundaryCheck: Bool + ) -> Index? { + assert(end <= endIndex) + + guard pos < end, unicodeScalars[pos] == scalar else { + return nil + } + + let idx = unicodeScalars.index(after: pos) + guard idx <= end else { return nil } + + if boundaryCheck && !isOnGraphemeClusterBoundary(idx) { + return nil + } + + return idx + } + } From ba8b57e9f6f23b64cf4d3f809d73ab4e1ea3671e Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 16:03:50 -0600 Subject: [PATCH 4/9] sink match onto string --- .../_StringProcessing/Engine/Processor.swift | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 20148f478..06725a466 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -232,11 +232,13 @@ extension Processor { // Match against the current input element. Returns whether // it succeeded vs signaling an error. mutating func match(_ e: Element) -> Bool { - guard let cur = load(), cur == e else { + guard let next = input.match( + e, at: currentPosition, limitedBy: end + ) else { signalFailure() return false } - _uncheckedForcedConsumeOne() + currentPosition = next return true } @@ -256,6 +258,7 @@ extension Processor { isScalarMode: Bool ) -> Bool { if isScalarMode { + // TODO: sink to specialized method on string, needs benchmark for s in seq.unicodeScalars { guard matchScalar(s, boundaryCheck: false) else { return false } } @@ -702,6 +705,23 @@ func blackHole(_ t: T) { _ = t } extension String { + func match( + _ char: Character, + at pos: Index, + limitedBy end: String.Index + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + assert(end <= endIndex) + + guard pos < end, self[pos] == char else { return nil } + + let idx = index(after: pos) + guard idx <= end else { return nil } + + return idx + } + // func consumeScalar(_ n: Distance) -> Bool { // } From 670ff9b4d97a06f6883045304639739b5cc8e8e0 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 16:18:29 -0600 Subject: [PATCH 5/9] sink match bitset --- .../_StringProcessing/Engine/MEQuantify.swift | 3 +- .../_StringProcessing/Engine/Processor.swift | 35 ++++++++++++++----- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 0a58f7ed2..09ab71168 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -3,7 +3,8 @@ extension Processor { var next: Input.Index? switch payload.type { case .bitset: - next = _doMatchBitset(registers[payload.bitset]) + next = input.matchBitset( + registers[payload.bitset], at: currentPosition, limitedBy: end) case .asciiChar: next = input.matchScalar( UnicodeScalar.init(_value: UInt32(payload.asciiChar)), diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 06725a466..97dde4e36 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -243,6 +243,7 @@ extension Processor { } mutating func matchCaseInsensitive(_ e: Element) -> Bool { + // TODO: need benchmark coverage guard let cur = load(), cur.lowercased() == e.lowercased() else { signalFailure() return false @@ -290,6 +291,7 @@ extension Processor { _ s: Unicode.Scalar, boundaryCheck: Bool ) -> Bool { + // TODO: needs benchmark coverage guard let curScalar = loadScalar(), s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping, let idx = input.unicodeScalars.index( @@ -305,21 +307,15 @@ extension Processor { return true } - func _doMatchBitset(_ bitset: DSLTree.CustomCharacterClass.AsciiBitset) -> Input.Index? { - if let cur = load(), bitset.matches(char: cur) { - return input.index(after: currentPosition) - } else { - return nil - } - } - // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset mutating func matchBitset( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset ) -> Bool { - guard let next = _doMatchBitset(bitset) else { + guard let next = input.matchBitset( + bitset, at: currentPosition, limitedBy: end + ) else { signalFailure() return false } @@ -748,4 +744,25 @@ extension String { return idx } + func matchBitset( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + at pos: Index, + limitedBy end: Index + ) -> Index? { + // TODO: extremely quick-check-able + // TODO: can be sped up with string internals + + assert(end <= endIndex) + + guard pos < end, bitset.matches(char: self[pos]) else { + return nil + } + + let idx = index(after: pos) + guard idx <= end else { return nil } + + return idx + } + + } From 1a4cde7a5004c9eb09e70781d70c5e2a1d41b8ba Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 16:35:01 -0600 Subject: [PATCH 6/9] sink matchSeq --- .../_StringProcessing/Engine/Processor.swift | 38 +++++++++++++++---- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 97dde4e36..e8b521693 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -180,6 +180,7 @@ extension Processor { // Returns whether the advance succeeded. On failure, our // save point was restored mutating func consume(_ n: Distance) -> Bool { + // TODO: need benchmark coverage guard let idx = input.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -192,6 +193,7 @@ extension Processor { // Advances in unicode scalar view mutating func consumeScalar(_ n: Distance) -> Bool { + // TODO: need benchmark coverage guard let idx = input.unicodeScalars.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -223,11 +225,6 @@ extension Processor { func load() -> Element? { currentPosition < end ? input[currentPosition] : nil } - func load(count: Int) -> Input.SubSequence? { - let slice = self.slice[currentPosition...].prefix(count) - guard slice.count == count else { return nil } - return slice - } // Match against the current input element. Returns whether // it succeeded vs signaling an error. @@ -259,16 +256,21 @@ extension Processor { isScalarMode: Bool ) -> Bool { if isScalarMode { - // TODO: sink to specialized method on string, needs benchmark + // TODO: needs benchmark coverage for s in seq.unicodeScalars { guard matchScalar(s, boundaryCheck: false) else { return false } } return true } - for e in seq { - guard match(e) else { return false } + guard let next = input.matchSeq( + seq, at: currentPosition, limitedBy: end + ) else { + signalFailure() + return false } + + currentPosition = next return true } @@ -327,6 +329,7 @@ extension Processor { mutating func matchBitsetScalar( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset ) -> Bool { + // TODO: needs benchmark coverage guard let curScalar = loadScalar(), bitset.matches(scalar: curScalar), let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { @@ -718,6 +721,25 @@ extension String { return idx } + func matchSeq( + _ seq: Substring, + at pos: Index, + limitedBy end: Index + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + assert(end <= endIndex) + + var cur = pos + for e in seq { + guard cur < end, self[cur] == e else { return nil } + self.formIndex(after: &cur) + } + + guard cur <= end else { return nil } + return cur + } + // func consumeScalar(_ n: Distance) -> Bool { // } From 89fa6c70aa2148debbfc0125e16c1f4e4b5bde6c Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 16:46:02 -0600 Subject: [PATCH 7/9] cleanup --- .../_StringProcessing/Engine/MEBuiltins.swift | 4 ++++ .../_StringProcessing/Engine/MEQuantify.swift | 1 + .../_StringProcessing/Engine/Processor.swift | 18 ------------------ 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 4ec7a71dd..b50d1c213 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -30,6 +30,7 @@ extension Processor { } func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { + // TODO: needs benchmark coverage if currentPosition == subjectBounds.lowerBound { return true } switch payload.semanticLevel { case .graphemeCluster: @@ -40,6 +41,7 @@ extension Processor { } func isAtEndOfLine(_ payload: AssertionPayload) -> Bool { + // TODO: needs benchmark coverage if currentPosition == subjectBounds.upperBound { return true } switch payload.semanticLevel { case .graphemeCluster: @@ -50,6 +52,8 @@ extension Processor { } mutating func builtinAssert(by payload: AssertionPayload) throws -> Bool { + // TODO: needs benchmark coverage + // Future work: Optimize layout and dispatch switch payload.kind { case .startOfSubject: return currentPosition == subjectBounds.lowerBound diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 09ab71168..81b80b00e 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -20,6 +20,7 @@ extension Processor { isStrictASCII: payload.builtinIsStrict, isScalarSemantics: false) case .any: + // TODO: call out to existing code with quick check let matched = currentPosition != input.endIndex && (!input[currentPosition].isNewline || payload.anyMatchesNewline) next = matched ? input.index(after: currentPosition) : nil diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index e8b521693..3ff767207 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -686,22 +686,8 @@ extension Processor { controller.step() } } - - func sleep() { - var i = 0 - for c in input { - if i > 20 { break } - i += 1 - if c == "C" { - blackHole(c) - } - } - } } -@inline(never) -func blackHole(_ t: T) { _ = t } - extension String { func match( @@ -740,10 +726,6 @@ extension String { return cur } - // func consumeScalar(_ n: Distance) -> Bool { - - // } - func matchScalar( _ scalar: Unicode.Scalar, at pos: Index, From 7335fa77c524d6377dd04c32e1f31362354fdd3b Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 17:08:52 -0600 Subject: [PATCH 8/9] cleanup --- Sources/_StringProcessing/Unicode/WordBreaking.swift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift index 50da079f6..f1f9573c1 100644 --- a/Sources/_StringProcessing/Unicode/WordBreaking.swift +++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift @@ -12,6 +12,7 @@ @_spi(_Unicode) import Swift +// TODO: Sink onto String extension Processor { func atSimpleBoundary( _ usesAsciiWord: Bool, @@ -20,9 +21,11 @@ extension Processor { func matchesWord(at i: Input.Index) -> Bool { switch semanticLevel { case .graphemeCluster: + // TODO: needs benchmark coverage let c = input[i] return c.isWordCharacter && (c.isASCII || !usesAsciiWord) case .unicodeScalar: + // TODO: needs benchmark coverage let c = input.unicodeScalars[i] return (c.properties.isAlphabetic || c == "_") && (c.isASCII || !usesAsciiWord) } @@ -51,6 +54,7 @@ extension String { using cache: inout Set?, _ maxIndex: inout String.Index? ) -> Bool { + // TODO: needs benchmark coverage guard i != startIndex, i != endIndex else { return true } From 8694dead049c0ca40c420add40181e428778bcb8 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 13 Apr 2023 06:30:46 -0600 Subject: [PATCH 9/9] Bug fix in newline hot path, and apply hot path to quantified dot --- .../_StringProcessing/Engine/MEBuiltins.swift | 16 +++++++------ .../_StringProcessing/Engine/MEQuantify.swift | 24 ++++++++++++------- Tests/RegexTests/MatchTests.swift | 5 ++++ 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index b50d1c213..d8c8c347b 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -63,10 +63,10 @@ extension Processor { switch payload.semanticLevel { case .graphemeCluster: return input.index(after: currentPosition) == subjectBounds.upperBound - && input[currentPosition].isNewline + && input[currentPosition].isNewline case .unicodeScalar: return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound - && input.unicodeScalars[currentPosition].isNewline + && input.unicodeScalars[currentPosition].isNewline } case .endOfSubject: return currentPosition == subjectBounds.upperBound @@ -121,6 +121,7 @@ extension Processor { // MARK: Matching `.` extension String { + // TODO: Should the below have a `limitedBy` parameter? func _matchAnyNonNewline( at currentPosition: String.Index, @@ -155,11 +156,11 @@ extension String { return .unknown } switch asciiValue { - case ._lineFeed, ._carriageReturn: - return .definite(nil) - default: - assert(!isCRLF) - return .definite(next) + case (._lineFeed)...(._carriageReturn): + return .definite(nil) + default: + assert(!isCRLF) + return .definite(next) } } @@ -183,6 +184,7 @@ extension String { // MARK: - Built-in character class matching extension String { + // TODO: Should the below have a `limitedBy` parameter? // Mentioned in ProgrammersManual.md, update docs if redesigned func _matchBuiltinCC( diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 81b80b00e..873627567 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,31 +1,37 @@ extension Processor { func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { - var next: Input.Index? + // FIXME: is the below updated for scalar semantics? switch payload.type { case .bitset: - next = input.matchBitset( + return input.matchBitset( registers[payload.bitset], at: currentPosition, limitedBy: end) case .asciiChar: - next = input.matchScalar( + return input.matchScalar( UnicodeScalar.init(_value: UInt32(payload.asciiChar)), at: currentPosition, limitedBy: end, boundaryCheck: true) case .builtin: + // FIXME: bounds check? endIndex or end? + // We only emit .quantify if it consumes a single character - next = input._matchBuiltinCC( + return input._matchBuiltinCC( payload.builtin, at: currentPosition, isInverted: payload.builtinIsInverted, isStrictASCII: payload.builtinIsStrict, isScalarSemantics: false) case .any: - // TODO: call out to existing code with quick check - let matched = currentPosition != input.endIndex - && (!input[currentPosition].isNewline || payload.anyMatchesNewline) - next = matched ? input.index(after: currentPosition) : nil + // FIXME: endIndex or end? + guard currentPosition < input.endIndex else { return nil } + + if payload.anyMatchesNewline { + return input.index(after: currentPosition) + } + + return input._matchAnyNonNewline( + at: currentPosition, isScalarSemantics: false) } - return next } /// Generic quantify instruction interpreter diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index e8e41a114..a6c9babbe 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1891,6 +1891,11 @@ extension RegexTests { func testSingleLineMode() { firstMatchTest(#".+"#, input: "a\nb", match: "a") firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") + + // We recognize LF, line tab, FF, and CR as newlines by default + firstMatchTest(#"."#, input: "\u{A}\u{B}\u{C}\u{D}\nb", match: "b") + firstMatchTest(#".+"#, input: "\u{A}\u{B}\u{C}\u{D}\nbb", match: "bb") + } func testMatchNewlines() {