Skip to content

Commit 2ec3187

Browse files
authored
Merge pull request #169 from hamishknight/back-to-the-future
2 parents 6bed883 + 9d5529e commit 2ec3187

File tree

3 files changed

+43
-59
lines changed

3 files changed

+43
-59
lines changed

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 14 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ extension Source {
279279
/// | 'x' HexDigit{0...2}
280280
/// | 'U' HexDigit{8}
281281
/// | 'o{' OctalDigit{1...} '}'
282-
/// | OctalDigit{1...3}
282+
/// | '0' OctalDigit{0...3}
283283
///
284284
mutating func expectUnicodeScalar(
285285
escapedCharacter base: Character
@@ -313,13 +313,14 @@ extension Source {
313313
let str = try src.lexUntil(eating: "}").value
314314
return try Source.validateUnicodeScalar(str, .octal)
315315

316-
case let c where c.isOctalDigit:
317-
// We can read *up to* 2 more octal digits per PCRE.
318-
// FIXME: ICU can read up to 3 octal digits if the leading digit is 0,
319-
// we should have a parser mode to switch.
320-
let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)
321-
let str = String(c) + (nextDigits?.string ?? "")
322-
return try Source.validateUnicodeScalar(str, .octal)
316+
case "0":
317+
// We can read *up to* 3 more octal digits.
318+
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
319+
// PCRE mode, we should limit it here.
320+
guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else {
321+
return Unicode.Scalar(0)
322+
}
323+
return try Source.validateUnicodeScalar(digits.string, .octal)
323324

324325
default:
325326
fatalError("Unexpected scalar start")
@@ -1341,26 +1342,10 @@ extension Source {
13411342
return nil
13421343
}
13431344

1344-
// Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE
1345-
// it is treated as a backreference if its first digit is not 0 (as that
1346-
// is always octal) and one of the following holds:
1347-
//
1348-
// - It's 0 < n < 10 (as octal would be pointless here)
1349-
// - Its first digit is 8 or 9 (as not valid octal)
1350-
// - There have been as many prior groups as the reference.
1351-
//
1352-
// Oniguruma follows the same rules except the second one. e.g \81 and
1353-
// \91 are instead treated as literal 81 and 91 respectively.
1354-
// TODO: If we want a strict Oniguruma mode, we'll need to add a check
1355-
// here.
1345+
// Backslash followed by a non-0 digit character is a backreference.
13561346
if firstChar != "0", let numAndLoc = try src.lexNumber() {
1357-
let num = numAndLoc.value
1358-
let ref = AST.Reference(.absolute(num), innerLoc: numAndLoc.location)
1359-
if num < 10 || firstChar == "8" || firstChar == "9" ||
1360-
context.isPriorGroupRef(ref.kind) {
1361-
return .backreference(ref)
1362-
}
1363-
return nil
1347+
return .backreference(.init(
1348+
.absolute(numAndLoc.value), innerLoc: numAndLoc.location))
13641349
}
13651350
return nil
13661351
}
@@ -1497,10 +1482,8 @@ extension Source {
14971482
}
14981483

14991484
switch char {
1500-
// Hexadecimal and octal unicode scalars. This must be done after
1501-
// backreference lexing due to the ambiguity with \nnn.
1502-
case let c where c.isOctalDigit: fallthrough
1503-
case "u", "x", "U", "o":
1485+
// Hexadecimal and octal unicode scalars.
1486+
case "u", "x", "U", "o", "0":
15041487
return try .scalar(
15051488
src.expectUnicodeScalar(escapedCharacter: char).value)
15061489
default:

Tests/RegexTests/MatchTests.swift

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ extension RegexTests {
267267
firstMatchTest(#"\070"#, input: "1238xyz", match: "8")
268268
firstMatchTest(#"\07A"#, input: "123\u{7}Axyz", match: "\u{7}A")
269269
firstMatchTest(#"\08"#, input: "123\08xyz", match: "\08")
270-
firstMatchTest(#"\0707"#, input: "12387xyz", match: "87")
270+
firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}")
271271

272272
// code point sequence
273273
firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
@@ -1018,9 +1018,6 @@ extension RegexTests {
10181018
firstMatchTest(
10191019
#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
10201020
input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
1021-
firstMatchTest(
1022-
#"(.)\10"#,
1023-
input: "a\u{8}b", match: "a\u{8}")
10241021

10251022
firstMatchTest(
10261023
#"(.)\g001"#,

Tests/RegexTests/ParseTests.swift

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -329,21 +329,23 @@ extension RegexTests {
329329
parseTest(#"\070"#, scalar("\u{38}"))
330330
parseTest(#"\07A"#, concat(scalar("\u{7}"), "A"))
331331
parseTest(#"\08"#, concat(scalar("\u{0}"), "8"))
332-
parseTest(#"\0707"#, concat(scalar("\u{38}"), "7"))
332+
parseTest(#"\0707"#, scalar("\u{1C7}"))
333333

334334
parseTest(#"[\0]"#, charClass(scalar_m("\u{0}")))
335335
parseTest(#"[\01]"#, charClass(scalar_m("\u{1}")))
336336
parseTest(#"[\070]"#, charClass(scalar_m("\u{38}")))
337337

338338
parseTest(#"[\07A]"#, charClass(scalar_m("\u{7}"), "A"))
339339
parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8"))
340-
parseTest(#"[\0707]"#, charClass(scalar_m("\u{38}"), "7"))
340+
parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}")))
341341

342-
parseTest(#"[\1]"#, charClass(scalar_m("\u{1}")))
343-
parseTest(#"[\123]"#, charClass(scalar_m("\u{53}")))
344-
parseTest(#"[\101]"#, charClass(scalar_m("\u{41}")))
345-
parseTest(#"[\7777]"#, charClass(scalar_m("\u{1FF}"), "7"))
346-
parseTest(#"[\181]"#, charClass(scalar_m("\u{1}"), "8", "1"))
342+
// TODO: These are treated as octal sequences by PCRE, we should warn and
343+
// suggest user prefix with 0.
344+
parseTest(#"[\1]"#, charClass("1"))
345+
parseTest(#"[\123]"#, charClass("1", "2", "3"))
346+
parseTest(#"[\101]"#, charClass("1", "0", "1"))
347+
parseTest(#"[\7777]"#, charClass("7", "7", "7", "7"))
348+
parseTest(#"[\181]"#, charClass("1", "8", "1"))
347349

348350
// We take *up to* the first two valid digits for \x. No valid digits is 0.
349351
parseTest(#"\x"#, scalar("\u{0}"))
@@ -492,6 +494,10 @@ extension RegexTests {
492494
#"a\Q \Q \\.\Eb"#,
493495
concat("a", quote(#" \Q \\."#), "b"))
494496

497+
// These follow the PCRE behavior.
498+
parseTest(#"\Q\\E"#, quote("\\"))
499+
parseTest(#"\E"#, "E")
500+
495501
parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
496502
syntax: .experimental)
497503
parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")),
@@ -797,11 +803,9 @@ extension RegexTests {
797803
)
798804
}
799805

800-
// TODO: Some of these behaviors are unintuitive, we should likely warn on
801-
// some of them.
802-
parseTest(#"\10"#, scalar("\u{8}"))
803-
parseTest(#"\18"#, concat(scalar("\u{1}"), "8"))
804-
parseTest(#"\7777"#, concat(scalar("\u{1FF}"), "7"))
806+
parseTest(#"\10"#, backreference(.absolute(10)))
807+
parseTest(#"\18"#, backreference(.absolute(18)))
808+
parseTest(#"\7777"#, backreference(.absolute(7777)))
805809
parseTest(#"\91"#, backreference(.absolute(91)))
806810

807811
parseTest(
@@ -813,21 +817,22 @@ extension RegexTests {
813817
parseTest(
814818
#"()()()()()()()()()\10()"#,
815819
concat(Array(repeating: capture(empty()), count: 9)
816-
+ [scalar("\u{8}"), capture(empty())]),
820+
+ [backreference(.absolute(10)), capture(empty())]),
817821
captures: .tuple(Array(repeating: .atom(), count: 10))
818822
)
819-
parseTest(#"()()\10"#,
820-
concat(capture(empty()), capture(empty()), scalar("\u{8}")),
821-
captures: .tuple(.atom(), .atom()))
823+
parseTest(#"()()\10"#, concat(
824+
capture(empty()), capture(empty()), backreference(.absolute(10))),
825+
captures: .tuple(.atom(), .atom())
826+
)
822827

823828
// A capture of three empty captures.
824829
let fourCaptures = capture(
825830
concat(capture(empty()), capture(empty()), capture(empty()))
826831
)
827832
parseTest(
828833
// There are 9 capture groups in total here.
829-
#"((()()())(()()()))\10"#,
830-
concat(capture(concat(fourCaptures, fourCaptures)), scalar("\u{8}")),
834+
#"((()()())(()()()))\10"#, concat(capture(concat(
835+
fourCaptures, fourCaptures)), backreference(.absolute(10))),
831836
captures: .tuple(Array(repeating: .atom(), count: 9))
832837
)
833838
parseTest(
@@ -852,7 +857,7 @@ extension RegexTests {
852857
concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]),
853858
captures: .tuple(Array(repeating: .atom(), count: 40))
854859
)
855-
parseTest(#"\40"#, scalar(" "))
860+
parseTest(#"\40"#, backreference(.absolute(40)))
856861
parseTest(
857862
String(repeating: "()", count: 40) + #"\40"#,
858863
concat(Array(repeating: capture(empty()), count: 40)
@@ -862,7 +867,7 @@ extension RegexTests {
862867

863868
parseTest(#"\7"#, backreference(.absolute(7)))
864869

865-
parseTest(#"\11"#, scalar("\u{9}"))
870+
parseTest(#"\11"#, backreference(.absolute(11)))
866871
parseTest(
867872
String(repeating: "()", count: 11) + #"\11"#,
868873
concat(Array(repeating: capture(empty()), count: 11)
@@ -876,12 +881,11 @@ extension RegexTests {
876881
captures: .tuple(Array(repeating: .atom(), count: 11))
877882
)
878883

879-
parseTest(#"\0113"#, concat(scalar("\u{9}"), "3"))
880-
parseTest(#"\113"#, scalar("\u{4B}"))
881-
parseTest(#"\377"#, scalar("\u{FF}"))
884+
parseTest(#"\0113"#, scalar("\u{4B}"))
885+
parseTest(#"\113"#, backreference(.absolute(113)))
886+
parseTest(#"\377"#, backreference(.absolute(377)))
882887
parseTest(#"\81"#, backreference(.absolute(81)))
883888

884-
885889
parseTest(#"\g1"#, backreference(.absolute(1)))
886890
parseTest(#"\g001"#, backreference(.absolute(1)))
887891
parseTest(#"\g52"#, backreference(.absolute(52)))

0 commit comments

Comments
 (0)