Skip to content

Commit ab105b5

Browse files
Fix regex description emitted by generator for switched branches (#115136)
* Fix regex description emitted by generator for switched branches The emitter is mutating the regex tree, under the assumption that no one subsequently looks at the tree... but the XML comment generator does. Make sure the mutations are undone. * Update src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs --------- Co-authored-by: Dan Moseley <danmose@microsoft.com>
1 parent 058b1ef commit ab105b5

File tree

2 files changed

+232
-22
lines changed

2 files changed

+232
-22
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1817,8 +1817,6 @@ void EmitSwitchedBranches()
18171817
writer.Indent++;
18181818

18191819
// Emit the code for the branch, without the first character that was already matched in the switch.
1820-
RegexNode? remainder = null;
1821-
HandleChild:
18221820
switch (child.Kind)
18231821
{
18241822
case RegexNodeKind.One:
@@ -1830,35 +1828,39 @@ void EmitSwitchedBranches()
18301828
case RegexNodeKind.Multi:
18311829
// First character was handled by the switch. Emit matching code for the remainder of the multi string.
18321830
sliceStaticPos++;
1833-
EmitNode(child.Str!.Length == 2 ?
1834-
new RegexNode(RegexNodeKind.One, child.Options, child.Str![1]) :
1835-
new RegexNode(RegexNodeKind.Multi, child.Options, child.Str!.Substring(1)));
1831+
EmitNode(SliceOffMultiFirstChar(child));
18361832
writer.WriteLine();
18371833
break;
18381834

18391835
case RegexNodeKind.Concatenate when child.Child(0) == startingLiteralNode && (startingLiteralNode.Kind is RegexNodeKind.One or RegexNodeKind.Set or RegexNodeKind.Multi):
18401836
// This is a concatenation where its first node is the starting literal we found and that starting literal
18411837
// is one of the nodes above that we know how to handle completely. This is a common
18421838
// enough case that we want to special-case it to avoid duplicating the processing for that character
1843-
// unnecessarily. So, we'll shave off that first node from the concatenation and then handle the remainder.
1844-
// Note that it's critical startingLiteralNode is something we can fully handle above: if it's not,
1845-
// we'll end up losing some of the pattern due to overwriting `remainder`.
1846-
remainder = child;
1847-
child = child.Child(0);
1848-
remainder.ReplaceChild(0, new RegexNode(RegexNodeKind.Empty, remainder.Options));
1849-
goto HandleChild; // reprocess just the first node that was saved; the remainder will then be processed below
1839+
// unnecessarily. First slice off the first character that was already handled. If that child is a multi, temporarily
1840+
// replace it with a node that doesn't have the already-matched first character; otherwise, replace it with an empty node
1841+
// that'll be ignored when rendered. Then emit the new tree, and subsequently restore the original child.
1842+
sliceStaticPos++;
1843+
RegexNode originalFirst = child.Child(0);
1844+
child.ReplaceChild(0,
1845+
child.Child(0).Kind is RegexNodeKind.Multi ?
1846+
SliceOffMultiFirstChar(child.Child(0)) : // multi, so slice off the first character
1847+
new RegexNode(RegexNodeKind.Empty, child.Options)); // single, so removing it yields empty
1848+
try
1849+
{
18501850

1851-
default:
1852-
Debug.Assert(remainder is null);
1853-
remainder = child;
1851+
EmitNode(child);
1852+
}
1853+
finally
1854+
{
1855+
child.ReplaceChild(0, originalFirst);
1856+
}
1857+
writer.WriteLine();
18541858
break;
1855-
}
18561859

1857-
if (remainder is not null)
1858-
{
1859-
// Emit a full match for whatever part of the child we haven't yet handled.
1860-
EmitNode(remainder);
1861-
writer.WriteLine();
1860+
default:
1861+
EmitNode(child);
1862+
writer.WriteLine();
1863+
break;
18621864
}
18631865

18641866
// This is only ever used for atomic alternations, so we can simply reset the doneLabel
@@ -1882,6 +1884,14 @@ void EmitSwitchedBranches()
18821884
}
18831885
}
18841886

1887+
static RegexNode SliceOffMultiFirstChar(RegexNode multi)
1888+
{
1889+
Debug.Assert(multi.Kind is RegexNodeKind.Multi, $"Expected a Multi node, got {multi.Kind}");
1890+
return multi.Str!.Length == 2 ?
1891+
new(RegexNodeKind.One, multi.Options, multi.Str[1]) :
1892+
new(RegexNodeKind.Multi, multi.Options, multi.Str.Substring(1));
1893+
}
1894+
18851895
void EmitAllBranches()
18861896
{
18871897
// Label to jump to when any branch completes successfully.

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs

Lines changed: 201 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -754,7 +754,7 @@ void UncaptureUntil(int capturePosition)
754754
}
755755
756756
/// <summary>Helper methods used by generated <see cref="Regex"/>-derived implementations.</summary>
757-
[GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
757+
[GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "%VERSION%")]
758758
file static class Utilities
759759
{
760760
/// <summary>Supports searching for the string "href".</summary>
@@ -930,6 +930,206 @@ file static class Utilities
930930
}
931931
"""
932932
};
933+
934+
yield return new object[]
935+
{
936+
"""
937+
using System.Text.RegularExpressions;
938+
partial class C
939+
{
940+
[GeneratedRegex(@"abcd*e|f")]
941+
public static partial Regex Valid();
942+
}
943+
""",
944+
945+
"""
946+
// <auto-generated/>
947+
#nullable enable
948+
#pragma warning disable CS0162 // Unreachable code
949+
#pragma warning disable CS0164 // Unreferenced label
950+
#pragma warning disable CS0219 // Variable assigned but never used
951+
952+
partial class C
953+
{
954+
/// <remarks>
955+
/// Pattern:<br/>
956+
/// <code>abcd*e|f</code><br/>
957+
/// Explanation:<br/>
958+
/// <code>
959+
/// ○ Match with 2 alternative expressions, atomically.<br/>
960+
/// ○ Match a sequence of expressions.<br/>
961+
/// ○ Match the string "abc".<br/>
962+
/// ○ Match 'd' atomically any number of times.<br/>
963+
/// ○ Match 'e'.<br/>
964+
/// ○ Match 'f'.<br/>
965+
/// </code>
966+
/// </remarks>
967+
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "%VERSION%")]
968+
public static partial global::System.Text.RegularExpressions.Regex Valid() => global::System.Text.RegularExpressions.Generated.Valid_0.Instance;
969+
}
970+
971+
namespace System.Text.RegularExpressions.Generated
972+
{
973+
using System;
974+
using System.Buffers;
975+
using System.CodeDom.Compiler;
976+
using System.Collections;
977+
using System.ComponentModel;
978+
using System.Globalization;
979+
using System.Runtime.CompilerServices;
980+
using System.Text.RegularExpressions;
981+
using System.Threading;
982+
983+
/// <summary>Custom <see cref="Regex"/>-derived type for the Valid method.</summary>
984+
[GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "%VERSION%")]
985+
[SkipLocalsInit]
986+
file sealed class Valid_0 : Regex
987+
{
988+
/// <summary>Cached, thread-safe singleton instance.</summary>
989+
internal static readonly Valid_0 Instance = new();
990+
991+
/// <summary>Initializes the instance.</summary>
992+
private Valid_0()
993+
{
994+
base.pattern = "abcd*e|f";
995+
base.roptions = RegexOptions.None;
996+
ValidateMatchTimeout(Utilities.s_defaultTimeout);
997+
base.internalMatchTimeout = Utilities.s_defaultTimeout;
998+
base.factory = new RunnerFactory();
999+
base.capsize = 1;
1000+
}
1001+
1002+
/// <summary>Provides a factory for creating <see cref="RegexRunner"/> instances to be used by methods on <see cref="Regex"/>.</summary>
1003+
private sealed class RunnerFactory : RegexRunnerFactory
1004+
{
1005+
/// <summary>Creates an instance of a <see cref="RegexRunner"/> used by methods on <see cref="Regex"/>.</summary>
1006+
protected override RegexRunner CreateInstance() => new Runner();
1007+
1008+
/// <summary>Provides the runner that contains the custom logic implementing the specified regular expression.</summary>
1009+
private sealed class Runner : RegexRunner
1010+
{
1011+
/// <summary>Scan the <paramref name="inputSpan"/> starting from base.runtextstart for the next match.</summary>
1012+
/// <param name="inputSpan">The text being scanned by the regular expression.</param>
1013+
protected override void Scan(ReadOnlySpan<char> inputSpan)
1014+
{
1015+
// Search until we can't find a valid starting position, we find a match, or we reach the end of the input.
1016+
while (TryFindNextPossibleStartingPosition(inputSpan) &&
1017+
!TryMatchAtCurrentPosition(inputSpan) &&
1018+
base.runtextpos != inputSpan.Length)
1019+
{
1020+
base.runtextpos++;
1021+
if (Utilities.s_hasTimeout)
1022+
{
1023+
base.CheckTimeout();
1024+
}
1025+
}
1026+
}
1027+
1028+
/// <summary>Search <paramref name="inputSpan"/> starting from base.runtextpos for the next location a match could possibly start.</summary>
1029+
/// <param name="inputSpan">The text being scanned by the regular expression.</param>
1030+
/// <returns>true if a possible match was found; false if no more matches are possible.</returns>
1031+
private bool TryFindNextPossibleStartingPosition(ReadOnlySpan<char> inputSpan)
1032+
{
1033+
int pos = base.runtextpos;
1034+
1035+
// Empty matches aren't possible.
1036+
if ((uint)pos < (uint)inputSpan.Length)
1037+
{
1038+
// The pattern begins with a character in the set [af].
1039+
// Find the next occurrence. If it can't be found, there's no match.
1040+
int i = inputSpan.Slice(pos).IndexOfAny('a', 'f');
1041+
if (i >= 0)
1042+
{
1043+
base.runtextpos = pos + i;
1044+
return true;
1045+
}
1046+
}
1047+
1048+
// No match found.
1049+
base.runtextpos = inputSpan.Length;
1050+
return false;
1051+
}
1052+
1053+
/// <summary>Determine whether <paramref name="inputSpan"/> at base.runtextpos is a match for the regular expression.</summary>
1054+
/// <param name="inputSpan">The text being scanned by the regular expression.</param>
1055+
/// <returns>true if the regular expression matches at the current position; otherwise, false.</returns>
1056+
private bool TryMatchAtCurrentPosition(ReadOnlySpan<char> inputSpan)
1057+
{
1058+
int pos = base.runtextpos;
1059+
int matchStart = pos;
1060+
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
1061+
1062+
// Match with 2 alternative expressions, atomically.
1063+
{
1064+
if (slice.IsEmpty)
1065+
{
1066+
return false; // The input didn't match.
1067+
}
1068+
1069+
switch (slice[0])
1070+
{
1071+
case 'a':
1072+
// Match the string "bc".
1073+
if (!slice.Slice(1).StartsWith("bc"))
1074+
{
1075+
return false; // The input didn't match.
1076+
}
1077+
1078+
// Match 'd' atomically any number of times.
1079+
{
1080+
int iteration = slice.Slice(3).IndexOfAnyExcept('d');
1081+
if (iteration < 0)
1082+
{
1083+
iteration = slice.Length - 3;
1084+
}
1085+
1086+
slice = slice.Slice(iteration);
1087+
pos += iteration;
1088+
}
1089+
1090+
// Match 'e'.
1091+
if ((uint)slice.Length < 4 || slice[3] != 'e')
1092+
{
1093+
return false; // The input didn't match.
1094+
}
1095+
1096+
pos += 4;
1097+
slice = inputSpan.Slice(pos);
1098+
break;
1099+
1100+
case 'f':
1101+
pos++;
1102+
slice = inputSpan.Slice(pos);
1103+
break;
1104+
1105+
default:
1106+
return false; // The input didn't match.
1107+
}
1108+
}
1109+
1110+
// The input matched.
1111+
base.runtextpos = pos;
1112+
base.Capture(0, matchStart, pos);
1113+
return true;
1114+
}
1115+
}
1116+
}
1117+
1118+
}
1119+
1120+
/// <summary>Helper methods used by generated <see cref="Regex"/>-derived implementations.</summary>
1121+
[GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "%VERSION%")]
1122+
file static class Utilities
1123+
{
1124+
/// <summary>Default timeout value set in <see cref="AppContext"/>, or <see cref="Regex.InfiniteMatchTimeout"/> if none was set.</summary>
1125+
internal static readonly TimeSpan s_defaultTimeout = AppContext.GetData("REGEX_DEFAULT_MATCH_TIMEOUT") is TimeSpan timeout ? timeout : Regex.InfiniteMatchTimeout;
1126+
1127+
/// <summary>Whether <see cref="s_defaultTimeout"/> is non-infinite.</summary>
1128+
internal static readonly bool s_hasTimeout = s_defaultTimeout != Regex.InfiniteMatchTimeout;
1129+
}
1130+
}
1131+
"""
1132+
};
9331133
}
9341134
}
9351135
}

0 commit comments

Comments
 (0)