Skip to content

Commit aa124b1

Browse files
committed
Merge pull request #158 from defuz/improve-name-mapping
Storing mapping from names to group indices into Regex
2 parents f34213c + 0144613 commit aa124b1

File tree

4 files changed

+149
-67
lines changed

4 files changed

+149
-67
lines changed

regex_macros/src/lib.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,19 @@ impl<'a> NfaGen<'a> {
109109
None => cx.expr_none(self.sp),
110110
}
111111
);
112+
let named_groups = {
113+
let mut named_groups = ::std::collections::BTreeMap::new();
114+
for (i, name) in self.names.iter().enumerate() {
115+
if let Some(ref name) = *name {
116+
named_groups.insert(name.to_owned(), i);
117+
}
118+
}
119+
self.vec_expr(named_groups.iter(),
120+
&mut |cx, (name, group_idx)|
121+
quote_expr!(cx, ($name, $group_idx))
122+
)
123+
};
124+
112125
let prefix_anchor = self.prog.anchored_begin;
113126

114127
let step_insts = self.step_insts();
@@ -123,6 +136,8 @@ impl<'a> NfaGen<'a> {
123136
// unused code generated by regex!. See #14185 for an example.
124137
#[allow(dead_code)]
125138
static CAP_NAMES: &'static [Option<&'static str>] = &$cap_names;
139+
#[allow(dead_code)]
140+
static NAMED_GROUPS: &'static [(&'static str, usize)] = &$named_groups;
126141

127142
#[allow(dead_code)]
128143
fn exec<'t>(
@@ -308,6 +323,7 @@ fn exec<'t>(
308323
::regex::Regex::Native(::regex::internal::ExNative {
309324
original: $regex,
310325
names: &CAP_NAMES,
326+
groups: &NAMED_GROUPS,
311327
prog: exec,
312328
})
313329
})

src/exec.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11+
use std::collections::HashMap;
12+
use std::sync::Arc;
13+
1114
use backtrack::{self, Backtrack};
1215
use dfa::{self, Dfa, DfaResult};
1316
use input::{ByteInput, CharInput};
@@ -375,6 +378,12 @@ impl Exec {
375378
&self.prog.cap_names
376379
}
377380

381+
/// Return a reference to named groups mapping (from group name to
382+
/// group position).
383+
pub fn named_groups(&self) -> &Arc<HashMap<String, usize>> {
384+
&self.prog.named_groups
385+
}
386+
378387
/// Return a fresh allocation for storing all possible captures in the
379388
/// underlying regular expression.
380389
pub fn alloc_captures(&self) -> Vec<Option<usize>> {

src/program.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11+
use std::collections::HashMap;
12+
use std::sync::Arc;
13+
1114
use syntax;
1215

1316
use backtrack::BacktrackCache;
@@ -39,6 +42,9 @@ pub struct Program {
3942
/// The sequence of capture group names. There is an entry for each capture
4043
/// group index and a name exists only if the capture group is named.
4144
pub cap_names: Vec<Option<String>>,
45+
/// The map of named capture groups. The keys are group names and
46+
/// the values are group indices.
47+
pub named_groups: Arc<HashMap<String, usize>>,
4248
/// If the regular expression requires a literal prefix in order to have a
4349
/// match, that prefix is stored here as a DFA.
4450
pub prefixes: Literals,
@@ -115,10 +121,17 @@ impl ProgramBuilder {
115121
insts.anchored_begin(),
116122
insts.anchored_end(),
117123
);
124+
let mut named_groups = HashMap::new();
125+
for (i, name) in cap_names.iter().enumerate() {
126+
if let Some(ref name) = *name {
127+
named_groups.insert(name.to_owned(), i);
128+
}
129+
}
118130
Ok(Program {
119131
original: self.re,
120132
insts: insts,
121133
cap_names: cap_names,
134+
named_groups: Arc::new(named_groups),
122135
prefixes: prefixes,
123136
anchored_begin: anchored_begin,
124137
anchored_end: anchored_end,

src/re.rs

Lines changed: 111 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010

1111
use std::borrow::Cow;
1212
use std::collections::HashMap;
13-
use std::collections::hash_map::Iter;
1413
use std::fmt;
1514
use std::ops::Index;
1615
#[cfg(feature = "pattern")]
1716
use std::str::pattern::{Pattern, Searcher, SearchStep};
1817
use std::str::FromStr;
18+
use std::sync::Arc;
1919

2020
use exec::{Exec, ExecBuilder};
2121
use syntax;
@@ -186,6 +186,8 @@ pub struct ExNative {
186186
#[doc(hidden)]
187187
pub names: &'static &'static [Option<&'static str>],
188188
#[doc(hidden)]
189+
pub groups: &'static &'static [(&'static str, usize)],
190+
#[doc(hidden)]
189191
pub prog: fn(&mut CaptureIdxs, &str, usize) -> bool,
190192
}
191193

@@ -395,9 +397,13 @@ impl Regex {
395397
/// The `0`th capture group is always unnamed, so it must always be
396398
/// accessed with `at(0)` or `[0]`.
397399
pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
398-
let mut caps = self.alloc_captures();
399-
if exec(self, &mut caps, text, 0) {
400-
Some(Captures::new(self, text, caps))
400+
let mut locs = self.alloc_captures();
401+
if exec(self, &mut locs, text, 0) {
402+
Some(Captures {
403+
text: text,
404+
locs: locs,
405+
named_groups: NamedGroups::from_regex(self)
406+
})
401407
} else {
402408
None
403409
}
@@ -804,6 +810,71 @@ impl<'r, 't> Iterator for RegexSplitsN<'r, 't> {
804810
}
805811
}
806812

813+
enum NamedGroups {
814+
Empty,
815+
Native(&'static [(&'static str, usize)]),
816+
Dynamic(Arc<HashMap<String, usize>>),
817+
}
818+
819+
impl NamedGroups {
820+
fn from_regex(regex: &Regex) -> NamedGroups {
821+
match *regex {
822+
Regex::Native(ExNative { ref groups, .. }) =>
823+
NamedGroups::Native(groups),
824+
Regex::Dynamic(ref exec) => {
825+
let groups = exec.named_groups();
826+
if groups.is_empty() {
827+
NamedGroups::Empty
828+
} else {
829+
NamedGroups::Dynamic(groups.clone())
830+
}
831+
}
832+
}
833+
}
834+
835+
fn pos(&self, name: &str) -> Option<usize> {
836+
match *self {
837+
NamedGroups::Empty => None,
838+
NamedGroups::Native(groups) => {
839+
groups.binary_search_by(|&(n, _)| n.cmp(name))
840+
.ok().map(|i| groups[i].1)
841+
},
842+
NamedGroups::Dynamic(ref groups) => {
843+
groups.get(name).map(|i| *i)
844+
},
845+
}
846+
}
847+
848+
fn iter<'n>(&'n self) -> NamedGroupsIter<'n> {
849+
match *self {
850+
NamedGroups::Empty => NamedGroupsIter::Empty,
851+
NamedGroups::Native(g) => NamedGroupsIter::Native(g.iter()),
852+
NamedGroups::Dynamic(ref g) => NamedGroupsIter::Dynamic(g.iter()),
853+
}
854+
}
855+
}
856+
857+
enum NamedGroupsIter<'n> {
858+
Empty,
859+
Native(::std::slice::Iter<'static, (&'static str, usize)>),
860+
Dynamic(::std::collections::hash_map::Iter<'n, String, usize>),
861+
}
862+
863+
impl<'n> Iterator for NamedGroupsIter<'n> {
864+
type Item = (&'n str, usize);
865+
866+
fn next(&mut self) -> Option<Self::Item> {
867+
match *self {
868+
NamedGroupsIter::Empty =>
869+
None,
870+
NamedGroupsIter::Native(ref mut it) =>
871+
it.next().map(|&v| v),
872+
NamedGroupsIter::Dynamic(ref mut it) =>
873+
it.next().map(|(s, i)| (s.as_ref(), *i))
874+
}
875+
}
876+
}
877+
807878
/// Captures represents a group of captured strings for a single match.
808879
///
809880
/// The 0th capture always corresponds to the entire match. Each subsequent
@@ -818,34 +889,10 @@ impl<'r, 't> Iterator for RegexSplitsN<'r, 't> {
818889
pub struct Captures<'t> {
819890
text: &'t str,
820891
locs: Vec<Option<usize>>,
821-
named: Option<HashMap<String, usize>>,
892+
named_groups: NamedGroups,
822893
}
823894

824895
impl<'t> Captures<'t> {
825-
fn new(
826-
re: &Regex,
827-
search: &'t str,
828-
locs: Vec<Option<usize>>,
829-
) -> Captures<'t> {
830-
let named =
831-
if re.captures_len() == 0 {
832-
None
833-
} else {
834-
let mut named = HashMap::new();
835-
for (i, name) in re.capture_names().enumerate() {
836-
if let Some(name) = name {
837-
named.insert(name.to_owned(), i);
838-
}
839-
}
840-
Some(named)
841-
};
842-
Captures {
843-
text: search,
844-
locs: locs,
845-
named: named,
846-
}
847-
}
848-
849896
/// Returns the start and end positions of the Nth capture group.
850897
/// Returns `None` if `i` is not a valid capture group or if the capture
851898
/// group did not match anything.
@@ -874,15 +921,7 @@ impl<'t> Captures<'t> {
874921
/// `name` isn't a valid capture group or didn't match anything, then
875922
/// `None` is returned.
876923
pub fn name(&self, name: &str) -> Option<&'t str> {
877-
match self.named {
878-
None => None,
879-
Some(ref h) => {
880-
match h.get(name) {
881-
None => None,
882-
Some(i) => self.at(*i),
883-
}
884-
}
885-
}
924+
self.named_groups.pos(name).and_then(|i| self.at(i))
886925
}
887926

888927
/// Creates an iterator of all the capture groups in order of appearance
@@ -895,7 +934,7 @@ impl<'t> Captures<'t> {
895934
/// appearance in the regular expression. Positions are byte indices
896935
/// in terms of the original string matched.
897936
pub fn iter_pos(&'t self) -> SubCapturesPos<'t> {
898-
SubCapturesPos { idx: 0, caps: self, }
937+
SubCapturesPos { idx: 0, locs: &self.locs }
899938
}
900939

901940
/// Creates an iterator of all named groups as an tuple with the group
@@ -904,7 +943,7 @@ impl<'t> Captures<'t> {
904943
pub fn iter_named(&'t self) -> SubCapturesNamed<'t> {
905944
SubCapturesNamed {
906945
caps: self,
907-
inner: self.named.as_ref().map(|n| n.iter()),
946+
names: self.named_groups.iter()
908947
}
909948
}
910949

@@ -978,16 +1017,16 @@ impl<'t> Index<&'t str> for Captures<'t> {
9781017
/// An iterator over capture groups for a particular match of a regular
9791018
/// expression.
9801019
///
981-
/// `'t` is the lifetime of the matched text.
982-
pub struct SubCaptures<'t> {
1020+
/// `'c` is the lifetime of the captures.
1021+
pub struct SubCaptures<'c> {
9831022
idx: usize,
984-
caps: &'t Captures<'t>,
1023+
caps: &'c Captures<'c>,
9851024
}
9861025

987-
impl<'t> Iterator for SubCaptures<'t> {
988-
type Item = Option<&'t str>;
1026+
impl<'c> Iterator for SubCaptures<'c> {
1027+
type Item = Option<&'c str>;
9891028

990-
fn next(&mut self) -> Option<Option<&'t str>> {
1029+
fn next(&mut self) -> Option<Option<&'c str>> {
9911030
if self.idx < self.caps.len() {
9921031
self.idx += 1;
9931032
Some(self.caps.at(self.idx - 1))
@@ -1002,42 +1041,43 @@ impl<'t> Iterator for SubCaptures<'t> {
10021041
///
10031042
/// Positions are byte indices in terms of the original string matched.
10041043
///
1005-
/// `'t` is the lifetime of the matched text.
1006-
pub struct SubCapturesPos<'t> {
1044+
/// `'c` is the lifetime of the captures.
1045+
pub struct SubCapturesPos<'c> {
10071046
idx: usize,
1008-
caps: &'t Captures<'t>,
1047+
locs: &'c [Option<usize>]
10091048
}
10101049

1011-
impl<'t> Iterator for SubCapturesPos<'t> {
1050+
impl<'c> Iterator for SubCapturesPos<'c> {
10121051
type Item = Option<(usize, usize)>;
10131052

10141053
fn next(&mut self) -> Option<Option<(usize, usize)>> {
1015-
if self.idx < self.caps.len() {
1016-
self.idx += 1;
1017-
Some(self.caps.pos(self.idx - 1))
1018-
} else {
1019-
None
1054+
if self.idx >= self.locs.len() {
1055+
return None
10201056
}
1057+
let r = match (self.locs[self.idx], self.locs[self.idx + 1]) {
1058+
(Some(s), Some(e)) => Some((s, e)),
1059+
(None, None) => None,
1060+
_ => unreachable!()
1061+
};
1062+
self.idx += 2;
1063+
Some(r)
10211064
}
10221065
}
10231066

10241067
/// An Iterator over named capture groups as a tuple with the group
10251068
/// name and the value.
10261069
///
1027-
/// `'t` is the lifetime of the matched text.
1028-
pub struct SubCapturesNamed<'t>{
1029-
caps: &'t Captures<'t>,
1030-
inner: Option<Iter<'t, String, usize>>,
1070+
/// `'c` is the lifetime of the captures.
1071+
pub struct SubCapturesNamed<'c> {
1072+
caps: &'c Captures<'c>,
1073+
names: NamedGroupsIter<'c>,
10311074
}
10321075

1033-
impl<'t> Iterator for SubCapturesNamed<'t> {
1034-
type Item = (&'t str, Option<&'t str>);
1076+
impl<'c> Iterator for SubCapturesNamed<'c> {
1077+
type Item = (&'c str, Option<&'c str>);
10351078

1036-
fn next(&mut self) -> Option<(&'t str, Option<&'t str>)> {
1037-
match self.inner.as_mut().map_or(None, |it| it.next()) {
1038-
Some((name, pos)) => Some((name, self.caps.at(*pos))),
1039-
None => None
1040-
}
1079+
fn next(&mut self) -> Option<(&'c str, Option<&'c str>)> {
1080+
self.names.next().map(|(name, pos)| (name, self.caps.at(pos)))
10411081
}
10421082
}
10431083

@@ -1081,7 +1121,11 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> {
10811121
}
10821122
self.last_end = e;
10831123
self.last_match = Some(self.last_end);
1084-
Some(Captures::new(self.re, self.search, caps))
1124+
Some(Captures {
1125+
text: self.search,
1126+
locs: caps,
1127+
named_groups: NamedGroups::from_regex(self.re),
1128+
})
10851129
}
10861130
}
10871131

0 commit comments

Comments
 (0)