Skip to content

Commit f171713

Browse files
WGH-zimmerle
authored andcommitted
Refactor Regex classes further
This commit changes Regex interface rather dramatically. Most importantly, RegexMatch class now contains a list of matched groups, with group(0) being entire match, group(1) - first capturing group, and so on. Secondly, searchAll now returns a list of RegexMatch objects instead of reversed flattened list of groups from all matches.
1 parent c314954 commit f171713

File tree

12 files changed

+158
-150
lines changed

12 files changed

+158
-150
lines changed

src/modsecurity.cc

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -228,9 +228,9 @@ int ModSecurity::processContentOffset(const char *content, size_t len,
228228
const unsigned char *buf;
229229
size_t jsonSize;
230230

231-
std::list<regex::RegexMatch> vars = variables.searchAll(matchString);
232-
std::list<regex::RegexMatch> ops = operators.searchAll(matchString);
233-
std::list<regex::RegexMatch> trans = transformations.searchAll(matchString);
231+
auto vars = variables.searchAll(matchString);
232+
auto ops = operators.searchAll(matchString);
233+
auto trans = transformations.searchAll(matchString);
234234

235235
g = yajl_gen_alloc(NULL);
236236
if (g == NULL) {
@@ -255,14 +255,12 @@ int ModSecurity::processContentOffset(const char *content, size_t len,
255255
strlen("highlight"));
256256

257257
yajl_gen_array_open(g);
258-
while (vars.size() > 0) {
258+
259+
for (const auto &m : vars) {
259260
std::string value;
260261
yajl_gen_map_open(g);
261-
vars.pop_back();
262-
const std::string &startingAt = vars.back().str();
263-
vars.pop_back();
264-
const std::string &size = vars.back().str();
265-
vars.pop_back();
262+
const std::string &startingAt = m.group(1).string;
263+
const std::string &size = m.group(2).string;
266264
yajl_gen_string(g,
267265
reinterpret_cast<const unsigned char*>("startingAt"),
268266
strlen("startingAt"));
@@ -298,32 +296,34 @@ int ModSecurity::processContentOffset(const char *content, size_t len,
298296
yajl_gen_map_open(g);
299297
yajl_gen_string(g, reinterpret_cast<const unsigned char*>("value"),
300298
strlen("value"));
301-
yajl_gen_string(g, reinterpret_cast<const unsigned char*>(varValue.c_str()),
299+
yajl_gen_string(g, reinterpret_cast<const unsigned char*>(varValue.data()),
302300
varValue.size());
303301
yajl_gen_map_close(g);
304302

305-
while (trans.size() > 0) {
303+
for (const auto &m : trans) {
306304
modsecurity::actions::transformations::Transformation *t;
307305
std::string varValueRes;
308306
yajl_gen_map_open(g);
309307
yajl_gen_string(g,
310308
reinterpret_cast<const unsigned char*>("transformation"),
311309
strlen("transformation"));
312310

311+
const std::string &transformation = m.group(0).string;
312+
313313
yajl_gen_string(g,
314-
reinterpret_cast<const unsigned char*>(trans.back().str().c_str()),
315-
trans.back().str().size());
314+
reinterpret_cast<const unsigned char*>(transformation.data()),
315+
transformation.size());
316316

317317
t = modsecurity::actions::transformations::Transformation::instantiate(
318-
trans.back().str().c_str());
318+
transformation.c_str());
319319
varValueRes = t->evaluate(varValue, NULL);
320320
varValue.assign(varValueRes);
321-
trans.pop_back();
321+
322322

323323
yajl_gen_string(g, reinterpret_cast<const unsigned char*>("value"),
324324
strlen("value"));
325325
yajl_gen_string(g, reinterpret_cast<const unsigned char*>(
326-
varValue.c_str()),
326+
varValue.data()),
327327
varValue.size());
328328
yajl_gen_map_close(g);
329329

@@ -337,26 +337,23 @@ int ModSecurity::processContentOffset(const char *content, size_t len,
337337

338338
yajl_gen_map_open(g);
339339

340-
while (ops.size() > 0) {
340+
for (const auto &m : ops) {
341341
std::string value;
342342
yajl_gen_string(g, reinterpret_cast<const unsigned char*>("highlight"),
343343
strlen("highlight"));
344344
yajl_gen_map_open(g);
345-
ops.pop_back();
346-
std::string startingAt = ops.back().str();
347-
ops.pop_back();
348-
std::string size = ops.back().str();
349-
ops.pop_back();
345+
const std::string &startingAt = m.group(1).string;
346+
const std::string &size = m.group(2).string;
350347
yajl_gen_string(g,
351348
reinterpret_cast<const unsigned char*>("startingAt"),
352349
strlen("startingAt"));
353350
yajl_gen_string(g,
354-
reinterpret_cast<const unsigned char*>(startingAt.c_str()),
351+
reinterpret_cast<const unsigned char*>(startingAt.data()),
355352
startingAt.size());
356353
yajl_gen_string(g, reinterpret_cast<const unsigned char*>("size"),
357354
strlen("size"));
358355
yajl_gen_string(g,
359-
reinterpret_cast<const unsigned char*>(size.c_str()),
356+
reinterpret_cast<const unsigned char*>(size.data()),
360357
size.size());
361358
yajl_gen_map_close(g);
362359

src/operators/rx.cc

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ bool Rx::init(const std::string &arg, std::string *error) {
3838

3939
bool Rx::evaluate(Transaction *transaction, Rule *rule,
4040
const std::string& input, std::shared_ptr<RuleMessage> ruleMessage) {
41-
std::list<RegexMatch> matches;
4241
Regex *re;
4342

4443
if (m_param.empty() && !m_string->m_containsMacro) {
@@ -52,33 +51,30 @@ bool Rx::evaluate(Transaction *transaction, Rule *rule,
5251
re = m_re;
5352
}
5453

55-
matches = re->searchAll(input);
56-
if (rule && rule->m_containsCaptureAction && transaction) {
57-
int i = 0;
58-
matches.reverse();
59-
for (const RegexMatch& a : matches) {
54+
regex::RegexMatch m;
55+
bool matched = re->search(input, &m, 9);
56+
57+
if (matched && rule && rule->m_containsCaptureAction && transaction) {
58+
for (int i = 0; i < m.num_groups(); i++) {
59+
auto key = std::to_string(i);
60+
const std::string &value = m.group(i).string;
6061
transaction->m_collections.m_tx_collection->storeOrUpdateFirst(
61-
std::to_string(i), a.str());
62+
key, value);
6263
ms_dbg_a(transaction, 7, "Added regex subexpression TX." +
63-
std::to_string(i) + ": " + a.str());
64-
transaction->m_matched.push_back(a.str());
65-
i++;
64+
key + ": " + value);
65+
transaction->m_matched.push_back(value);
6666
}
6767
}
68-
69-
for (const auto & i : matches) {
70-
logOffset(ruleMessage, i.offset(), i.str().size());
68+
for (int i = 0; i < m.num_groups(); i++) {
69+
const regex::MatchGroup &g = m.group(i);
70+
logOffset(ruleMessage, g.offset, g.string.size());
7171
}
7272

7373
if (m_string->m_containsMacro) {
7474
delete re;
7575
}
7676

77-
if (matches.size() > 0) {
78-
return true;
79-
}
80-
81-
return false;
77+
return matched;
8278
}
8379

8480

src/operators/verify_cpf.cc

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ bool VerifyCPF::verify(const char *cpfnumber, int len) {
119119

120120
bool VerifyCPF::evaluate(Transaction *t, Rule *rule,
121121
const std::string& input, std::shared_ptr<RuleMessage> ruleMessage) {
122-
std::list<RegexMatch> matches;
123122
bool is_cpf = false;
124123
int i;
125124

@@ -128,18 +127,17 @@ bool VerifyCPF::evaluate(Transaction *t, Rule *rule,
128127
}
129128

130129
for (i = 0; i < input.size() - 1 && is_cpf == false; i++) {
131-
matches = m_re->searchAll(input.substr(i, input.size()));
132-
for (const auto & i : matches) {
133-
is_cpf = verify(i.str().c_str(), i.str().size());
130+
auto matches = m_re->searchAll(input.substr(i, input.size()));
131+
for (const auto &m : matches) {
132+
const regex::MatchGroup &g = m.group(0);
133+
is_cpf = verify(g.string.data(), g.string.size());
134134
if (is_cpf) {
135-
logOffset(ruleMessage, i.offset(), i.str().size());
135+
logOffset(ruleMessage, g.offset, g.string.size());
136136
if (rule && t && rule->m_containsCaptureAction) {
137137
t->m_collections.m_tx_collection->storeOrUpdateFirst(
138-
"0", i.str());
139-
ms_dbg_a(t, 7, "Added VerifyCPF match TX.0: " + \
140-
i.str());
138+
"0", g.string);
139+
ms_dbg_a(t, 7, "Added VerifyCPF match TX.0: " + g.string);
141140
}
142-
143141
goto out;
144142
}
145143
}

src/operators/verify_ssn.cc

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@ bool VerifySSN::verify(const char *ssnumber, int len) {
110110

111111
bool VerifySSN::evaluate(Transaction *t, Rule *rule,
112112
const std::string& input, std::shared_ptr<RuleMessage> ruleMessage) {
113-
std::list<RegexMatch> matches;
114113
bool is_ssn = false;
115114
int i;
116115

@@ -119,18 +118,17 @@ bool VerifySSN::evaluate(Transaction *t, Rule *rule,
119118
}
120119

121120
for (i = 0; i < input.size() - 1 && is_ssn == false; i++) {
122-
matches = m_re->searchAll(input.substr(i, input.size()));
123-
for (const auto & i : matches) {
124-
is_ssn = verify(i.str().c_str(), i.str().size());
121+
auto matches = m_re->searchAll(input.substr(i, input.size()));
122+
for (const auto &m : matches) {
123+
const regex::MatchGroup &g = m.group(0);
124+
is_ssn = verify(g.string.data(), g.string.size());
125125
if (is_ssn) {
126-
logOffset(ruleMessage, i.offset(), i.str().size());
126+
logOffset(ruleMessage, g.offset, g.string.size());
127127
if (rule && t && rule->m_containsCaptureAction) {
128128
t->m_collections.m_tx_collection->storeOrUpdateFirst(
129-
"0", i.str());
130-
ms_dbg_a(t, 7, "Added VerifySSN match TX.0: " + \
131-
i.str());
129+
"0", g.string);
130+
ms_dbg_a(t, 7, "Added VerifySSN match TX.0: " + g.string);
132131
}
133-
134132
goto out;
135133
}
136134
}

src/regex/backend/backend.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,8 @@ class Backend {
3030

3131
virtual bool ok() const = 0;
3232

33-
virtual std::list<RegexMatch> searchAll(const std::string& s) const = 0;
34-
virtual int search(const std::string &s, RegexMatch *m) const = 0;
35-
virtual int search(const std::string &s) const = 0;
33+
virtual std::vector<RegexMatch> searchAll(const std::string& s, bool overlapping = false) const = 0;
34+
virtual bool search(const std::string &s, RegexMatch *m = nullptr, ssize_t max_groups = -1) const = 0;
3635

3736
virtual const std::string& getPattern() const = 0;
3837
};

src/regex/backend/pcre.cc

Lines changed: 62 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#include <iostream>
1818
#include <fstream>
1919
#include <string>
20-
#include <list>
2120

2221

2322
#include "src/regex/backend/pcre.h"
@@ -46,6 +45,8 @@ Pcre::Pcre(const std::string& pattern_)
4645
&errptr, &erroffset, NULL);
4746

4847
m_pce = pcre_study(m_pc, pcre_study_opt, &errptr);
48+
49+
pcre_fullinfo(m_pc, m_pce, PCRE_INFO_CAPTURECOUNT, &m_capture_count);
4950
}
5051

5152

@@ -64,60 +65,76 @@ Pcre::~Pcre() {
6465
}
6566
}
6667

68+
static bool do_match(
69+
pcre *pc,
70+
pcre_extra *pce,
71+
int pcre_capture_count,
72+
const char *s,
73+
size_t n,
74+
RegexMatch *m,
75+
ssize_t max_groups,
76+
size_t offset)
77+
{
78+
if (m == nullptr) {
79+
max_groups = 0;
80+
}
6781

68-
std::list<RegexMatch> Pcre::searchAll(const std::string& s) const {
69-
const char *subject = s.c_str();
70-
const std::string tmpString = std::string(s.c_str(), s.size());
71-
int ovector[OVECCOUNT];
72-
int rc, i, offset = 0;
73-
std::list<RegexMatch> retList;
74-
75-
do {
76-
rc = pcre_exec(m_pc, m_pce, subject,
77-
s.size(), offset, 0, ovector, OVECCOUNT);
78-
79-
for (i = 0; i < rc; i++) {
80-
size_t start = ovector[2*i];
81-
size_t end = ovector[2*i+1];
82-
size_t len = end - start;
83-
if (end > s.size()) {
84-
rc = 0;
85-
break;
86-
}
87-
std::string match = std::string(tmpString, start, len);
88-
offset = start + len;
89-
retList.push_front(RegexMatch(match, start));
90-
}
82+
// "+1" is required for full match (aka group 0)
83+
int ovecsize = (pcre_capture_count+1) * 3;
84+
int ovector[ovecsize];
85+
int ret = pcre_exec(pc, pce, s, n, offset, 0, ovector, ovecsize);
9186

92-
offset = ovector[1]; // end
93-
if (offset == ovector[0]) { // start == end (size == 0)
94-
offset++;
87+
if (ret > 0) {
88+
if (max_groups < 0) {
89+
max_groups = ret;
9590
}
96-
} while (rc > 0);
97-
98-
return retList;
99-
}
100-
10191

102-
int Pcre::search(const std::string& s, RegexMatch *match) const {
103-
int ovector[OVECCOUNT];
104-
int ret = pcre_exec(m_pc, m_pce, s.c_str(),
105-
s.size(), 0, 0, ovector, OVECCOUNT) > 0;
92+
if (max_groups > 0) {
93+
size_t ngroups = std::min<size_t>(max_groups, ret);
94+
RegexMatch::MatchGroupContainer groups;
95+
groups.reserve(ngroups);
96+
for (size_t i = 0; i < ngroups; i++) {
97+
size_t start = ovector[2*i];
98+
size_t end = ovector[2*i+1];
99+
std::string group(s + start, end - start);
106100

107-
if (ret > 0) {
108-
*match = RegexMatch(
109-
std::string(s, ovector[ret-1], ovector[ret] - ovector[ret-1]),
110-
0);
101+
groups.push_back(MatchGroup{start, std::move(group)});
102+
}
103+
*m = RegexMatch(std::move(groups));
104+
}
105+
return true;
111106
}
107+
return false;
112108

113-
return ret;
114109
}
115110

111+
std::vector<RegexMatch> Pcre::searchAll(const std::string& s, bool overlapping) const {
112+
std::vector<RegexMatch> res;
113+
size_t offset = 0;
114+
115+
while (1) {
116+
RegexMatch m;
117+
bool match = do_match(m_pc, m_pce, m_capture_count, s.data(), s.size(), &m, -1, offset);
118+
if (!match) break;
119+
120+
if (overlapping) {
121+
// start just after the beginning of the last match
122+
offset = m.group(0).offset + 1;
123+
} else {
124+
// start just at the end of the last match
125+
offset = m.group(0).offset + m.group(0).string.size();
126+
if (offset == m.group(0).offset) {
127+
// empty match - advance by one to not match empty string repeatedly
128+
offset++;
129+
}
130+
}
131+
res.push_back(std::move(m));
132+
}
133+
return res;
134+
}
116135

117-
int Pcre::search(const std::string& s) const {
118-
int ovector[OVECCOUNT];
119-
return pcre_exec(m_pc, m_pce, s.c_str(),
120-
s.size(), 0, 0, ovector, OVECCOUNT) > 0;
136+
bool Pcre::search(const std::string &s, RegexMatch *m, ssize_t max_groups) const {
137+
return do_match(m_pc, m_pce, m_capture_count, s.data(), s.size(), m, max_groups, 0);
121138
}
122139

123140
#endif

0 commit comments

Comments
 (0)