Handle foldcase when computing hints. Mea culpa.
Change-Id: Ie4756d468db022fcce84f5389e446bd4df910922
Reviewed-on: https://code-review.googlesource.com/c/re2/+/38810
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/prog.cc b/re2/prog.cc
index 5c3c1a7..98bbc4a 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -468,8 +468,11 @@
foldlo = 'a';
if (foldhi > 'z')
foldhi = 'z';
- if (foldlo <= foldhi)
- builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a');
+ if (foldlo <= foldhi) {
+ foldlo += 'A' - 'a';
+ foldhi += 'A' - 'a';
+ builder.Mark(foldlo, foldhi);
+ }
}
// If this Inst is not the last Inst in its list AND the next Inst is
// also a ByteRange AND the Insts have the same out, defer the merge.
@@ -821,11 +824,15 @@
}
}
+// For each ByteRange instruction in [begin, end), computes a hint to execution
+// engines: the delta to the next instruction (in flat) worth exploring iff the
+// current instruction matched.
+//
+// Implements a coloring algorithm related to ByteMapBuilder, but in this case,
+// colors are instructions and recoloring ranges precisely identifies conflicts
+// between instructions. Iterating backwards over [begin, end) is guaranteed to
+// identify the nearest conflict (if any) with only linear complexity.
void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
- // Hello, Bitmap256, my old friend...
- // This time around, "colors" are instructions. By iterating backwards over
- // [begin, end) and recoloring ranges, the nearest conflict (if any) can be
- // identified precisely - and this is achieved with only linear complexity.
Bitmap256 splits;
int colors[256];
@@ -846,34 +853,53 @@
}
dirty = true;
- // first ratchets backwards from end to the nearest conflict (if any).
+ // We recolor the [lo-hi] range with id. Note that first ratchets backwards
+ // from end to the nearest conflict (if any) during recoloring.
int first = end;
+ auto Recolor = [&](int lo, int hi) {
+ // Like ByteMapBuilder, we split at lo-1 and at hi.
+ --lo;
+
+ if (0 <= lo && !splits.Test(lo)) {
+ splits.Set(lo);
+ int next = splits.FindNextSetBit(lo+1);
+ colors[lo] = colors[next];
+ }
+ if (!splits.Test(hi)) {
+ splits.Set(hi);
+ int next = splits.FindNextSetBit(hi+1);
+ colors[hi] = colors[next];
+ }
+
+ int c = lo+1;
+ while (c < 256) {
+ int next = splits.FindNextSetBit(c);
+ // Ratchet backwards...
+ first = std::min(first, colors[next]);
+ // Recolor with id - because it's the new nearest conflict!
+ colors[next] = id;
+ if (next == hi)
+ break;
+ c = next+1;
+ }
+ };
Inst* ip = &(*flat)[id];
- int lo = ip->lo()-1;
+ int lo = ip->lo();
int hi = ip->hi();
-
- if (0 <= lo && !splits.Test(lo)) {
- splits.Set(lo);
- int next = splits.FindNextSetBit(lo+1);
- colors[lo] = colors[next];
- }
- if (!splits.Test(hi)) {
- splits.Set(hi);
- int next = splits.FindNextSetBit(hi+1);
- colors[hi] = colors[next];
- }
-
- int c = lo+1;
- while (c < 256) {
- int next = splits.FindNextSetBit(c);
- // Ratchet backwards...
- first = std::min(first, colors[next]);
- // Recolor with id - because it's the new nearest conflict!
- colors[next] = id;
- if (next == hi)
- break;
- c = next+1;
+ Recolor(lo, hi);
+ if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
+ int foldlo = lo;
+ int foldhi = hi;
+ if (foldlo < 'a')
+ foldlo = 'a';
+ if (foldhi > 'z')
+ foldhi = 'z';
+ if (foldlo <= foldhi) {
+ foldlo += 'A' - 'a';
+ foldhi += 'A' - 'a';
+ Recolor(foldlo, foldhi);
+ }
}
if (first == end) {
diff --git a/re2/testing/search_test.cc b/re2/testing/search_test.cc
index 8adef6c..16d8e50 100644
--- a/re2/testing/search_test.cc
+++ b/re2/testing/search_test.cc
@@ -307,6 +307,7 @@
// Former bugs.
{ "a\\C*|ba\\C", "baba" },
+ { "\\w*I\\w*", "Inc." },
};
TEST(Regexp, SearchTests) {