Squash *+, *?, +*, +?, ?* and ?+.

Change-Id: Ied7dd7fb2e718cdf397b4dc4167ab622a9dbdc01
Reviewed-on: https://code-review.googlesource.com/11132
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/parse.cc b/re2/parse.cc
index 6d24016..0729b75 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -486,6 +486,17 @@
   if (op == stacktop_->op() && fl == stacktop_->parse_flags())
     return true;
 
+  // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
+  // op is a repeat, we just have to check that stacktop_->op() is too,
+  // then adjust stacktop_.
+  if ((stacktop_->op() == kRegexpStar ||
+       stacktop_->op() == kRegexpPlus ||
+       stacktop_->op() == kRegexpQuest) &&
+      fl == stacktop_->parse_flags()) {
+    stacktop_->op_ = kRegexpStar;
+    return true;
+  }
+
   Regexp* re = new Regexp(op, fl);
   re->AllocSub(1);
   re->down_ = stacktop_->down_;
diff --git a/re2/regexp.cc b/re2/regexp.cc
index 950c985..8c5fe59 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -190,31 +190,41 @@
   return re;
 }
 
-Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
-  if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
+Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
+  // Squash **, ++ and ??.
+  if (op == sub->op() && flags == sub->parse_flags())
     return sub;
-  Regexp* re = new Regexp(kRegexpPlus, flags);
+
+  // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
+  // op is Star/Plus/Quest, we just have to check that sub->op() is too,
+  // then rewrite sub.
+  if ((sub->op() == kRegexpStar ||
+       sub->op() == kRegexpPlus ||
+       sub->op() == kRegexpQuest) &&
+      flags == sub->parse_flags()) {
+    Regexp* re = new Regexp(kRegexpStar, flags);
+    re->AllocSub(1);
+    re->sub()[0] = sub->sub()[0]->Incref();
+    sub->Decref();  // We didn't consume the reference after all.
+    return re;
+  }
+
+  Regexp* re = new Regexp(op, flags);
   re->AllocSub(1);
   re->sub()[0] = sub;
   return re;
 }
 
+Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
+  return StarPlusOrQuest(kRegexpPlus, sub, flags);
+}
+
 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
-  if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
-    return sub;
-  Regexp* re = new Regexp(kRegexpStar, flags);
-  re->AllocSub(1);
-  re->sub()[0] = sub;
-  return re;
+  return StarPlusOrQuest(kRegexpStar, sub, flags);
 }
 
 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
-  if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
-    return sub;
-  Regexp* re = new Regexp(kRegexpQuest, flags);
-  re->AllocSub(1);
-  re->sub()[0] = sub;
-  return re;
+  return StarPlusOrQuest(kRegexpQuest, sub, flags);
 }
 
 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
diff --git a/re2/regexp.h b/re2/regexp.h
index a11fb3a..9b5d98d 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -460,6 +460,10 @@
   // Computes whether Regexp is already simple.
   bool ComputeSimple();
 
+  // Constructor that generates a Star, Plus or Quest,
+  // squashing the pair if sub is also a Star, Plus or Quest.
+  static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
+
   // Constructor that generates a concatenation or alternation,
   // enforcing the limit on the number of subexpressions for
   // a particular Regexp.
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index 9a13e18..ec80fc4 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -115,10 +115,16 @@
   { "ab|cd", "alt{str{ab}str{cd}}" },
   { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
 
-  // Test squashing of **, ++ and ??.
+  // Test squashing of **, ++, ?? et cetera.
   { "(?:(?:a)*)*", "star{lit{a}}" },
   { "(?:(?:a)+)+", "plus{lit{a}}" },
   { "(?:(?:a)?)?", "que{lit{a}}" },
+  { "(?:(?:a)*)+", "star{lit{a}}" },
+  { "(?:(?:a)*)?", "star{lit{a}}" },
+  { "(?:(?:a)+)*", "star{lit{a}}" },
+  { "(?:(?:a)+)?", "star{lit{a}}" },
+  { "(?:(?:a)?)*", "star{lit{a}}" },
+  { "(?:(?:a)?)+", "star{lit{a}}" },
 
   // Test flattening.
   { "(?:a)", "lit{a}" },
diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc
index 33f8a8c..6bdb2af 100644
--- a/re2/testing/simplify_test.cc
+++ b/re2/testing/simplify_test.cc
@@ -231,6 +231,17 @@
   // capture. The new capture must have the cap of the old capture.
   // Failure to copy it results in cap=0 -> ToString() logs a fatal error.
   { "(a*aab)", "(aa+b)" },
+
+  // Test squashing of **, ++, ?? et cetera.
+  { "(?:(?:a){0,}){0,}", "a*" },
+  { "(?:(?:a){1,}){1,}", "a+" },
+  { "(?:(?:a){0,1}){0,1}", "a?" },
+  { "(?:(?:a){0,}){1,}", "a*" },
+  { "(?:(?:a){0,}){0,1}", "a*" },
+  { "(?:(?:a){1,}){0,}", "a*" },
+  { "(?:(?:a){1,}){0,1}", "a*" },
+  { "(?:(?:a){0,1}){0,}", "a*" },
+  { "(?:(?:a){0,1}){1,}", "a*" },
 };
 
 TEST(TestSimplify, SimpleRegexps) {