From 964b93a09deb7e166041c584ace83b2989f41671 Mon Sep 17 00:00:00 2001
From: Jonathan Marler <johnnymarler@gmail.com>
Date: Sat, 6 Mar 2021 07:19:02 -0700
Subject: [PATCH 01/30] fix mishandling of ^ inside an expression

---
 Makefile      |  1 +
 re.c          |  1 +
 tests/test1.c | 20 ++++++++++++++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 857d2ee..05fe7ec 100644
--- a/Makefile
+++ b/Makefile
@@ -101,6 +101,7 @@ test: all
 	@$(PYTHON) ./scripts/regex_test_neg.py [012345-9]             $(NRAND_TESTS)
 	@$(PYTHON) ./scripts/regex_test_neg.py [0-56789]              $(NRAND_TESTS)
 	@$(PYTHON) ./scripts/regex_test_neg.py .*123faerdig           $(NRAND_TESTS)
+	@$(PYTHON) ./scripts/regex_test_neg.py a^                     $(NRAND_TESTS)
 	@echo
 	@echo
 	@./tests/test2
diff --git a/re.c b/re.c
index 20d1474..34831af 100644
--- a/re.c
+++ b/re.c
@@ -393,6 +393,7 @@ static int matchone(regex_t p, char c)
     case NOT_ALPHA:      return !matchalphanum(c);
     case WHITESPACE:     return  matchwhitespace(c);
     case NOT_WHITESPACE: return !matchwhitespace(c);
+    case BEGIN:          return 0;
     default:             return  (p.u.ch == c);
   }
 }
diff --git a/tests/test1.c b/tests/test1.c
index 5fdfe74..666d18b 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -101,11 +101,12 @@ int main()
     int should_fail;
     int length;
     int correctlen;
-    size_t ntests = sizeof(test_vector) / sizeof(*test_vector);
+    size_t nvector_tests = sizeof(test_vector) / sizeof(*test_vector);
+    size_t ntests = nvector_tests + 1;
     size_t nfailed = 0;
     size_t i;
 
-    for (i = 0; i < ntests; ++i)
+    for (i = 0; i < nvector_tests; ++i)
     {
         pattern = test_vector[i][1];
         text = test_vector[i][2];
@@ -141,6 +142,21 @@ int main()
         }
     }
 
+    // regression test for unhandled BEGIN in the middle of an expression
+    // we need to test text strings with all possible values for the second
+    // byte because re.c was matching it against an uninitalized value, so
+    // it could be anything
+    pattern = "a^";
+    for (i = 0; i < 255; i++) {
+      char text_buf[] = { 'a', i, '\0' };
+      int m = re_match(pattern, text_buf, &length);
+      if (m != -1) {
+        fprintf(stderr, "[%lu/%lu]: pattern '%s' matched '%s' unexpectedly", ntests, ntests, pattern, text_buf);
+        nfailed += 1;
+        break;
+      }
+    }
+
     // printf("\n");
     printf("%lu/%lu tests succeeded.\n", ntests - nfailed, ntests);
     printf("\n");

From 7fb7a51bcc24f94f1c5c37e6f33e410cc32977fd Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 13:40:28 +0200
Subject: [PATCH 02/30] add cbmc verify and fix a --conversion-check

add another formal verifier (much easier to use),
and fix an invalid signed conversion
---
 Makefile               |  4 ++++
 formal_verification.md |  5 +++++
 re.c                   | 30 +++++++++++++++++++++++++++++-
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 857d2ee..ece0acb 100644
--- a/Makefile
+++ b/Makefile
@@ -107,3 +107,7 @@ test: all
 	@echo
 	@echo
 
+CBMC := cbmc
+
+verify:
+	$(CBMC) -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c
diff --git a/formal_verification.md b/formal_verification.md
index 46fc9ee..a36bb45 100644
--- a/formal_verification.md
+++ b/formal_verification.md
@@ -140,3 +140,8 @@ sys     9m34.654s
 klee@780432c1aaae0:~$ 
 ```
 
+----
+
+For the formal verifier CBMC just call make verify.
+This verifier is much faster and better than klee.
+https://www.cprover.org/cbmc/
diff --git a/re.c b/re.c
index 20d1474..896a417 100644
--- a/re.c
+++ b/re.c
@@ -230,7 +230,8 @@ re_t re_compile(const char* pattern)
       default:
       {
         re_compiled[j].type = CHAR;
-        re_compiled[j].u.ch = c;
+        // cbmc: arithmetic overflow on signed to unsigned type conversion in (unsigned char)c
+        re_compiled[j].u.ch = (unsigned char)c;
       } break;
     }
     /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
@@ -526,3 +527,30 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
 }
 
 #endif
+
+#ifdef CPROVER
+#define N 24
+
+/* Formal verification with cbmc: */
+/* cbmc -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace re.c
+ */
+int main(int argc, char* argv[])
+{
+  /* test input - ten chars used as a regex-pattern input */
+  char arr[N];
+
+  /* make input symbolic, to search all paths through the code */
+  /* i.e. the input is checked for all possible ten-char combinations */
+  for (int i=0; i<sizeof(arr)-1; i++) {
+      //arr[i] = nondet_char();
+      assume(arr[i] > -127 && arr[i] < 128);
+  }
+  /* assume proper NULL termination */
+  assume(arr[sizeof(arr) - 1] == 0);
+
+  /* verify abscence of run-time errors - go! */
+  re_compile(arr);
+
+  return 0;
+}
+#endif

From 69afafec0a9db141a31706239e12e736faec7469 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 13:52:11 +0200
Subject: [PATCH 03/30] extend CBMC checks to all APIs

compare GH #76
---
 Makefile |  2 +-
 re.c     | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index ece0acb..369f419 100644
--- a/Makefile
+++ b/Makefile
@@ -110,4 +110,4 @@ test: all
 CBMC := cbmc
 
 verify:
-	$(CBMC) -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c
+	$(CBMC) -DCPROVER --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c
diff --git a/re.c b/re.c
index 896a417..6b4cff6 100644
--- a/re.c
+++ b/re.c
@@ -536,8 +536,10 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
  */
 int main(int argc, char* argv[])
 {
+  int length;
   /* test input - ten chars used as a regex-pattern input */
   char arr[N];
+  regex_t pattern[N];
 
   /* make input symbolic, to search all paths through the code */
   /* i.e. the input is checked for all possible ten-char combinations */
@@ -547,10 +549,17 @@ int main(int argc, char* argv[])
   }
   /* assume proper NULL termination */
   assume(arr[sizeof(arr) - 1] == 0);
-
   /* verify abscence of run-time errors - go! */
   re_compile(arr);
 
+  for (int i=0; i<N; i++) {
+      pattern[i].type = nondet_uchar();
+      pattern[i].u.ch = nondet_int();
+  }
+  re_print(&pattern);
+
+  re_match(&pattern, arr, &length);
+
   return 0;
 }
 #endif

From e4486516ec68a2c8b6e3844400a27c2c30ea43a0 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 13:58:30 +0200
Subject: [PATCH 04/30] fix GH #76 out-of-bounds

with invalid types in re_print
---
 re.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/re.c b/re.c
index 6b4cff6..a562ce4 100644
--- a/re.c
+++ b/re.c
@@ -251,7 +251,7 @@ re_t re_compile(const char* pattern)
 
 void re_print(regex_t* pattern)
 {
-  const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" };
+  const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ };
 
   int i;
   int j;
@@ -263,7 +263,11 @@ void re_print(regex_t* pattern)
       break;
     }
 
-    printf("type: %s", types[pattern[i].type]);
+    if (pattern[i].type <= NOT_WHITESPACE)
+      printf("type: %s", types[pattern[i].type]);
+    else
+      printf("invalid type: %d", pattern[i].type);
+
     if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS)
     {
       printf(" [");

From bd55c35edf45d42a99395446db86e7c84482862c Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 14:26:35 +0200
Subject: [PATCH 05/30] refactor cbmc proofs a bit

seperate functions.
check assume vs nondet_uchar() (both are the same).
use less MAX_REGEXP_OBJECTS for cbmc (much faster then)

improve the no buffer-out-of-bounds access on invalid patterns check.
---
 Makefile |  3 ++-
 re.c     | 70 +++++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/Makefile b/Makefile
index 369f419..2a204b3 100644
--- a/Makefile
+++ b/Makefile
@@ -109,5 +109,6 @@ test: all
 
 CBMC := cbmc
 
+# unwindset: loop max MAX_REGEXP_OBJECTS patterns
 verify:
-	$(CBMC) -DCPROVER --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c
+	$(CBMC) -DCPROVER --unwindset 8 --unwind 16 --depth 16 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check $(CBMC_ARGS) re.c
diff --git a/re.c b/re.c
index a562ce4..05aa97c 100644
--- a/re.c
+++ b/re.c
@@ -35,8 +35,12 @@
 
 /* Definitions: */
 
-#define MAX_REGEXP_OBJECTS      30    /* Max number of regex symbols in expression. */
 #define MAX_CHAR_CLASS_LEN      40    /* Max length of character-class buffer in.   */
+#ifndef CPROVER
+#define MAX_REGEXP_OBJECTS      30    /* Max number of regex symbols in expression. */
+#else
+#define MAX_REGEXP_OBJECTS      8    /* faster formal proofs */
+#endif
 
 
 enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
@@ -226,6 +230,9 @@ re_t re_compile(const char* pattern)
         re_compiled[j].u.ccl = &ccl_buf[buf_begin];
       } break;
 
+      case '\0': // EOL
+        return 0;
+
       /* Other characters: */
       default:
       {
@@ -234,12 +241,6 @@ re_t re_compile(const char* pattern)
         re_compiled[j].u.ch = (unsigned char)c;
       } break;
     }
-    /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
-    if (pattern[i] == 0)
-    {
-      return 0;
-    }
-
     i += 1;
     j += 1;
   }
@@ -251,11 +252,14 @@ re_t re_compile(const char* pattern)
 
 void re_print(regex_t* pattern)
 {
-  const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ };
+  const char *const types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ };
 
-  int i;
-  int j;
+  unsigned char i;
+  unsigned char j;
   char c;
+
+  if (!pattern)
+    return;
   for (i = 0; i < MAX_REGEXP_OBJECTS; ++i)
   {
     if (pattern[i].type == UNUSED)
@@ -538,32 +542,60 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
 /* Formal verification with cbmc: */
 /* cbmc -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace re.c
  */
-int main(int argc, char* argv[])
+
+void verify_re_compile()
 {
-  int length;
   /* test input - ten chars used as a regex-pattern input */
   char arr[N];
-  regex_t pattern[N];
-
   /* make input symbolic, to search all paths through the code */
   /* i.e. the input is checked for all possible ten-char combinations */
   for (int i=0; i<sizeof(arr)-1; i++) {
-      //arr[i] = nondet_char();
-      assume(arr[i] > -127 && arr[i] < 128);
+    //arr[i] = nondet_char();
+    assume(arr[i] > -127 && arr[i] < 128);
   }
   /* assume proper NULL termination */
   assume(arr[sizeof(arr) - 1] == 0);
   /* verify abscence of run-time errors - go! */
   re_compile(arr);
+}
 
-  for (int i=0; i<N; i++) {
-      pattern[i].type = nondet_uchar();
-      pattern[i].u.ch = nondet_int();
+void verify_re_print()
+{
+  regex_t pattern[MAX_REGEXP_OBJECTS];
+  for (unsigned char i=0; i<MAX_REGEXP_OBJECTS; i++) {
+    //pattern[i].type = nondet_uchar();
+    assume(pattern[i].type >= 0 && pattern[i].type <= 255);
+    pattern[i].u.ccl = nondet_long();
   }
   re_print(&pattern);
+}
+
+void verify_re_match()
+{
+  int length;
+  regex_t pattern[MAX_REGEXP_OBJECTS];
+  char arr[N];
+
+  for (unsigned char i=0; i<MAX_REGEXP_OBJECTS; i++) {
+    //pattern[i].type = nondet_uchar();
+    //pattern[i].u.ch = nondet_int();
+    assume(pattern[i].type >= 0 && pattern[i].type <= 255);
+    assume(pattern[i].u.ccl >= 0 && pattern[i].u.ccl <= ~1);
+  }
+  for (int i=0; i<sizeof(arr)-1; i++) {
+    assume(arr[i] > -127 && arr[i] < 128);
+  }
+  /* assume proper NULL termination */
+  assume(arr[sizeof(arr) - 1] == 0);
 
   re_match(&pattern, arr, &length);
+}
 
+int main(int argc, char* argv[])
+{
+  verify_re_compile();
+  verify_re_printh();
+  verify_re_match();
   return 0;
 }
 #endif

From 9d25c223eedf3ce1056fcf6e31703dd5077673d7 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 14:51:24 +0200
Subject: [PATCH 06/30] support "\\\\" pattern, and disallow "..\\"

ending \\
---
 re.c          | 11 +++--------
 tests/test1.c |  3 +++
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/re.c b/re.c
index 05aa97c..4696505 100644
--- a/re.c
+++ b/re.c
@@ -156,7 +156,7 @@ re_t re_compile(const char* pattern)
             case 's': {    re_compiled[j].type = WHITESPACE;       } break;
             case 'S': {    re_compiled[j].type = NOT_WHITESPACE;   } break;
 
-            /* Escaped character, e.g. '.' or '$' */
+              /* Escaped character, e.g. '.', '$' or '\\' */
             default:
             {
               re_compiled[j].type = CHAR;
@@ -164,14 +164,9 @@ re_t re_compile(const char* pattern)
             } break;
           }
         }
-        /* '\\' as last char in pattern -> invalid regular expression. */
-/*
+        /* '\\' as last char without previous \\ -> invalid regular expression. */
         else
-        {
-          re_compiled[j].type = CHAR;
-          re_compiled[j].ch = pattern[i];
-        }
-*/
+          return 0;
       } break;
 
       /* Character class: */
diff --git a/tests/test1.c b/tests/test1.c
index 5fdfe74..af43c99 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -89,6 +89,9 @@ char* test_vector[][4] =
   { NOK, "X?Y",                        "Z",               (char*) 0      },
   { OK, "[a-z]+\nbreak",              "blahblah\nbreak",  (char*) 14     },
   { OK, "[a-z\\s]+\nbreak",           "bla bla \nbreak",  (char*) 14     },
+  { NOK, "a\\",                       "a\\",              (char*) 0      },
+  { NOK, "\\",                        "\\",               (char*) 0      },
+  { OK,  "\\\\",                      "\\",               (char*) 1      },
 };
 
 

From 7bd15de3604148bf4dd2c4e41851fdc83c86dfb3 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 14:54:19 +0200
Subject: [PATCH 07/30] Clarify python2 is needed

---
 Makefile              | 2 +-
 scripts/regex_test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 2a204b3..4d84611 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \
           elif command -v python2 >/dev/null 2>&1; then             \
             echo 'python2';                                         \
           else                                                      \
-            echo 'Error: no compatible python version found.' >&2;  \
+            echo 'Error: no compatible python 2 version found.' >&2;  \
             exit 1;                                                 \
           fi
 
diff --git a/scripts/regex_test.py b/scripts/regex_test.py
index 4fa98de..08b4c5e 100755
--- a/scripts/regex_test.py
+++ b/scripts/regex_test.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 """
-  This program generates random text that matches a given regex-pattern.
+  This python2 program generates random text that matches a given regex-pattern.
   The pattern is given via sys.argv and the generated text is passed to
   the binary 'tests/test_rand' to check if the generated text also matches
   the regex-pattern in the C implementation.

From 0388df31ef50e5df4681da44f3828b5c112aa4e0 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 15:37:47 +0200
Subject: [PATCH 08/30] re-enable INV_CHAR_CLASS

and use the enum type internally
---
 README.md     | 7 +------
 re.c          | 8 +++++---
 tests/test1.c | 4 +---
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 0a2be86..fabddab 100644
--- a/README.md
+++ b/README.md
@@ -51,8 +51,6 @@ int  re_match(const char* pattern, const char* text, int* matchlength);
 ### Supported regex-operators
 The following features / regex-operators are supported by this library.
 
-NOTE: inverted character classes are buggy - see the test harness for concrete examples.
-
 
   -  `.`         Dot, matches any character
   -  `^`         Start anchor, matches beginning of string
@@ -104,10 +102,10 @@ if (match_idx != -1)
 For more usage examples I encourage you to look at the code in the `tests`-folder.
 
 ### TODO
-- Fix the implementation of inverted character classes.
 - Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`.
 - Add `example.c` that demonstrates usage.
 - Add `tests/test_perf.c` for performance and time measurements.
+- Add optional multibyte support (e.g. UTF-8)
 - Testing: Improve pattern rejection testing.
 
 ### FAQ
@@ -118,6 +116,3 @@ For more usage examples I encourage you to look at the code in the `tests`-folde
 ### License
 All material in this repository is in the public domain.
 
-
-
- 
diff --git a/re.c b/re.c
index 4696505..d4413e6 100644
--- a/re.c
+++ b/re.c
@@ -15,7 +15,7 @@
  *   '+'        Plus, match one or more (greedy)
  *   '?'        Question, match zero or one (non-greedy)
  *   '[abc]'    Character class, match if one of {'a', 'b', 'c'}
- *   '[^abc]'   Inverted class, match if NOT one of {'a', 'b', 'c'} -- NOTE: feature is currently broken!
+ *   '[^abc]'   Inverted class, match if NOT one of {'a', 'b', 'c'}
  *   '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z }
  *   '\s'       Whitespace, \t \f \r \n \v and spaces
  *   '\S'       Non-whitespace
@@ -43,11 +43,11 @@
 #endif
 
 
-enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
+enum regex_type_e { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
 
 typedef struct regex_t
 {
-  unsigned char  type;   /* CHAR, STAR, etc.                      */
+  enum regex_type_e type;   /* CHAR, STAR, etc.                      */
   union
   {
     unsigned char  ch;   /*      the character itself             */
@@ -270,6 +270,8 @@ void re_print(regex_t* pattern)
     if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS)
     {
       printf(" [");
+      if (pattern[i].type == INV_CHAR_CLASS)
+        printf("^");
       for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j)
       {
         c = pattern[i].u.ccl[j];
diff --git a/tests/test1.c b/tests/test1.c
index af43c99..7005494 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -75,15 +75,13 @@ char* test_vector[][4] =
   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "Hello world!   ",  (char*) 11     },
   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "Hello world  !",   (char*) 13     },
   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "hello World    !", (char*) 15     },
-  { NOK, "\\d\\d?:\\d\\d?:\\d\\d?",   "a:0",              (char*) 0      }, /* Failing test case reported in https://github.com/kokke/tiny-regex-c/issues/12 */
-/*
+  { NOK, "\\d\\d?:\\d\\d?:\\d\\d?",   "a:0",              (char*) 0      },
   { OK,  "[^\\w][^-1-4]",     ")T",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     ")^",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     "*)",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     "!.",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     " x",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     "$b",          (char*) 2      },
-*/
   { OK,  ".?bar",                      "real_bar",        (char*) 4      },
   { NOK, ".?bar",                      "real_foo",        (char*) 0      },
   { NOK, "X?Y",                        "Z",               (char*) 0      },

From f334c5b3ff61acb6fbeebf83c11502bb9989b64b Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 15:38:31 +0200
Subject: [PATCH 09/30] prepare multi-byte support

and fix isalpha crashes on bad libc's. Fixes GH #70.
e.g. UTF-8.
---
 re.c          | 7 ++++---
 tests/test1.c | 5 +++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/re.c b/re.c
index d4413e6..fae8aa0 100644
--- a/re.c
+++ b/re.c
@@ -296,15 +296,15 @@ void re_print(regex_t* pattern)
 /* Private functions: */
 static int matchdigit(char c)
 {
-  return isdigit(c);
+  return isdigit((unsigned char)c);
 }
 static int matchalpha(char c)
 {
-  return isalpha(c);
+  return isalpha((unsigned char)c);
 }
 static int matchwhitespace(char c)
 {
-  return isspace(c);
+  return isspace((unsigned char)c);
 }
 static int matchalphanum(char c)
 {
@@ -407,6 +407,7 @@ static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchle
 {
   int prelen = *matchlength;
   const char* prepoint = text;
+  // TODO check if multibyte, and use mbtowc() then
   while ((text[0] != '\0') && matchone(p, *text))
   {
     text++;
diff --git a/tests/test1.c b/tests/test1.c
index 7005494..b98be12 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -4,6 +4,7 @@
 
 #include <stdio.h>
 #include <string.h>
+//#include <locale.h>
 #include "re.h"
 
 
@@ -90,6 +91,8 @@ char* test_vector[][4] =
   { NOK, "a\\",                       "a\\",              (char*) 0      },
   { NOK, "\\",                        "\\",               (char*) 0      },
   { OK,  "\\\\",                      "\\",               (char*) 1      },
+  // no multibyte support yet
+  //{ OK,  "\\w+",                      "Çüéâ",             (char*) 4      },
 };
 
 
@@ -106,6 +109,8 @@ int main()
     size_t nfailed = 0;
     size_t i;
 
+    //setlocale(LC_CTYPE, "en_US.UTF-8");
+
     for (i = 0; i < ntests; ++i)
     {
         pattern = test_vector[i][1];

From 148e229fb68a7875668653df6572a80f8ca8b988 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Mon, 20 Jun 2022 08:44:45 +0200
Subject: [PATCH 10/30] TODOs and new tests

---
 README.md     | 13 +++++++++----
 tests/test1.c |  2 ++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index fabddab..d74f46a 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ The following features / regex-operators are supported by this library.
   -  `[abc]`     Character class, match if one of {'a', 'b', 'c'}
   -  `[^abc]`   Inverted class, match if NOT one of {'a', 'b', 'c'}
   -  `[a-zA-Z]` Character ranges, the character set of the ranges { a-z | A-Z }
-  -  `\s`       Whitespace, \t \f \r \n \v and spaces
+  -  `\s`       Whitespace, '\t' '\f' '\r' '\n' '\v' and spaces
   -  `\S`       Non-whitespace
   -  `\w`       Alphanumeric, [a-zA-Z0-9_]
   -  `\W`       Non-alphanumeric
@@ -88,7 +88,7 @@ int match_length;
 /* Standard null-terminated C-string to search: */
 const char* string_to_search = "ahem.. 'hello world !' ..";
 
-/* Compile a simple regular expression using character classes, meta-char and greedy + non-greedy quantifiers: */
+/* Compile a simple regular expression using character classes, meta-char and greedy quantifiers: */
 re_t pattern = re_compile("[Hh]ello [Ww]orld\\s*[!]?");
 
 /* Check if the regex matches the text: */
@@ -102,10 +102,15 @@ if (match_idx != -1)
 For more usage examples I encourage you to look at the code in the `tests`-folder.
 
 ### TODO
-- Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`.
+- Fix implementation of branches (`|`) (see the branch), and add groups as well, e.g. `(a|b)+`.
+- `re_match_capture()` with groups.
 - Add `example.c` that demonstrates usage.
 - Add `tests/test_perf.c` for performance and time measurements.
-- Add optional multibyte support (e.g. UTF-8)
+- Add optional multibyte support (e.g. UTF-8). On non-wchar systems roll our own.
+- Word boundary: \b \B
+- non-greedy, lazy quantifiers (??, +?, *?, {n,m}?)
+- case-insensitive option or API. `re_matchi()`
+- '.' may not match '\r' nor '\n', unless a single-line option is given.
 - Testing: Improve pattern rejection testing.
 
 ### FAQ
diff --git a/tests/test1.c b/tests/test1.c
index b98be12..228b2e1 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -37,6 +37,8 @@ char* test_vector[][4] =
   { OK,  "[abc]",                     "1c2",              (char*) 1      },
   { NOK, "[abc]",                     "1C2",              (char*) 0      },
   { OK,  "[1-5]+",                    "0123456789",       (char*) 5      },
+  { OK,  "[1-5-]+",                   "123-",             (char*) 4      },
+  { OK,  "[1-5-]+[-1-2]-[-]", 	      "13132231--353444-511--",    (char *) 22  },
   { OK,  "[.2]",                      "1C2",              (char*) 1      },
   { OK,  "a*$",                       "Xaa",              (char*) 2      },
   { OK,  "a*$",                       "Xaa",              (char*) 2      },

From 89f513f4e8fb74a673bf9dface055faee2d4ba2a Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Mon, 20 Jun 2022 08:56:38 +0200
Subject: [PATCH 11/30] fix ranges with ending -

Fixes GH #79 and the exreg failures with [1-5-]+[-1-2]-[-]
---
 re.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/re.c b/re.c
index fae8aa0..e81aa67 100644
--- a/re.c
+++ b/re.c
@@ -373,7 +373,9 @@ static int matchcharclass(char c, const char* str)
     {
       if (c == '-')
       {
-        return ((str[-1] == '\0') || (str[1] == '\0'));
+        if ((str[-1] == '\0') || (str[1] == '\0'))
+            return 1;
+        // else continue
       }
       else
       {

From e9f6a87d2a33a1f819659e937d480e155ca7e5dc Mon Sep 17 00:00:00 2001
From: Federico Perini <federico.perini@gmail.com>
Date: Sun, 18 Dec 2022 09:34:51 +0100
Subject: [PATCH 12/30] fix end anchor match length

---
 re.c                    |  4 ++++
 tests/test_end_anchor.c | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 tests/test_end_anchor.c

diff --git a/re.c b/re.c
index 20d1474..06bab58 100644
--- a/re.c
+++ b/re.c
@@ -99,6 +99,10 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
 
           return idx;
         }
+
+        //  Reset match length for the next starting point
+	*matchlength = 0;
+
       }
       while (*text++ != '\0');
     }
diff --git a/tests/test_end_anchor.c b/tests/test_end_anchor.c
new file mode 100644
index 0000000..005fc7b
--- /dev/null
+++ b/tests/test_end_anchor.c
@@ -0,0 +1,20 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "../re.h"
+
+int main() {
+ 
+   const char *text = "table football";
+   const char *pattern = "l$";   
+   int index,len;
+
+   index = re_match(pattern, text, &len);
+
+   if (index==13 && len==1) {
+      return 0; 
+   } else {
+      printf("ERROR! index=%d len=%d \n",index,len);
+      return -1;
+   }
+
+}	

From 359a38c6ea729661a2109b5b234ff149c414dcba Mon Sep 17 00:00:00 2001
From: Federico Perini <federico.perini@gmail.com>
Date: Sun, 18 Dec 2022 09:37:29 +0100
Subject: [PATCH 13/30] add end anchor test to makefile

---
 Makefile                | 18 ++++++++++++------
 tests/test_end_anchor.c |  2 +-
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 857d2ee..c640f46 100644
--- a/Makefile
+++ b/Makefile
@@ -17,14 +17,15 @@ PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \
 CFLAGS := -O3 -Wall -Wextra -std=c99 -I.
 
 all:
-	@$(CC) $(CFLAGS) re.c tests/test1.c         -o tests/test1
-	@$(CC) $(CFLAGS) re.c tests/test2.c         -o tests/test2
-	@$(CC) $(CFLAGS) re.c tests/test_rand.c     -o tests/test_rand
-	@$(CC) $(CFLAGS) re.c tests/test_rand_neg.c -o tests/test_rand_neg
-	@$(CC) $(CFLAGS) re.c tests/test_compile.c  -o tests/test_compile
+	@$(CC) $(CFLAGS) re.c tests/test1.c           -o tests/test1
+	@$(CC) $(CFLAGS) re.c tests/test2.c           -o tests/test2
+	@$(CC) $(CFLAGS) re.c tests/test_rand.c       -o tests/test_rand
+	@$(CC) $(CFLAGS) re.c tests/test_rand_neg.c   -o tests/test_rand_neg
+	@$(CC) $(CFLAGS) re.c tests/test_compile.c    -o tests/test_compile
+	@$(CC) $(CFLAGS) re.c tests/test_end_anchor.c -o tests/test_end_anchor
 
 clean:
-	@rm -f tests/test1 tests/test2 tests/test_rand tests/test_compile
+	@rm -f tests/test1 tests/test2 tests/test_rand tests/test_compile tests/test_end_anchor
 	@#@$(foreach test_bin,$(TEST_BINS), rm -f $(test_bin) ; )
 	@rm -f a.out
 	@rm -f *.o
@@ -106,4 +107,9 @@ test: all
 	@./tests/test2
 	@echo
 	@echo
+	@echo
+	@echo
+	@./tests/test_end_anchor
+	@echo
+	@echo
 
diff --git a/tests/test_end_anchor.c b/tests/test_end_anchor.c
index 005fc7b..1809f7c 100644
--- a/tests/test_end_anchor.c
+++ b/tests/test_end_anchor.c
@@ -1,6 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
-#include "../re.h"
+#include "re.h"
 
 int main() {
  

From 45dfbe4880afe3d091e152698f9913bdf7423282 Mon Sep 17 00:00:00 2001
From: kokke <spam@rowdy.dk>
Date: Fri, 11 Jun 2021 20:13:36 +0200
Subject: [PATCH 14/30] Update formal_verification.md

---
 formal_verification.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/formal_verification.md b/formal_verification.md
index 4227e41..46fc9ee 100644
--- a/formal_verification.md
+++ b/formal_verification.md
@@ -4,7 +4,7 @@ Here is a crude demo of formal verification of tiny-regex. This is a hefty plagi
 
 I am using the [KLEE Symbolic Execution Engine](https://klee.github.io/) and their Docker image here on a Debian-based host.
 
-What this does, is mechanically try and prove the abscence of all run-time errors, memory corruption bugs and other problems by symbolic execution. We mark the inputs as being symbolic, so that the tool knows to use that as the "search space". That means KLEE checks all possible inputs of the form we give it.
+What this does, is mechanically try to prove the abscence of all run-time errors, memory corruption bugs and other problems by symbolic execution. We mark the inputs as being symbolic, so that the tool knows to use that as the "search space". That means KLEE checks all possible inputs of the form we give it.
 
 Steps:
 

From fd17b66552a02515d956a02f611235738ee75219 Mon Sep 17 00:00:00 2001
From: Jonathan Marler <johnnymarler@gmail.com>
Date: Sat, 6 Mar 2021 05:28:33 -0700
Subject: [PATCH 15/30] use flat memory layout

---
 re.c | 186 ++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 102 insertions(+), 84 deletions(-)

diff --git a/re.c b/re.c
index 34831af..7f9ef11 100644
--- a/re.c
+++ b/re.c
@@ -35,30 +35,31 @@
 
 /* Definitions: */
 
-#define MAX_REGEXP_OBJECTS      30    /* Max number of regex symbols in expression. */
-#define MAX_CHAR_CLASS_LEN      40    /* Max length of character-class buffer in.   */
+#define MAX_REGEXP_LEN      70    /* Max number of bytes for a regex. */
 
 
 enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
 
 typedef struct regex_t
 {
-  unsigned char  type;   /* CHAR, STAR, etc.                      */
-  union
-  {
-    unsigned char  ch;   /*      the character itself             */
-    unsigned char* ccl;  /*  OR  a pointer to characters in class */
-  } u;
+  unsigned char type;    /* CHAR, STAR, etc.                      */
+  unsigned char data_len;
+  unsigned char data[0];
 } regex_t;
 
+static re_t getnext(regex_t* pattern)
+{
+  return (re_t)(((unsigned char*)pattern) + 2 + pattern->data_len);
+}
+
 
 
 /* Private function declarations: */
 static int matchpattern(regex_t* pattern, const char* text, int* matchlength);
 static int matchcharclass(char c, const char* str);
-static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength);
-static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength);
-static int matchone(regex_t p, char c);
+static int matchstar(regex_t *p, regex_t* pattern, const char* text, int* matchlength);
+static int matchplus(regex_t *p, regex_t* pattern, const char* text, int* matchlength);
+static int matchone(regex_t* p, char c);
 static int matchdigit(char c);
 static int matchalpha(char c);
 static int matchwhitespace(char c);
@@ -80,9 +81,9 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
   *matchlength = 0;
   if (pattern != 0)
   {
-    if (pattern[0].type == BEGIN)
+    if (pattern->type == BEGIN)
     {
-      return ((matchpattern(&pattern[1], text, matchlength)) ? 0 : -1);
+      return ((matchpattern(getnext(pattern), text, matchlength)) ? 0 : -1);
     }
     else
     {
@@ -106,33 +107,37 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
   return -1;
 }
 
+static int min(int a, int b)
+{
+  return (a <= b) ? a : b;
+}
+
 re_t re_compile(const char* pattern)
 {
-  /* The sizes of the two static arrays below substantiates the static RAM usage of this module.
-     MAX_REGEXP_OBJECTS is the max number of symbols in the expression.
-     MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */
-  static regex_t re_compiled[MAX_REGEXP_OBJECTS];
-  static unsigned char ccl_buf[MAX_CHAR_CLASS_LEN];
-  int ccl_bufidx = 1;
+  /* The size of this static array substantiates the static RAM usage of this module.
+     MAX_REGEXP_LEN is the max number number of bytes in the expression. */
+  static unsigned char re_data[MAX_REGEXP_LEN];
 
   char c;     /* current char in pattern   */
   int i = 0;  /* index into pattern        */
-  int j = 0;  /* index into re_compiled    */
+  int j = 0;  /* index into re_data    */
 
-  while (pattern[i] != '\0' && (j+1 < MAX_REGEXP_OBJECTS))
+  while (pattern[i] != '\0' && (j+3 < MAX_REGEXP_LEN))
   {
     c = pattern[i];
+    regex_t *re_compiled = (regex_t*)(re_data+j);
+    re_compiled->data_len = 0;
 
     switch (c)
     {
       /* Meta-characters: */
-      case '^': {    re_compiled[j].type = BEGIN;           } break;
-      case '$': {    re_compiled[j].type = END;             } break;
-      case '.': {    re_compiled[j].type = DOT;             } break;
-      case '*': {    re_compiled[j].type = STAR;            } break;
-      case '+': {    re_compiled[j].type = PLUS;            } break;
-      case '?': {    re_compiled[j].type = QUESTIONMARK;    } break;
-/*    case '|': {    re_compiled[j].type = BRANCH;          } break; <-- not working properly */
+      case '^': {    re_compiled->type = BEGIN;           } break;
+      case '$': {    re_compiled->type = END;             } break;
+      case '.': {    re_compiled->type = DOT;             } break;
+      case '*': {    re_compiled->type = STAR;            } break;
+      case '+': {    re_compiled->type = PLUS;            } break;
+      case '?': {    re_compiled->type = QUESTIONMARK;    } break;
+/*    case '|': {    re_compiled->type = BRANCH;          } break; <-- not working properly */
 
       /* Escaped character-classes (\s \w ...): */
       case '\\':
@@ -145,18 +150,19 @@ re_t re_compile(const char* pattern)
           switch (pattern[i])
           {
             /* Meta-character: */
-            case 'd': {    re_compiled[j].type = DIGIT;            } break;
-            case 'D': {    re_compiled[j].type = NOT_DIGIT;        } break;
-            case 'w': {    re_compiled[j].type = ALPHA;            } break;
-            case 'W': {    re_compiled[j].type = NOT_ALPHA;        } break;
-            case 's': {    re_compiled[j].type = WHITESPACE;       } break;
-            case 'S': {    re_compiled[j].type = NOT_WHITESPACE;   } break;
+            case 'd': {    re_compiled->type = DIGIT;            } break;
+            case 'D': {    re_compiled->type = NOT_DIGIT;        } break;
+            case 'w': {    re_compiled->type = ALPHA;            } break;
+            case 'W': {    re_compiled->type = NOT_ALPHA;        } break;
+            case 's': {    re_compiled->type = WHITESPACE;       } break;
+            case 'S': {    re_compiled->type = NOT_WHITESPACE;   } break;
 
             /* Escaped character, e.g. '.' or '$' */
             default:
             {
-              re_compiled[j].type = CHAR;
-              re_compiled[j].u.ch = pattern[i];
+              re_compiled->type = CHAR;
+              re_compiled->data_len = 1;
+              re_compiled->data[0] = pattern[i];
             } break;
           }
         }
@@ -164,8 +170,9 @@ re_t re_compile(const char* pattern)
 /*
         else
         {
-          re_compiled[j].type = CHAR;
-          re_compiled[j].ch = pattern[i];
+          re_compiled->type = CHAR;
+          re_compiled->data_len = 1;
+          re_compiled->data[0] = pattern[i];
         }
 */
       } break;
@@ -173,13 +180,12 @@ re_t re_compile(const char* pattern)
       /* Character class: */
       case '[':
       {
-        /* Remember where the char-buffer starts. */
-        int buf_begin = ccl_bufidx;
+        int char_limit = min(0xff, MAX_REGEXP_LEN - j - 4); // 4 for this object and UNUSED at the minimum
 
         /* Look-ahead to determine if negated */
         if (pattern[i+1] == '^')
         {
-          re_compiled[j].type = INV_CHAR_CLASS;
+          re_compiled->type = INV_CHAR_CLASS;
           i += 1; /* Increment i to avoid including '^' in the char-buffer */
           if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '^' */
           {
@@ -188,7 +194,7 @@ re_t re_compile(const char* pattern)
         }
         else
         {
-          re_compiled[j].type = CHAR_CLASS;
+          re_compiled->type = CHAR_CLASS;
         }
 
         /* Copy characters inside [..] to buffer */
@@ -197,7 +203,7 @@ re_t re_compile(const char* pattern)
         {
           if (pattern[i] == '\\')
           {
-            if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1)
+            if (re_compiled->data_len >= char_limit)
             {
               //fputs("exceeded internal buffer!\n", stderr);
               return 0;
@@ -206,31 +212,32 @@ re_t re_compile(const char* pattern)
             {
               return 0;
             }
-            ccl_buf[ccl_bufidx++] = pattern[i++];
+            re_compiled->data[re_compiled->data_len++] = pattern[i++];
           }
-          else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
+          // TODO: I think this "else if" is a bug, should just be "if"
+          else if (re_compiled->data_len >= char_limit)
           {
               //fputs("exceeded internal buffer!\n", stderr);
               return 0;
           }
-          ccl_buf[ccl_bufidx++] = pattern[i];
+          re_compiled->data[re_compiled->data_len++] = pattern[i];
         }
-        if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
+        if (re_compiled->data_len >= char_limit)
         {
             /* Catches cases such as [00000000000000000000000000000000000000][ */
             //fputs("exceeded internal buffer!\n", stderr);
             return 0;
         }
         /* Null-terminate string end */
-        ccl_buf[ccl_bufidx++] = 0;
-        re_compiled[j].u.ccl = &ccl_buf[buf_begin];
+        re_compiled->data[re_compiled->data_len++] = 0;
       } break;
 
       /* Other characters: */
       default:
       {
-        re_compiled[j].type = CHAR;
-        re_compiled[j].u.ch = c;
+        re_compiled->type = CHAR;
+        re_compiled->data_len = 1;
+        re_compiled->data[0] = c;
       } break;
     }
     /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
@@ -240,35 +247,39 @@ re_t re_compile(const char* pattern)
     }
 
     i += 1;
-    j += 1;
+    j += 2 + re_compiled->data_len;
+  }
+  if (j + 1 >= MAX_REGEXP_LEN) {
+      //fputs("exceeded internal buffer!\n", stderr);
+       return 0;
   }
   /* 'UNUSED' is a sentinel used to indicate end-of-pattern */
-  re_compiled[j].type = UNUSED;
+  re_data[j] = UNUSED;
+  re_data[j+1] = 0;
 
-  return (re_t) re_compiled;
+  return (re_t) re_data;
 }
 
 void re_print(regex_t* pattern)
 {
   const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" };
 
-  int i;
   int j;
   char c;
-  for (i = 0; i < MAX_REGEXP_OBJECTS; ++i)
+  for (;; pattern = getnext(pattern))
   {
-    if (pattern[i].type == UNUSED)
+    if (pattern->type == UNUSED)
     {
       break;
     }
 
-    printf("type: %s", types[pattern[i].type]);
-    if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS)
+    printf("type: %s", types[pattern->type]);
+    if (pattern->type == CHAR_CLASS || pattern->type == INV_CHAR_CLASS)
     {
       printf(" [");
-      for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j)
+      for (j = 0; j < pattern->data_len; ++j)
       {
-        c = pattern[i].u.ccl[j];
+        c = pattern->data[j];
         if ((c == '\0') || (c == ']'))
         {
           break;
@@ -277,9 +288,9 @@ void re_print(regex_t* pattern)
       }
       printf("]");
     }
-    else if (pattern[i].type == CHAR)
+    else if (pattern->type == CHAR)
     {
-      printf(" '%c'", pattern[i].u.ch);
+      printf(" '%c'", pattern->data[0]);
     }
     printf("\n");
   }
@@ -380,13 +391,13 @@ static int matchcharclass(char c, const char* str)
   return 0;
 }
 
-static int matchone(regex_t p, char c)
+static int matchone(regex_t* p, char c)
 {
-  switch (p.type)
+  switch (p->type)
   {
     case DOT:            return matchdot(c);
-    case CHAR_CLASS:     return  matchcharclass(c, (const char*)p.u.ccl);
-    case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p.u.ccl);
+    case CHAR_CLASS:     return  matchcharclass(c, (const char*)p->data);
+    case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p->data);
     case DIGIT:          return  matchdigit(c);
     case NOT_DIGIT:      return !matchdigit(c);
     case ALPHA:          return  matchalphanum(c);
@@ -394,11 +405,11 @@ static int matchone(regex_t p, char c)
     case WHITESPACE:     return  matchwhitespace(c);
     case NOT_WHITESPACE: return !matchwhitespace(c);
     case BEGIN:          return 0;
-    default:             return  (p.u.ch == c);
+    default:             return  (p->data[0] == c);
   }
 }
 
-static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength)
+static int matchstar(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
 {
   int prelen = *matchlength;
   const char* prepoint = text;
@@ -418,7 +429,7 @@ static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchle
   return 0;
 }
 
-static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength)
+static int matchplus(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
 {
   const char* prepoint = text;
   while ((text[0] != '\0') && matchone(p, *text))
@@ -436,10 +447,8 @@ static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchle
   return 0;
 }
 
-static int matchquestion(regex_t p, regex_t* pattern, const char* text, int* matchlength)
+static int matchquestion(regex_t *p, regex_t* pattern, const char* text, int* matchlength)
 {
-  if (p.type == UNUSED)
-    return 1;
   if (matchpattern(pattern, text, matchlength))
       return 1;
   if (*text && matchone(p, *text++))
@@ -494,33 +503,42 @@ static int matchpattern(regex_t* pattern, const char* text, int *matchlength)
 static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
 {
   int pre = *matchlength;
-  do
+  while (1)
   {
-    if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK))
+    if (pattern->type == UNUSED)
     {
-      return matchquestion(pattern[0], &pattern[2], text, matchlength);
+      return 1;
     }
-    else if (pattern[1].type == STAR)
+    regex_t* next_pattern = getnext(pattern);
+    if (next_pattern->type == QUESTIONMARK)
     {
-      return matchstar(pattern[0], &pattern[2], text, matchlength);
+      return matchquestion(pattern, getnext(next_pattern), text, matchlength);
     }
-    else if (pattern[1].type == PLUS)
+    else if (next_pattern->type == STAR)
     {
-      return matchplus(pattern[0], &pattern[2], text, matchlength);
+      return matchstar(pattern, getnext(next_pattern), text, matchlength);
     }
-    else if ((pattern[0].type == END) && pattern[1].type == UNUSED)
+    else if (next_pattern->type == PLUS)
+    {
+      return matchplus(pattern, getnext(next_pattern), text, matchlength);
+    }
+    else if ((pattern->type == END) && next_pattern->type == UNUSED)
     {
       return (text[0] == '\0');
     }
 /*  Branching is not working properly
-    else if (pattern[1].type == BRANCH)
+    else if (pattern->type == BRANCH)
     {
-      return (matchpattern(pattern, text) || matchpattern(&pattern[2], text));
+      return (matchpattern(pattern, text) || matchpattern(getnext(next_pattern), text));
     }
 */
   (*matchlength)++;
+    if (text[0] == '\0')
+      break;
+    if (!matchone(pattern, *text++))
+      break;
+    pattern = next_pattern;
   }
-  while ((text[0] != '\0') && matchone(*pattern++, *text++));
 
   *matchlength = pre;
   return 0;

From 87502b5c9a5f06b185cbf9e42ceba9197ac94e92 Mon Sep 17 00:00:00 2001
From: Jonathan Marler <johnnymarler@gmail.com>
Date: Sun, 7 Mar 2021 05:14:47 -0700
Subject: [PATCH 16/30] simplify matchplus and matchstar

`matchplus` can be simplified by only modifying `matchlength` once the complete match is successful.  This means it doesn't have to rewind `matchlength` as it iterates through each possible `matchpattern`.  This also means it keeps `matchlength` unmodified if it doesn't return a match.  Because of this last part, this also means that `matchstar` can leverage `matchplus` which reduces it to single line of code `return matchplus(...) || matchpattern(..)`.
---
 re.c | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/re.c b/re.c
index 7f9ef11..eb456f1 100644
--- a/re.c
+++ b/re.c
@@ -411,22 +411,7 @@ static int matchone(regex_t* p, char c)
 
 static int matchstar(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
 {
-  int prelen = *matchlength;
-  const char* prepoint = text;
-  while ((text[0] != '\0') && matchone(p, *text))
-  {
-    text++;
-    (*matchlength)++;
-  }
-  while (text >= prepoint)
-  {
-    if (matchpattern(pattern, text--, matchlength))
-      return 1;
-    (*matchlength)--;
-  }
-
-  *matchlength = prelen;
-  return 0;
+  return matchplus(p, pattern, text, matchlength) || matchpattern(pattern, text, matchlength);
 }
 
 static int matchplus(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
@@ -435,15 +420,14 @@ static int matchplus(regex_t* p, regex_t* pattern, const char* text, int* matchl
   while ((text[0] != '\0') && matchone(p, *text))
   {
     text++;
-    (*matchlength)++;
   }
-  while (text > prepoint)
+  for (; text > prepoint; text--)
   {
-    if (matchpattern(pattern, text--, matchlength))
+    if (matchpattern(pattern, text, matchlength)) {
+      *matchlength += text - prepoint;
       return 1;
-    (*matchlength)--;
+    }
   }
-
   return 0;
 }
 

From b5d1b7e79f4d91833d96cb2060d59cdbde8d3a50 Mon Sep 17 00:00:00 2001
From: Jonathan Marler <johnnymarler@gmail.com>
Date: Sat, 6 Mar 2021 07:19:02 -0700
Subject: [PATCH 17/30] fix mishandling of ^ inside an expression

---
 Makefile      |  1 +
 re.c          |  1 +
 tests/test1.c | 22 +++++++++++++++++++++-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 4d84611..fbf754c 100644
--- a/Makefile
+++ b/Makefile
@@ -101,6 +101,7 @@ test: all
 	@$(PYTHON) ./scripts/regex_test_neg.py [012345-9]             $(NRAND_TESTS)
 	@$(PYTHON) ./scripts/regex_test_neg.py [0-56789]              $(NRAND_TESTS)
 	@$(PYTHON) ./scripts/regex_test_neg.py .*123faerdig           $(NRAND_TESTS)
+	@$(PYTHON) ./scripts/regex_test_neg.py a^                     $(NRAND_TESTS)
 	@echo
 	@echo
 	@./tests/test2
diff --git a/re.c b/re.c
index e81aa67..03b79ae 100644
--- a/re.c
+++ b/re.c
@@ -401,6 +401,7 @@ static int matchone(regex_t p, char c)
     case NOT_ALPHA:      return !matchalphanum(c);
     case WHITESPACE:     return  matchwhitespace(c);
     case NOT_WHITESPACE: return !matchwhitespace(c);
+    case BEGIN:          return 0;
     default:             return  (p.u.ch == c);
   }
 }
diff --git a/tests/test1.c b/tests/test1.c
index 228b2e1..4e3e199 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -107,13 +107,18 @@ int main()
     int should_fail;
     int length;
     int correctlen;
-    size_t ntests = sizeof(test_vector) / sizeof(*test_vector);
+    size_t nvector_tests = sizeof(test_vector) / sizeof(*test_vector);
+    size_t ntests = nvector_tests + 1;
     size_t nfailed = 0;
     size_t i;
 
+<<<<<<< HEAD
     //setlocale(LC_CTYPE, "en_US.UTF-8");
 
     for (i = 0; i < ntests; ++i)
+=======
+    for (i = 0; i < nvector_tests; ++i)
+>>>>>>> 964b93a (fix mishandling of ^ inside an expression)
     {
         pattern = test_vector[i][1];
         text = test_vector[i][2];
@@ -149,6 +154,21 @@ int main()
         }
     }
 
+    // regression test for unhandled BEGIN in the middle of an expression
+    // we need to test text strings with all possible values for the second
+    // byte because re.c was matching it against an uninitalized value, so
+    // it could be anything
+    pattern = "a^";
+    for (i = 0; i < 255; i++) {
+      char text_buf[] = { 'a', i, '\0' };
+      int m = re_match(pattern, text_buf, &length);
+      if (m != -1) {
+        fprintf(stderr, "[%lu/%lu]: pattern '%s' matched '%s' unexpectedly", ntests, ntests, pattern, text_buf);
+        nfailed += 1;
+        break;
+      }
+    }
+
     // printf("\n");
     printf("%lu/%lu tests succeeded.\n", ntests - nfailed, ntests);
     printf("\n");

From ad71eb153b951b9898af69f0fe1b57f0971bc361 Mon Sep 17 00:00:00 2001
From: Jonathan Marler <johnnymarler@gmail.com>
Date: Sat, 6 Mar 2021 05:28:33 -0700
Subject: [PATCH 18/30] use flat memory layout

---
 re.c | 194 ++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 127 insertions(+), 67 deletions(-)

diff --git a/re.c b/re.c
index 03b79ae..28fb4ac 100644
--- a/re.c
+++ b/re.c
@@ -35,34 +35,49 @@
 
 /* Definitions: */
 
+<<<<<<< HEAD
 #define MAX_CHAR_CLASS_LEN      40    /* Max length of character-class buffer in.   */
 #ifndef CPROVER
 #define MAX_REGEXP_OBJECTS      30    /* Max number of regex symbols in expression. */
 #else
 #define MAX_REGEXP_OBJECTS      8    /* faster formal proofs */
 #endif
+=======
+#define MAX_REGEXP_LEN      70    /* Max number of bytes for a regex. */
+>>>>>>> fd17b66 (use flat memory layout)
 
 
 enum regex_type_e { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
 
 typedef struct regex_t
 {
+<<<<<<< HEAD
   enum regex_type_e type;   /* CHAR, STAR, etc.                      */
   union
   {
     unsigned char  ch;   /*      the character itself             */
     unsigned char* ccl;  /*  OR  a pointer to characters in class */
   } u;
+=======
+  unsigned char type;    /* CHAR, STAR, etc.                      */
+  unsigned char data_len;
+  unsigned char data[0];
+>>>>>>> fd17b66 (use flat memory layout)
 } regex_t;
 
+static re_t getnext(regex_t* pattern)
+{
+  return (re_t)(((unsigned char*)pattern) + 2 + pattern->data_len);
+}
+
 
 
 /* Private function declarations: */
 static int matchpattern(regex_t* pattern, const char* text, int* matchlength);
 static int matchcharclass(char c, const char* str);
-static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength);
-static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength);
-static int matchone(regex_t p, char c);
+static int matchstar(regex_t *p, regex_t* pattern, const char* text, int* matchlength);
+static int matchplus(regex_t *p, regex_t* pattern, const char* text, int* matchlength);
+static int matchone(regex_t* p, char c);
 static int matchdigit(char c);
 static int matchalpha(char c);
 static int matchwhitespace(char c);
@@ -84,9 +99,9 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
   *matchlength = 0;
   if (pattern != 0)
   {
-    if (pattern[0].type == BEGIN)
+    if (pattern->type == BEGIN)
     {
-      return ((matchpattern(&pattern[1], text, matchlength)) ? 0 : -1);
+      return ((matchpattern(getnext(pattern), text, matchlength)) ? 0 : -1);
     }
     else
     {
@@ -110,33 +125,37 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
   return -1;
 }
 
+static int min(int a, int b)
+{
+  return (a <= b) ? a : b;
+}
+
 re_t re_compile(const char* pattern)
 {
-  /* The sizes of the two static arrays below substantiates the static RAM usage of this module.
-     MAX_REGEXP_OBJECTS is the max number of symbols in the expression.
-     MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */
-  static regex_t re_compiled[MAX_REGEXP_OBJECTS];
-  static unsigned char ccl_buf[MAX_CHAR_CLASS_LEN];
-  int ccl_bufidx = 1;
+  /* The size of this static array substantiates the static RAM usage of this module.
+     MAX_REGEXP_LEN is the max number number of bytes in the expression. */
+  static unsigned char re_data[MAX_REGEXP_LEN];
 
   char c;     /* current char in pattern   */
   int i = 0;  /* index into pattern        */
-  int j = 0;  /* index into re_compiled    */
+  int j = 0;  /* index into re_data    */
 
-  while (pattern[i] != '\0' && (j+1 < MAX_REGEXP_OBJECTS))
+  while (pattern[i] != '\0' && (j+3 < MAX_REGEXP_LEN))
   {
     c = pattern[i];
+    regex_t *re_compiled = (regex_t*)(re_data+j);
+    re_compiled->data_len = 0;
 
     switch (c)
     {
       /* Meta-characters: */
-      case '^': {    re_compiled[j].type = BEGIN;           } break;
-      case '$': {    re_compiled[j].type = END;             } break;
-      case '.': {    re_compiled[j].type = DOT;             } break;
-      case '*': {    re_compiled[j].type = STAR;            } break;
-      case '+': {    re_compiled[j].type = PLUS;            } break;
-      case '?': {    re_compiled[j].type = QUESTIONMARK;    } break;
-/*    case '|': {    re_compiled[j].type = BRANCH;          } break; <-- not working properly */
+      case '^': {    re_compiled->type = BEGIN;           } break;
+      case '$': {    re_compiled->type = END;             } break;
+      case '.': {    re_compiled->type = DOT;             } break;
+      case '*': {    re_compiled->type = STAR;            } break;
+      case '+': {    re_compiled->type = PLUS;            } break;
+      case '?': {    re_compiled->type = QUESTIONMARK;    } break;
+/*    case '|': {    re_compiled->type = BRANCH;          } break; <-- not working properly */
 
       /* Escaped character-classes (\s \w ...): */
       case '\\':
@@ -149,36 +168,45 @@ re_t re_compile(const char* pattern)
           switch (pattern[i])
           {
             /* Meta-character: */
-            case 'd': {    re_compiled[j].type = DIGIT;            } break;
-            case 'D': {    re_compiled[j].type = NOT_DIGIT;        } break;
-            case 'w': {    re_compiled[j].type = ALPHA;            } break;
-            case 'W': {    re_compiled[j].type = NOT_ALPHA;        } break;
-            case 's': {    re_compiled[j].type = WHITESPACE;       } break;
-            case 'S': {    re_compiled[j].type = NOT_WHITESPACE;   } break;
+            case 'd': {    re_compiled->type = DIGIT;            } break;
+            case 'D': {    re_compiled->type = NOT_DIGIT;        } break;
+            case 'w': {    re_compiled->type = ALPHA;            } break;
+            case 'W': {    re_compiled->type = NOT_ALPHA;        } break;
+            case 's': {    re_compiled->type = WHITESPACE;       } break;
+            case 'S': {    re_compiled->type = NOT_WHITESPACE;   } break;
 
               /* Escaped character, e.g. '.', '$' or '\\' */
             default:
             {
-              re_compiled[j].type = CHAR;
-              re_compiled[j].u.ch = pattern[i];
+              re_compiled->type = CHAR;
+              re_compiled->data_len = 1;
+              re_compiled->data[0] = pattern[i];
             } break;
           }
         }
         /* '\\' as last char without previous \\ -> invalid regular expression. */
         else
+<<<<<<< HEAD
           return 0;
+=======
+        {
+          re_compiled->type = CHAR;
+          re_compiled->data_len = 1;
+          re_compiled->data[0] = pattern[i];
+        }
+*/
+>>>>>>> fd17b66 (use flat memory layout)
       } break;
 
       /* Character class: */
       case '[':
       {
-        /* Remember where the char-buffer starts. */
-        int buf_begin = ccl_bufidx;
+        int char_limit = min(0xff, MAX_REGEXP_LEN - j - 4); // 4 for this object and UNUSED at the minimum
 
         /* Look-ahead to determine if negated */
         if (pattern[i+1] == '^')
         {
-          re_compiled[j].type = INV_CHAR_CLASS;
+          re_compiled->type = INV_CHAR_CLASS;
           i += 1; /* Increment i to avoid including '^' in the char-buffer */
           if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '^' */
           {
@@ -187,7 +215,7 @@ re_t re_compile(const char* pattern)
         }
         else
         {
-          re_compiled[j].type = CHAR_CLASS;
+          re_compiled->type = CHAR_CLASS;
         }
 
         /* Copy characters inside [..] to buffer */
@@ -196,7 +224,7 @@ re_t re_compile(const char* pattern)
         {
           if (pattern[i] == '\\')
           {
-            if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1)
+            if (re_compiled->data_len >= char_limit)
             {
               //fputs("exceeded internal buffer!\n", stderr);
               return 0;
@@ -205,24 +233,24 @@ re_t re_compile(const char* pattern)
             {
               return 0;
             }
-            ccl_buf[ccl_bufidx++] = pattern[i++];
+            re_compiled->data[re_compiled->data_len++] = pattern[i++];
           }
-          else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
+          // TODO: I think this "else if" is a bug, should just be "if"
+          else if (re_compiled->data_len >= char_limit)
           {
               //fputs("exceeded internal buffer!\n", stderr);
               return 0;
           }
-          ccl_buf[ccl_bufidx++] = pattern[i];
+          re_compiled->data[re_compiled->data_len++] = pattern[i];
         }
-        if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
+        if (re_compiled->data_len >= char_limit)
         {
             /* Catches cases such as [00000000000000000000000000000000000000][ */
             //fputs("exceeded internal buffer!\n", stderr);
             return 0;
         }
         /* Null-terminate string end */
-        ccl_buf[ccl_bufidx++] = 0;
-        re_compiled[j].u.ccl = &ccl_buf[buf_begin];
+        re_compiled->data[re_compiled->data_len++] = 0;
       } break;
 
       case '\0': // EOL
@@ -231,24 +259,36 @@ re_t re_compile(const char* pattern)
       /* Other characters: */
       default:
       {
+<<<<<<< HEAD
         re_compiled[j].type = CHAR;
         // cbmc: arithmetic overflow on signed to unsigned type conversion in (unsigned char)c
         re_compiled[j].u.ch = (unsigned char)c;
+=======
+        re_compiled->type = CHAR;
+        re_compiled->data_len = 1;
+        re_compiled->data[0] = c;
+>>>>>>> fd17b66 (use flat memory layout)
       } break;
     }
     i += 1;
-    j += 1;
+    j += 2 + re_compiled->data_len;
+  }
+  if (j + 1 >= MAX_REGEXP_LEN) {
+      //fputs("exceeded internal buffer!\n", stderr);
+       return 0;
   }
   /* 'UNUSED' is a sentinel used to indicate end-of-pattern */
-  re_compiled[j].type = UNUSED;
+  re_data[j] = UNUSED;
+  re_data[j+1] = 0;
 
-  return (re_t) re_compiled;
+  return (re_t) re_data;
 }
 
 void re_print(regex_t* pattern)
 {
   const char *const types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ };
 
+<<<<<<< HEAD
   unsigned char i;
   unsigned char j;
   char c;
@@ -256,12 +296,18 @@ void re_print(regex_t* pattern)
   if (!pattern)
     return;
   for (i = 0; i < MAX_REGEXP_OBJECTS; ++i)
+=======
+  int j;
+  char c;
+  for (;; pattern = getnext(pattern))
+>>>>>>> fd17b66 (use flat memory layout)
   {
-    if (pattern[i].type == UNUSED)
+    if (pattern->type == UNUSED)
     {
       break;
     }
 
+<<<<<<< HEAD
     if (pattern[i].type <= NOT_WHITESPACE)
       printf("type: %s", types[pattern[i].type]);
     else
@@ -273,8 +319,15 @@ void re_print(regex_t* pattern)
       if (pattern[i].type == INV_CHAR_CLASS)
         printf("^");
       for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j)
+=======
+    printf("type: %s", types[pattern->type]);
+    if (pattern->type == CHAR_CLASS || pattern->type == INV_CHAR_CLASS)
+    {
+      printf(" [");
+      for (j = 0; j < pattern->data_len; ++j)
+>>>>>>> fd17b66 (use flat memory layout)
       {
-        c = pattern[i].u.ccl[j];
+        c = pattern->data[j];
         if ((c == '\0') || (c == ']'))
         {
           break;
@@ -283,9 +336,9 @@ void re_print(regex_t* pattern)
       }
       printf("]");
     }
-    else if (pattern[i].type == CHAR)
+    else if (pattern->type == CHAR)
     {
-      printf(" '%c'", pattern[i].u.ch);
+      printf(" '%c'", pattern->data[0]);
     }
     printf("\n");
   }
@@ -388,13 +441,13 @@ static int matchcharclass(char c, const char* str)
   return 0;
 }
 
-static int matchone(regex_t p, char c)
+static int matchone(regex_t* p, char c)
 {
-  switch (p.type)
+  switch (p->type)
   {
     case DOT:            return matchdot(c);
-    case CHAR_CLASS:     return  matchcharclass(c, (const char*)p.u.ccl);
-    case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p.u.ccl);
+    case CHAR_CLASS:     return  matchcharclass(c, (const char*)p->data);
+    case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p->data);
     case DIGIT:          return  matchdigit(c);
     case NOT_DIGIT:      return !matchdigit(c);
     case ALPHA:          return  matchalphanum(c);
@@ -402,11 +455,11 @@ static int matchone(regex_t p, char c)
     case WHITESPACE:     return  matchwhitespace(c);
     case NOT_WHITESPACE: return !matchwhitespace(c);
     case BEGIN:          return 0;
-    default:             return  (p.u.ch == c);
+    default:             return  (p->data[0] == c);
   }
 }
 
-static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength)
+static int matchstar(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
 {
   int prelen = *matchlength;
   const char* prepoint = text;
@@ -427,7 +480,7 @@ static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchle
   return 0;
 }
 
-static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength)
+static int matchplus(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
 {
   const char* prepoint = text;
   while ((text[0] != '\0') && matchone(p, *text))
@@ -445,10 +498,8 @@ static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchle
   return 0;
 }
 
-static int matchquestion(regex_t p, regex_t* pattern, const char* text, int* matchlength)
+static int matchquestion(regex_t *p, regex_t* pattern, const char* text, int* matchlength)
 {
-  if (p.type == UNUSED)
-    return 1;
   if (matchpattern(pattern, text, matchlength))
       return 1;
   if (*text && matchone(p, *text++))
@@ -503,33 +554,42 @@ static int matchpattern(regex_t* pattern, const char* text, int *matchlength)
 static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
 {
   int pre = *matchlength;
-  do
+  while (1)
   {
-    if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK))
+    if (pattern->type == UNUSED)
     {
-      return matchquestion(pattern[0], &pattern[2], text, matchlength);
+      return 1;
     }
-    else if (pattern[1].type == STAR)
+    regex_t* next_pattern = getnext(pattern);
+    if (next_pattern->type == QUESTIONMARK)
     {
-      return matchstar(pattern[0], &pattern[2], text, matchlength);
+      return matchquestion(pattern, getnext(next_pattern), text, matchlength);
     }
-    else if (pattern[1].type == PLUS)
+    else if (next_pattern->type == STAR)
     {
-      return matchplus(pattern[0], &pattern[2], text, matchlength);
+      return matchstar(pattern, getnext(next_pattern), text, matchlength);
     }
-    else if ((pattern[0].type == END) && pattern[1].type == UNUSED)
+    else if (next_pattern->type == PLUS)
+    {
+      return matchplus(pattern, getnext(next_pattern), text, matchlength);
+    }
+    else if ((pattern->type == END) && next_pattern->type == UNUSED)
     {
       return (text[0] == '\0');
     }
 /*  Branching is not working properly
-    else if (pattern[1].type == BRANCH)
+    else if (pattern->type == BRANCH)
     {
-      return (matchpattern(pattern, text) || matchpattern(&pattern[2], text));
+      return (matchpattern(pattern, text) || matchpattern(getnext(next_pattern), text));
     }
 */
   (*matchlength)++;
+    if (text[0] == '\0')
+      break;
+    if (!matchone(pattern, *text++))
+      break;
+    pattern = next_pattern;
   }
-  while ((text[0] != '\0') && matchone(*pattern++, *text++));
 
   *matchlength = pre;
   return 0;

From 46af3b8bc94007f4be3797b2adef2b46753041c4 Mon Sep 17 00:00:00 2001
From: Jonathan Marler <johnnymarler@gmail.com>
Date: Sun, 7 Mar 2021 05:14:47 -0700
Subject: [PATCH 19/30] simplify matchplus and matchstar

`matchplus` can be simplified by only modifying `matchlength` once the complete match is successful.  This means it doesn't have to rewind `matchlength` as it iterates through each possible `matchpattern`.  This also means it keeps `matchlength` unmodified if it doesn't return a match.  Because of this last part, this also means that `matchstar` can leverage `matchplus` which reduces it to single line of code `return matchplus(...) || matchpattern(..)`.
---
 re.c | 189 ++++-------------------------------------------------------
 1 file changed, 13 insertions(+), 176 deletions(-)

diff --git a/re.c b/re.c
index 28fb4ac..b5e4db4 100644
--- a/re.c
+++ b/re.c
@@ -35,34 +35,16 @@
 
 /* Definitions: */
 
-<<<<<<< HEAD
-#define MAX_CHAR_CLASS_LEN      40    /* Max length of character-class buffer in.   */
-#ifndef CPROVER
-#define MAX_REGEXP_OBJECTS      30    /* Max number of regex symbols in expression. */
-#else
-#define MAX_REGEXP_OBJECTS      8    /* faster formal proofs */
-#endif
-=======
 #define MAX_REGEXP_LEN      70    /* Max number of bytes for a regex. */
->>>>>>> fd17b66 (use flat memory layout)
 
 
 enum regex_type_e { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
 
 typedef struct regex_t
 {
-<<<<<<< HEAD
-  enum regex_type_e type;   /* CHAR, STAR, etc.                      */
-  union
-  {
-    unsigned char  ch;   /*      the character itself             */
-    unsigned char* ccl;  /*  OR  a pointer to characters in class */
-  } u;
-=======
   unsigned char type;    /* CHAR, STAR, etc.                      */
   unsigned char data_len;
   unsigned char data[0];
->>>>>>> fd17b66 (use flat memory layout)
 } regex_t;
 
 static re_t getnext(regex_t* pattern)
@@ -186,16 +168,12 @@ re_t re_compile(const char* pattern)
         }
         /* '\\' as last char without previous \\ -> invalid regular expression. */
         else
-<<<<<<< HEAD
-          return 0;
-=======
         {
           re_compiled->type = CHAR;
           re_compiled->data_len = 1;
           re_compiled->data[0] = pattern[i];
         }
 */
->>>>>>> fd17b66 (use flat memory layout)
       } break;
 
       /* Character class: */
@@ -259,15 +237,9 @@ re_t re_compile(const char* pattern)
       /* Other characters: */
       default:
       {
-<<<<<<< HEAD
-        re_compiled[j].type = CHAR;
-        // cbmc: arithmetic overflow on signed to unsigned type conversion in (unsigned char)c
-        re_compiled[j].u.ch = (unsigned char)c;
-=======
         re_compiled->type = CHAR;
         re_compiled->data_len = 1;
-        re_compiled->data[0] = c;
->>>>>>> fd17b66 (use flat memory layout)
+        re_compiled->data[0] = (unsigned char)c;
       } break;
     }
     i += 1;
@@ -288,44 +260,29 @@ void re_print(regex_t* pattern)
 {
   const char *const types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ };
 
-<<<<<<< HEAD
-  unsigned char i;
-  unsigned char j;
+  int j;
   char c;
 
   if (!pattern)
     return;
-  for (i = 0; i < MAX_REGEXP_OBJECTS; ++i)
-=======
-  int j;
-  char c;
   for (;; pattern = getnext(pattern))
->>>>>>> fd17b66 (use flat memory layout)
   {
     if (pattern->type == UNUSED)
     {
       break;
     }
 
-<<<<<<< HEAD
-    if (pattern[i].type <= NOT_WHITESPACE)
-      printf("type: %s", types[pattern[i].type]);
+    if (pattern->type <= NOT_WHITESPACE)
+      printf("type: %s", types[pattern->type]);
     else
-      printf("invalid type: %d", pattern[i].type);
+      printf("invalid type: %d", pattern->type);
 
-    if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS)
-    {
-      printf(" [");
-      if (pattern[i].type == INV_CHAR_CLASS)
-        printf("^");
-      for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j)
-=======
-    printf("type: %s", types[pattern->type]);
     if (pattern->type == CHAR_CLASS || pattern->type == INV_CHAR_CLASS)
     {
       printf(" [");
+      if (pattern->type == INV_CHAR_CLASS)
+        printf("^");
       for (j = 0; j < pattern->data_len; ++j)
->>>>>>> fd17b66 (use flat memory layout)
       {
         c = pattern->data[j];
         if ((c == '\0') || (c == ']'))
@@ -459,25 +416,9 @@ static int matchone(regex_t* p, char c)
   }
 }
 
-static int matchstar(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
+static inline int matchstar(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
 {
-  int prelen = *matchlength;
-  const char* prepoint = text;
-  // TODO check if multibyte, and use mbtowc() then
-  while ((text[0] != '\0') && matchone(p, *text))
-  {
-    text++;
-    (*matchlength)++;
-  }
-  while (text >= prepoint)
-  {
-    if (matchpattern(pattern, text--, matchlength))
-      return 1;
-    (*matchlength)--;
-  }
-
-  *matchlength = prelen;
-  return 0;
+  return matchplus(p, pattern, text, matchlength) || matchpattern(pattern, text, matchlength);
 }
 
 static int matchplus(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
@@ -486,15 +427,14 @@ static int matchplus(regex_t* p, regex_t* pattern, const char* text, int* matchl
   while ((text[0] != '\0') && matchone(p, *text))
   {
     text++;
-    (*matchlength)++;
   }
-  while (text > prepoint)
+  for (; text > prepoint; text--)
   {
-    if (matchpattern(pattern, text--, matchlength))
+    if (matchpattern(pattern, text, matchlength)) {
+      *matchlength += text - prepoint;
       return 1;
-    (*matchlength)--;
+    }
   }
-
   return 0;
 }
 
@@ -513,43 +453,6 @@ static int matchquestion(regex_t *p, regex_t* pattern, const char* text, int* ma
   return 0;
 }
 
-
-#if 0
-
-/* Recursive matching */
-static int matchpattern(regex_t* pattern, const char* text, int *matchlength)
-{
-  int pre = *matchlength;
-  if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK))
-  {
-    return matchquestion(pattern[1], &pattern[2], text, matchlength);
-  }
-  else if (pattern[1].type == STAR)
-  {
-    return matchstar(pattern[0], &pattern[2], text, matchlength);
-  }
-  else if (pattern[1].type == PLUS)
-  {
-    return matchplus(pattern[0], &pattern[2], text, matchlength);
-  }
-  else if ((pattern[0].type == END) && pattern[1].type == UNUSED)
-  {
-    return text[0] == '\0';
-  }
-  else if ((text[0] != '\0') && matchone(pattern[0], text[0]))
-  {
-    (*matchlength)++;
-    return matchpattern(&pattern[1], text+1);
-  }
-  else
-  {
-    *matchlength = pre;
-    return 0;
-  }
-}
-
-#else
-
 /* Iterative matching */
 static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
 {
@@ -594,69 +497,3 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
   *matchlength = pre;
   return 0;
 }
-
-#endif
-
-#ifdef CPROVER
-#define N 24
-
-/* Formal verification with cbmc: */
-/* cbmc -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace re.c
- */
-
-void verify_re_compile()
-{
-  /* test input - ten chars used as a regex-pattern input */
-  char arr[N];
-  /* make input symbolic, to search all paths through the code */
-  /* i.e. the input is checked for all possible ten-char combinations */
-  for (int i=0; i<sizeof(arr)-1; i++) {
-    //arr[i] = nondet_char();
-    assume(arr[i] > -127 && arr[i] < 128);
-  }
-  /* assume proper NULL termination */
-  assume(arr[sizeof(arr) - 1] == 0);
-  /* verify abscence of run-time errors - go! */
-  re_compile(arr);
-}
-
-void verify_re_print()
-{
-  regex_t pattern[MAX_REGEXP_OBJECTS];
-  for (unsigned char i=0; i<MAX_REGEXP_OBJECTS; i++) {
-    //pattern[i].type = nondet_uchar();
-    assume(pattern[i].type >= 0 && pattern[i].type <= 255);
-    pattern[i].u.ccl = nondet_long();
-  }
-  re_print(&pattern);
-}
-
-void verify_re_match()
-{
-  int length;
-  regex_t pattern[MAX_REGEXP_OBJECTS];
-  char arr[N];
-
-  for (unsigned char i=0; i<MAX_REGEXP_OBJECTS; i++) {
-    //pattern[i].type = nondet_uchar();
-    //pattern[i].u.ch = nondet_int();
-    assume(pattern[i].type >= 0 && pattern[i].type <= 255);
-    assume(pattern[i].u.ccl >= 0 && pattern[i].u.ccl <= ~1);
-  }
-  for (int i=0; i<sizeof(arr)-1; i++) {
-    assume(arr[i] > -127 && arr[i] < 128);
-  }
-  /* assume proper NULL termination */
-  assume(arr[sizeof(arr) - 1] == 0);
-
-  re_match(&pattern, arr, &length);
-}
-
-int main(int argc, char* argv[])
-{
-  verify_re_compile();
-  verify_re_printh();
-  verify_re_match();
-  return 0;
-}
-#endif

From 623e3fd77e0df48b8cad77dd3af187045c293120 Mon Sep 17 00:00:00 2001
From: Shahar Sivan <59827819+shahar99s@users.noreply.github.com>
Date: Sat, 25 May 2024 18:44:24 +0300
Subject: [PATCH 20/30] silence gcc tests warnings

---
 tests/test1.c | 13 ++++++++-----
 tests/test2.c |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/test1.c b/tests/test1.c
index d6087ec..292d0c1 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -11,7 +11,6 @@
 #define OK    ((char*) 1)
 #define NOK   ((char*) 0)
 
-
 char* test_vector[][4] =
 {
   { OK,  "\\d",                       "5",                (char*) 1      },
@@ -107,17 +106,21 @@ int main()
     int should_fail;
     int length;
     int correctlen;
-    size_t nvector_tests = sizeof(test_vector) / sizeof(*test_vector);
-    size_t ntests = nvector_tests + 1;
-    size_t nfailed = 0;
-    size_t i;
+    unsigned long nvector_tests = sizeof(test_vector) / sizeof(*test_vector);
+    unsigned long ntests = nvector_tests + 1;
+    unsigned long nfailed = 0;
+    unsigned long i;
 
     for (i = 0; i < nvector_tests; ++i)
     {
         pattern = test_vector[i][1];
         text = test_vector[i][2];
         should_fail = (test_vector[i][0] == NOK);
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpointer-to-int-cast"
         correctlen = (int)(test_vector[i][3]);
+#pragma GCC diagnostic pop
 
         int m = re_match(pattern, text, &length);
 
diff --git a/tests/test2.c b/tests/test2.c
index 723e262..2302a72 100644
--- a/tests/test2.c
+++ b/tests/test2.c
@@ -2066,7 +2066,7 @@ int main()
   size_t bufsize = sizeof(buf) - 1;
   int i;
   int dummy = 0;
-  size_t bufsizes[ntests];
+  unsigned long bufsizes[ntests];
   char old;
 
   for (i = ntests-1; i >= 0; --i)

From 973e9909d81f5416a48269fb5f4e2023c33b5fd7 Mon Sep 17 00:00:00 2001
From: Shahar Sivan <59827819+shahar99s@users.noreply.github.com>
Date: Sat, 25 May 2024 19:30:17 +0300
Subject: [PATCH 21/30] use default python in makefile

---
 Makefile | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index fbf754c..51ecf6d 100644
--- a/Makefile
+++ b/Makefile
@@ -4,14 +4,7 @@ CC := gcc
 # Number of random text expressions to generate, for random testing
 NRAND_TESTS := 1000
 
-PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \
-            echo 'python';                                          \
-          elif command -v python2 >/dev/null 2>&1; then             \
-            echo 'python2';                                         \
-          else                                                      \
-            echo 'Error: no compatible python 2 version found.' >&2;  \
-            exit 1;                                                 \
-          fi
+PYTHON := python
 
 # Flags to pass to compiler
 CFLAGS := -O3 -Wall -Wextra -std=c99 -I.

From 1072ecfe86d39c3fbe7e75c86def53520b3fdd7d Mon Sep 17 00:00:00 2001
From: Shahar Sivan <59827819+shahar99s@users.noreply.github.com>
Date: Sat, 25 May 2024 19:31:56 +0300
Subject: [PATCH 22/30] cleanup tests + add windows support

---
 scripts/exrex.py          | 311 --------------------------------------
 scripts/regex_test.py     |  87 +++++------
 scripts/regex_test_neg.py | 101 +++++++------
 scripts/utils.py          |  11 ++
 4 files changed, 110 insertions(+), 400 deletions(-)
 delete mode 100755 scripts/exrex.py
 create mode 100644 scripts/utils.py

diff --git a/scripts/exrex.py b/scripts/exrex.py
deleted file mode 100755
index 9357748..0000000
--- a/scripts/exrex.py
+++ /dev/null
@@ -1,311 +0,0 @@
-#!/usr/bin/env python
-
-# This file is part of exrex.
-#
-# exrex is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# exrex is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with exrex. If not, see < http://www.gnu.org/licenses/ >.
-#
-# (C) 2012- by Adam Tauber, <asciimoo@gmail.com>
-
-try:
-    from future_builtins import map, range
-except:
-    pass
-from re import match, sre_parse
-from itertools import product, chain, tee
-from random import choice,randint
-import string
-
-__all__ = ('generate', 'CATEGORIES', 'count', 'parse', 'getone')
-
-CATEGORIES = {'category_space'  : sorted(sre_parse.WHITESPACE)
-             ,'category_digit'  : sorted(sre_parse.DIGITS)
-             ,'category_not_digit'  : [chr(x) for x in range(32, 123) if
-                                       match('\D', chr(x))]
-             ,'category_any'    : [chr(x) for x in range(32, 123)]
-             ,'category_word'   : sorted( frozenset(string.ascii_letters + string.digits + "_") )
-             ,'category_not_word'  : [chr(x) for x in range(32, 123) if
-                                       match('\W', chr(x))]
-             }
-
-def comb(g, i):
-    for c in g:
-        g2,i = tee(i)
-        for c2 in g2:
-            yield c+c2
-
-def mappend(g, c):
-    for cc in g:
-        yield cc+c
-
-def _in(d):
-    ret = []
-    neg = False
-    for i in d:
-        if i[0] == 'range':
-            subs = map(chr, range(i[1][0], i[1][1]+1))
-            if neg:
-                for char in subs:
-                    try:
-                        ret.remove(char)
-                    except:
-                        pass
-            else:
-                ret.extend(subs)
-        elif i[0] == 'literal':
-            if neg:
-                try:
-                    ret.remove(chr(i[1]))
-                except:
-                    pass
-            else:
-                ret.append(chr(i[1]))
-        elif i[0] == 'category':
-            subs = CATEGORIES.get(i[1], [''])
-            if neg:
-                for char in subs:
-                    try:
-                        ret.remove(char)
-                    except:
-                        pass
-            else:
-                ret.extend(subs)
-        elif i[0] == 'negate':
-            ret = list(CATEGORIES['category_any'])
-            neg = True
-    return ret
-
-
-def prods(orig, ran, items):
-    for o in orig:
-        for r in ran:
-            for s in product(items, repeat=r):
-                yield o+''.join(s)
-
-def ggen(g1, f, *args, **kwargs):
-    for a in g1:
-        g2 = f(*args, **kwargs)
-        if isinstance(g2, int):
-            yield g2
-        else:
-            for b in g2:
-                yield a+b
-
-def _gen(d, limit=20, count=False):
-    """docstring for _gen"""
-    ret = ['']
-    strings = 0
-    for i in d:
-        if i[0] == 'in':
-            subs = _in(i[1])
-            if count:
-                strings = (strings or 1) * len(subs)
-            ret = comb(ret, subs)
-        elif i[0] == 'literal':
-            ret = mappend(ret, chr(i[1]))
-        elif i[0] == 'category':
-            subs = CATEGORIES.get(i[1], [''])
-            if count:
-                strings = (strings or 1) * len(subs)
-            ret = comb(ret, subs)
-        elif i[0] == 'any':
-            subs = CATEGORIES['category_any']
-            if count:
-                strings = (strings or 1) * len(subs)
-            ret = comb(ret, subs)
-        elif i[0] == 'max_repeat':
-            chars = filter(None, _gen(list(i[1][2]), limit))
-            if i[1][1]+1 - i[1][0] >= limit:
-                ran = range(i[1][0], i[1][0]+limit)
-            else:
-                ran = range(i[1][0], i[1][1]+1)
-            if count:
-                for i in ran:
-                    strings += pow(len(chars), i)
-            ret = prods(ret, ran, chars)
-        elif i[0] == 'branch':
-            subs = list(chain.from_iterable(_gen(list(x), limit) for x in i[1][1]))
-            if count:
-                strings = (strings or 1) * (len(subs) or 1)
-            ret = comb(ret, subs)
-        elif i[0] == 'subpattern':
-            if count:
-                strings = (strings or 1) * (sum(ggen([0], _gen, i[1][1], limit=limit, count=True)) or 1)
-            ret = ggen(ret, _gen, i[1][1], limit=limit, count=False)
-        # ignore ^ and $
-        elif i[0] == 'at':
-            continue
-        elif i[0] == 'not_literal':
-            subs = list(CATEGORIES['category_any'])
-            subs.remove(chr(i[1]))
-            if count:
-                strings = (strings or 1) * len(subs)
-            ret = comb(ret, subs)
-        elif i[0] == 'assert':
-            print i[1][1]
-            continue
-        else:
-            #print('[!] cannot handle expression ' + repr(i))
-            raise Exception('[!] cannot handle expression ' + repr(i))
-
-    if count:
-        return strings
-
-    return ret
-
-def _randone(d, limit=20):
-    """docstring for _randone"""
-    ret = ''
-    for i in d:
-        if i[0] == 'in':
-            ret += choice(_in(i[1]))
-        elif i[0] == 'literal':
-            ret += chr(i[1])
-        elif i[0] == 'category':
-            ret += choice(CATEGORIES.get(i[1], ['']))
-        elif i[0] == 'any':
-            ret += choice(CATEGORIES['category_any'])
-        elif i[0] == 'max_repeat':
-            chars = filter(None, _gen(list(i[1][2]), limit))
-            if i[1][1]+1 - i[1][0] >= limit:
-                min,max = i[1][0], i[1][0]+limit
-            else:
-                min,max = i[1][0], i[1][1]
-            for _ in range(randint(min, max)):
-                ret += choice(chars)
-        elif i[0] == 'branch':
-            ret += choice(list(chain.from_iterable(_gen(list(x), limit) for x in i[1][1])))
-        elif i[0] == 'subpattern':
-            ret += _randone(i[1][1], limit)
-        elif i[0] == 'at':
-            continue
-        elif i[0] == 'not_literal':
-            c=list(CATEGORIES['category_any'])
-            c.remove(chr(i[1]))
-            ret += choice(c)
-        else:
-            #print('[!] cannot handle expression "%s"' % str(i))
-            raise Exception('[!] cannot handle expression "%s"' % str(i))
-
-    return ret
-
-
-def parse(s):
-    """Regular expression parser
-    :param s: Regular expression
-    :type s: str
-    :rtype: list
-    """
-    r = sre_parse.parse(s)
-    return list(r)
-
-def generate(s, limit=20):
-    """Creates a generator that generates all matching strings to a given regular expression
-    :param s: Regular expression
-    :type s: str
-    :param limit: Range limit
-    :type limit: int
-    :returns: string generator object
-    """
-    return _gen(parse(s), limit)
-
-def count(s, limit=20):
-    """Counts all matching strings to a given regular expression
-    :param s: Regular expression
-    :type s: str
-    :param limit: Range limit
-    :type limit: int
-    :rtype: int
-    :returns: number of matching strings
-    """
-    return _gen(parse(s), limit, count=True)
-
-def getone(regex_string, limit=20):
-    """Returns a random matching string to a given regular expression
-    """
-    return _randone(parse(regex_string), limit)
-
-def argparser():
-    import argparse
-    from sys import stdout
-    argp = argparse.ArgumentParser(description='exrex - regular expression string generator')
-    argp.add_argument('-o', '--output'
-                     ,help      = 'Output file - default is STDOUT'
-                     ,metavar   = 'FILE'
-                     ,default   = stdout
-                     ,type      = argparse.FileType('w')
-                     )
-    argp.add_argument('-l', '--limit'
-                     ,help      = 'Max limit for range size - default is 20'
-                     ,default   = 20
-                     ,action    = 'store'
-                     ,type      = int
-                     ,metavar   = 'N'
-                     )
-    argp.add_argument('-c', '--count'
-                     ,help      = 'Count matching strings'
-                     ,default   = False
-                     ,action    = 'store_true'
-                     )
-    argp.add_argument('-r', '--random'
-                     ,help      = 'Returns a random string that matches to the regex'
-                     ,default   = False
-                     ,action    = 'store_true'
-                     )
-    argp.add_argument('-d', '--delimiter'
-                     ,help      = 'Delimiter - default is \\n'
-                     ,default   = '\n'
-                     )
-    argp.add_argument('-v', '--verbose'
-                     ,action    = 'store_true'
-                     ,help      = 'Verbose mode'
-                     ,default   = False
-                     )
-    argp.add_argument('regex'
-                     ,metavar   = 'REGEX'
-                     ,help      = 'REGEX string'
-                     )
-    return vars(argp.parse_args())
-
-def __main__():
-    from sys import exit, stderr
-    # 'as(d|f)qw(e|r|s)[a-zA-Z]{2,3}'
-    # 'as(QWE|Z([XC]|Y|U)V){2,3}asdf'
-    # '.?'
-    # '.+'
-    # 'asdf.{1,4}qwer{2,5}'
-    # 'a(b)?(c)?(d)?'
-    # 'a[b][c][d]?[e]?
-    args = argparser()
-    if args['verbose']:
-        args['output'].write('%r%s' % (parse(args['regex'], limit=args['limit']), args['delimiter']))
-    if args['count']:
-        args['output'].write('%d%s' % (count(args['regex'], limit=args['limit']), args['delimiter']))
-        exit(0)
-    if args['random']:
-        args['output'].write('%s%s' % (getone(args['regex'], limit=args['limit']), args['delimiter']))
-        exit(0)
-    try:
-        g = generate(args['regex'], args['limit'])
-    except Exception, e:
-        print >> stderr, '[!] Error: ', e
-        exit(1)
-    for s in g:
-        try:
-            args['output'].write(s+args['delimiter'])
-        except:
-            break
-
-if __name__ == '__main__':
-    __main__()
-
diff --git a/scripts/regex_test.py b/scripts/regex_test.py
index 08b4c5e..1da68f7 100755
--- a/scripts/regex_test.py
+++ b/scripts/regex_test.py
@@ -1,77 +1,78 @@
 #!/usr/bin/env python
 
 """
-  This python2 program generates random text that matches a given regex-pattern.
-  The pattern is given via sys.argv and the generated text is passed to
-  the binary 'tests/test_rand' to check if the generated text also matches
-  the regex-pattern in the C implementation.
-  The exit-code of the testing program, is used to determine test success.
+This python2 program generates random text that matches a given regex-pattern.
+The pattern is given via sys.argv and the generated text is passed to
+the binary 'tests/test_rand' to check if the generated text also matches
+the regex-pattern in the C implementation.
+The exit-code of the testing program, is used to determine test success.
 
-  This script is called by the Makefile when doing 'make test'
+This script is called by the Makefile when doing 'make test'
 """
 
-
-import re
 import sys
-import exrex
+import rstr
 from subprocess import call
 
+from utils import get_executable_name
 
-prog = "./tests/test_rand"
+prog = get_executable_name("./tests/test_rand")
 
 if len(sys.argv) < 2:
-  print("")
-  print("usage: %s pattern [nrepeat]" % sys.argv[0])
-  print("  where [nrepeat] is optional")
-  print("")
-  sys.exit(-1)
+    print("")
+    print("usage: %s pattern [nrepeat]" % sys.argv[0])
+    print("  where [nrepeat] is optional")
+    print("")
+    sys.exit(-1)
 
 own_prog = sys.argv[0]
 pattern = sys.argv[1]
 if len(sys.argv) > 2:
-  ntests = int(sys.argv[2])
+    ntests = int(sys.argv[2])
 else:
-  ntests = 10
+    ntests = 10
 nfails = 0
 repeats = ntests
 
 
 try:
-  repeats = int(sys.argv[2])
+    repeats = int(sys.argv[2])
 except:
-  pass
+    pass
 
 r = 50
 while r < 0:
-  try:
-    g = exrex.generate(pattern)
-    break
-  except:
-    pass
+    try:
+        g = rstr.xeger(pattern)
+        break
+    except:
+        pass
 
 
 sys.stdout.write("%-35s" % ("  pattern '%s': " % pattern))
 
 
 while repeats >= 0:
-  try:
-    repeats -= 1
-    example = exrex.getone(pattern)
-    #print("%s %s %s" % (prog, pattern, example))
-    ret = call([prog, "\"%s\"" % pattern, "\"%s\"" % example])
-    if ret != 0:
-      escaped = repr(example) # escapes special chars for better printing
-      print("    FAIL : doesn't match %s as expected [%s]." % (escaped, ", ".join([("0x%02x" % ord(e)) for e in example]) ))
-      nfails += 1
-
-  except:
-    #import traceback
-    #print("EXCEPTION!")
-    #raw_input(traceback.format_exc())
-    ntests -= 1
-    repeats += 1
-    #nfails += 1
+    try:
+        repeats -= 1
+        example = rstr.xeger(pattern)
+        # print("%s %s %s" % (prog, pattern, example))
+        ret = call([prog, '"%s"' % pattern, '"%s"' % example])
+        if ret != 0:
+            escaped = repr(example)  # escapes special chars for better printing
+            print(
+                "    FAIL : doesn't match %s as expected [%s]."
+                % (escaped, ", ".join([("0x%02x" % ord(e)) for e in example]))
+            )
+            nfails += 1
+
+    except:
+        # import traceback
+        # print("EXCEPTION!")
+        # input(traceback.format_exc())
+        ntests -= 1
+        repeats += 1
+        # nfails += 1
 
 sys.stdout.write("%4d/%d tests succeeded \n" % (ntests - nfails, ntests))
-#print("")
-
+# print("")
diff --git a/scripts/regex_test_neg.py b/scripts/regex_test_neg.py
index c3daad6..748333b 100755
--- a/scripts/regex_test_neg.py
+++ b/scripts/regex_test_neg.py
@@ -1,82 +1,91 @@
 #!/usr/bin/env python
 
 """
-  This program generates random text that matches a given regex-pattern.
-  The pattern is given via sys.argv and the generated text is passed to
-  the binary 'tests/test_rand' to check if the generated text also matches
-  the regex-pattern in the C implementation.
-  The exit-code of the testing program, is used to determine test success.
+This program generates random text that matches a given regex-pattern.
+The pattern is given via sys.argv and the generated text is passed to
+the binary 'tests/test_rand' to check if the generated text also matches
+the regex-pattern in the C implementation.
+The exit-code of the testing program, is used to determine test success.
 
-  This script is called by the Makefile when doing 'make test'
+This script is called by the Makefile when doing 'make test'
 """
 
-
 import re
 import sys
 import string
 import random
 from subprocess import call
 
+from utils import get_executable_name
+
 
-prog = "./tests/test_rand_neg"
+prog = get_executable_name("./tests/test_rand_neg")
 
 if len(sys.argv) < 2:
-  print("")
-  print("usage: %s pattern [nrepeat]" % sys.argv[0])
-  print("  where [nrepeat] is optional")
-  print("")
-  sys.exit(-1)
+    print("")
+    print("usage: %s pattern [nrepeat]" % sys.argv[0])
+    print("  where [nrepeat] is optional")
+    print("")
+    sys.exit(-1)
 
 own_prog = sys.argv[0]
 pattern = sys.argv[1]
 if len(sys.argv) > 2:
-  ntests = int(sys.argv[2])
+    ntests = int(sys.argv[2])
 else:
-  ntests = 10
+    ntests = 10
 nfails = 0
 repeats = ntests
 
 
 try:
-  repeats = int(sys.argv[2])
+    repeats = int(sys.argv[2])
 except:
-  pass
+    pass
 
 sys.stdout.write("%-35s" % ("  pattern '%s': " % pattern))
 
 
-
-
 def gen_no_match(pattern, minlen=1, maxlen=50, maxattempts=500):
-  nattempts = 0
-  while True:
-    nattempts += 1
-    ret = "".join([random.choice(string.printable) for i in range(random.Random().randint(minlen, maxlen))])
-    if re.findall(pattern, ret) == []:
-      return ret
-    if nattempts >= maxattempts:
-      raise Exception("Could not generate string that did not match the regex pattern '%s' after %d attempts" % (pattern, nattempts))
-
+    nattempts = 0
+    while True:
+        nattempts += 1
+        ret = "".join(
+            [
+                random.choice(string.printable)
+                for i in range(random.Random().randint(minlen, maxlen))
+            ]
+        )
+        if re.findall(pattern, ret) == []:
+            return ret
+        if nattempts >= maxattempts:
+            raise Exception(
+                "Could not generate string that did not match the regex pattern '%s' after %d attempts"
+                % (pattern, nattempts)
+            )
 
 
 while repeats >= 0:
-  try:
-    repeats -= 1
-    example = gen_no_match(pattern)
-    #print("%s %s %s" % (prog, pattern, example))
-    ret = call([prog, "\"%s\"" % pattern, "\"%s\"" % example])
-    if ret != 0:
-      escaped = repr(example) # escapes special chars for better printing
-      print("    FAIL : matches %s unexpectedly [%s]." % (escaped, ", ".join([("0x%02x" % ord(e)) for e in example]) ))
-      nfails += 1
-
-  except:
-    #import traceback
-    #print("EXCEPTION!")
-    #raw_input(traceback.format_exc())
-    ntests -= 1
-    repeats += 1
-    #nfails += 1
+    try:
+        repeats -= 1
+        example = gen_no_match(pattern)
+        # print("%s %s %s" % (prog, pattern, example))
+        ret = call([prog, '"%s"' % pattern, '"%s"' % example])
+        if ret != 0:
+            escaped = repr(example)  # escapes special chars for better printing
+            print(
+                "    FAIL : matches %s unexpectedly [%s]."
+                % (escaped, ", ".join([("0x%02x" % ord(e)) for e in example]))
+            )
+            nfails += 1
+
+    except:
+        # import traceback
+        # print("EXCEPTION!")
+        # raw_input(traceback.format_exc())
+        ntests -= 1
+        repeats += 1
+        # nfails += 1
 
 sys.stdout.write("%4d/%d tests succeeded \n" % (ntests - nfails, ntests))
-#print("")
+# print("")
diff --git a/scripts/utils.py b/scripts/utils.py
new file mode 100644
index 0000000..f64cbd3
--- /dev/null
+++ b/scripts/utils.py
@@ -0,0 +1,11 @@
+import os
+
+
+def get_executable_name(path: str) -> str:
+    """
+    Adds .exe extension to the path if running on Windows and the path does not already end with .exe
+    """
+    if os.name == "nt":  # Check if the OS is Windows
+        if not path.lower().endswith(".exe"):
+            path += ".exe"
+    return path

From 0ecadd4e7995380dd095256a7dd03d9b1746d232 Mon Sep 17 00:00:00 2001
From: Shahar Sivan <59827819+shahar99s@users.noreply.github.com>
Date: Sat, 25 May 2024 21:17:08 +0300
Subject: [PATCH 23/30] prevent '*' expansion in scripts

---
 scripts/regex_test.py     | 9 +++------
 scripts/regex_test_neg.py | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/scripts/regex_test.py b/scripts/regex_test.py
index 1da68f7..c29e8f5 100755
--- a/scripts/regex_test.py
+++ b/scripts/regex_test.py
@@ -56,14 +56,11 @@
     try:
         repeats -= 1
         example = rstr.xeger(pattern)
-        # print("%s %s %s" % (prog, pattern, example))
-        ret = call([prog, '"%s"' % pattern, '"%s"' % example])
+        # print(f'{prog} "{pattern}" "{example}"')
+        ret = call([prog, f"'{pattern}'", f"'{example}'"], shell=False)
         if ret != 0:
             escaped = repr(example)  # escapes special chars for better printing
-            print(
-                "    FAIL : doesn't match %s as expected [%s]."
-                % (escaped, ", ".join([("0x%02x" % ord(e)) for e in example]))
-            )
+            print(f"    FAIL: {pattern} doesn't match {example}")
             nfails += 1
 
     except:
diff --git a/scripts/regex_test_neg.py b/scripts/regex_test_neg.py
index 748333b..d211f6f 100755
--- a/scripts/regex_test_neg.py
+++ b/scripts/regex_test_neg.py
@@ -70,7 +70,7 @@ def gen_no_match(pattern, minlen=1, maxlen=50, maxattempts=500):
         repeats -= 1
         example = gen_no_match(pattern)
         # print("%s %s %s" % (prog, pattern, example))
-        ret = call([prog, '"%s"' % pattern, '"%s"' % example])
+        ret = call([prog, f"'{pattern}'", f"'{example}'"], shell=False)
         if ret != 0:
             escaped = repr(example)  # escapes special chars for better printing
             print(

From b9fa9cc2b47a63802514f8452f4783caac51bb33 Mon Sep 17 00:00:00 2001
From: Shahar Sivan <59827819+shahar99s@users.noreply.github.com>
Date: Sat, 25 May 2024 21:17:51 +0300
Subject: [PATCH 24/30] support character class starting with '-'

---
 re.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/re.c b/re.c
index b5e4db4..3767a75 100644
--- a/re.c
+++ b/re.c
@@ -360,6 +360,10 @@ static int matchmetachar(char c, const char* str)
 
 static int matchcharclass(char c, const char* str)
 {
+  if (str[0] == '-' && c == '-') {
+      return 1;
+  }
+
   do
   {
     if (matchrange(c, str))

From 6fec7bb4c8c2c2fbb521c2afed827e7f934317ab Mon Sep 17 00:00:00 2001
From: Shahar Sivan <59827819+shahar99s@users.noreply.github.com>
Date: Sun, 26 May 2024 02:51:01 +0300
Subject: [PATCH 25/30] return error for invalid escaping

---
 re.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/re.c b/re.c
index 3767a75..be7e44a 100644
--- a/re.c
+++ b/re.c
@@ -23,6 +23,7 @@
  *   '\W'       Non-alphanumeric
  *   '\d'       Digits, [0-9]
  *   '\D'       Non-digits
+ *   '|'        Branch, matches either the preceding or following pattern
  *
  *
  */
@@ -169,11 +170,8 @@ re_t re_compile(const char* pattern)
         /* '\\' as last char without previous \\ -> invalid regular expression. */
         else
         {
-          re_compiled->type = CHAR;
-          re_compiled->data_len = 1;
-          re_compiled->data[0] = pattern[i];
+          return 0;
         }
-*/
       } break;
 
       /* Character class: */

From d2f6b576cb16878dd8a853d09e9a7ed91eb73257 Mon Sep 17 00:00:00 2001
From: Shahar Sivan <59827819+shahar99s@users.noreply.github.com>
Date: Sun, 26 May 2024 22:42:15 +0300
Subject: [PATCH 26/30] Support compiling regex in Python

---
 scripts/regex_compile.py | 182 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 182 insertions(+)
 create mode 100644 scripts/regex_compile.py

diff --git a/scripts/regex_compile.py b/scripts/regex_compile.py
new file mode 100644
index 0000000..e3fda36
--- /dev/null
+++ b/scripts/regex_compile.py
@@ -0,0 +1,182 @@
+from enum import Enum
+
+MAX_REGEXP_LEN = 1024
+
+class RegexType(Enum):
+    UNUSED = 0
+    DOT = 1
+    BEGIN = 2
+    END = 3
+    QUESTIONMARK = 4
+    STAR = 5
+    PLUS = 6
+    CHAR = 7
+    CHAR_CLASS = 8
+    INV_CHAR_CLASS = 9
+    DIGIT = 10
+    NOT_DIGIT = 11
+    ALPHA = 12
+    NOT_ALPHA = 13
+    WHITESPACE = 14
+    NOT_WHITESPACE = 15
+
+class RegexSegment:
+    def __init__(self, r_type, data_len=0, data=None):
+        self.type = r_type
+        self.data_len = data_len
+        self.data = data if data else []
+
+    def to_bytes(self):
+        return bytes([self.type.value, self.data_len] + self.data)
+
+    # def to_hex(self):
+    #     if self.type in (RegexType.CHAR_CLASS, RegexType.INV_CHAR_CLASS):
+    #         s = f"\\x{'[':02x}"
+    #         e = f"\\x{']':02x}"
+    #         content = ''.join(['\\x{0:02x}'.format(d) for d in self.data])
+    #         return s + f'\\x{"^":02x}' if self.type == RegexType.INV_CHAR_CLASS else '' + content + e
+    #     else:
+    #         return f"\\x{self.data[0]:02x}"
+
+    def __str__(self):
+        if self.type in (RegexType.CHAR_CLASS, RegexType.INV_CHAR_CLASS):
+            content = ''.join(['\\x{0:02x}'.format(d) for d in self.data])
+            return f"type: {self.type.name} [{'^' if self.type == RegexType.INV_CHAR_CLASS else ''}{content}]"
+        elif self.type == RegexType.CHAR:
+            return f"type: {self.type.name} '\\x{self.data[0]:02x}'"
+        else:
+            return f"type: {self.type.name}"
+
+
+def to_buffer(segments):
+    # Create the flat memory buffer
+    buffer = bytearray()
+    for segment in segments:
+        buffer.extend(segment.to_bytes())
+    return buffer
+
+# def to_hex(segments):
+#     pattern = ""
+#     for segment in segments:
+#         pattern += segment.to_hex()
+#     return pattern
+
+def print_buffer(segments):
+    print(to_buffer(segments))
+
+
+class MiniRegexCompiler:
+    MAX_REGEXP_LEN = 70  # Max number of bytes for a regex
+
+    def compile(self, pattern):
+        segments = []
+        i = 0
+
+        while i < len(pattern):
+            c = pattern[i]
+            if c == '.':
+                segments.append(RegexSegment(RegexType.DOT))
+            elif c == '^':
+                segments.append(RegexSegment(RegexType.BEGIN))
+            elif c == '$':
+                segments.append(RegexSegment(RegexType.END))
+            elif c == '*':
+                segments.append(RegexSegment(RegexType.STAR))
+            elif c == '+':
+                segments.append(RegexSegment(RegexType.PLUS))
+            elif c == '?':
+                segments.append(RegexSegment(RegexType.QUESTIONMARK))
+            elif c == '|':
+                raise Exception("Unsupported")
+            elif c == '\\':
+                i += 1
+                if i < len(pattern):
+                    escaped_segment = self.handle_escape(pattern[i])
+                    if escaped_segment:
+                        segments.append(escaped_segment)
+                    else:
+                        return None  # Invalid regex
+                else:
+                    return None  # Invalid regex
+            elif c == '[':
+                char_limit = MAX_REGEXP_LEN - 4 #min(0xff, MAX_REGEXP_LEN - j - 4)
+                i += 1
+                if i < len(pattern) and pattern[i] == '^':
+                    segment = RegexSegment(RegexType.INV_CHAR_CLASS)
+                    i += 1
+                    if i >= len(pattern):
+                        return None
+                else:
+                    segment = RegexSegment(RegexType.CHAR_CLASS)
+
+                while i < len(pattern) and pattern[i] != ']':
+                    if pattern[i] == '\\':
+                        i += 1
+                        if i < len(pattern):
+                            self.add_escaped_char(segment, pattern[i])
+                        else:
+                            return None  # Invalid regex
+                    elif segment.data_len >= char_limit:
+                        return None
+                    else:
+                        segment.data.append(ord(pattern[i]))
+                        segment.data_len += 1
+                    i += 1
+                if segment.data_len >= char_limit:
+                    return None
+
+                # Character class expects 'UNUSED' at the end
+                segment.data.append(RegexType.UNUSED.value)
+                segment.data_len += 1
+                segments.append(segment)
+            elif c == '\0':
+                return None
+            else:
+                segments.append(RegexSegment(RegexType.CHAR, 1, [ord(c)]))
+
+            i += 1
+
+        if len(segments) * 3 > self.MAX_REGEXP_LEN:  # Rough check, as each segment can have different lengths
+            return None  # Exceeded internal buffer
+                
+        return segments
+
+    def handle_escape(self, char):
+        if char == 'd':
+            return RegexSegment(RegexType.DIGIT)
+        elif char == 'D':
+            return RegexSegment(RegexType.NOT_DIGIT)
+        elif char == 'w':
+            return RegexSegment(RegexType.ALPHA)
+        elif char == 'W':
+            return RegexSegment(RegexType.NOT_ALPHA)
+        elif char == 's':
+            return RegexSegment(RegexType.WHITESPACE)
+        elif char == 'S':
+            return RegexSegment(RegexType.NOT_WHITESPACE)
+        elif char in {'.', '^', '$', '*', '+', '?',  '[', ']', '\\'}: # TODO: add '|'
+            return RegexSegment(RegexType.CHAR, 1, [ord(char)])
+        else:
+            return None  # Invalid escape sequence
+
+    def add_escaped_char(self, segment, char):
+        segment.data.append(ord('\\'))  # Add the escape character
+        segment.data_len += 1
+        segment.data.append(ord(char))
+        segment.data_len += 1
+
+def main():
+    # Usage example
+    pattern = "t*31elJ)_?~*DF_ac]*.+.*.[\\.]."
+    compiler = MiniRegexCompiler()
+    segments = compiler.compile(pattern)
+    compiled_pattern = to_buffer(segments)
+    if compiled_pattern:
+        print("Compiled pattern:", compiled_pattern)
+        for segment in segments:
+            print(segment)
+    else:
+        print("Invalid regex pattern")
+
+if __name__ == "__main__":
+    main()

From 25152bf6001e8d9ff4e479ed8122515c0f745bc7 Mon Sep 17 00:00:00 2001
From: Shahar Sivan <59827819+shahar99s@users.noreply.github.com>
Date: Sun, 26 May 2024 22:46:39 +0300
Subject: [PATCH 27/30] Add Python regex compilation tests

---
 Makefile                      | 43 ++++++++++++++++++++++++
 scripts/regex_test_compile.py | 46 ++++++++++++++++++++++++++
 scripts/regex_test_neg.py     |  8 ++---
 tests/test_compile.c          | 61 +++++++++++++++++++++++++++++++----
 4 files changed, 148 insertions(+), 10 deletions(-)
 create mode 100644 scripts/regex_test_compile.py

diff --git a/Makefile b/Makefile
index 51ecf6d..b132f85 100644
--- a/Makefile
+++ b/Makefile
@@ -30,6 +30,49 @@ test: all
 	@./tests/test1
 	@echo Testing handling of invalid regex patterns
 	@./tests/test_compile
+	@echo Compiling patterns in both Python and C and verifying the results are the same:
+	@echo
+	@$(PYTHON) ./scripts/regex_test_compile.py \\d+\\w?\\D\\d
+	@$(PYTHON) ./scripts/regex_test_compile.py \\s+[a-zA-Z0-9?]*
+	@$(PYTHON) ./scripts/regex_test_compile.py \\w*\\d?\\w\\?
+	@$(PYTHON) ./scripts/regex_test_compile.py [^\\d]+\\\\?\\s
+	@$(PYTHON) ./scripts/regex_test_compile.py [^\\w][^-1-4]
+	@$(PYTHON) ./scripts/regex_test_compile.py [^\\w]
+	@$(PYTHON) ./scripts/regex_test_compile.py [^1-4]
+	@$(PYTHON) ./scripts/regex_test_compile.py [^-1-4]
+	@$(PYTHON) ./scripts/regex_test_compile.py [^\\d]+\\s?[\\w]*
+	@$(PYTHON) ./scripts/regex_test_compile.py a+b*[ac]*.+.*.[\\.].
+	@$(PYTHON) ./scripts/regex_test_compile.py a?b[ac*]*.?[\\]+[?]?
+	@$(PYTHON) ./scripts/regex_test_compile.py [1-5-]+[-1-2]-[-]
+	@$(PYTHON) ./scripts/regex_test_compile.py [-1-3]-[-]+
+	@$(PYTHON) ./scripts/regex_test_compile.py [1-5]+[-1-2]-[\\-]
+	@$(PYTHON) ./scripts/regex_test_compile.py [-1-2]*
+	@$(PYTHON) ./scripts/regex_test_compile.py \\s?[a-fKL098]+-?
+	@$(PYTHON) ./scripts/regex_test_compile.py [\\-]*
+	@$(PYTHON) ./scripts/regex_test_compile.py [\\\\]+
+	@$(PYTHON) ./scripts/regex_test_compile.py [0-9a-fA-F]+
+	@$(PYTHON) ./scripts/regex_test_compile.py [1379][2468][abcdef]
+	@$(PYTHON) ./scripts/regex_test_compile.py [012345-9]?[0123-789]
+	@$(PYTHON) ./scripts/regex_test_compile.py [012345-9]
+	@$(PYTHON) ./scripts/regex_test_compile.py [0-56789]
+	@$(PYTHON) ./scripts/regex_test_compile.py [abc-zABC-Z]
+	@$(PYTHON) ./scripts/regex_test_compile.py [a\d]?1234
+	@$(PYTHON) ./scripts/regex_test_compile.py .*123faerdig
+	@$(PYTHON) ./scripts/regex_test_compile.py .?\\w+jsj$
+	@$(PYTHON) ./scripts/regex_test_compile.py [?to][+to][?ta][*ta]
+	@$(PYTHON) ./scripts/regex_test_compile.py \\d+
+	@$(PYTHON) ./scripts/regex_test_compile.py [a-z]+
+	@$(PYTHON) ./scripts/regex_test_compile.py \\s+[a-zA-Z0-9?]*
+	@$(PYTHON) ./scripts/regex_test_compile.py \\w
+	@$(PYTHON) ./scripts/regex_test_compile.py \\d
+	@$(PYTHON) ./scripts/regex_test_compile.py [\\d]
+	@$(PYTHON) ./scripts/regex_test_compile.py [^\\d]
+	@$(PYTHON) ./scripts/regex_test_compile.py [^-1-4]
+	@$(PYTHON) ./scripts/regex_test_compile.py \\x01[^\\xff][^
+	@$(PYTHON) ./scripts/regex_test_compile.py \\x01[^\\xff][\
+	@echo
+	@echo
+	@echo
 	@echo Testing patterns against $(NRAND_TESTS) random strings matching the Python implementation and comparing:
 	@echo
 	@$(PYTHON) ./scripts/regex_test.py \\d+\\w?\\D\\d             $(NRAND_TESTS)
diff --git a/scripts/regex_test_compile.py b/scripts/regex_test_compile.py
new file mode 100644
index 0000000..f3f2401
--- /dev/null
+++ b/scripts/regex_test_compile.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+"""
+This python program verifies `scripts/regex_compile.py` generates the same compiled regex as re.c.
+The pattern is given via sys.argv and the compiled hex pattern is passed to
+the binary 'tests/test_compile' to check if the compiled hex pattern is
+the same as the one compiled in the C code.
+The exit code of the testing program determines test success.
+
+This script is called by the Makefile when doing 'make test'.
+"""
+
+import binascii
+import subprocess
+import sys
+
+from regex_compile import MiniRegexCompiler, to_buffer
+from utils import get_executable_name
+
+prog = get_executable_name("./tests/test_compile")
+
+if len(sys.argv) < 2:
+    print(f"\nusage: {sys.argv[0]} pattern\n\n")
+    sys.exit(-1)
+
+pattern = sys.argv[1]
+
+print(f"  pattern '{pattern}': ", end='')
+
+compiler = MiniRegexCompiler()
+segments = compiler.compile(pattern)
+hex_pattern = ''
+if segments:
+    compiled_pattern = to_buffer(segments)
+    hex_pattern = binascii.hexlify(compiled_pattern)
+
+ret = subprocess.call([prog, pattern, hex_pattern], shell=False)
+if ret != 0:
+    print("Compiled pattern:", compiled_pattern)
+    for segment in segments:
+        print(segment)
+    print(f"    FAIL: {pattern}")
+    print(hex_pattern)
+    sys.exit(1)
+else:
+    print("SUCCEED")
diff --git a/scripts/regex_test_neg.py b/scripts/regex_test_neg.py
index d211f6f..b5ec6ad 100755
--- a/scripts/regex_test_neg.py
+++ b/scripts/regex_test_neg.py
@@ -80,12 +80,12 @@ def gen_no_match(pattern, minlen=1, maxlen=50, maxattempts=500):
             nfails += 1
 
     except:
-        # import traceback
-        # print("EXCEPTION!")
-        # raw_input(traceback.format_exc())
+        import traceback
+        print("EXCEPTION!")
+        input(traceback.format_exc())
         ntests -= 1
         repeats += 1
-        # nfails += 1
+        nfails += 1
 
 sys.stdout.write("%4d/%d tests succeeded \n" % (ntests - nfails, ntests))
 # print("")
diff --git a/tests/test_compile.c b/tests/test_compile.c
index 2a7b4d0..21b0d04 100644
--- a/tests/test_compile.c
+++ b/tests/test_compile.c
@@ -8,14 +8,63 @@ This file tests two bug patterns reported by @DavidKorczynski in https://github.
 #include <stdlib.h> /* for NULL */
 #include "re.h"
 
+void hexdump(const unsigned char *data, size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+        printf("\\x%02x", data[i]);
+    }
+    printf("\n");
+}
 
-int main()
-{
-  /* Test 1: inverted set without a closing ']' */
-  assert(re_compile("\\\x01[^\\\xff][^") == NULL);
+int hex_to_int(char c) {
+    if (c >= '0' && c <= '9') {
+        return c - '0';
+    } else if (c >= 'A' && c <= 'F') {
+        return c - 'A' + 10;
+    } else if (c >= 'a' && c <= 'f') {
+        return c - 'a' + 10;
+    } else {
+        return -1;
+    }
+}
 
-  /* Test 2: set with an incomplete escape sequence and without a closing ']' */
-  assert(re_compile("\\\x01[^\\\xff][\\") == NULL);
+// Function to convert a hex string to a byte array
+unsigned char *hex_to_bytes(const char *hex, size_t *length) {
+    size_t len = strlen(hex);
+    if (len % 2 != 0) {
+        return NULL;  // Invalid hex string
+    }
+    *length = len / 2;
+    unsigned char *bytes = malloc(*length);
+    for (size_t i = 0; i < *length; i++) {
+        int high = hex_to_int(hex[2 * i]);
+        int low = hex_to_int(hex[2 * i + 1]);
+        if (high == -1 || low == -1) {
+            free(bytes);
+            return NULL;  // Invalid hex character
+        }
+        bytes[i] = (high << 4) | low;
+    }
+    return bytes;
+}
+
+int main(int argc, char** argv)
+{
+  int length;
+  if (argc == 3)
+  {
+    size_t pattern_len;
+    re_t *compiled_pattern = NULL;
+    if(argv[2] != NULL){
+      compiled_pattern = hex_to_bytes(argv[2], &pattern_len);
+    }
+    //hexdump(compiled_pattern, pattern_len);
+    //hexdump(re_compile(argv[1]), pattern_len);
+    assert(0 == memcmp(compiled_pattern, re_compile(argv[1]), pattern_len));
+  }
+  else
+  {
+    printf("\nUsage: %s <PATTERN> <HEX_COMPILED_PATTERN> \n", argv[0]);
+  }
 
   return 0;
 }

From 5beb2b30f66d575bcad80555774328da53583aa2 Mon Sep 17 00:00:00 2001
From: Shahar Sivan <59827819+shahar99s@users.noreply.github.com>
Date: Sun, 26 May 2024 23:14:35 +0300
Subject: [PATCH 28/30] Remove redundent character class null terminator

---
 re.c                              | 15 +++---
 scripts/regex_compile.py          |  3 --
 scripts/regex_test_precompiled.py | 88 +++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+), 11 deletions(-)
 create mode 100644 scripts/regex_test_precompiled.py

diff --git a/re.c b/re.c
index be7e44a..a8ba4b8 100644
--- a/re.c
+++ b/re.c
@@ -57,7 +57,7 @@ static re_t getnext(regex_t* pattern)
 
 /* Private function declarations: */
 static int matchpattern(regex_t* pattern, const char* text, int* matchlength);
-static int matchcharclass(char c, const char* str);
+static int matchcharclass(char c, unsigned char len, const char* str);
 static int matchstar(regex_t *p, regex_t* pattern, const char* text, int* matchlength);
 static int matchplus(regex_t *p, regex_t* pattern, const char* text, int* matchlength);
 static int matchone(regex_t* p, char c);
@@ -225,8 +225,6 @@ re_t re_compile(const char* pattern)
             //fputs("exceeded internal buffer!\n", stderr);
             return 0;
         }
-        /* Null-terminate string end */
-        re_compiled->data[re_compiled->data_len++] = 0;
       } break;
 
       case '\0': // EOL
@@ -356,12 +354,13 @@ static int matchmetachar(char c, const char* str)
   }
 }
 
-static int matchcharclass(char c, const char* str)
+static int matchcharclass(char c, unsigned char len, const char* str)
 {
   if (str[0] == '-' && c == '-') {
       return 1;
   }
 
+  int i = 0;
   do
   {
     if (matchrange(c, str))
@@ -385,7 +384,7 @@ static int matchcharclass(char c, const char* str)
     {
       if (c == '-')
       {
-        if ((str[-1] == '\0') || (str[1] == '\0'))
+        if ((str[-1] == '\0') || (i == len - 1))
             return 1;
         // else continue
       }
@@ -395,7 +394,7 @@ static int matchcharclass(char c, const char* str)
       }
     }
   }
-  while (*str++ != '\0');
+  while (++i < len && *str++ != '\0');
 
   return 0;
 }
@@ -405,8 +404,8 @@ static int matchone(regex_t* p, char c)
   switch (p->type)
   {
     case DOT:            return matchdot(c);
-    case CHAR_CLASS:     return  matchcharclass(c, (const char*)p->data);
-    case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p->data);
+    case CHAR_CLASS:     return  matchcharclass(c, p->data_len, (const char*)p->data);
+    case INV_CHAR_CLASS: return !matchcharclass(c, p->data_len, (const char*)p->data);
     case DIGIT:          return  matchdigit(c);
     case NOT_DIGIT:      return !matchdigit(c);
     case ALPHA:          return  matchalphanum(c);
diff --git a/scripts/regex_compile.py b/scripts/regex_compile.py
index e3fda36..0056dc4 100644
--- a/scripts/regex_compile.py
+++ b/scripts/regex_compile.py
@@ -125,9 +125,6 @@ def compile(self, pattern):
                 if segment.data_len >= char_limit:
                     return None
 
-                # Character class expects 'UNUSED' at the end
-                segment.data.append(RegexType.UNUSED.value)
-                segment.data_len += 1
                 segments.append(segment)
             elif c == '\0':
                 return None
diff --git a/scripts/regex_test_precompiled.py b/scripts/regex_test_precompiled.py
new file mode 100644
index 0000000..fcd7752
--- /dev/null
+++ b/scripts/regex_test_precompiled.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+"""
+This python2 program generates random text that matches a given regex-pattern.
+The pattern is given via sys.argv and the generated text is passed to
+the binary 'tests/test_rand' to check if the generated text also matches
+the regex-pattern in the C implementation.
+The exit-code of the testing program, is used to determine test success.
+
+This script is called by the Makefile when doing 'make test'
+"""
+
+import binascii
+import subprocess
+import sys
+import rstr
+from subprocess import call
+
+from regex_compile import MiniRegexCompiler
+from utils import get_executable_name
+
+prog = get_executable_name("./tests/test_compile")
+
+if len(sys.argv) < 2:
+    print("")
+    print("usage: %s pattern [nrepeat]" % sys.argv[0])
+    print("  where [nrepeat] is optional")
+    print("")
+    sys.exit(-1)
+
+own_prog = sys.argv[0]
+pattern = sys.argv[1]
+if len(sys.argv) > 2:
+    ntests = int(sys.argv[2])
+else:
+    ntests = 10
+nfails = 0
+repeats = ntests
+
+
+try:
+    repeats = int(sys.argv[2])
+except:
+    pass
+
+r = 50
+while r < 0:
+    try:
+        g = rstr.xeger(pattern)
+        break
+    except:
+        pass
+
+
+sys.stdout.write("%-35s" % ("  pattern '%s': " % pattern))
+
+
+while repeats >= 0:
+    try:
+        repeats -= 1
+        example = rstr.xeger(pattern)
+        # print(f'{prog} "{pattern}" "{example}"')
+        compiler = MiniRegexCompiler()
+        compiled_pattern, segments = compiler.compile(pattern)
+        hex_pattern = binascii.hexlify(compiled_pattern)
+
+        ret = subprocess.call([prog, pattern, hex_pattern], shell=False)
+        if ret != 0:
+            print("Compiled pattern:", compiled_pattern)
+            for segment in segments:
+                print(segment)
+
+            escaped = repr(example)  # escapes special chars for better printing
+            print(f"    FAIL: {pattern} doesn't match {example}")
+            nfails += 1
+            print(hex_pattern)
+            exit()
+
+    except:
+        import traceback
+        print("EXCEPTION!")
+        input(traceback.format_exc())
+        ntests -= 1
+        repeats += 1
+        nfails += 1
+
+sys.stdout.write("%4d/%d tests succeeded \n" % (ntests - nfails, ntests))
+# print("")

From 1a2544d21c5148897c708b98692809a72a5d626d Mon Sep 17 00:00:00 2001
From: Shahar Sivan <59827819+shahar99s@users.noreply.github.com>
Date: Sun, 26 May 2024 23:42:33 +0300
Subject: [PATCH 29/30] Fix character class matching

---
 re.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/re.c b/re.c
index a8ba4b8..c53c816 100644
--- a/re.c
+++ b/re.c
@@ -360,31 +360,30 @@ static int matchcharclass(char c, unsigned char len, const char* str)
       return 1;
   }
 
-  int i = 0;
-  do
+  for(unsigned char i = 0; i < len; i++)
   {
-    if (matchrange(c, str))
+    if (matchrange(c, &str[i]))
     {
       return 1;
     }
-    else if (str[0] == '\\')
+    else if (str[i] == '\\')
     {
       /* Escape-char: increment str-ptr and match on next char */
-      str += 1;
-      if (matchmetachar(c, str))
+      i++;
+      if (matchmetachar(c, &str[i]))
       {
         return 1;
       }
-      else if ((c == str[0]) && !ismetachar(c))
+      else if ((c == str[i]) && !ismetachar(c))
       {
         return 1;
       }
     }
-    else if (c == str[0])
+    else if (c == str[i])
     {
       if (c == '-')
       {
-        if ((str[-1] == '\0') || (i == len - 1))
+        if ((str[i-1] == '\0') || (i == len - 1))
             return 1;
         // else continue
       }
@@ -394,7 +393,6 @@ static int matchcharclass(char c, unsigned char len, const char* str)
       }
     }
   }
-  while (++i < len && *str++ != '\0');
 
   return 0;
 }

From 2967b89f85d2fe354a32cf98378b6ddb81c04543 Mon Sep 17 00:00:00 2001
From: Shahar Sivan <59827819+shahar99s@users.noreply.github.com>
Date: Mon, 27 May 2024 22:52:01 +0300
Subject: [PATCH 30/30] Fix 'test_compile.c' warning

---
 tests/test_compile.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_compile.c b/tests/test_compile.c
index 21b0d04..23c4155 100644
--- a/tests/test_compile.c
+++ b/tests/test_compile.c
@@ -5,6 +5,8 @@ This file tests two bug patterns reported by @DavidKorczynski in https://github.
 */
 
 #include <assert.h>
+#include <stdio.h>
+#include <string.h>
 #include <stdlib.h> /* for NULL */
 #include "re.h"
 
@@ -49,13 +51,12 @@ unsigned char *hex_to_bytes(const char *hex, size_t *length) {
 
 int main(int argc, char** argv)
 {
-  int length;
   if (argc == 3)
   {
     size_t pattern_len;
-    re_t *compiled_pattern = NULL;
+    re_t compiled_pattern = NULL;
     if(argv[2] != NULL){
-      compiled_pattern = hex_to_bytes(argv[2], &pattern_len);
+      compiled_pattern = (re_t)hex_to_bytes(argv[2], &pattern_len);
     }
     //hexdump(compiled_pattern, pattern_len);
     //hexdump(re_compile(argv[1]), pattern_len);