1 files changed, 389 insertions, 0 deletions
diff --git a/src/pcresearch.c b/src/pcresearch.c
new file mode 100644
index 0000000..f6e72b0
--- /dev/null
+++ b/src/pcresearch.c
@@ -0,0 +1,389 @@
+/* pcresearch.c - searching subroutines using PCRE for grep.
+   Copyright 2000, 2007, 2009-2016 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written August 1992 by Mike Haertel. */
+
+#include <config.h>
+#include "search.h"
+
+#if HAVE_LIBPCRE
+# include <pcre.h>
+
+/* This must be at least 2; everything after that is for performance
+   in pcre_exec.  */
+enum { NSUB = 300 };
+
+/* Compiled internal form of a Perl regular expression.  */
+static pcre *cre;
+
+/* Additional information about the pattern.  */
+static pcre_extra *extra;
+
+# ifndef PCRE_STUDY_JIT_COMPILE
+#  define PCRE_STUDY_JIT_COMPILE 0
+# endif
+
+# if PCRE_STUDY_JIT_COMPILE
+/* Maximum size of the JIT stack.  */
+static int jit_stack_size;
+# endif
+
+/* Match the already-compiled PCRE pattern against the data in SUBJECT,
+   of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with
+   options OPTIONS, and storing resulting matches into SUB.  Return
+   the (nonnegative) match location or a (negative) error number.  */
+static int
+jit_exec (char const *subject, int search_bytes, int search_offset,
+          int options, int *sub)
+{
+  while (true)
+    {
+      int e = pcre_exec (cre, extra, subject, search_bytes, search_offset,
+                         options, sub, NSUB);
+
+# if PCRE_STUDY_JIT_COMPILE
+      if (e == PCRE_ERROR_JIT_STACKLIMIT
+          && 0 < jit_stack_size && jit_stack_size <= INT_MAX / 2)
+        {
+          int old_size = jit_stack_size;
+          int new_size = jit_stack_size = old_size * 2;
+          static pcre_jit_stack *jit_stack;
+          if (jit_stack)
+            pcre_jit_stack_free (jit_stack);
+          jit_stack = pcre_jit_stack_alloc (old_size, new_size);
+          if (!jit_stack)
+            error (EXIT_TROUBLE, 0,
+                   _("failed to allocate memory for the PCRE JIT stack"));
+          pcre_assign_jit_stack (extra, NULL, jit_stack);
+          continue;
+        }
+# endif
+
+      return e;
+    }
+}
+
+#endif
+
+#if HAVE_LIBPCRE
+/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
+   string matches when that flag is used.  */
+static int empty_match[2];
+
+static bool multibyte_locale;
+#endif
+
+void
+Pcompile (char const *pattern, size_t size)
+{
+#if !HAVE_LIBPCRE
+  error (EXIT_TROUBLE, 0, "%s",
+         _("support for the -P option is not compiled into "
+           "this --disable-perl-regexp binary"));
+#else
+  int e;
+  char const *ep;
+  static char const wprefix[] = "(?<!\\w)(?:";
+  static char const wsuffix[] = ")(?!\\w)";
+  static char const xprefix[] = "^(?:";
+  static char const xsuffix[] = ")$";
+  int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1,
+                         sizeof xprefix - 1 + sizeof xsuffix - 1);
+  char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
+  int flags = (PCRE_MULTILINE
+               | (match_icase ? PCRE_CASELESS : 0));
+  char const *patlim = pattern + size;
+  char *n = re;
+  char const *p;
+  char const *pnul;
+
+  if (1 < MB_CUR_MAX)
+    {
+      if (! using_utf8 ())
+        error (EXIT_TROUBLE, 0,
+               _("-P supports only unibyte and UTF-8 locales"));
+      multibyte_locale = true;
+      flags |= PCRE_UTF8;
+    }
+
+  /* FIXME: Remove these restrictions.  */
+  if (memchr (pattern, '\n', size))
+    error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
+  if (! eolbyte)
+    {
+      bool escaped = false;
+      bool after_unescaped_left_bracket = false;
+      for (p = pattern; *p; p++)
+        if (escaped)
+          escaped = after_unescaped_left_bracket = false;
+        else
+          {
+            if (*p == '$' || (*p == '^' && !after_unescaped_left_bracket))
+              error (EXIT_TROUBLE, 0,
+                     _("unescaped ^ or $ not supported with -Pz"));
+            escaped = *p == '\\';
+            after_unescaped_left_bracket = *p == '[';
+          }
+    }
+
+  *n = '\0';
+  if (match_words)
+    strcpy (n, wprefix);
+  if (match_lines)
+    strcpy (n, xprefix);
+  n += strlen (n);
+
+  /* The PCRE interface doesn't allow NUL bytes in the pattern, so
+     replace each NUL byte in the pattern with the four characters
+     "\000", removing a preceding backslash if there are an odd
+     number of backslashes before the NUL.  */
+  for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
+    {
+      memcpy (n, p, pnul - p);
+      n += pnul - p;
+      for (p = pnul; pattern < p && p[-1] == '\\'; p--)
+        continue;
+      n -= (pnul - p) & 1;
+      strcpy (n, "\\000");
+      n += 4;
+    }
+
+  memcpy (n, p, patlim - p);
+  n += patlim - p;
+  *n = '\0';
+  if (match_words)
+    strcpy (n, wsuffix);
+  if (match_lines)
+    strcpy (n, xsuffix);
+
+  cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
+  if (!cre)
+    error (EXIT_TROUBLE, 0, "%s", ep);
+
+  extra = pcre_study (cre, PCRE_STUDY_JIT_COMPILE, &ep);
+  if (ep)
+    error (EXIT_TROUBLE, 0, "%s", ep);
+
+# if PCRE_STUDY_JIT_COMPILE
+  if (pcre_fullinfo (cre, extra, PCRE_INFO_JIT, &e))
+    error (EXIT_TROUBLE, 0, _("internal error (should never happen)"));
+
+  /* The PCRE documentation says that a 32 KiB stack is the default.  */
+  if (e)
+    jit_stack_size = 32 << 10;
+# endif
+
+  free (re);
+
+  int sub[NSUB];
+  empty_match[false] = pcre_exec (cre, extra, "", 0, 0,
+                                  PCRE_NOTBOL, sub, NSUB);
+  empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, sub, NSUB);
+#endif /* HAVE_LIBPCRE */
+}
+
+size_t
+Pexecute (char *buf, size_t size, size_t *match_size,
+          char const *start_ptr)
+{
+#if !HAVE_LIBPCRE
+  /* We can't get here, because Pcompile would have been called earlier.  */
+  error (EXIT_TROUBLE, 0, _("internal error"));
+  return -1;
+#else
+  int sub[NSUB];
+  char const *p = start_ptr ? start_ptr : buf;
+  bool bol = p[-1] == eolbyte;
+  char const *line_start = buf;
+  int e = PCRE_ERROR_NOMATCH;
+  char const *line_end;
+
+  /* The search address to pass to pcre_exec.  This is the start of
+     the buffer, or just past the most-recently discovered encoding
+     error.  */
+  char const *subject = buf;
+
+  /* If the input is unibyte or is free of encoding errors a multiline search is
+     typically more efficient.  Otherwise, a single-line search is
+     typically faster, so that pcre_exec doesn't waste time validating
+     the entire input buffer.  */
+  bool multiline = true;
+  if (multibyte_locale)
+    {
+      multiline = ! buf_has_encoding_errors (buf, size - 1);
+      buf[size - 1] = eolbyte;
+    }
+
+  for (; p < buf + size; p = line_start = line_end + 1)
+    {
+      bool too_big;
+
+      if (multiline)
+        {
+          size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
+          size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
+          line_end = memrchr (p, eolbyte, scan_size);
+          too_big = ! line_end;
+        }
+      else
+        {
+          line_end = memchr (p, eolbyte, buf + size - p);
+          too_big = INT_MAX < line_end - p;
+        }
+
+      if (too_big)
+        error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
+
+      for (;;)
+        {
+          /* Skip past bytes that are easily determined to be encoding
+             errors, treating them as data that cannot match.  This is
+             faster than having pcre_exec check them.  */
+          while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
+            {
+              p++;
+              subject = p;
+              bol = false;
+            }
+
+          int search_offset = p - subject;
+
+          /* Check for an empty match; this is faster than letting
+             pcre_exec do it.  */
+          if (p == line_end)
+            {
+              sub[0] = sub[1] = search_offset;
+              e = empty_match[bol];
+              break;
+            }
+
+          int options = 0;
+          if (!bol)
+            options |= PCRE_NOTBOL;
+          if (multiline)
+            options |= PCRE_NO_UTF8_CHECK;
+
+          e = jit_exec (subject, line_end - subject, search_offset,
+                        options, sub);
+          if (e != PCRE_ERROR_BADUTF8)
+            {
+              if (0 < e && multiline && sub[1] - sub[0] != 0)
+                {
+                  char const *nl = memchr (subject + sub[0], eolbyte,
+                                           sub[1] - sub[0]);
+                  if (nl)
+                    {
+                      /* This match crosses a line boundary; reject it.  */
+                      p = subject + sub[0];
+                      line_end = nl;
+                      continue;
+                    }
+                }
+              break;
+            }
+          int valid_bytes = sub[0];
+
+          if (search_offset <= valid_bytes)
+            {
+              /* Try to match the string before the encoding error.  */
+              if (valid_bytes == 0)
+                {
+                  /* Handle the empty-match case specially, for speed.
+                     This optimization is valid if VALID_BYTES is zero,
+                     which means SEARCH_OFFSET is also zero.  */
+                  sub[1] = 0;
+                  e = empty_match[bol];
+                }
+              else
+                e = jit_exec (subject, valid_bytes, search_offset,
+                              options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub);
+
+              if (e != PCRE_ERROR_NOMATCH)
+                break;
+
+              /* Treat the encoding error as data that cannot match.  */
+              p = subject + valid_bytes + 1;
+              bol = false;
+            }
+
+          subject += valid_bytes + 1;
+        }
+
+      if (e != PCRE_ERROR_NOMATCH)
+        break;
+      bol = true;
+    }
+
+  if (e <= 0)
+    {
+      switch (e)
+        {
+        case PCRE_ERROR_NOMATCH:
+          break;
+
+        case PCRE_ERROR_NOMEMORY:
+          error (EXIT_TROUBLE, 0, _("memory exhausted"));
+
+# if PCRE_STUDY_JIT_COMPILE
+        case PCRE_ERROR_JIT_STACKLIMIT:
+          error (EXIT_TROUBLE, 0, _("exhausted PCRE JIT stack"));
+# endif
+
+        case PCRE_ERROR_MATCHLIMIT:
+          error (EXIT_TROUBLE, 0, _("exceeded PCRE's backtracking limit"));
+
+        default:
+          /* For now, we lump all remaining PCRE failures into this basket.
+             If anyone cares to provide sample grep usage that can trigger
+             particular PCRE errors, we can add to the list (above) of more
+             detailed diagnostics.  */
+          error (EXIT_TROUBLE, 0, _("internal PCRE error: %d"), e);
+        }
+
+      return -1;
+    }
+  else
+    {
+      char const *matchbeg = subject + sub[0];
+      char const *matchend = subject + sub[1];
+      char const *beg;
+      char const *end;
+      if (start_ptr)
+        {
+          beg = matchbeg;
+          end = matchend;
+        }
+      else if (multiline)
+        {
+          char const *prev_nl = memrchr (line_start - 1, eolbyte,
+                                         matchbeg - (line_start - 1));
+          char const *next_nl = memchr (matchend, eolbyte,
+                                        line_end + 1 - matchend);
+          beg = prev_nl + 1;
+          end = next_nl + 1;
+        }
+      else
+        {
+          beg = line_start;
+          end = line_end + 1;
+        }
+      *match_size = end - beg;
+      return beg - buf;
+    }
+#endif
+}