Unicode upper/lower casing is now used when UCP is set, even if UTF is not set.

This is not yet documented, and it not yet implemented in JIT. git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1224 6239d852-aaf2-0410-a92c-79f79f948069
author: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2020-02-23 16:40:05 +0000
committer: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2020-02-23 16:40:05 +0000
commit: eeeb059e46a07f10023f2313894159161504b664 (patch)
tree: 98719aa173603943d4a2a403724045bf8a40f19c /src
parent: 98c6677bd3ff37d50249b32297abdb6008b42d54 (diff)
download: pcre2-eeeb059e46a07f10023f2313894159161504b664.tar.gz
7 files changed, 219 insertions, 108 deletions
diff --git a/src/pcre2_auto_possess.c b/src/pcre2_auto_possess.c
index 5b95b9b..6c4925f 100644
--- a/src/pcre2_auto_possess.c
+++ b/src/pcre2_auto_possess.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2019 University of Cambridge
+          New API code Copyright (c) 2016-2020 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -292,6 +292,7 @@ possessification, and if so, fills a list with its properties.
 Arguments:
   code        points to start of expression
   utf         TRUE if in UTF mode
+  ucp         TRUE if in UCP mode
   fcc         points to the case-flipping table
   list        points to output list
               list[0] will be filled with the opcode
@@ -304,7 +305,7 @@ Returns:      points to the start of the next opcode if *code is accepted
 */
 
 static PCRE2_SPTR
-get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
+get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
   uint32_t *list)
 {
 PCRE2_UCHAR c = *code;
@@ -316,7 +317,8 @@ uint32_t chr;
 uint32_t *clist_dest;
 const uint32_t *clist_src;
 #else
-(void)utf;    /* Suppress "unused parameter" compiler warning */
+(void)utf;    /* Suppress "unused parameter" compiler warnings */
+(void)ucp;
 #endif
 
 list[0] = c;
@@ -396,7 +398,7 @@ switch(c)
   list[2] = chr;
 
 #ifdef SUPPORT_UNICODE
-  if (chr < 128 || (chr < 256 && !utf))
+  if (chr < 128 || (chr < 256 && !utf && !ucp))
     list[3] = fcc[chr];
   else
     list[3] = UCD_OTHERCASE(chr);
@@ -503,6 +505,7 @@ which case the base cannot be possessified.
 Arguments:
   code        points to the byte code
   utf         TRUE in UTF mode
+  ucp         TRUE in UCP mode 
   cb          compile data block
   base_list   the data list of the base opcode
   base_end    the end of the base opcode
@@ -512,7 +515,7 @@ Returns:      TRUE if the auto-possessification is possible
 */
 
 static BOOL
-compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
+compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
   const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
 {
 PCRE2_UCHAR c;
@@ -651,7 +654,7 @@ for(;;)
 
     while (*next_code == OP_ALT)
       {
-      if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
+      if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
         return FALSE;
       code = next_code + 1 + LINK_SIZE;
       next_code += GET(next_code, 1);
@@ -672,7 +675,8 @@ for(;;)
     /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
 
     next_code += 1 + LINK_SIZE;
-    if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
+    if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, 
+         rec_limit))
       return FALSE;
 
     code += PRIV(OP_lengths)[c];
@@ -688,7 +692,7 @@ for(;;)
   /* We now have the next appropriate opcode to compare with the base. Check
   for a supported opcode, and load its properties. */
 
-  code = get_chr_property_list(code, utf, cb->fcc, list);
+  code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
   if (code == NULL) return FALSE;    /* Unsupported */
 
   /* If either opcode is a small character list, set pointers for comparing
@@ -1100,7 +1104,6 @@ leaving the remainder of the pattern unpossessified.
 
 Arguments:
   code        points to start of the byte code
-  utf         TRUE in UTF mode
   cb          compile data block
 
 Returns:      0 for success
@@ -1108,13 +1111,15 @@ Returns:      0 for success
 */
 
 int
-PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
+PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
 {
 PCRE2_UCHAR c;
 PCRE2_SPTR end;
 PCRE2_UCHAR *repeat_opcode;
 uint32_t list[8];
 int rec_limit = 1000;  /* Was 10,000 but clang+ASAN uses a lot of stack. */
+BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
+BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
 
 for (;;)
   {
@@ -1126,10 +1131,11 @@ for (;;)
     {
     c -= get_repeat_base(c) - OP_STAR;
     end = (c <= OP_MINUPTO) ?
-      get_chr_property_list(code, utf, cb->fcc, list) : NULL;
+      get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
 
-    if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
+    if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, 
+        &rec_limit))
       {
       switch(c)
         {
@@ -1181,11 +1187,11 @@ for (;;)
     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
       {
       /* end must not be NULL. */
-      end = get_chr_property_list(code, utf, cb->fcc, list);
+      end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
 
       list[1] = (c & 1) == 0;
 
-      if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
+      if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
         {
         switch (c)
           {
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index aa4869f..515f2aa 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -4904,7 +4904,7 @@ range. */
 if ((options & PCRE2_CASELESS) != 0)
   {
 #ifdef SUPPORT_UNICODE
-  if ((options & PCRE2_UTF) != 0)
+  if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
     {
     int rc;
     uint32_t oc, od;
@@ -5319,7 +5319,8 @@ dynamically as we process the pattern. */
 
 #ifdef SUPPORT_UNICODE
 BOOL utf = (options & PCRE2_UTF) != 0;
-#else  /* No UTF support */
+BOOL ucp = (options & PCRE2_UCP) != 0;
+#else  /* No Unicode support */
 BOOL utf = FALSE;
 #endif
 
@@ -5602,7 +5603,7 @@ for (;; pptr++)
         uint32_t d;
 
 #ifdef SUPPORT_UNICODE
-        if (utf && c > 127) d = UCD_OTHERCASE(c); else
+        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
 #endif
           {
 #if PCRE2_CODE_UNIT_WIDTH != 8
@@ -9632,6 +9633,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
 {
 BOOL utf;                             /* Set TRUE for UTF mode */
+BOOL ucp;                             /* Set TRUE for UCP mode */
 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
 pcre2_real_code *re = NULL;           /* What we will return */
@@ -9919,8 +9921,8 @@ if (utf)
 
 /* Check UCP lockout. */
 
-if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
-    (PCRE2_UCP|PCRE2_NEVER_UCP))
+ucp = (cb.external_options & PCRE2_UCP) != 0;
+if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
   {
   errorcode = ERR75;
   goto HAD_EARLY_ERROR;
@@ -10296,7 +10298,7 @@ function call. */
 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
   {
   PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
-  if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
+  if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
   }
 
 /* Failed to compile, or error while post-processing. */
@@ -10344,21 +10346,25 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
 
     if ((firstcuflags & REQ_CASELESS) != 0)
       {
-      if (firstcu < 128 || (!utf && firstcu < 255))
+      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
         {
         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
         }
 
-      /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
-      8-bit UTF mode, codepoints in the range 128-255 are introductory code
-      points and cannot have another case. In 16-bit and 32-bit modes, we can
-      check wide characters when UTF (and therefore UCP) is supported. */
+      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
+      In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
+      points and cannot have another case, but if UCP is set they may do. */
 
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-      else if (firstcu <= MAX_UTF_CODE_POINT &&
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
+        re->flags |= PCRE2_FIRSTCASELESS;
+#else
+      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
                UCD_OTHERCASE(firstcu) != firstcu)
         re->flags |= PCRE2_FIRSTCASELESS;
 #endif
+#endif  /* SUPPORT_UNICODE */
       }
     }
 
@@ -10407,14 +10413,20 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
 
       if ((reqcuflags & REQ_CASELESS) != 0)
         {
-        if (reqcu < 128 || (!utf && reqcu < 255))
+        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
           {
           if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
           }
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-        else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
-          re->flags |= PCRE2_LASTCASELESS;
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
+        re->flags |= PCRE2_LASTCASELESS;
+#else
+      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
+               UCD_OTHERCASE(reqcu) != reqcu)
+        re->flags |= PCRE2_LASTCASELESS;
 #endif
+#endif  /* SUPPORT_UNICODE */
         }
       }
     }
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index 7d8ffe8..b8bdd02 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2019 University of Cambridge
+          New API code Copyright (c) 2016-2020 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code;
 
 #ifdef SUPPORT_UNICODE
 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
 #else
 BOOL utf = FALSE;
 #endif
@@ -2190,7 +2191,7 @@ for (;;)
       if (clen == 0) break;
 
 #ifdef SUPPORT_UNICODE
-      if (utf)
+      if (utf_or_ucp)
         {
         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
           {
@@ -2204,7 +2205,7 @@ for (;;)
         }
       else
 #endif  /* SUPPORT_UNICODE */
-      /* Not UTF mode */
+      /* Not UTF or UCP mode */
         {
         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
           { ADD_NEW(state_offset + 2, 0); }
@@ -2339,7 +2340,7 @@ for (;;)
         {
         uint32_t otherd;
 #ifdef SUPPORT_UNICODE
-        if (utf && d >= 128)
+        if (utf_or_ucp && d >= 128)
           otherd = UCD_OTHERCASE(d);
         else
 #endif  /* SUPPORT_UNICODE */
@@ -2374,7 +2375,7 @@ for (;;)
         if (caseless)
           {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
             otherd = UCD_OTHERCASE(d);
           else
 #endif  /* SUPPORT_UNICODE */
@@ -2417,7 +2418,7 @@ for (;;)
         if (caseless)
           {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
             otherd = UCD_OTHERCASE(d);
           else
 #endif  /* SUPPORT_UNICODE */
@@ -2458,7 +2459,7 @@ for (;;)
         if (caseless)
           {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
             otherd = UCD_OTHERCASE(d);
           else
 #endif  /* SUPPORT_UNICODE */
@@ -2491,7 +2492,7 @@ for (;;)
         if (caseless)
           {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
             otherd = UCD_OTHERCASE(d);
           else
 #endif  /* SUPPORT_UNICODE */
@@ -2531,7 +2532,7 @@ for (;;)
         if (caseless)
           {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
             otherd = UCD_OTHERCASE(d);
           else
 #endif  /* SUPPORT_UNICODE */
@@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
   if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
     {
     first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && first_cu > 127)
+#ifdef SUPPORT_UNICODE 
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
-#endif
+#else
+    if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
+      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
+#endif       
+#endif  /* SUPPORT_UNICODE */
     }
   }
 else
@@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0)
   if ((re->flags & PCRE2_LASTCASELESS) != 0)
     {
     req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) 
+      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#else
+    if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) 
+      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
 #endif
+#endif  /* SUPPORT_UNICODE */
     }
   }
 
diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h
index ac96d2d..9963d6f 100644
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@@ -1952,7 +1952,7 @@ is available. */
 #define _pcre2_was_newline           PCRE2_SUFFIX(_pcre2_was_newline_)
 #define _pcre2_xclass                PCRE2_SUFFIX(_pcre2_xclass_)
 
-extern int          _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
+extern int          _pcre2_auto_possessify(PCRE2_UCHAR *,
                       const compile_block *);
 extern int          _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
                       int *, uint32_t, uint32_t, BOOL, compile_block *);
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index a3fccc1..77c98f5 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2015-2019 University of Cambridge
+          New API code Copyright (c) 2015-2020 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -598,12 +598,13 @@ BOOL condition;         /* Used in conditional groups */
 BOOL cur_is_word;       /* Used in "word" tests */
 BOOL prev_is_word;      /* Used in "word" tests */
 
-/* UTF flag */
+/* UTF and UCP flags */
 
 #ifdef SUPPORT_UNICODE
 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
 #else
-BOOL utf = FALSE;
+BOOL utf = FALSE;  /* Required for convenience even when no Unicode support */
 #endif
 
 /* This is the length of the last part of a backtracking frame that must be
@@ -928,6 +929,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
       }
     else
 #endif
+
     /* Not UTF mode */
       {
       if (mb->end_subject - Feptr < 1)
@@ -987,10 +989,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
         if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
         }
       }
+
+    /* If UCP is set without UTF we must do the same as above, but with one
+    character per code unit. */
+
+    else if (ucp)
+      {
+      uint32_t cc = UCHAR21(Feptr);
+      fc = Fecode[1];
+      if (fc < 128)
+        {
+        if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
+        }
+      else
+        {
+        if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
+        }
+      Feptr++;
+      Fecode += 2;
+      }
+
     else
 #endif   /* SUPPORT_UNICODE */
 
-    /* Not UTF mode; use the table for characters < 256. */
+    /* Not UTF or UCP mode; use the table for characters < 256. */
       {
       if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
           != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
@@ -1010,6 +1032,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
       SCHECK_PARTIAL();
       RRETURN(MATCH_NOMATCH);
       }
+
 #ifdef SUPPORT_UNICODE
     if (utf)
       {
@@ -1026,15 +1049,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
         if (ch > 127)
           ch = UCD_OTHERCASE(ch);
         else
-          ch = TABLE_GET(ch, mb->fcc, ch);
+          ch = (mb->fcc)[ch];
+        if (ch == fc) RRETURN(MATCH_NOMATCH);
+        }
+      }
+
+    /* UCP without UTF is as above, but with one character per code unit. */
+
+    else if (ucp)
+      {
+      uint32_t ch;
+      fc = UCHAR21INC(Feptr);
+      ch = Fecode[1];
+      Fecode += 2;
+
+      if (ch == fc)
+        {
+        RRETURN(MATCH_NOMATCH);  /* Caseful match */
+        }
+      else if (Fop == OP_NOTI)   /* If caseless */
+        {
+        if (ch > 127)
+          ch = UCD_OTHERCASE(ch);
+        else
+          ch = (mb->fcc)[ch];
         if (ch == fc) RRETURN(MATCH_NOMATCH);
         }
       }
+
     else
 #endif  /* SUPPORT_UNICODE */
+
+    /* Neither UTF nor UCP is set */
+
       {
       uint32_t ch = Fecode[1];
-      fc = *Feptr++;
+      fc = UCHAR21INC(Feptr);
       if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
         RRETURN(MATCH_NOMATCH);
       Fecode += 2;
@@ -1244,7 +1294,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
 #endif  /* SUPPORT_UNICODE */
 
     /* When not in UTF mode, load a single-code-unit character. Then proceed as
-    above. */
+    above, using Unicode casing if either UTF or UCP is set. */
 
     Lc = *Fecode++;
 
@@ -1253,11 +1303,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
     if (Fop >= OP_STARI)
       {
 #if PCRE2_CODE_UNIT_WIDTH == 8
-      /* Lc must be < 128 in UTF-8 mode. */
+#ifdef SUPPORT_UNICODE
+      if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+      else
+#endif  /* SUPPORT_UNICODE */
+      /* Lc will be < 128 in UTF-8 mode. */
       Loc = mb->fcc[Lc];
 #else /* 16-bit & 32-bit */
 #ifdef SUPPORT_UNICODE
-      if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+      if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
       else
 #endif  /* SUPPORT_UNICODE */
       Loc = TABLE_GET(Lc, mb->fcc, Lc);
@@ -1490,7 +1544,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
     if (Fop >= OP_NOTSTARI)     /* Caseless */
       {
 #ifdef SUPPORT_UNICODE
-      if (utf && Lc > 127)
+      if ((utf || ucp) && Lc > 127)
         Loc = UCD_OTHERCASE(Lc);
       else
 #endif /* SUPPORT_UNICODE */
@@ -6045,7 +6099,6 @@ BOOL firstline;
 BOOL has_first_cu = FALSE;
 BOOL has_req_cu = FALSE;
 BOOL startline;
-BOOL utf;
 
 #if PCRE2_CODE_UNIT_WIDTH == 8
 BOOL memchr_not_found_first_cu = FALSE;
@@ -6069,13 +6122,19 @@ PCRE2_SPTR match_partial;
 BOOL use_jit;
 #endif
 
+/* This flag is needed even when Unicode is not supported for convenience
+(it is used by the IS_NEWLINE macro). */
+
+BOOL utf = FALSE;
+
 #ifdef SUPPORT_UNICODE
+BOOL ucp = FALSE;
 BOOL allow_invalid;
 uint32_t fragment_options = 0;
 #ifdef SUPPORT_JIT
 BOOL jit_checked_utf = FALSE;
 #endif
-#endif
+#endif  /* SUPPORT_UNICODE */
 
 PCRE2_SIZE frame_size;
 
@@ -6147,12 +6206,13 @@ use_jit = (re->executable_jit != NULL &&
           (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
 #endif
 
-/* Initialize UTF parameters. */
+/* Initialize UTF/UCP parameters. */
 
-utf = (re->overall_options & PCRE2_UTF) != 0;
 #ifdef SUPPORT_UNICODE
+utf = (re->overall_options & PCRE2_UTF) != 0;
 allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
-#endif
+ucp = (re->overall_options & PCRE2_UCP) != 0;
+#endif  /* SUPPORT_UNICODE */
 
 /* Convert the partial matching flags into an integer. */
 
@@ -6589,9 +6649,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
   if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
     {
     first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
+#else
+    if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
 #endif
+#endif  /* SUPPORT_UNICODE */
     }
   }
 else
@@ -6607,9 +6671,13 @@ if ((re->flags & PCRE2_LASTSET) != 0)
   if ((re->flags & PCRE2_LASTCASELESS) != 0)
     {
     req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
+#else
+    if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
 #endif
+#endif  /* SUPPORT_UNICODE */
     }
   }
 
@@ -6756,15 +6824,16 @@ for(;;)
 #endif
           }
 
-        /* If we can't find the required code unit, having reached the true end
-        of the subject, break the bumpalong loop, to force a match failure,
-        except when doing partial matching, when we let the next cycle run at
-        the end of the subject. To see why, consider the pattern /(?<=abc)def/,
-        which partially matches "abc", even though the string does not contain
-        the starting character "d". If we have not reached the true end of the
-        subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
-        we also let the cycle run, because the matching string is legitimately
-        allowed to start with the first code unit of a newline. */
+        /* If we can't find the required first code unit, having reached the
+        true end of the subject, break the bumpalong loop, to force a match
+        failure, except when doing partial matching, when we let the next cycle
+        run at the end of the subject. To see why, consider the pattern
+        /(?<=abc)def/, which partially matches "abc", even though the string
+        does not contain the starting character "d". If we have not reached the
+        true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
+        temporarily modified) we also let the cycle run, because the matching
+        string is legitimately allowed to start with the first code unit of a
+        newline. */
 
         if (mb->partial == 0 && start_match >= mb->end_subject)
           {
diff --git a/src/pcre2_study.c b/src/pcre2_study.c
index 5af01b5..02d1c08 100644
--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@@ -772,15 +772,19 @@ Arguments:
   p             points to the first code unit of the character
   caseless      TRUE if caseless
   utf           TRUE for UTF mode
+  ucp           TRUE for UCP mode 
 
 Returns:        pointer after the character
 */
 
 static PCRE2_SPTR
-set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
+set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf, 
+  BOOL ucp)
 {
 uint32_t c = *p++;   /* First code unit */
-(void)utf;           /* Stop compiler warning when UTF not supported */
+
+(void)utf;           /* Stop compiler warnings when UTF not supported */
+(void)ucp;
 
 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
 0xff. */
@@ -810,22 +814,26 @@ if (utf)
 if (caseless)
   {
 #ifdef SUPPORT_UNICODE
-  if (utf)
+  if (utf || ucp)
     {
-#if PCRE2_CODE_UNIT_WIDTH == 8
-    PCRE2_UCHAR buff[6];
     c = UCD_OTHERCASE(c);
-    (void)PRIV(ord2utf)(c, buff);
-    SET_BIT(buff[0]);
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (utf)
+      { 
+      PCRE2_UCHAR buff[6];
+      (void)PRIV(ord2utf)(c, buff);
+      SET_BIT(buff[0]);
+      }
+    else SET_BIT(c);    
 #else  /* 16-bit or 32-bit mode */
-    c = UCD_OTHERCASE(c);
     if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
 #endif
     }
+ 
   else
 #endif  /* SUPPORT_UNICODE */
 
-  /* Not UTF */
+  /* Not UTF or UCP */
 
   if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
   }
@@ -931,6 +939,7 @@ Arguments:
   re           points to the compiled regex block
   code         points to an expression
   utf          TRUE if in UTF mode
+  ucp          TRUE if in UCP mode 
   depthptr     pointer to recurse depth
 
 Returns:       SSB_FAIL     => Failed to find any starting code units
@@ -941,7 +950,8 @@ Returns:       SSB_FAIL     => Failed to find any starting code units
 */
 
 static int
-set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, int *depthptr)
+set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
+  int *depthptr)
 {
 uint32_t c;
 int yield = SSB_DONE;
@@ -1111,7 +1121,7 @@ do
       case OP_SCRIPT_RUN:
       case OP_ASSERT:
       case OP_ASSERT_NA:
-      rc = set_start_bits(re, tcode, utf, depthptr);
+      rc = set_start_bits(re, tcode, utf, ucp, depthptr);
       if (rc == SSB_DONE)
         {
         try_next = FALSE;
@@ -1167,7 +1177,7 @@ do
       case OP_BRAZERO:
       case OP_BRAMINZERO:
       case OP_BRAPOSZERO:
-      rc = set_start_bits(re, ++tcode, utf, depthptr);
+      rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
       if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
       tcode += 1 + LINK_SIZE;
@@ -1189,7 +1199,7 @@ do
       case OP_QUERY:
       case OP_MINQUERY:
       case OP_POSQUERY:
-      tcode = set_table_bit(re, tcode + 1, FALSE, utf);
+      tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
       break;
 
       case OP_STARI:
@@ -1198,7 +1208,7 @@ do
       case OP_QUERYI:
       case OP_MINQUERYI:
       case OP_POSQUERYI:
-      tcode = set_table_bit(re, tcode + 1, TRUE, utf);
+      tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
       break;
 
       /* Single-char upto sets the bit and tries the next */
@@ -1206,13 +1216,13 @@ do
       case OP_UPTO:
       case OP_MINUPTO:
       case OP_POSUPTO:
-      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
+      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
       break;
 
       case OP_UPTOI:
       case OP_MINUPTOI:
       case OP_POSUPTOI:
-      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
+      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
       break;
 
       /* At least one single char sets the bit and stops */
@@ -1224,7 +1234,7 @@ do
       case OP_PLUS:
       case OP_MINPLUS:
       case OP_POSPLUS:
-      (void)set_table_bit(re, tcode + 1, FALSE, utf);
+      (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
       try_next = FALSE;
       break;
 
@@ -1235,7 +1245,7 @@ do
       case OP_PLUSI:
       case OP_MINPLUSI:
       case OP_POSPLUSI:
-      (void)set_table_bit(re, tcode + 1, TRUE, utf);
+      (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
       try_next = FALSE;
       break;
 
@@ -1664,6 +1674,7 @@ PRIV(study)(pcre2_real_code *re)
 int count = 0;
 PCRE2_UCHAR *code;
 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
+BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
 
 /* Find start of compiled code */
 
@@ -1677,7 +1688,7 @@ code units. */
 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
   {
   int depth = 0;
-  int rc = set_start_bits(re, code, utf, &depth);
+  int rc = set_start_bits(re, code, utf, ucp, &depth);
   if (rc == SSB_UNKNOWN) return 1;
 
   /* If a list of starting code units was set up, scan the list to see if only
@@ -1695,7 +1706,7 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
     int b = -1;
     uint8_t *p = re->start_bitmap;
     uint32_t flags = PCRE2_FIRSTMAPSET;
-
+    
     for (i = 0; i < 256; p++, i += 8)
       {
       uint8_t x = *p;
@@ -1725,27 +1736,27 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
           }
 
         /* c contains the code unit value, in the range 0-255. In 8-bit UTF
-        mode, only values < 128 can be used. */
+        mode, only values < 128 can be used. In all the other cases, c is a 
+        character value. */
 
 #if PCRE2_CODE_UNIT_WIDTH == 8
-        if (c > 127) goto DONE;
+        if (utf && c > 127) goto DONE;
 #endif
-        if (a < 0) a = c;   /* First one found */
+        if (a < 0) a = c;   /* First one found, save in a */
         else if (b < 0)     /* Second one found */
           {
           int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
-
+          
 #ifdef SUPPORT_UNICODE
-#if PCRE2_CODE_UNIT_WIDTH == 8
-          if (utf && UCD_CASESET(c) != 0) goto DONE;   /* Multiple case set */
-#else   /* 16-bit or 32-bit */
-          if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
-          if (utf && c > 127) d = UCD_OTHERCASE(c);
-#endif  /* Code width */
+          if (utf || ucp)
+            { 
+            if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
+            if (c > 127) d = UCD_OTHERCASE(c);
+            }
 #endif  /* SUPPORT_UNICODE */
 
-          if (d != a) goto DONE;   /* Not other case of a */
-          b = c;
+          if (d != a) goto DONE;   /* Not the other case of a */
+          b = c;                   /* Save second in b */
           }
         else goto DONE;   /* More than two characters found */
         }
diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c
index 222cb32..981a106 100644
--- a/src/pcre2_substitute.c
+++ b/src/pcre2_substitute.c
@@ -236,6 +236,7 @@ BOOL use_existing_match;
 BOOL replacement_only;
 #ifdef SUPPORT_UNICODE
 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
+BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
 #endif
 PCRE2_UCHAR temp[6];
 PCRE2_SPTR ptr;
@@ -758,7 +759,7 @@ do
           if (forcecase != 0)
             {
 #ifdef SUPPORT_UNICODE
-            if (utf)
+            if (utf || ucp)
               {
               uint32_t type = UCD_CHARTYPE(ch);
               if (PRIV(ucp_gentype)[type] == ucp_L &&
@@ -860,7 +861,7 @@ do
       if (forcecase != 0)
         {
 #ifdef SUPPORT_UNICODE
-        if (utf)
+        if (utf || ucp)
           {
           uint32_t type = UCD_CHARTYPE(ch);
           if (PRIV(ucp_gentype)[type] == ucp_L &&
author	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2020-02-23 16:40:05 +0000
committer	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2020-02-23 16:40:05 +0000
commit	eeeb059e46a07f10023f2313894159161504b664 (patch)
tree	98719aa173603943d4a2a403724045bf8a40f19c /src
parent	98c6677bd3ff37d50249b32297abdb6008b42d54 (diff)
download	pcre2-eeeb059e46a07f10023f2313894159161504b664.tar.gz