Basic "script run" implementation. Not yet complete, and not yet documented.

git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1019 6239d852-aaf2-0410-a92c-79f79f948069
author: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2018-10-02 15:25:58 +0000
committer: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2018-10-02 15:25:58 +0000
commit: 3899f6557728000c2cfd428cddc597e377baddc2 (patch)
tree: aaa05107db98463f3a50a7ca6d3eb9f83a985f3d /src/pcre2_script_run.c
parent: af1cda3afb77f3e43c3c8069bd3b784abbcc2036 (diff)
download: pcre2-3899f6557728000c2cfd428cddc597e377baddc2.tar.gz
1 files changed, 228 insertions, 0 deletions
diff --git a/src/pcre2_script_run.c b/src/pcre2_script_run.c
new file mode 100644
index 0000000..479cb31
--- /dev/null
+++ b/src/pcre2_script_run.c
@@ -0,0 +1,228 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+     Original API code Copyright (c) 1997-2012 University of Cambridge
+          New API code Copyright (c) 2016-2018 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* This module contains the function for checking a script run. */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre2_internal.h"
+
+
+/*************************************************
+*                Check script run                *
+*************************************************/
+
+/* A script run is conceptually a sequence of characters all in the same
+Unicode script. However, it isn't quite that simple. There are special rules
+for scripts that are commonly used together, and also special rules for digits.
+This function implements the appropriate checks, which is possible only when
+PCRE2 is compiled with Unicode support. The function returns TRUE if there is
+no Unicode support; however, it should never be called in that circumstance
+because an error is given by pcre2_compile() if a script run is called for in a
+version of PCRE2 compiled without Unicode support.
+
+Arguments:
+  pgr       point to the first character
+  endptr    point after the last character
+  utf       TRUE if in UTF mode
+
+Returns:    TRUE if this is a valid script run
+*/
+
+#define SCRIPT_UNSET        (-1)
+#define SCRIPT_HANPENDING   (-2)
+#define SCRIPT_HANHIRAKATA  (-3)
+#define SCRIPT_HANBOPOMOFO  (-4)
+#define SCRIPT_HANHANGUL    (-5)
+
+BOOL
+PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
+{
+#ifdef SUPPORT_UNICODE
+int require_script = SCRIPT_UNSET;
+uint32_t require_digitset = 0;
+uint32_t c;
+
+#if PCRE2_CODE_UNIT_WIDTH == 32
+(void)utf;    /* Avoid compiler warning */
+#endif
+
+/* Any string containing fewer than 2 characters is a valid script run. */
+
+if (ptr >= endptr) return TRUE;
+GETCHARINCTEST(c, ptr);
+if (ptr >= endptr) return TRUE;
+
+/* Scan strings of two or more characters, checking the Unicode characteristics
+of each code point. */
+
+for (;;)
+  {
+  const ucd_record *ucd = GET_UCD(c);
+  uint32_t script = ucd->script;
+
+  /* If the script is Unknown, the string is not a valid script run. Such
+  characters can only form script runs of length one. */
+  
+  if (script == ucp_Unknown) return FALSE; 
+
+  /* A character whose script is Inherited is always accepted, and plays no
+  further part. A character whose script is Common is always accepted, but must
+  still be tested for a digit below. Otherwise, the character must match the
+  script of the first non-Inherited, non-Common character encountered. For most
+  scripts, the test is for the same script. However, the Han Chinese script may
+  be used in conjunction with four other scripts in these combinations:
+
+  . Han with Hiragana and Katakana is allowed (for Japanese).
+
+  . Han with Bopomofo is allowed (for Taiwanese Mandarin).
+
+  . Han with Hangul is allowed (for Korean).
+
+  If the first significant character's script is one of the four, the required
+  script type is immediately known. However, if the first significant
+  character's script is Han, we have to keep checking for a non-Han character.
+  Hence the SCRIPT_HANPENDING state. */
+ 
+  if (script != ucp_Inherited)
+    { 
+    if (script != ucp_Common) switch(require_script)
+      {
+      default:
+      if (script != (unsigned int)require_script) return FALSE;
+      break;
+    
+      case SCRIPT_UNSET:
+      case SCRIPT_HANPENDING:
+      switch(script)
+        {
+        case ucp_Han:
+        require_script = SCRIPT_HANPENDING;
+        break;
+    
+        case ucp_Hiragana:
+        case ucp_Katakana:
+        require_script = SCRIPT_HANHIRAKATA;
+        break;
+    
+        case ucp_Bopomofo:
+        require_script = SCRIPT_HANBOPOMOFO;
+        break;
+    
+        case ucp_Hangul:
+        require_script = SCRIPT_HANHANGUL;
+        break;
+    
+        default:
+        if (require_script == SCRIPT_HANPENDING) return FALSE;
+        require_script = script;
+        break;
+        }
+      break;
+    
+      case SCRIPT_HANHIRAKATA:
+      if (script != ucp_Han && script != ucp_Hiragana && script != ucp_Katakana)
+        return FALSE;
+      break;
+    
+      case SCRIPT_HANBOPOMOFO:
+      if (script != ucp_Han && script != ucp_Bopomofo) return FALSE;
+      break;
+    
+      case SCRIPT_HANHANGUL:
+      if (script != ucp_Han && script != ucp_Hangul) return FALSE;
+      break;
+      }
+    
+    /* The character is in an acceptable script. We must now ensure that all
+    decimal digits in the string come from the same set. Some scripts (e.g.
+    Common, Arabic) have more than one set of decimal digits. This code does
+    not allow mixing sets, even within the same script. The vector called
+    PRIV(ucd_digit_sets)[] contains, in its first element, the number of
+    following elements, and then, in ascending order, the code points of the
+    '9' characters in every set of 10 digits. Each set is identified by the
+    offset in the vector of its '9' character. An initial check of the first
+    value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
+    
+    if (ucd->chartype == ucp_Nd)
+      {
+      uint32_t digitset;
+        
+      if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
+        {
+        int mid;
+        int bot = 1;
+        int top = PRIV(ucd_digit_sets)[0];
+        for (;;)
+          {
+          if (top <= bot + 1)    /* <= rather than == is paranoia */
+            {
+            digitset = top;
+            break;
+            }
+          mid = (top + bot) / 2;
+          if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
+          }
+        }
+    
+      /* A required value of 0 means "unset". */
+    
+      if (require_digitset == 0) require_digitset = digitset;
+        else if (digitset != require_digitset) return FALSE;
+      }   /* End digit handling */
+    }     /* End checking non-Inherited character */
+
+  /* If we haven't yet got to the end, pick up the next character. */
+
+  if (ptr >= endptr) return TRUE;
+  GETCHARINCTEST(c, ptr);
+  }  /* End checking loop */
+
+#else   /* NOT SUPPORT_UNICODE */
+(void)ptr;
+(void)endptr;
+(void)utf;
+return TRUE;
+#endif  /* SUPPORT_UNICODE */
+}
+
+/* End of pcre2_script_run.c */
author	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2018-10-02 15:25:58 +0000
committer	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2018-10-02 15:25:58 +0000
commit	3899f6557728000c2cfd428cddc597e377baddc2 (patch)
tree	aaa05107db98463f3a50a7ca6d3eb9f83a985f3d /src/pcre2_script_run.c
parent	af1cda3afb77f3e43c3c8069bd3b784abbcc2036 (diff)
download	pcre2-3899f6557728000c2cfd428cddc597e377baddc2.tar.gz