diff options
author | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2018-10-02 15:25:58 +0000 |
---|---|---|
committer | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2018-10-02 15:25:58 +0000 |
commit | 3899f6557728000c2cfd428cddc597e377baddc2 (patch) | |
tree | aaa05107db98463f3a50a7ca6d3eb9f83a985f3d /src/pcre2_script_run.c | |
parent | af1cda3afb77f3e43c3c8069bd3b784abbcc2036 (diff) | |
download | pcre2-3899f6557728000c2cfd428cddc597e377baddc2.tar.gz |
Basic "script run" implementation. Not yet complete, and not yet documented.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1019 6239d852-aaf2-0410-a92c-79f79f948069
Diffstat (limited to 'src/pcre2_script_run.c')
-rw-r--r-- | src/pcre2_script_run.c | 228 |
1 files changed, 228 insertions, 0 deletions
diff --git a/src/pcre2_script_run.c b/src/pcre2_script_run.c new file mode 100644 index 0000000..479cb31 --- /dev/null +++ b/src/pcre2_script_run.c @@ -0,0 +1,228 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2018 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains the function for checking a script run. */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pcre2_internal.h" + + +/************************************************* +* Check script run * +*************************************************/ + +/* A script run is conceptually a sequence of characters all in the same +Unicode script. However, it isn't quite that simple. There are special rules +for scripts that are commonly used together, and also special rules for digits. +This function implements the appropriate checks, which is possible only when +PCRE2 is compiled with Unicode support. The function returns TRUE if there is +no Unicode support; however, it should never be called in that circumstance +because an error is given by pcre2_compile() if a script run is called for in a +version of PCRE2 compiled without Unicode support. + +Arguments: + pgr point to the first character + endptr point after the last character + utf TRUE if in UTF mode + +Returns: TRUE if this is a valid script run +*/ + +#define SCRIPT_UNSET (-1) +#define SCRIPT_HANPENDING (-2) +#define SCRIPT_HANHIRAKATA (-3) +#define SCRIPT_HANBOPOMOFO (-4) +#define SCRIPT_HANHANGUL (-5) + +BOOL +PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf) +{ +#ifdef SUPPORT_UNICODE +int require_script = SCRIPT_UNSET; +uint32_t require_digitset = 0; +uint32_t c; + +#if PCRE2_CODE_UNIT_WIDTH == 32 +(void)utf; /* Avoid compiler warning */ +#endif + +/* Any string containing fewer than 2 characters is a valid script run. */ + +if (ptr >= endptr) return TRUE; +GETCHARINCTEST(c, ptr); +if (ptr >= endptr) return TRUE; + +/* Scan strings of two or more characters, checking the Unicode characteristics +of each code point. */ + +for (;;) + { + const ucd_record *ucd = GET_UCD(c); + uint32_t script = ucd->script; + + /* If the script is Unknown, the string is not a valid script run. Such + characters can only form script runs of length one. */ + + if (script == ucp_Unknown) return FALSE; + + /* A character whose script is Inherited is always accepted, and plays no + further part. A character whose script is Common is always accepted, but must + still be tested for a digit below. Otherwise, the character must match the + script of the first non-Inherited, non-Common character encountered. For most + scripts, the test is for the same script. However, the Han Chinese script may + be used in conjunction with four other scripts in these combinations: + + . Han with Hiragana and Katakana is allowed (for Japanese). + + . Han with Bopomofo is allowed (for Taiwanese Mandarin). + + . Han with Hangul is allowed (for Korean). + + If the first significant character's script is one of the four, the required + script type is immediately known. However, if the first significant + character's script is Han, we have to keep checking for a non-Han character. + Hence the SCRIPT_HANPENDING state. */ + + if (script != ucp_Inherited) + { + if (script != ucp_Common) switch(require_script) + { + default: + if (script != (unsigned int)require_script) return FALSE; + break; + + case SCRIPT_UNSET: + case SCRIPT_HANPENDING: + switch(script) + { + case ucp_Han: + require_script = SCRIPT_HANPENDING; + break; + + case ucp_Hiragana: + case ucp_Katakana: + require_script = SCRIPT_HANHIRAKATA; + break; + + case ucp_Bopomofo: + require_script = SCRIPT_HANBOPOMOFO; + break; + + case ucp_Hangul: + require_script = SCRIPT_HANHANGUL; + break; + + default: + if (require_script == SCRIPT_HANPENDING) return FALSE; + require_script = script; + break; + } + break; + + case SCRIPT_HANHIRAKATA: + if (script != ucp_Han && script != ucp_Hiragana && script != ucp_Katakana) + return FALSE; + break; + + case SCRIPT_HANBOPOMOFO: + if (script != ucp_Han && script != ucp_Bopomofo) return FALSE; + break; + + case SCRIPT_HANHANGUL: + if (script != ucp_Han && script != ucp_Hangul) return FALSE; + break; + } + + /* The character is in an acceptable script. We must now ensure that all + decimal digits in the string come from the same set. Some scripts (e.g. + Common, Arabic) have more than one set of decimal digits. This code does + not allow mixing sets, even within the same script. The vector called + PRIV(ucd_digit_sets)[] contains, in its first element, the number of + following elements, and then, in ascending order, the code points of the + '9' characters in every set of 10 digits. Each set is identified by the + offset in the vector of its '9' character. An initial check of the first + value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ + + if (ucd->chartype == ucp_Nd) + { + uint32_t digitset; + + if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else + { + int mid; + int bot = 1; + int top = PRIV(ucd_digit_sets)[0]; + for (;;) + { + if (top <= bot + 1) /* <= rather than == is paranoia */ + { + digitset = top; + break; + } + mid = (top + bot) / 2; + if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; + } + } + + /* A required value of 0 means "unset". */ + + if (require_digitset == 0) require_digitset = digitset; + else if (digitset != require_digitset) return FALSE; + } /* End digit handling */ + } /* End checking non-Inherited character */ + + /* If we haven't yet got to the end, pick up the next character. */ + + if (ptr >= endptr) return TRUE; + GETCHARINCTEST(c, ptr); + } /* End checking loop */ + +#else /* NOT SUPPORT_UNICODE */ +(void)ptr; +(void)endptr; +(void)utf; +return TRUE; +#endif /* SUPPORT_UNICODE */ +} + +/* End of pcre2_script_run.c */ |