Basic "script run" implementation. Not yet complete, and not yet documented.

git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1019 6239d852-aaf2-0410-a92c-79f79f948069
author: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2018-10-02 15:25:58 +0000
committer: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2018-10-02 15:25:58 +0000
commit: 3899f6557728000c2cfd428cddc597e377baddc2 (patch)
tree: aaa05107db98463f3a50a7ca6d3eb9f83a985f3d /testdata
parent: af1cda3afb77f3e43c3c8069bd3b784abbcc2036 (diff)
download: pcre2-3899f6557728000c2cfd428cddc597e377baddc2.tar.gz
7 files changed, 364 insertions, 0 deletions
diff --git a/testdata/testinput12 b/testdata/testinput12
index 6a996aa..29d1095 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -386,5 +386,16 @@
 
 /(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
     123abcáyzabcdef789abcሴqr
+    
+# A few script run tests in non-UTF mode (but they need Unicode support)
+
+/^(*script_run:.{4})/
+    \x{3041}\x{30a1}\x{3007}\x{3007}   Hiragana Katakana Han Han
+    \x{30a1}\x{3041}\x{3007}\x{3007}   Katakana Hiragana Han Han
+    \x{1100}\x{2e80}\x{2e80}\x{1101}   Hangul Han Han Hangul
+ 
+/^(*sr:.*)/utf,allow_surrogate_escapes
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+    \x{d800}\x{dfff}                   Surrogates (Unknown) \=no_utf_check
 
 # End of testinput12
diff --git a/testdata/testinput4 b/testdata/testinput4
index a27b6af..eea087d 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -2317,5 +2317,96 @@
 
 /[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf
     \x{99}\x{99}\x{99}
+    
+# Script run tests
+
+/^(*script_run:.{4})/utf
+    abcd                               Latin x4
+    \x{2e80}\x{2fa1d}\x{3041}\x{30a1}  Han Han Hiragana Katakana
+    \x{3041}\x{30a1}\x{3007}\x{3007}   Hiragana Katakana Han Han
+    \x{30a1}\x{3041}\x{3007}\x{3007}   Katakana Hiragana Han Han
+    \x{1100}\x{2e80}\x{2e80}\x{1101}   Hangul Han Han Hangul
+    \x{2e80}\x{3105}\x{2e80}\x{3105}   Han Bopomofo Han Bopomofo
+    \x{02ea}\x{2e80}\x{2e80}\x{3105}   Bopomofo-Sk Han Han Bopomofo
+    \x{3105}\x{2e80}\x{2e80}\x{3105}   Bopomofo Han Han Bopomofo
+    \x{0300}cd!                        Inherited Latin Latin Common
+    \x{0391}12\x{03a9}                 Greek Common-digits Greek 
+    \x{0400}12\x{fe2f}                 Cyrillic Common-digits Cyrillic
+    \x{0531}12\x{fb17}                 Armenian Common-digits Armenian
+    \x{0591}12\x{fb4f}                 Hebrew Common-digits Hebrew
+    \x{0600}12\x{1eef1}                Arabic Common-digits Arabic
+    \x{0600}\x{0660}\x{0669}\x{1eef1}  Arabic Arabic-digits Arabic
+    \x{0700}12\x{086a}                 Syriac Common-digits Syriac
+    \x{1200}12\x{ab2e}                 Ethiopic Common-digits Ethiopic
+    \x{1680}12\x{169c}                 Ogham Common-digits Ogham
+    \x{3041}12\x{3041}                 Hiragana Common-digits Hiragana
+    \x{0980}\x{09e6}\x{09e7}\x{0993}   Bengali Bengali-digits Bengali
+    !cde                               Common Latin Latin Latin
+    A..B                               Latin Common Common Latin 
+    0abc                               Ascii-digit Latin Latin Latin
+    1\x{0700}\x{0700}\x{0700}          Ascii-digit Syriac x 3
+    \x{1A80}\x{1A80}\x{1a40}\x{1a41}   Tai Tham Hora digits, letters
+\= Expect no match
+    a\x{370}bcd                        Latin Greek Latin Latin
+    \x{1100}\x{02ea}\x{02ea}\x{02ea}   Hangul Bopomofo x3
+    \x{02ea}\x{02ea}\x{02ea}\x{1100}   Bopomofo x3 Hangul
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+    \x{0391}\x{09e6}\x{09e7}\x{03a9}   Greek Bengali digits Greek 
+    \x{0600}7\x{0669}\x{1eef1}         Arabic ascii-digit Arabic-digit Arabic
+    \x{0600}\x{0669}7\x{1eef1}         Arabic Arabic-digit ascii-digit Arabic
+    A5\x{ff19}B                        Latin Common-ascii/notascii-digits Latin 
+    \x{0300}cd\x{0391}                 Inherited Latin Latin Greek
+    !cd\x{0391}                        Common Latin Latin Greek
+    \x{1A80}\x{1A90}\x{1a40}\x{1a41}   Tai Tham Hora digit, Tham digit, letters
+    A\x{1d7ce}\x{1d7ff}B               Common fancy-common-2-sets-digits Common
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+    
+/^(*sr:.{4}|..)/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+
+/^(*atomic_script_run:.{4}|..)/utf
+\= Expect no match
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+
+/^(*asr:.*)/utf
+\= Expect no match
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+
+/^(?>(*sr:.*))/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+
+/^(*sr:.*)/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+    \x{10fffd}\x{10fffd}\x{10fffd}     Private use (Unknown)
+
+/^(*sr:\x{2e80}*)/utf
+    \x{2e80}\x{2e80}\x{3105}           Han Han Bopomofo
+
+/^(*sr:\x{2e80}*)\x{2e80}/utf
+    \x{2e80}\x{2e80}\x{3105}           Han Han Bopomofo
+    
+/^(*sr:.*)Test/utf
+    Test script run on an empty string
+
+/^(*sr:(.{2})){2}/utf
+    \x{0600}7\x{0669}\x{1eef1}         Arabic ascii-digit Arabic-digit Arabic
+    \x{1A80}\x{1A80}\x{1a40}\x{1a41}   Tai Tham Hora digits, letters
+    \x{1A80}\x{1a40}\x{1A90}\x{1a41}   Tai Tham Hora digit, letter, Tham digit, letter
+\= Expect no match
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+
+# Test loop breaking for empty string match
+
+/^(*sr:A|)*BCD/utf
+    AABCD
+    ABCD
+    BCD 
+    
+# The use of (*ACCEPT) breaks script run checking 
+
+/^(*sr:.*(*ACCEPT)ZZ)/utf
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+
+# ------- 
 
 # End of testinput4
diff --git a/testdata/testinput5 b/testdata/testinput5
index 687de32..9730e0b 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -2106,5 +2106,28 @@
 
 /(*: A‎B C)abc/x,utf,mark,alt_verbnames
     abc
+    
+# Script run tests: auto-possessification
+
+/^(*sr:.*)/B,utf 
+    paypаl.com   A classic example of why script run checks are a good thing
+
+/^(*sr:\x{2e80}*)/B,utf
+
+/^(*sr:\x{2e80}*)\x{2e80}/B,utf
+
+# Some script run patterns are broken in Perl 5.28.0. These can be moved into
+# test 4 when a mended version of Perl is released.
+
+/^(*sr:.{4})/utf
+    \x{0980}12\x{0993}     Bengali Common-digits Bengali
+    \x{0780}12\x{07b1}     Thaana Common-digits Thaana
+    \x{0e01}12\x{0e5b}     Thai Common-digits Thai
+    \x{1780}12\x{19ff}     Khmer Common-digits Khmer
+    \x{0904}12\x{0939}     Devanagari Common-digits Devanagari
+    A\x{ff10}\x{ff19}B     Latin Common-notascii-digits Latin 
+    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
+
+# ------- 
 
 # End of testinput5
diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16
index e29243a..72dde81 100644
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@@ -1480,5 +1480,20 @@ Old 12 12  New 14 16
 Old 12 15  New 16 21
 Old 21 21  New 27 29
  4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
+    
+# A few script run tests in non-UTF mode (but they need Unicode support)
+
+/^(*script_run:.{4})/
+    \x{3041}\x{30a1}\x{3007}\x{3007}   Hiragana Katakana Han Han
+ 0: \x{3041}\x{30a1}\x{3007}\x{3007}
+    \x{30a1}\x{3041}\x{3007}\x{3007}   Katakana Hiragana Han Han
+ 0: \x{30a1}\x{3041}\x{3007}\x{3007}
+    \x{1100}\x{2e80}\x{2e80}\x{1101}   Hangul Han Han Hangul
+ 0: \x{1100}\x{2e80}\x{2e80}\x{1101}
+ 
+/^(*sr:.*)/utf,allow_surrogate_escapes
+Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+    \x{d800}\x{dfff}                   Surrogates (Unknown) \=no_utf_check
 
 # End of testinput12
diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32
index 706e0c0..d2c59e5 100644
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@@ -1477,5 +1477,21 @@ Old 12 12  New 14 16
 Old 12 15  New 16 21
 Old 21 21  New 27 29
  4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
+    
+# A few script run tests in non-UTF mode (but they need Unicode support)
+
+/^(*script_run:.{4})/
+    \x{3041}\x{30a1}\x{3007}\x{3007}   Hiragana Katakana Han Han
+ 0: \x{3041}\x{30a1}\x{3007}\x{3007}
+    \x{30a1}\x{3041}\x{3007}\x{3007}   Katakana Hiragana Han Han
+ 0: \x{30a1}\x{3041}\x{3007}\x{3007}
+    \x{1100}\x{2e80}\x{2e80}\x{1101}   Hangul Han Han Hangul
+ 0: \x{1100}\x{2e80}\x{2e80}\x{1101}
+ 
+/^(*sr:.*)/utf,allow_surrogate_escapes
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}\x{2e80}
+    \x{d800}\x{dfff}                   Surrogates (Unknown) \=no_utf_check
+ 0: \x{d800}
 
 # End of testinput12
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index ba3df37..8a53cf7 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -3741,5 +3741,154 @@ No match
 /[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf
     \x{99}\x{99}\x{99}
  0: \x{99}\x{99}\x{99}
+    
+# Script run tests
+
+/^(*script_run:.{4})/utf
+    abcd                               Latin x4
+ 0: abcd
+    \x{2e80}\x{2fa1d}\x{3041}\x{30a1}  Han Han Hiragana Katakana
+ 0: \x{2e80}\x{2fa1d}\x{3041}\x{30a1}
+    \x{3041}\x{30a1}\x{3007}\x{3007}   Hiragana Katakana Han Han
+ 0: \x{3041}\x{30a1}\x{3007}\x{3007}
+    \x{30a1}\x{3041}\x{3007}\x{3007}   Katakana Hiragana Han Han
+ 0: \x{30a1}\x{3041}\x{3007}\x{3007}
+    \x{1100}\x{2e80}\x{2e80}\x{1101}   Hangul Han Han Hangul
+ 0: \x{1100}\x{2e80}\x{2e80}\x{1101}
+    \x{2e80}\x{3105}\x{2e80}\x{3105}   Han Bopomofo Han Bopomofo
+ 0: \x{2e80}\x{3105}\x{2e80}\x{3105}
+    \x{02ea}\x{2e80}\x{2e80}\x{3105}   Bopomofo-Sk Han Han Bopomofo
+ 0: \x{2ea}\x{2e80}\x{2e80}\x{3105}
+    \x{3105}\x{2e80}\x{2e80}\x{3105}   Bopomofo Han Han Bopomofo
+ 0: \x{3105}\x{2e80}\x{2e80}\x{3105}
+    \x{0300}cd!                        Inherited Latin Latin Common
+ 0: \x{300}cd!
+    \x{0391}12\x{03a9}                 Greek Common-digits Greek 
+ 0: \x{391}12\x{3a9}
+    \x{0400}12\x{fe2f}                 Cyrillic Common-digits Cyrillic
+ 0: \x{400}12\x{fe2f}
+    \x{0531}12\x{fb17}                 Armenian Common-digits Armenian
+ 0: \x{531}12\x{fb17}
+    \x{0591}12\x{fb4f}                 Hebrew Common-digits Hebrew
+ 0: \x{591}12\x{fb4f}
+    \x{0600}12\x{1eef1}                Arabic Common-digits Arabic
+ 0: \x{600}12\x{1eef1}
+    \x{0600}\x{0660}\x{0669}\x{1eef1}  Arabic Arabic-digits Arabic
+ 0: \x{600}\x{660}\x{669}\x{1eef1}
+    \x{0700}12\x{086a}                 Syriac Common-digits Syriac
+ 0: \x{700}12\x{86a}
+    \x{1200}12\x{ab2e}                 Ethiopic Common-digits Ethiopic
+ 0: \x{1200}12\x{ab2e}
+    \x{1680}12\x{169c}                 Ogham Common-digits Ogham
+ 0: \x{1680}12\x{169c}
+    \x{3041}12\x{3041}                 Hiragana Common-digits Hiragana
+ 0: \x{3041}12\x{3041}
+    \x{0980}\x{09e6}\x{09e7}\x{0993}   Bengali Bengali-digits Bengali
+ 0: \x{980}\x{9e6}\x{9e7}\x{993}
+    !cde                               Common Latin Latin Latin
+ 0: !cde
+    A..B                               Latin Common Common Latin 
+ 0: A..B
+    0abc                               Ascii-digit Latin Latin Latin
+ 0: 0abc
+    1\x{0700}\x{0700}\x{0700}          Ascii-digit Syriac x 3
+ 0: 1\x{700}\x{700}\x{700}
+    \x{1A80}\x{1A80}\x{1a40}\x{1a41}   Tai Tham Hora digits, letters
+ 0: \x{1a80}\x{1a80}\x{1a40}\x{1a41}
+\= Expect no match
+    a\x{370}bcd                        Latin Greek Latin Latin
+No match
+    \x{1100}\x{02ea}\x{02ea}\x{02ea}   Hangul Bopomofo x3
+No match
+    \x{02ea}\x{02ea}\x{02ea}\x{1100}   Bopomofo x3 Hangul
+No match
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+No match
+    \x{0391}\x{09e6}\x{09e7}\x{03a9}   Greek Bengali digits Greek 
+No match
+    \x{0600}7\x{0669}\x{1eef1}         Arabic ascii-digit Arabic-digit Arabic
+No match
+    \x{0600}\x{0669}7\x{1eef1}         Arabic Arabic-digit ascii-digit Arabic
+No match
+    A5\x{ff19}B                        Latin Common-ascii/notascii-digits Latin 
+No match
+    \x{0300}cd\x{0391}                 Inherited Latin Latin Greek
+No match
+    !cd\x{0391}                        Common Latin Latin Greek
+No match
+    \x{1A80}\x{1A90}\x{1a40}\x{1a41}   Tai Tham Hora digit, Tham digit, letters
+No match
+    A\x{1d7ce}\x{1d7ff}B               Common fancy-common-2-sets-digits Common
+No match
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+No match
+    
+/^(*sr:.{4}|..)/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}
+
+/^(*atomic_script_run:.{4}|..)/utf
+\= Expect no match
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+No match
+
+/^(*asr:.*)/utf
+\= Expect no match
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+No match
+
+/^(?>(*sr:.*))/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}\x{2e80}
+
+/^(*sr:.*)/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}\x{2e80}
+    \x{10fffd}\x{10fffd}\x{10fffd}     Private use (Unknown)
+ 0: \x{10fffd}
+
+/^(*sr:\x{2e80}*)/utf
+    \x{2e80}\x{2e80}\x{3105}           Han Han Bopomofo
+ 0: \x{2e80}\x{2e80}
+
+/^(*sr:\x{2e80}*)\x{2e80}/utf
+    \x{2e80}\x{2e80}\x{3105}           Han Han Bopomofo
+ 0: \x{2e80}\x{2e80}
+    
+/^(*sr:.*)Test/utf
+    Test script run on an empty string
+ 0: Test
+
+/^(*sr:(.{2})){2}/utf
+    \x{0600}7\x{0669}\x{1eef1}         Arabic ascii-digit Arabic-digit Arabic
+ 0: \x{600}7\x{669}\x{1eef1}
+ 1: \x{669}\x{1eef1}
+    \x{1A80}\x{1A80}\x{1a40}\x{1a41}   Tai Tham Hora digits, letters
+ 0: \x{1a80}\x{1a80}\x{1a40}\x{1a41}
+ 1: \x{1a40}\x{1a41}
+    \x{1A80}\x{1a40}\x{1A90}\x{1a41}   Tai Tham Hora digit, letter, Tham digit, letter
+ 0: \x{1a80}\x{1a40}\x{1a90}\x{1a41}
+ 1: \x{1a90}\x{1a41}
+\= Expect no match
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+No match
+
+# Test loop breaking for empty string match
+
+/^(*sr:A|)*BCD/utf
+    AABCD
+ 0: AABCD
+    ABCD
+ 0: ABCD
+    BCD 
+ 0: BCD
+    
+# The use of (*ACCEPT) breaks script run checking 
+
+/^(*sr:.*(*ACCEPT)ZZ)/utf
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+ 0: \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+
+# ------- 
 
 # End of testinput4
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 51caa18..406db22 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -4775,5 +4775,64 @@ Failed: error 137 at offset 2: PCRE2 does not support \F, \L, \l, \N{name}, \U,
     abc
  0: abc
 MK: ABC
+    
+# Script run tests: auto-possessification
+
+/^(*sr:.*)/B,utf 
+------------------------------------------------------------------
+        Bra
+        ^
+        Script run
+        Any*
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+    paypаl.com   A classic example of why script run checks are a good thing
+ 0: payp
+
+/^(*sr:\x{2e80}*)/B,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        Script run
+        \x{2e80}*+
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(*sr:\x{2e80}*)\x{2e80}/B,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        Script run
+        \x{2e80}*
+        Ket
+        \x{2e80}
+        Ket
+        End
+------------------------------------------------------------------
+
+# Some script run patterns are broken in Perl 5.28.0. These can be moved into
+# test 4 when a mended version of Perl is released.
+
+/^(*sr:.{4})/utf
+    \x{0980}12\x{0993}     Bengali Common-digits Bengali
+ 0: \x{980}12\x{993}
+    \x{0780}12\x{07b1}     Thaana Common-digits Thaana
+ 0: \x{780}12\x{7b1}
+    \x{0e01}12\x{0e5b}     Thai Common-digits Thai
+ 0: \x{e01}12\x{e5b}
+    \x{1780}12\x{19ff}     Khmer Common-digits Khmer
+ 0: \x{1780}12\x{19ff}
+    \x{0904}12\x{0939}     Devanagari Common-digits Devanagari
+ 0: \x{904}12\x{939}
+    A\x{ff10}\x{ff19}B     Latin Common-notascii-digits Latin 
+ 0: A\x{ff10}\x{ff19}B
+    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
+ 0: A\x{1d7ce}\x{1d7cf}B
+
+# ------- 
 
 # End of testinput5
author	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2018-10-02 15:25:58 +0000
committer	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2018-10-02 15:25:58 +0000
commit	3899f6557728000c2cfd428cddc597e377baddc2 (patch)
tree	aaa05107db98463f3a50a7ca6d3eb9f83a985f3d /testdata
parent	af1cda3afb77f3e43c3c8069bd3b784abbcc2036 (diff)
download	pcre2-3899f6557728000c2cfd428cddc597e377baddc2.tar.gz