summaryrefslogtreecommitdiff
path: root/testdata
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2018-10-02 15:25:58 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2018-10-02 15:25:58 +0000
commit3899f6557728000c2cfd428cddc597e377baddc2 (patch)
treeaaa05107db98463f3a50a7ca6d3eb9f83a985f3d /testdata
parentaf1cda3afb77f3e43c3c8069bd3b784abbcc2036 (diff)
downloadpcre2-3899f6557728000c2cfd428cddc597e377baddc2.tar.gz
Basic "script run" implementation. Not yet complete, and not yet documented.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1019 6239d852-aaf2-0410-a92c-79f79f948069
Diffstat (limited to 'testdata')
-rw-r--r--testdata/testinput1211
-rw-r--r--testdata/testinput491
-rw-r--r--testdata/testinput523
-rw-r--r--testdata/testoutput12-1615
-rw-r--r--testdata/testoutput12-3216
-rw-r--r--testdata/testoutput4149
-rw-r--r--testdata/testoutput559
7 files changed, 364 insertions, 0 deletions
diff --git a/testdata/testinput12 b/testdata/testinput12
index 6a996aa..29d1095 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -386,5 +386,16 @@
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
123abcáyzabcdef789abcሴqr
+
+# A few script run tests in non-UTF mode (but they need Unicode support)
+
+/^(*script_run:.{4})/
+ \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
+ \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
+ \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
+
+/^(*sr:.*)/utf,allow_surrogate_escapes
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+ \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
# End of testinput12
diff --git a/testdata/testinput4 b/testdata/testinput4
index a27b6af..eea087d 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -2317,5 +2317,96 @@
/[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf
\x{99}\x{99}\x{99}
+
+# Script run tests
+
+/^(*script_run:.{4})/utf
+ abcd Latin x4
+ \x{2e80}\x{2fa1d}\x{3041}\x{30a1} Han Han Hiragana Katakana
+ \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
+ \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
+ \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
+ \x{2e80}\x{3105}\x{2e80}\x{3105} Han Bopomofo Han Bopomofo
+ \x{02ea}\x{2e80}\x{2e80}\x{3105} Bopomofo-Sk Han Han Bopomofo
+ \x{3105}\x{2e80}\x{2e80}\x{3105} Bopomofo Han Han Bopomofo
+ \x{0300}cd! Inherited Latin Latin Common
+ \x{0391}12\x{03a9} Greek Common-digits Greek
+ \x{0400}12\x{fe2f} Cyrillic Common-digits Cyrillic
+ \x{0531}12\x{fb17} Armenian Common-digits Armenian
+ \x{0591}12\x{fb4f} Hebrew Common-digits Hebrew
+ \x{0600}12\x{1eef1} Arabic Common-digits Arabic
+ \x{0600}\x{0660}\x{0669}\x{1eef1} Arabic Arabic-digits Arabic
+ \x{0700}12\x{086a} Syriac Common-digits Syriac
+ \x{1200}12\x{ab2e} Ethiopic Common-digits Ethiopic
+ \x{1680}12\x{169c} Ogham Common-digits Ogham
+ \x{3041}12\x{3041} Hiragana Common-digits Hiragana
+ \x{0980}\x{09e6}\x{09e7}\x{0993} Bengali Bengali-digits Bengali
+ !cde Common Latin Latin Latin
+ A..B Latin Common Common Latin
+ 0abc Ascii-digit Latin Latin Latin
+ 1\x{0700}\x{0700}\x{0700} Ascii-digit Syriac x 3
+ \x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
+\= Expect no match
+ a\x{370}bcd Latin Greek Latin Latin
+ \x{1100}\x{02ea}\x{02ea}\x{02ea} Hangul Bopomofo x3
+ \x{02ea}\x{02ea}\x{02ea}\x{1100} Bopomofo x3 Hangul
+ \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
+ \x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek
+ \x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
+ \x{0600}\x{0669}7\x{1eef1} Arabic Arabic-digit ascii-digit Arabic
+ A5\x{ff19}B Latin Common-ascii/notascii-digits Latin
+ \x{0300}cd\x{0391} Inherited Latin Latin Greek
+ !cd\x{0391} Common Latin Latin Greek
+ \x{1A80}\x{1A90}\x{1a40}\x{1a41} Tai Tham Hora digit, Tham digit, letters
+ A\x{1d7ce}\x{1d7ff}B Common fancy-common-2-sets-digits Common
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+
+/^(*sr:.{4}|..)/utf
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+
+/^(*atomic_script_run:.{4}|..)/utf
+\= Expect no match
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+
+/^(*asr:.*)/utf
+\= Expect no match
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+
+/^(?>(*sr:.*))/utf
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+
+/^(*sr:.*)/utf
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+ \x{10fffd}\x{10fffd}\x{10fffd} Private use (Unknown)
+
+/^(*sr:\x{2e80}*)/utf
+ \x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
+
+/^(*sr:\x{2e80}*)\x{2e80}/utf
+ \x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
+
+/^(*sr:.*)Test/utf
+ Test script run on an empty string
+
+/^(*sr:(.{2})){2}/utf
+ \x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
+ \x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
+ \x{1A80}\x{1a40}\x{1A90}\x{1a41} Tai Tham Hora digit, letter, Tham digit, letter
+\= Expect no match
+ \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
+
+# Test loop breaking for empty string match
+
+/^(*sr:A|)*BCD/utf
+ AABCD
+ ABCD
+ BCD
+
+# The use of (*ACCEPT) breaks script run checking
+
+/^(*sr:.*(*ACCEPT)ZZ)/utf
+ \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
+
+# -------
# End of testinput4
diff --git a/testdata/testinput5 b/testdata/testinput5
index 687de32..9730e0b 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -2106,5 +2106,28 @@
/(*: A‎B
C)abc/x,utf,mark,alt_verbnames
abc
+
+# Script run tests: auto-possessification
+
+/^(*sr:.*)/B,utf
+ paypаl.com A classic example of why script run checks are a good thing
+
+/^(*sr:\x{2e80}*)/B,utf
+
+/^(*sr:\x{2e80}*)\x{2e80}/B,utf
+
+# Some script run patterns are broken in Perl 5.28.0. These can be moved into
+# test 4 when a mended version of Perl is released.
+
+/^(*sr:.{4})/utf
+ \x{0980}12\x{0993} Bengali Common-digits Bengali
+ \x{0780}12\x{07b1} Thaana Common-digits Thaana
+ \x{0e01}12\x{0e5b} Thai Common-digits Thai
+ \x{1780}12\x{19ff} Khmer Common-digits Khmer
+ \x{0904}12\x{0939} Devanagari Common-digits Devanagari
+ A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin
+ A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
+
+# -------
# End of testinput5
diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16
index e29243a..72dde81 100644
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@@ -1480,5 +1480,20 @@ Old 12 12 New 14 16
Old 12 15 New 16 21
Old 21 21 New 27 29
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
+
+# A few script run tests in non-UTF mode (but they need Unicode support)
+
+/^(*script_run:.{4})/
+ \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
+ 0: \x{3041}\x{30a1}\x{3007}\x{3007}
+ \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
+ 0: \x{30a1}\x{3041}\x{3007}\x{3007}
+ \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
+ 0: \x{1100}\x{2e80}\x{2e80}\x{1101}
+
+/^(*sr:.*)/utf,allow_surrogate_escapes
+Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+ \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
# End of testinput12
diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32
index 706e0c0..d2c59e5 100644
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@@ -1477,5 +1477,21 @@ Old 12 12 New 14 16
Old 12 15 New 16 21
Old 21 21 New 27 29
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
+
+# A few script run tests in non-UTF mode (but they need Unicode support)
+
+/^(*script_run:.{4})/
+ \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
+ 0: \x{3041}\x{30a1}\x{3007}\x{3007}
+ \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
+ 0: \x{30a1}\x{3041}\x{3007}\x{3007}
+ \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
+ 0: \x{1100}\x{2e80}\x{2e80}\x{1101}
+
+/^(*sr:.*)/utf,allow_surrogate_escapes
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}\x{2e80}
+ \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
+ 0: \x{d800}
# End of testinput12
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index ba3df37..8a53cf7 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -3741,5 +3741,154 @@ No match
/[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf
\x{99}\x{99}\x{99}
0: \x{99}\x{99}\x{99}
+
+# Script run tests
+
+/^(*script_run:.{4})/utf
+ abcd Latin x4
+ 0: abcd
+ \x{2e80}\x{2fa1d}\x{3041}\x{30a1} Han Han Hiragana Katakana
+ 0: \x{2e80}\x{2fa1d}\x{3041}\x{30a1}
+ \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
+ 0: \x{3041}\x{30a1}\x{3007}\x{3007}
+ \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
+ 0: \x{30a1}\x{3041}\x{3007}\x{3007}
+ \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
+ 0: \x{1100}\x{2e80}\x{2e80}\x{1101}
+ \x{2e80}\x{3105}\x{2e80}\x{3105} Han Bopomofo Han Bopomofo
+ 0: \x{2e80}\x{3105}\x{2e80}\x{3105}
+ \x{02ea}\x{2e80}\x{2e80}\x{3105} Bopomofo-Sk Han Han Bopomofo
+ 0: \x{2ea}\x{2e80}\x{2e80}\x{3105}
+ \x{3105}\x{2e80}\x{2e80}\x{3105} Bopomofo Han Han Bopomofo
+ 0: \x{3105}\x{2e80}\x{2e80}\x{3105}
+ \x{0300}cd! Inherited Latin Latin Common
+ 0: \x{300}cd!
+ \x{0391}12\x{03a9} Greek Common-digits Greek
+ 0: \x{391}12\x{3a9}
+ \x{0400}12\x{fe2f} Cyrillic Common-digits Cyrillic
+ 0: \x{400}12\x{fe2f}
+ \x{0531}12\x{fb17} Armenian Common-digits Armenian
+ 0: \x{531}12\x{fb17}
+ \x{0591}12\x{fb4f} Hebrew Common-digits Hebrew
+ 0: \x{591}12\x{fb4f}
+ \x{0600}12\x{1eef1} Arabic Common-digits Arabic
+ 0: \x{600}12\x{1eef1}
+ \x{0600}\x{0660}\x{0669}\x{1eef1} Arabic Arabic-digits Arabic
+ 0: \x{600}\x{660}\x{669}\x{1eef1}
+ \x{0700}12\x{086a} Syriac Common-digits Syriac
+ 0: \x{700}12\x{86a}
+ \x{1200}12\x{ab2e} Ethiopic Common-digits Ethiopic
+ 0: \x{1200}12\x{ab2e}
+ \x{1680}12\x{169c} Ogham Common-digits Ogham
+ 0: \x{1680}12\x{169c}
+ \x{3041}12\x{3041} Hiragana Common-digits Hiragana
+ 0: \x{3041}12\x{3041}
+ \x{0980}\x{09e6}\x{09e7}\x{0993} Bengali Bengali-digits Bengali
+ 0: \x{980}\x{9e6}\x{9e7}\x{993}
+ !cde Common Latin Latin Latin
+ 0: !cde
+ A..B Latin Common Common Latin
+ 0: A..B
+ 0abc Ascii-digit Latin Latin Latin
+ 0: 0abc
+ 1\x{0700}\x{0700}\x{0700} Ascii-digit Syriac x 3
+ 0: 1\x{700}\x{700}\x{700}
+ \x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
+ 0: \x{1a80}\x{1a80}\x{1a40}\x{1a41}
+\= Expect no match
+ a\x{370}bcd Latin Greek Latin Latin
+No match
+ \x{1100}\x{02ea}\x{02ea}\x{02ea} Hangul Bopomofo x3
+No match
+ \x{02ea}\x{02ea}\x{02ea}\x{1100} Bopomofo x3 Hangul
+No match
+ \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
+No match
+ \x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek
+No match
+ \x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
+No match
+ \x{0600}\x{0669}7\x{1eef1} Arabic Arabic-digit ascii-digit Arabic
+No match
+ A5\x{ff19}B Latin Common-ascii/notascii-digits Latin
+No match
+ \x{0300}cd\x{0391} Inherited Latin Latin Greek
+No match
+ !cd\x{0391} Common Latin Latin Greek
+No match
+ \x{1A80}\x{1A90}\x{1a40}\x{1a41} Tai Tham Hora digit, Tham digit, letters
+No match
+ A\x{1d7ce}\x{1d7ff}B Common fancy-common-2-sets-digits Common
+No match
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+No match
+
+/^(*sr:.{4}|..)/utf
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}
+
+/^(*atomic_script_run:.{4}|..)/utf
+\= Expect no match
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+No match
+
+/^(*asr:.*)/utf
+\= Expect no match
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+No match
+
+/^(?>(*sr:.*))/utf
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}\x{2e80}
+
+/^(*sr:.*)/utf
+ \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}\x{2e80}
+ \x{10fffd}\x{10fffd}\x{10fffd} Private use (Unknown)
+ 0: \x{10fffd}
+
+/^(*sr:\x{2e80}*)/utf
+ \x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
+ 0: \x{2e80}\x{2e80}
+
+/^(*sr:\x{2e80}*)\x{2e80}/utf
+ \x{2e80}\x{2e80}\x{3105} Han Han Bopomofo
+ 0: \x{2e80}\x{2e80}
+
+/^(*sr:.*)Test/utf
+ Test script run on an empty string
+ 0: Test
+
+/^(*sr:(.{2})){2}/utf
+ \x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic
+ 0: \x{600}7\x{669}\x{1eef1}
+ 1: \x{669}\x{1eef1}
+ \x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters
+ 0: \x{1a80}\x{1a80}\x{1a40}\x{1a41}
+ 1: \x{1a40}\x{1a41}
+ \x{1A80}\x{1a40}\x{1A90}\x{1a41} Tai Tham Hora digit, letter, Tham digit, letter
+ 0: \x{1a80}\x{1a40}\x{1a90}\x{1a41}
+ 1: \x{1a90}\x{1a41}
+\= Expect no match
+ \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
+No match
+
+# Test loop breaking for empty string match
+
+/^(*sr:A|)*BCD/utf
+ AABCD
+ 0: AABCD
+ ABCD
+ 0: ABCD
+ BCD
+ 0: BCD
+
+# The use of (*ACCEPT) breaks script run checking
+
+/^(*sr:.*(*ACCEPT)ZZ)/utf
+ \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
+ 0: \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul
+
+# -------
# End of testinput4
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 51caa18..406db22 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -4775,5 +4775,64 @@ Failed: error 137 at offset 2: PCRE2 does not support \F, \L, \l, \N{name}, \U,
abc
0: abc
MK: ABC
+
+# Script run tests: auto-possessification
+
+/^(*sr:.*)/B,utf
+------------------------------------------------------------------
+ Bra
+ ^
+ Script run
+ Any*
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+ paypаl.com A classic example of why script run checks are a good thing
+ 0: payp
+
+/^(*sr:\x{2e80}*)/B,utf
+------------------------------------------------------------------
+ Bra
+ ^
+ Script run
+ \x{2e80}*+
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+
+/^(*sr:\x{2e80}*)\x{2e80}/B,utf
+------------------------------------------------------------------
+ Bra
+ ^
+ Script run
+ \x{2e80}*
+ Ket
+ \x{2e80}
+ Ket
+ End
+------------------------------------------------------------------
+
+# Some script run patterns are broken in Perl 5.28.0. These can be moved into
+# test 4 when a mended version of Perl is released.
+
+/^(*sr:.{4})/utf
+ \x{0980}12\x{0993} Bengali Common-digits Bengali
+ 0: \x{980}12\x{993}
+ \x{0780}12\x{07b1} Thaana Common-digits Thaana
+ 0: \x{780}12\x{7b1}
+ \x{0e01}12\x{0e5b} Thai Common-digits Thai
+ 0: \x{e01}12\x{e5b}
+ \x{1780}12\x{19ff} Khmer Common-digits Khmer
+ 0: \x{1780}12\x{19ff}
+ \x{0904}12\x{0939} Devanagari Common-digits Devanagari
+ 0: \x{904}12\x{939}
+ A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin
+ 0: A\x{ff10}\x{ff19}B
+ A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin
+ 0: A\x{1d7ce}\x{1d7cf}B
+
+# -------
# End of testinput5