diff options
author | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2018-10-02 15:25:58 +0000 |
---|---|---|
committer | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2018-10-02 15:25:58 +0000 |
commit | 3899f6557728000c2cfd428cddc597e377baddc2 (patch) | |
tree | aaa05107db98463f3a50a7ca6d3eb9f83a985f3d /testdata | |
parent | af1cda3afb77f3e43c3c8069bd3b784abbcc2036 (diff) | |
download | pcre2-3899f6557728000c2cfd428cddc597e377baddc2.tar.gz |
Basic "script run" implementation. Not yet complete, and not yet documented.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1019 6239d852-aaf2-0410-a92c-79f79f948069
Diffstat (limited to 'testdata')
-rw-r--r-- | testdata/testinput12 | 11 | ||||
-rw-r--r-- | testdata/testinput4 | 91 | ||||
-rw-r--r-- | testdata/testinput5 | 23 | ||||
-rw-r--r-- | testdata/testoutput12-16 | 15 | ||||
-rw-r--r-- | testdata/testoutput12-32 | 16 | ||||
-rw-r--r-- | testdata/testoutput4 | 149 | ||||
-rw-r--r-- | testdata/testoutput5 | 59 |
7 files changed, 364 insertions, 0 deletions
diff --git a/testdata/testinput12 b/testdata/testinput12 index 6a996aa..29d1095 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -386,5 +386,16 @@ /(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout 123abcáyzabcdef789abcሴqr + +# A few script run tests in non-UTF mode (but they need Unicode support) + +/^(*script_run:.{4})/ + \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han + \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han + \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul + +/^(*sr:.*)/utf,allow_surrogate_escapes + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check # End of testinput12 diff --git a/testdata/testinput4 b/testdata/testinput4 index a27b6af..eea087d 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -2317,5 +2317,96 @@ /[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf \x{99}\x{99}\x{99} + +# Script run tests + +/^(*script_run:.{4})/utf + abcd Latin x4 + \x{2e80}\x{2fa1d}\x{3041}\x{30a1} Han Han Hiragana Katakana + \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han + \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han + \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul + \x{2e80}\x{3105}\x{2e80}\x{3105} Han Bopomofo Han Bopomofo + \x{02ea}\x{2e80}\x{2e80}\x{3105} Bopomofo-Sk Han Han Bopomofo + \x{3105}\x{2e80}\x{2e80}\x{3105} Bopomofo Han Han Bopomofo + \x{0300}cd! Inherited Latin Latin Common + \x{0391}12\x{03a9} Greek Common-digits Greek + \x{0400}12\x{fe2f} Cyrillic Common-digits Cyrillic + \x{0531}12\x{fb17} Armenian Common-digits Armenian + \x{0591}12\x{fb4f} Hebrew Common-digits Hebrew + \x{0600}12\x{1eef1} Arabic Common-digits Arabic + \x{0600}\x{0660}\x{0669}\x{1eef1} Arabic Arabic-digits Arabic + \x{0700}12\x{086a} Syriac Common-digits Syriac + \x{1200}12\x{ab2e} Ethiopic Common-digits Ethiopic + \x{1680}12\x{169c} Ogham Common-digits Ogham + \x{3041}12\x{3041} Hiragana Common-digits Hiragana + \x{0980}\x{09e6}\x{09e7}\x{0993} Bengali Bengali-digits Bengali + !cde Common Latin Latin Latin + A..B Latin Common Common Latin + 0abc Ascii-digit Latin Latin Latin + 1\x{0700}\x{0700}\x{0700} Ascii-digit Syriac x 3 + \x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters +\= Expect no match + a\x{370}bcd Latin Greek Latin Latin + \x{1100}\x{02ea}\x{02ea}\x{02ea} Hangul Bopomofo x3 + \x{02ea}\x{02ea}\x{02ea}\x{1100} Bopomofo x3 Hangul + \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul + \x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek + \x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic + \x{0600}\x{0669}7\x{1eef1} Arabic Arabic-digit ascii-digit Arabic + A5\x{ff19}B Latin Common-ascii/notascii-digits Latin + \x{0300}cd\x{0391} Inherited Latin Latin Greek + !cd\x{0391} Common Latin Latin Greek + \x{1A80}\x{1A90}\x{1a40}\x{1a41} Tai Tham Hora digit, Tham digit, letters + A\x{1d7ce}\x{1d7ff}B Common fancy-common-2-sets-digits Common + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + +/^(*sr:.{4}|..)/utf + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + +/^(*atomic_script_run:.{4}|..)/utf +\= Expect no match + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + +/^(*asr:.*)/utf +\= Expect no match + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + +/^(?>(*sr:.*))/utf + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + +/^(*sr:.*)/utf + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + \x{10fffd}\x{10fffd}\x{10fffd} Private use (Unknown) + +/^(*sr:\x{2e80}*)/utf + \x{2e80}\x{2e80}\x{3105} Han Han Bopomofo + +/^(*sr:\x{2e80}*)\x{2e80}/utf + \x{2e80}\x{2e80}\x{3105} Han Han Bopomofo + +/^(*sr:.*)Test/utf + Test script run on an empty string + +/^(*sr:(.{2})){2}/utf + \x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic + \x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters + \x{1A80}\x{1a40}\x{1A90}\x{1a41} Tai Tham Hora digit, letter, Tham digit, letter +\= Expect no match + \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul + +# Test loop breaking for empty string match + +/^(*sr:A|)*BCD/utf + AABCD + ABCD + BCD + +# The use of (*ACCEPT) breaks script run checking + +/^(*sr:.*(*ACCEPT)ZZ)/utf + \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul + +# ------- # End of testinput4 diff --git a/testdata/testinput5 b/testdata/testinput5 index 687de32..9730e0b 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2106,5 +2106,28 @@ /(*: AB
C)abc/x,utf,mark,alt_verbnames abc + +# Script run tests: auto-possessification + +/^(*sr:.*)/B,utf + paypаl.com A classic example of why script run checks are a good thing + +/^(*sr:\x{2e80}*)/B,utf + +/^(*sr:\x{2e80}*)\x{2e80}/B,utf + +# Some script run patterns are broken in Perl 5.28.0. These can be moved into +# test 4 when a mended version of Perl is released. + +/^(*sr:.{4})/utf + \x{0980}12\x{0993} Bengali Common-digits Bengali + \x{0780}12\x{07b1} Thaana Common-digits Thaana + \x{0e01}12\x{0e5b} Thai Common-digits Thai + \x{1780}12\x{19ff} Khmer Common-digits Khmer + \x{0904}12\x{0939} Devanagari Common-digits Devanagari + A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin + A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin + +# ------- # End of testinput5 diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index e29243a..72dde81 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1480,5 +1480,20 @@ Old 12 12 New 14 16 Old 12 15 New 16 21 Old 21 21 New 27 29 4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr + +# A few script run tests in non-UTF mode (but they need Unicode support) + +/^(*script_run:.{4})/ + \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han + 0: \x{3041}\x{30a1}\x{3007}\x{3007} + \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han + 0: \x{30a1}\x{3041}\x{3007}\x{3007} + \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul + 0: \x{1100}\x{2e80}\x{2e80}\x{1101} + +/^(*sr:.*)/utf,allow_surrogate_escapes +Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check # End of testinput12 diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index 706e0c0..d2c59e5 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1477,5 +1477,21 @@ Old 12 12 New 14 16 Old 12 15 New 16 21 Old 21 21 New 27 29 4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr + +# A few script run tests in non-UTF mode (but they need Unicode support) + +/^(*script_run:.{4})/ + \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han + 0: \x{3041}\x{30a1}\x{3007}\x{3007} + \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han + 0: \x{30a1}\x{3041}\x{3007}\x{3007} + \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul + 0: \x{1100}\x{2e80}\x{2e80}\x{1101} + +/^(*sr:.*)/utf,allow_surrogate_escapes + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + 0: \x{2e80}\x{3105}\x{2e80} + \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check + 0: \x{d800} # End of testinput12 diff --git a/testdata/testoutput4 b/testdata/testoutput4 index ba3df37..8a53cf7 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -3741,5 +3741,154 @@ No match /[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf \x{99}\x{99}\x{99} 0: \x{99}\x{99}\x{99} + +# Script run tests + +/^(*script_run:.{4})/utf + abcd Latin x4 + 0: abcd + \x{2e80}\x{2fa1d}\x{3041}\x{30a1} Han Han Hiragana Katakana + 0: \x{2e80}\x{2fa1d}\x{3041}\x{30a1} + \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han + 0: \x{3041}\x{30a1}\x{3007}\x{3007} + \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han + 0: \x{30a1}\x{3041}\x{3007}\x{3007} + \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul + 0: \x{1100}\x{2e80}\x{2e80}\x{1101} + \x{2e80}\x{3105}\x{2e80}\x{3105} Han Bopomofo Han Bopomofo + 0: \x{2e80}\x{3105}\x{2e80}\x{3105} + \x{02ea}\x{2e80}\x{2e80}\x{3105} Bopomofo-Sk Han Han Bopomofo + 0: \x{2ea}\x{2e80}\x{2e80}\x{3105} + \x{3105}\x{2e80}\x{2e80}\x{3105} Bopomofo Han Han Bopomofo + 0: \x{3105}\x{2e80}\x{2e80}\x{3105} + \x{0300}cd! Inherited Latin Latin Common + 0: \x{300}cd! + \x{0391}12\x{03a9} Greek Common-digits Greek + 0: \x{391}12\x{3a9} + \x{0400}12\x{fe2f} Cyrillic Common-digits Cyrillic + 0: \x{400}12\x{fe2f} + \x{0531}12\x{fb17} Armenian Common-digits Armenian + 0: \x{531}12\x{fb17} + \x{0591}12\x{fb4f} Hebrew Common-digits Hebrew + 0: \x{591}12\x{fb4f} + \x{0600}12\x{1eef1} Arabic Common-digits Arabic + 0: \x{600}12\x{1eef1} + \x{0600}\x{0660}\x{0669}\x{1eef1} Arabic Arabic-digits Arabic + 0: \x{600}\x{660}\x{669}\x{1eef1} + \x{0700}12\x{086a} Syriac Common-digits Syriac + 0: \x{700}12\x{86a} + \x{1200}12\x{ab2e} Ethiopic Common-digits Ethiopic + 0: \x{1200}12\x{ab2e} + \x{1680}12\x{169c} Ogham Common-digits Ogham + 0: \x{1680}12\x{169c} + \x{3041}12\x{3041} Hiragana Common-digits Hiragana + 0: \x{3041}12\x{3041} + \x{0980}\x{09e6}\x{09e7}\x{0993} Bengali Bengali-digits Bengali + 0: \x{980}\x{9e6}\x{9e7}\x{993} + !cde Common Latin Latin Latin + 0: !cde + A..B Latin Common Common Latin + 0: A..B + 0abc Ascii-digit Latin Latin Latin + 0: 0abc + 1\x{0700}\x{0700}\x{0700} Ascii-digit Syriac x 3 + 0: 1\x{700}\x{700}\x{700} + \x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters + 0: \x{1a80}\x{1a80}\x{1a40}\x{1a41} +\= Expect no match + a\x{370}bcd Latin Greek Latin Latin +No match + \x{1100}\x{02ea}\x{02ea}\x{02ea} Hangul Bopomofo x3 +No match + \x{02ea}\x{02ea}\x{02ea}\x{1100} Bopomofo x3 Hangul +No match + \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul +No match + \x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek +No match + \x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic +No match + \x{0600}\x{0669}7\x{1eef1} Arabic Arabic-digit ascii-digit Arabic +No match + A5\x{ff19}B Latin Common-ascii/notascii-digits Latin +No match + \x{0300}cd\x{0391} Inherited Latin Latin Greek +No match + !cd\x{0391} Common Latin Latin Greek +No match + \x{1A80}\x{1A90}\x{1a40}\x{1a41} Tai Tham Hora digit, Tham digit, letters +No match + A\x{1d7ce}\x{1d7ff}B Common fancy-common-2-sets-digits Common +No match + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana +No match + +/^(*sr:.{4}|..)/utf + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + 0: \x{2e80}\x{3105} + +/^(*atomic_script_run:.{4}|..)/utf +\= Expect no match + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana +No match + +/^(*asr:.*)/utf +\= Expect no match + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana +No match + +/^(?>(*sr:.*))/utf + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + 0: \x{2e80}\x{3105}\x{2e80} + +/^(*sr:.*)/utf + \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana + 0: \x{2e80}\x{3105}\x{2e80} + \x{10fffd}\x{10fffd}\x{10fffd} Private use (Unknown) + 0: \x{10fffd} + +/^(*sr:\x{2e80}*)/utf + \x{2e80}\x{2e80}\x{3105} Han Han Bopomofo + 0: \x{2e80}\x{2e80} + +/^(*sr:\x{2e80}*)\x{2e80}/utf + \x{2e80}\x{2e80}\x{3105} Han Han Bopomofo + 0: \x{2e80}\x{2e80} + +/^(*sr:.*)Test/utf + Test script run on an empty string + 0: Test + +/^(*sr:(.{2})){2}/utf + \x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic + 0: \x{600}7\x{669}\x{1eef1} + 1: \x{669}\x{1eef1} + \x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters + 0: \x{1a80}\x{1a80}\x{1a40}\x{1a41} + 1: \x{1a40}\x{1a41} + \x{1A80}\x{1a40}\x{1A90}\x{1a41} Tai Tham Hora digit, letter, Tham digit, letter + 0: \x{1a80}\x{1a40}\x{1a90}\x{1a41} + 1: \x{1a90}\x{1a41} +\= Expect no match + \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul +No match + +# Test loop breaking for empty string match + +/^(*sr:A|)*BCD/utf + AABCD + 0: AABCD + ABCD + 0: ABCD + BCD + 0: BCD + +# The use of (*ACCEPT) breaks script run checking + +/^(*sr:.*(*ACCEPT)ZZ)/utf + \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul + 0: \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul + +# ------- # End of testinput4 diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 51caa18..406db22 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -4775,5 +4775,64 @@ Failed: error 137 at offset 2: PCRE2 does not support \F, \L, \l, \N{name}, \U, abc 0: abc MK: ABC + +# Script run tests: auto-possessification + +/^(*sr:.*)/B,utf +------------------------------------------------------------------ + Bra + ^ + Script run + Any* + Ket + Ket + End +------------------------------------------------------------------ + paypаl.com A classic example of why script run checks are a good thing + 0: payp + +/^(*sr:\x{2e80}*)/B,utf +------------------------------------------------------------------ + Bra + ^ + Script run + \x{2e80}*+ + Ket + Ket + End +------------------------------------------------------------------ + +/^(*sr:\x{2e80}*)\x{2e80}/B,utf +------------------------------------------------------------------ + Bra + ^ + Script run + \x{2e80}* + Ket + \x{2e80} + Ket + End +------------------------------------------------------------------ + +# Some script run patterns are broken in Perl 5.28.0. These can be moved into +# test 4 when a mended version of Perl is released. + +/^(*sr:.{4})/utf + \x{0980}12\x{0993} Bengali Common-digits Bengali + 0: \x{980}12\x{993} + \x{0780}12\x{07b1} Thaana Common-digits Thaana + 0: \x{780}12\x{7b1} + \x{0e01}12\x{0e5b} Thai Common-digits Thai + 0: \x{e01}12\x{e5b} + \x{1780}12\x{19ff} Khmer Common-digits Khmer + 0: \x{1780}12\x{19ff} + \x{0904}12\x{0939} Devanagari Common-digits Devanagari + 0: \x{904}12\x{939} + A\x{ff10}\x{ff19}B Latin Common-notascii-digits Latin + 0: A\x{ff10}\x{ff19}B + A\x{1d7ce}\x{1d7cf}B Latin fancy-common-digits Latin + 0: A\x{1d7ce}\x{1d7cf}B + +# ------- # End of testinput5 |