diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-03-19 19:29:17 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-03-20 12:16:13 -0600 |
commit | 90826b5cd27738a30509f332296d8b985731d3fc (patch) | |
tree | 0dfdc33edd5165afb20ffa9e6a9712d56d222e31 | |
parent | e286af2d135c6b1b03be2bd322f22f89e1b1aa5d (diff) | |
download | perl-90826b5cd27738a30509f332296d8b985731d3fc.tar.gz |
regcharclass: Add tricky fold characters.
The tricky fold characters need to be expanded to include the ones
that map to the same ones as the original set. This isn't because the
new ones have a length issue, it's that they get left out of comparisons
because of the special regnodes generated for the tricky ones.
-rw-r--r-- | regcharclass.h | 82 | ||||
-rwxr-xr-x | regen/regcharclass.pl | 5 |
2 files changed, 81 insertions, 6 deletions
diff --git a/regcharclass.h b/regcharclass.h index ea5cb99733..47d4b41925 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -361,9 +361,12 @@ /* TRICKYFOLD: Problematic fold case letters. - 0x00DF # LATIN1 SMALL LETTER SHARP S + 0x00DF # LATIN SMALL LETTER SHARP S 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + 0x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF + 0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390 + 0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0 */ /*** GENERATED CODE ***/ #define is_TRICKYFOLD(s,is_utf8) \ @@ -372,12 +375,32 @@ ( ( 0x9F == ((U8*)s)[1] ) ? 2 : 0 ) \ : ( 0xCE == ((U8*)s)[0] ) ? \ ( ( 0x90 == ((U8*)s)[1] || 0xB0 == ((U8*)s)[1] ) ? 2 : 0 ) \ + : ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0xBA == ((U8*)s)[1] ) ? \ + ( ( 0x9E == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( 0xBF == ((U8*)s)[1] ) ? \ + ( ( 0x93 == ((U8*)s)[2] || 0xA3 == ((U8*)s)[2] ) ? 3 : 0 ) \ + : 0 ) \ : 0 ) \ : ( 0xDF == ((U8*)s)[0] ) ) /*** GENERATED CODE ***/ #define is_TRICKYFOLD_safe(s,e,is_utf8) \ -( ((e)-(s) > 1) ? \ +( ((e)-(s) > 2) ? \ + ( ( is_utf8 ) ? \ + ( ( 0xC3 == ((U8*)s)[0] ) ? \ + ( ( 0x9F == ((U8*)s)[1] ) ? 2 : 0 ) \ + : ( 0xCE == ((U8*)s)[0] ) ? \ + ( ( 0x90 == ((U8*)s)[1] || 0xB0 == ((U8*)s)[1] ) ? 2 : 0 ) \ + : ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0xBA == ((U8*)s)[1] ) ? \ + ( ( 0x9E == ((U8*)s)[2] ) ? 3 : 0 ) \ + : ( 0xBF == ((U8*)s)[1] ) ? \ + ( ( 0x93 == ((U8*)s)[2] || 0xA3 == ((U8*)s)[2] ) ? 3 : 0 ) \ + : 0 ) \ + : 0 ) \ + : ( 0xDF == ((U8*)s)[0] ) ) \ +: ((e)-(s) > 1) ? \ ( ( is_utf8 ) ? \ ( ( 0xC3 == ((U8*)s)[0] ) ? \ ( ( 0x9F == ((U8*)s)[1] ) ? 2 : 0 ) \ @@ -395,7 +418,10 @@ #define is_TRICKYFOLD_cp(cp) \ ( 0xDF == cp || ( 0xDF < cp && \ ( 0x390 == cp || ( 0x390 < cp && \ -0x3B0 == cp ) ) ) ) +( 0x3B0 == cp || ( 0x3B0 < cp && \ +( 0x1E9E == cp || ( 0x1E9E < cp && \ +( 0x1FD3 == cp || ( 0x1FD3 < cp && \ +0x1FE3 == cp ) ) ) ) ) ) ) ) ) ) /*** GENERATED CODE ***/ #define what_TRICKYFOLD(s,is_utf8) \ @@ -405,12 +431,35 @@ : ( 0xCE == ((U8*)s)[0] ) ? \ ( ( 0x90 == ((U8*)s)[1] ) ? 0x390 \ : ( 0xB0 == ((U8*)s)[1] ) ? 0x3B0 : 0 ) \ + : ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0xBA == ((U8*)s)[1] ) ? \ + ( ( 0x9E == ((U8*)s)[2] ) ? 0x1E9E : 0 ) \ + : ( 0xBF == ((U8*)s)[1] ) ? \ + ( ( 0x93 == ((U8*)s)[2] ) ? 0x1FD3 \ + : ( 0xA3 == ((U8*)s)[2] ) ? 0x1FE3 : 0 ) \ + : 0 ) \ : 0 ) \ : ( 0xDF == ((U8*)s)[0] ) ? 0xDF : 0 ) /*** GENERATED CODE ***/ #define what_TRICKYFOLD_safe(s,e,is_utf8) \ -( ((e)-(s) > 1) ? \ +( ((e)-(s) > 2) ? \ + ( ( is_utf8 ) ? \ + ( ( 0xC3 == ((U8*)s)[0] ) ? \ + ( ( 0x9F == ((U8*)s)[1] ) ? 0xDF : 0 ) \ + : ( 0xCE == ((U8*)s)[0] ) ? \ + ( ( 0x90 == ((U8*)s)[1] ) ? 0x390 \ + : ( 0xB0 == ((U8*)s)[1] ) ? 0x3B0 : 0 ) \ + : ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0xBA == ((U8*)s)[1] ) ? \ + ( ( 0x9E == ((U8*)s)[2] ) ? 0x1E9E : 0 ) \ + : ( 0xBF == ((U8*)s)[1] ) ? \ + ( ( 0x93 == ((U8*)s)[2] ) ? 0x1FD3 \ + : ( 0xA3 == ((U8*)s)[2] ) ? 0x1FE3 : 0 ) \ + : 0 ) \ + : 0 ) \ + : ( 0xDF == ((U8*)s)[0] ) ? 0xDF : 0 ) \ +: ((e)-(s) > 1) ? \ ( ( is_utf8 ) ? \ ( ( 0xC3 == ((U8*)s)[0] ) ? \ ( ( 0x9F == ((U8*)s)[1] ) ? 0xDF : 0 ) \ @@ -431,12 +480,35 @@ : ( 0xCE == ((U8*)s)[0] ) ? \ ( ( 0x90 == ((U8*)s)[1] ) ? len=2, 0x390 \ : ( 0xB0 == ((U8*)s)[1] ) ? len=2, 0x3B0 : 0 ) \ + : ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0xBA == ((U8*)s)[1] ) ? \ + ( ( 0x9E == ((U8*)s)[2] ) ? len=3, 0x1E9E : 0 ) \ + : ( 0xBF == ((U8*)s)[1] ) ? \ + ( ( 0x93 == ((U8*)s)[2] ) ? len=3, 0x1FD3 \ + : ( 0xA3 == ((U8*)s)[2] ) ? len=3, 0x1FE3 : 0 ) \ + : 0 ) \ : 0 ) \ : ( 0xDF == ((U8*)s)[0] ) ? len=1, 0xDF : 0 ) /*** GENERATED CODE ***/ #define what_len_TRICKYFOLD_safe(s,e,is_utf8,len) \ -( ((e)-(s) > 1) ? \ +( ((e)-(s) > 2) ? \ + ( ( is_utf8 ) ? \ + ( ( 0xC3 == ((U8*)s)[0] ) ? \ + ( ( 0x9F == ((U8*)s)[1] ) ? len=2, 0xDF : 0 ) \ + : ( 0xCE == ((U8*)s)[0] ) ? \ + ( ( 0x90 == ((U8*)s)[1] ) ? len=2, 0x390 \ + : ( 0xB0 == ((U8*)s)[1] ) ? len=2, 0x3B0 : 0 ) \ + : ( 0xE1 == ((U8*)s)[0] ) ? \ + ( ( 0xBA == ((U8*)s)[1] ) ? \ + ( ( 0x9E == ((U8*)s)[2] ) ? len=3, 0x1E9E : 0 ) \ + : ( 0xBF == ((U8*)s)[1] ) ? \ + ( ( 0x93 == ((U8*)s)[2] ) ? len=3, 0x1FD3 \ + : ( 0xA3 == ((U8*)s)[2] ) ? len=3, 0x1FE3 : 0 ) \ + : 0 ) \ + : 0 ) \ + : ( 0xDF == ((U8*)s)[0] ) ? len=1, 0xDF : 0 ) \ +: ((e)-(s) > 1) ? \ ( ( is_utf8 ) ? \ ( ( 0xC3 == ((U8*)s)[0] ) ? \ ( ( 0x9F == ((U8*)s)[1] ) ? len=2, 0xDF : 0 ) \ diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index c3ea8a62b0..2e89b2da28 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -731,6 +731,9 @@ VERTWS: Vertical Whitespace: \v \V TRICKYFOLD: Problematic fold case letters. => generic cp generic-cp generic-both :fast safe -0x00DF # LATIN1 SMALL LETTER SHARP S +0x00DF # LATIN SMALL LETTER SHARP S 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +0x1E9E # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF +0x1FD3 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390 +0x1FE3 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0 |