diff options
-rw-r--r-- | regcharclass.h | 46 | ||||
-rwxr-xr-x | regen/regcharclass.pl | 34 |
2 files changed, 48 insertions, 32 deletions
diff --git a/regcharclass.h b/regcharclass.h index 3b3a823b1a..df0a654b4a 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1395,7 +1395,7 @@ ( ( inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[3]), 0xA0, 0xAF) ) ? 4 : 0 )\ : ( ( 0x73 == ((const U8*)s)[2] ) && ( inRANGE_helper_(U8, ((const U8*)s)[3], 0x72, 0x73) ) ) ? 4 : 0 )\ : 0 ) \ - : ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) == 0xF3 || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) & 0xFD ) == 0xF5 ) ) ?\ + : ( ((const U8*)s)[0] == 0xDF || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) & 0xFD ) == 0xF5 ) ) ?\ ( ( ( ( 0x73 == ((const U8*)s)[1] ) && ( 0x73 == ((const U8*)s)[2] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[3], 0x72, 0x73) ) ) ? 4 : 0 )\ : ( 0xED == ((const U8*)s)[0] ) ? \ ( ( ( ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF9 ) == 0xA9 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF1 ) == 0xB1 ) ) && ( 0x73 == ((const U8*)s)[2] ) ) && ( 0x73 == ((const U8*)s)[3] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[4], 0x72, 0x73) ) ) ? 5 : 0 )\ @@ -1420,7 +1420,7 @@ ( ( inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[3]), 0xA0, 0xAF) ) ? 4 : 0 )\ : ( ( 0x73 == ((const U8*)s)[2] ) && ( inRANGE_helper_(U8, ((const U8*)s)[3], 0x72, 0x73) ) ) ? 4 : 0 )\ : 0 ) \ -: ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) == 0xF3 || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) & 0xFD ) == 0xF5 ) ) && ( 0x73 == ((const U8*)s)[1] ) ) && ( 0x73 == ((const U8*)s)[2] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[3], 0x72, 0x73) ) ) ? 4 : 0 ) +: ( ( ( ( ((const U8*)s)[0] == 0xDF || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) & 0xFD ) == 0xF5 ) ) && ( 0x73 == ((const U8*)s)[1] ) ) && ( 0x73 == ((const U8*)s)[2] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[3], 0x72, 0x73) ) ) ? 4 : 0 ) /* LARGER_NON_CHARS: # 5 bytes @@ -1476,7 +1476,7 @@ : ( 0xBE == ((const U8*)s)[0] ) ? \ ( ( ( 0x41 == ((const U8*)s)[1] ) && ( inRANGE_helper_(U8, ((const U8*)s)[2], 0x52, 0x55) ) ) ? 3 : 0 )\ : ( 0xCA == ((const U8*)s)[0] ) ? \ - ( ( 0xA0 == NATIVE_UTF8_TO_I8(((const U8*)s)[1]) || inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[1]), 0xAD, 0xBF) ) ?\ + ( ( 0x41 == ((const U8*)s)[1] || inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[1]), 0xAD, 0xBF) ) ?\ 3 \ : ( 0x42 == ((const U8*)s)[1] ) ? \ ( ( inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[2]), 0xA0, 0xBE) ) ? 3 : 0 )\ @@ -1598,13 +1598,13 @@ ( ( 0x46 == ((const U8*)s)[2] ) ? \ ( ( ( ( 0xB8 == ((const U8*)s)[3] ) && ( 0x53 == ((const U8*)s)[4] ) ) && ( 0x43 == ((const U8*)s)[5] ) ) ? 6 : 0 )\ : ( 0x63 == ((const U8*)s)[2] ) ? \ - ( ( ( ( 0xB8 == ((const U8*)s)[3] ) && ( 0x52 == ((const U8*)s)[4] ) ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[5]) & 0xF7 ) == 0xA5 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[5]) == 0xAB || NATIVE_UTF8_TO_I8(((const U8*)s)[5]) == 0xB6 ) ) ? 6 : 0 )\ + ( ( ( ( 0xB8 == ((const U8*)s)[3] ) && ( 0x52 == ((const U8*)s)[4] ) ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[5]) & 0xF7 ) == 0xA5 ) || ((const U8*)s)[5] == 0x52 || ((const U8*)s)[5] == 0x65 ) ) ? 6 : 0 )\ : ( ( ( ( 0x72 == ((const U8*)s)[2] ) && ( 0xB8 == ((const U8*)s)[3] ) ) && ( 0x52 == ((const U8*)s)[4] ) ) && ( 0x65 == ((const U8*)s)[5] ) ) ? 6 : 0 )\ : 0 ) \ : ( 0xBF == ((const U8*)s)[0] ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x67, 0x68) ) ? \ ( ( ( ( inRANGE_helper_(U8, ((const U8*)s)[2], 0x41, 0x48) ) && ( 0xB4 == ((const U8*)s)[3] ) ) && ( 0x68 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ - : ( ( ( ( 0x6A == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[2]) == 0xBC ) ) && ( 0xB4 == ((const U8*)s)[3] ) ) && ( 0x68 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ + : ( ( ( ( 0x6A == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || ((const U8*)s)[2] == 0x70 ) ) && ( 0xB4 == ((const U8*)s)[3] ) ) && ( 0x68 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ : 0 ) @@ -1661,7 +1661,7 @@ : ( 0xBF == ((const U8*)s)[0] ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x67, 0x68) ) ? \ ( ( ( ( inRANGE_helper_(U8, ((const U8*)s)[2], 0x41, 0x48) ) && ( 0xB4 == ((const U8*)s)[3] ) ) && ( 0x68 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ - : ( ( ( ( 0x6A == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[2]) == 0xBC ) ) && ( 0xB4 == ((const U8*)s)[3] ) ) && ( 0x68 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ + : ( ( ( ( 0x6A == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || ((const U8*)s)[2] == 0x70 ) ) && ( 0xB4 == ((const U8*)s)[3] ) ) && ( 0x68 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ : 0 ) @@ -2284,7 +2284,7 @@ : ( 0xBF == ((const U8*)s)[0] ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x67, 0x68) ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[2], 0x41, 0x48) ) ? 3 : 0 )\ - : ( ( 0x6A == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[2]) == 0xBC ) ) ? 3 : 0 )\ + : ( ( 0x6A == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || ((const U8*)s)[2] == 0x70 ) ) ? 3 : 0 )\ : 0 ) \ : ((e)-(s) > 2) ? \ ( ( ( ( ((const U8*)s)[0] & 0xAF ) == 'a' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 'h' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 's' ) || ( ( ((const U8*)s)[0] & 0xBF ) == 'w' ) || ( ( ((const U8*)s)[0] & 0xBF ) == 'y' ) ) ? 1\ @@ -2295,7 +2295,7 @@ : ( 0xAB == ((const U8*)s)[0] ) ? \ ( ( 0x70 == ((const U8*)s)[1] ) ? 2 : 0 ) \ : ( 0xB4 == ((const U8*)s)[0] ) ? \ - ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xFD ) == 0xAC ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF7 ) == 0xB1 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[1]) == 0xB7 ) ? 2 : 0 )\ + ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xFD ) == 0xAC ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF7 ) == 0xB1 ) || ((const U8*)s)[1] == 0x66 ) ? 2 : 0 )\ : ( 0xB5 == ((const U8*)s)[0] ) ? \ ( ( ( ( ((const U8*)s)[1] & 0xFB ) == 0x42 ) || ((const U8*)s)[1] == 0x4A || ((const U8*)s)[1] == 0x55 ) ? 2 : 0 )\ : ( 0xB8 == ((const U8*)s)[0] ) ? \ @@ -2303,7 +2303,7 @@ : ( 0xBF == ((const U8*)s)[0] ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x67, 0x68) ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[2], 0x41, 0x48) ) ? 3 : 0 )\ - : ( ( 0x6A == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[2]) == 0xBC ) ) ? 3 : 0 )\ + : ( ( 0x6A == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || ((const U8*)s)[2] == 0x70 ) ) ? 3 : 0 )\ : 0 ) \ : ((e)-(s) > 1) ? \ ( ( ( ( ((const U8*)s)[0] & 0xAF ) == 'a' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 'h' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 's' ) || ( ( ((const U8*)s)[0] & 0xBF ) == 'w' ) || ( ( ((const U8*)s)[0] & 0xBF ) == 'y' ) ) ? 1\ @@ -2314,7 +2314,7 @@ : ( 0xAB == ((const U8*)s)[0] ) ? \ ( ( 0x70 == ((const U8*)s)[1] ) ? 2 : 0 ) \ : ( 0xB4 == ((const U8*)s)[0] ) ? \ - ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xFD ) == 0xAC ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF7 ) == 0xB1 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[1]) == 0xB7 ) ? 2 : 0 )\ + ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xFD ) == 0xAC ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF7 ) == 0xB1 ) || ((const U8*)s)[1] == 0x66 ) ? 2 : 0 )\ : ( ( 0xB5 == ((const U8*)s)[0] ) && ( ( ( ((const U8*)s)[1] & 0xFB ) == 0x42 ) || ((const U8*)s)[1] == 0x4A || ((const U8*)s)[1] == 0x55 ) ) ? 2 : 0 )\ : ((e)-(s) > 0) ? \ ( ( ( ((const U8*)s)[0] & 0xAF ) == 'a' ) || ( ( ((const U8*)s)[0] & 0x9F ) == 'f' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 'h' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 's' ) || ( ( ((const U8*)s)[0] & 0xBF ) == 'y' ) )\ @@ -2362,7 +2362,7 @@ ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF0 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFA ) == 0xB2 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF7 ) == 0xB4 ) ) ? 3 : 0 )\ : ( 0x72 == ((const U8*)s)[1] ) ? \ ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xEA ) == 0xA2 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF7 ) == 0xA4 ) ) ? 3 : 0 )\ - : ( ( 0x73 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xEA ) == 0xA2 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[2]) == 0xA4 || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF7 ) == 0xB4 ) ) ) ? 3 : 0 )\ + : ( ( 0x73 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xEA ) == 0xA2 ) || ((const U8*)s)[2] == 0x45 || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF7 ) == 0xB4 ) ) ) ? 3 : 0 )\ : ( ( ( ( 0xDD == ((const U8*)s)[0] ) && ( 0x72 == ((const U8*)s)[1] ) ) && ( 0x67 == ((const U8*)s)[2] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[3], 0x41, 0x47) || inRANGE_helper_(U8, ((const U8*)s)[3], 0x62, 0x66) ) ) ? 4 : 0 ) /* @@ -2620,7 +2620,7 @@ ( ( inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[3]), 0xA0, 0xAF) ) ? 4 : 0 )\ : ( ( 0x72 == ((const U8*)s)[2] ) && ( inRANGE_helper_(U8, ((const U8*)s)[3], 0x71, 0x72) ) ) ? 4 : 0 )\ : 0 ) \ - : ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) == 0xF3 || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) & 0xFD ) == 0xF5 ) ) ?\ + : ( ((const U8*)s)[0] == 0xDF || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) & 0xFD ) == 0xF5 ) ) ?\ ( ( ( ( 0x72 == ((const U8*)s)[1] ) && ( 0x72 == ((const U8*)s)[2] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[3], 0x71, 0x72) ) ) ? 4 : 0 )\ : ( 0xED == ((const U8*)s)[0] ) ? \ ( ( ( ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF9 ) == 0xA9 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF1 ) == 0xB1 ) ) && ( 0x72 == ((const U8*)s)[2] ) ) && ( 0x72 == ((const U8*)s)[3] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[4], 0x71, 0x72) ) ) ? 5 : 0 )\ @@ -2645,7 +2645,7 @@ ( ( inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[3]), 0xA0, 0xAF) ) ? 4 : 0 )\ : ( ( 0x72 == ((const U8*)s)[2] ) && ( inRANGE_helper_(U8, ((const U8*)s)[3], 0x71, 0x72) ) ) ? 4 : 0 )\ : 0 ) \ -: ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) == 0xF3 || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) & 0xFD ) == 0xF5 ) ) && ( 0x72 == ((const U8*)s)[1] ) ) && ( 0x72 == ((const U8*)s)[2] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[3], 0x71, 0x72) ) ) ? 4 : 0 ) +: ( ( ( ( ((const U8*)s)[0] == 0xDF || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[0]) & 0xFD ) == 0xF5 ) ) && ( 0x72 == ((const U8*)s)[1] ) ) && ( 0x72 == ((const U8*)s)[2] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[3], 0x71, 0x72) ) ) ? 4 : 0 ) /* LARGER_NON_CHARS: # 5 bytes @@ -2701,7 +2701,7 @@ : ( 0xBE == ((const U8*)s)[0] ) ? \ ( ( ( 0x41 == ((const U8*)s)[1] ) && ( inRANGE_helper_(U8, ((const U8*)s)[2], 0x52, 0x55) ) ) ? 3 : 0 )\ : ( 0xCA == ((const U8*)s)[0] ) ? \ - ( ( 0xA0 == NATIVE_UTF8_TO_I8(((const U8*)s)[1]) || inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[1]), 0xAD, 0xBF) ) ?\ + ( ( 0x41 == ((const U8*)s)[1] || inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[1]), 0xAD, 0xBF) ) ?\ 3 \ : ( 0x42 == ((const U8*)s)[1] ) ? \ ( ( inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[2]), 0xA0, 0xBE) ) ? 3 : 0 )\ @@ -2823,13 +2823,13 @@ ( ( 0x46 == ((const U8*)s)[2] ) ? \ ( ( ( ( 0xB7 == ((const U8*)s)[3] ) && ( 0x53 == ((const U8*)s)[4] ) ) && ( 0x43 == ((const U8*)s)[5] ) ) ? 6 : 0 )\ : ( 0x62 == ((const U8*)s)[2] ) ? \ - ( ( ( ( 0xB7 == ((const U8*)s)[3] ) && ( 0x52 == ((const U8*)s)[4] ) ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[5]) & 0xF7 ) == 0xA5 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[5]) == 0xAB || NATIVE_UTF8_TO_I8(((const U8*)s)[5]) == 0xB6 ) ) ? 6 : 0 )\ + ( ( ( ( 0xB7 == ((const U8*)s)[3] ) && ( 0x52 == ((const U8*)s)[4] ) ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[5]) & 0xF7 ) == 0xA5 ) || ((const U8*)s)[5] == 0x52 || ((const U8*)s)[5] == 0x64 ) ) ? 6 : 0 )\ : ( ( ( ( 0x71 == ((const U8*)s)[2] ) && ( 0xB7 == ((const U8*)s)[3] ) ) && ( 0x52 == ((const U8*)s)[4] ) ) && ( 0x64 == ((const U8*)s)[5] ) ) ? 6 : 0 )\ : 0 ) \ : ( 0xBF == ((const U8*)s)[0] ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x66, 0x67) ) ? \ ( ( ( ( inRANGE_helper_(U8, ((const U8*)s)[2], 0x41, 0x48) ) && ( 0xB3 == ((const U8*)s)[3] ) ) && ( 0x67 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ - : ( ( ( ( 0x69 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[2]) == 0xBC ) ) && ( 0xB3 == ((const U8*)s)[3] ) ) && ( 0x67 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ + : ( ( ( ( 0x69 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || ((const U8*)s)[2] == 0x6A ) ) && ( 0xB3 == ((const U8*)s)[3] ) ) && ( 0x67 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ : 0 ) @@ -2886,7 +2886,7 @@ : ( 0xBF == ((const U8*)s)[0] ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x66, 0x67) ) ? \ ( ( ( ( inRANGE_helper_(U8, ((const U8*)s)[2], 0x41, 0x48) ) && ( 0xB3 == ((const U8*)s)[3] ) ) && ( 0x67 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ - : ( ( ( ( 0x69 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[2]) == 0xBC ) ) && ( 0xB3 == ((const U8*)s)[3] ) ) && ( 0x67 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ + : ( ( ( ( 0x69 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || ((const U8*)s)[2] == 0x6A ) ) && ( 0xB3 == ((const U8*)s)[3] ) ) && ( 0x67 == ((const U8*)s)[4] ) ) ? 5 : 0 )\ : 0 ) @@ -3509,7 +3509,7 @@ : ( 0xBF == ((const U8*)s)[0] ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x66, 0x67) ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[2], 0x41, 0x48) ) ? 3 : 0 )\ - : ( ( 0x69 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[2]) == 0xBC ) ) ? 3 : 0 )\ + : ( ( 0x69 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || ((const U8*)s)[2] == 0x6A ) ) ? 3 : 0 )\ : 0 ) \ : ((e)-(s) > 2) ? \ ( ( ( ( ((const U8*)s)[0] & 0xAF ) == 'a' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 'h' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 's' ) || ( ( ((const U8*)s)[0] & 0xBF ) == 'w' ) || ( ( ((const U8*)s)[0] & 0xBF ) == 'y' ) ) ? 1\ @@ -3520,7 +3520,7 @@ : ( 0xAA == ((const U8*)s)[0] ) ? \ ( ( 0x6A == ((const U8*)s)[1] ) ? 2 : 0 ) \ : ( 0xB3 == ((const U8*)s)[0] ) ? \ - ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xFD ) == 0xAC ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF7 ) == 0xB1 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[1]) == 0xB7 ) ? 2 : 0 )\ + ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xFD ) == 0xAC ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF7 ) == 0xB1 ) || ((const U8*)s)[1] == 0x65 ) ? 2 : 0 )\ : ( 0xB4 == ((const U8*)s)[0] ) ? \ ( ( ( ( ((const U8*)s)[1] & 0xFB ) == 0x42 ) || ((const U8*)s)[1] == 0x4A || ((const U8*)s)[1] == 0x55 ) ? 2 : 0 )\ : ( 0xB7 == ((const U8*)s)[0] ) ? \ @@ -3528,7 +3528,7 @@ : ( 0xBF == ((const U8*)s)[0] ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x66, 0x67) ) ? \ ( ( inRANGE_helper_(U8, ((const U8*)s)[2], 0x41, 0x48) ) ? 3 : 0 )\ - : ( ( 0x69 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[2]) == 0xBC ) ) ? 3 : 0 )\ + : ( ( 0x69 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF8 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFB ) == 0xB0 ) || ((const U8*)s)[2] == 0x6A ) ) ? 3 : 0 )\ : 0 ) \ : ((e)-(s) > 1) ? \ ( ( ( ( ((const U8*)s)[0] & 0xAF ) == 'a' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 'h' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 's' ) || ( ( ((const U8*)s)[0] & 0xBF ) == 'w' ) || ( ( ((const U8*)s)[0] & 0xBF ) == 'y' ) ) ? 1\ @@ -3539,7 +3539,7 @@ : ( 0xAA == ((const U8*)s)[0] ) ? \ ( ( 0x6A == ((const U8*)s)[1] ) ? 2 : 0 ) \ : ( 0xB3 == ((const U8*)s)[0] ) ? \ - ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xFD ) == 0xAC ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF7 ) == 0xB1 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[1]) == 0xB7 ) ? 2 : 0 )\ + ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xFD ) == 0xAC ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[1]) & 0xF7 ) == 0xB1 ) || ((const U8*)s)[1] == 0x65 ) ? 2 : 0 )\ : ( ( 0xB4 == ((const U8*)s)[0] ) && ( ( ( ((const U8*)s)[1] & 0xFB ) == 0x42 ) || ((const U8*)s)[1] == 0x4A || ((const U8*)s)[1] == 0x55 ) ) ? 2 : 0 )\ : ((e)-(s) > 0) ? \ ( ( ( ((const U8*)s)[0] & 0xAF ) == 'a' ) || ( ( ((const U8*)s)[0] & 0x9F ) == 'f' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 'h' ) || ( ( ((const U8*)s)[0] & 0xBE ) == 's' ) || ( ( ((const U8*)s)[0] & 0xBF ) == 'y' ) )\ @@ -3587,7 +3587,7 @@ ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF0 ) == 0xA0 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xFA ) == 0xB2 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF7 ) == 0xB4 ) ) ? 3 : 0 )\ : ( 0x71 == ((const U8*)s)[1] ) ? \ ( ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xEA ) == 0xA2 ) || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF7 ) == 0xA4 ) ) ? 3 : 0 )\ - : ( ( 0x72 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xEA ) == 0xA2 ) || NATIVE_UTF8_TO_I8(((const U8*)s)[2]) == 0xA4 || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF7 ) == 0xB4 ) ) ) ? 3 : 0 )\ + : ( ( 0x72 == ((const U8*)s)[1] ) && ( ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xEA ) == 0xA2 ) || ((const U8*)s)[2] == 0x45 || ( ( NATIVE_UTF8_TO_I8(((const U8*)s)[2]) & 0xF7 ) == 0xB4 ) ) ) ? 3 : 0 )\ : ( ( ( ( 0xDD == ((const U8*)s)[0] ) && ( 0x71 == ((const U8*)s)[1] ) ) && ( 0x66 == ((const U8*)s)[2] ) ) && ( inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[3]), 0xA0, 0xA6) || inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[3]), 0xB3, 0xB7) ) ) ? 4 : 0 ) /* @@ -3753,6 +3753,6 @@ * 696e706fddd3ce8cd48c7ea91caf4c9edf5c296432d320aa7b78631f69aa9eac lib/unicore/mktables * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl - * 1568369f6c2d5411cca894445193c91273b4f0666027c5980a268f353b7df148 regen/regcharclass.pl + * ca1cae2ae68045dcfa7761a0b8d27399269f3dc395da5735ec4efbf4077c4dd0 regen/regcharclass.pl * b2f896452d2b30da3e04800f478c60c1fd0b03d6b668689b020f1e3cf1f1cdd9 regen/regcharclass_multi_char_folds.pl * ex: set ro: */ diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index e555945320..4f8a5ee042 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -366,6 +366,7 @@ my %a2n; my %n2a; # Inversion of a2n, for each character set my %I8_2_utf; my %utf_2_I8; # Inversion of I8_2_utf, for each EBCDIC character set +my @identity = (0..255); sub new { my $class= shift; @@ -1100,6 +1101,18 @@ sub _cond_as_str { # Should we avoid using mnemonics for code points? my $always_hex = 0; + # The second pass is all about using a transformation to see if it + # creates contiguous blocks that lead to fewer ranges or masking. But + # single element ranges don't have any benefit, and so the transform + # is just extra work for them. '$range_test' includes the transform + # for multi-element ranges, and '$original' maps a byte back to what + # it was without being transformed. Thus we use '$range_test' and the + # transormed bytes on multi-element ranges, and plain '$test' and + # '$original' on single ones. In the first pass these are effectively + # no-ops. + my $range_test = $test; + my $original = \@identity; + if ($i) { # 2nd pass # The second pass is only for non-ascii character sets, to see if # a transform to Unicode/ASCII saves anything. @@ -1122,11 +1135,13 @@ sub _cond_as_str { my $lookup; if ($opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi) { $lookup = $utf_2_I8{$charset}; - $test = "NATIVE_UTF8_TO_I8($test)"; + $original = $I8_2_utf{$charset}; + $range_test = "NATIVE_UTF8_TO_I8($test)"; } else { $lookup = $n2a{$charset}; - $test = "NATIVE_TO_LATIN1($test)"; + $original = $a2n{$charset}; + $range_test = "NATIVE_TO_LATIN1($test)"; } # Translate the native conditions (bytes) into the Unicode ones @@ -1221,16 +1236,17 @@ sub _cond_as_str { # development cycle. for (my $i = $loop_start; $i < $loop_end; $i++) { if (! ref $ranges[$i]) { # Trivial case: no range - $ranges[$i] = $self->val_fmt($ranges[$i], $always_hex) - . " == $test"; + $ranges[$i] = + $self->val_fmt($original->[$ranges[$i]], $always_hex) + . " == $test"; } elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) { $ranges[$i] = # Trivial case: single element range - $self->val_fmt($ranges[$i]->[0], $always_hex) - . " == $test"; + $self->val_fmt($original->[$ranges[$i]->[0]], $always_hex) + . " == $test"; } else { - $ranges[$i] = "inRANGE_helper_(U8, $test, " + $ranges[$i] = "inRANGE_helper_(U8, $range_test, " . $self->val_fmt($ranges[$i]->[0], $always_hex) .", " . $self->val_fmt($ranges[$i]->[1], $always_hex) . ")"; } @@ -1249,13 +1265,13 @@ sub _cond_as_str { my @masked; foreach my $mask_ref (@masks) { if (defined $mask_ref->[1]) { - push @masked, "( ( $test & " + push @masked, "( ( $range_test & " . $self->val_fmt($mask_ref->[1], $always_hex) . " ) == " . $self->val_fmt($mask_ref->[0], $always_hex) . " )"; } else { # An undefined mask means to use the value as-is push @masked, "$test == " - . $self->val_fmt($mask_ref->[0], $always_hex); + . $self->val_fmt($original->[$mask_ref->[0]], $always_hex); } } |