summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc2
-rw-r--r--embed.h2
-rw-r--r--handy.h63
-rw-r--r--proto.h9
-rw-r--r--utf8.c56
5 files changed, 96 insertions, 36 deletions
diff --git a/embed.fnc b/embed.fnc
index a3ab8a2d5e..5af5c97109 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -658,6 +658,8 @@ Anpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
Anpd |bool |is_utf8_string |NN const U8 *s|STRLEN len
Anpdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **ep
Anpd |bool |is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
+AMpR |bool |_is_uni_FOO|const U8 classnum|const UV c
+AMpR |bool |_is_utf8_FOO|const U8 classnum|NN const U8 *p
AMpR |bool |is_utf8_alnum |NN const U8 *p
AMpR |bool |is_utf8_alnumc |NN const U8 *p
ADMpR |bool |is_utf8_idfirst|NN const U8 *p
diff --git a/embed.h b/embed.h
index 20450e976e..c1ca676374 100644
--- a/embed.h
+++ b/embed.h
@@ -27,7 +27,9 @@
/* Hide global symbols */
#define Gv_AMupdate(a,b) Perl_Gv_AMupdate(aTHX_ a,b)
+#define _is_uni_FOO(a,b) Perl__is_uni_FOO(aTHX_ a,b)
#define _is_uni_perl_idstart(a) Perl__is_uni_perl_idstart(aTHX_ a)
+#define _is_utf8_FOO(a,b) Perl__is_utf8_FOO(aTHX_ a,b)
#define _is_utf8_perl_idstart(a) Perl__is_utf8_perl_idstart(aTHX_ a)
#define _to_uni_fold_flags(a,b,c,d) Perl__to_uni_fold_flags(aTHX_ a,b,c,d)
#define _to_utf8_fold_flags(a,b,c,d,e) Perl__to_utf8_fold_flags(aTHX_ a,b,c,d,e)
diff --git a/handy.h b/handy.h
index 077952c42f..f4e978caeb 100644
--- a/handy.h
+++ b/handy.h
@@ -802,6 +802,26 @@ typedef enum {
#define POSIX_SWASH_COUNT _FIRST_NON_SWASH_CC
+#if defined(PERL_IN_UTF8_C)
+# if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
+ || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
+ || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8
+ #error Need to adjust order of swash_property_names[]
+# endif
+
+static const char* const swash_property_names[] = {
+ "XPosixWord",
+ "XPosixDigit",
+ "XPosixAlpha",
+ "XPosixLower",
+ "XPosixUpper",
+ "XPosixPunct",
+ "XPosixPrint",
+ "XPosixAlnum",
+ "XPosixGraph"
+};
+#endif
+
#define PL_utf8_alnum PL_utf8_swash_ptrs[_CC_WORDCHAR]
#define PL_utf8_alnumc PL_utf8_swash_ptrs[_CC_ALPHANUMERIC]
#define PL_utf8_alpha PL_utf8_swash_ptrs[_CC_ALPHA]
@@ -1107,26 +1127,29 @@ EXTCONST U32 PL_charclass[];
#define _generic_uni(classnum, function, c) ((c) < 256 \
? _generic_isCC(c, classnum) \
: function(c))
+#define _generic_uni_swash(classnum, c) ((c) < 256 \
+ ? _generic_isCC(c, classnum) \
+ : _is_uni_FOO(classnum, c))
#define isALNUM_uni(c) isWORDCHAR_uni(c)
-#define isALPHA_uni(c) _generic_uni(_CC_ALPHA, is_uni_alpha, c)
-#define isALPHANUMERIC_uni(c) _generic_uni(_CC_ALPHANUMERIC, is_uni_alnumc, c)
+#define isALPHA_uni(c) _generic_uni_swash(_CC_ALPHA, c)
+#define isALPHANUMERIC_uni(c) _generic_uni_swash(_CC_ALPHANUMERIC, c)
#define isASCII_uni(c) isASCII(c)
#define isBLANK_uni(c) _generic_uni(_CC_BLANK, is_HORIZWS_cp_high, c)
#define isCNTRL_uni(c) isCNTRL_L1(c) /* All controls are in Latin1 */
-#define isDIGIT_uni(c) _generic_uni(_CC_DIGIT, is_uni_digit, c)
-#define isGRAPH_uni(c) _generic_uni(_CC_GRAPH, is_uni_graph, c)
+#define isDIGIT_uni(c) _generic_uni_swash(_CC_DIGIT, c)
+#define isGRAPH_uni(c) _generic_uni_swash(_CC_GRAPH, c)
#define isIDFIRST_uni(c) _generic_uni(_CC_IDFIRST, _is_uni_perl_idstart, c)
-#define isLOWER_uni(c) _generic_uni(_CC_LOWER, is_uni_lower, c)
-#define isPRINT_uni(c) _generic_uni(_CC_PRINT, is_uni_print, c)
+#define isLOWER_uni(c) _generic_uni_swash(_CC_LOWER, c)
+#define isPRINT_uni(c) _generic_uni_swash(_CC_PRINT, c)
/* Posix and regular space are identical above Latin1 */
#define isPSXSPC_uni(c) _generic_uni(_CC_PSXSPC, is_XPERLSPACE_cp_high, c)
-#define isPUNCT_uni(c) _generic_uni(_CC_PUNCT, is_uni_punct, c)
+#define isPUNCT_uni(c) _generic_uni_swash(_CC_PUNCT, c)
#define isSPACE_uni(c) _generic_uni(_CC_SPACE, is_XPERLSPACE_cp_high, c)
-#define isUPPER_uni(c) _generic_uni(_CC_UPPER, is_uni_upper, c)
+#define isUPPER_uni(c) _generic_uni_swash(_CC_UPPER, c)
#define isVERTWS_uni(c) _generic_uni(_CC_VERTSPACE, is_VERTWS_cp_high, c)
-#define isWORDCHAR_uni(c) _generic_uni(_CC_WORDCHAR, is_uni_alnum, c)
+#define isWORDCHAR_uni(c) _generic_uni_swash(_CC_WORDCHAR, c)
#define isXDIGIT_uni(c) _generic_uni(_CC_XDIGIT, is_XDIGIT_cp_high, c)
#define toFOLD_uni(c,s,l) to_uni_fold(c,s,l)
@@ -1180,6 +1203,11 @@ EXTCONST U32 PL_charclass[];
#define _generic_utf8(classnum, function, p) \
_generic_utf8_utf8(classnum, p, function(p))
+/* Like the above, but passes classnum to _isFOO_utf8() */
+#define _generic_swash_utf8(classnum, p) \
+ _generic_utf8_utf8(classnum, p, _is_utf8_FOO(classnum, p))
+
+
/* Like the above, but should be used only when it is known that there are no
* characters in the range 128-255 which the class is TRUE for. Hence it can
* skip the tests for this range */
@@ -1199,9 +1227,8 @@ EXTCONST U32 PL_charclass[];
* "if-else-if-else ..." */
#define isALNUM_utf8(p) isWORDCHAR_utf8(p) /* back compat */
-#define isALPHA_utf8(p) _generic_utf8(_CC_ALPHA, is_utf8_alpha, p)
-#define isALPHANUMERIC_utf8(p) _generic_utf8(_CC_ALPHANUMERIC, \
- is_utf8_alnumc, p)
+#define isALPHA_utf8(p) _generic_swash_utf8(_CC_ALPHA, p)
+#define isALPHANUMERIC_utf8(p) _generic_swash_utf8(_CC_ALPHANUMERIC, p)
#define isASCII_utf8(p) isASCII(*p) /* Because ASCII is invariant under
utf8, the non-utf8 macro works
*/
@@ -1209,7 +1236,7 @@ EXTCONST U32 PL_charclass[];
#define isCNTRL_utf8(p) _generic_utf8_utf8(_CC_CNTRL, p, 0)
#define isDIGIT_utf8(p) _generic_utf8_no_upper_latin1(_CC_DIGIT, \
is_utf8_digit, p)
-#define isGRAPH_utf8(p) _generic_utf8(_CC_GRAPH, is_utf8_graph, p)
+#define isGRAPH_utf8(p) _generic_swash_utf8(_CC_GRAPH, p)
#define isIDCONT_utf8(p) _generic_utf8(_CC_WORDCHAR, is_utf8_xidcont, p)
/* To prevent S_scan_word in toke.c from hanging, we have to make sure that
@@ -1221,17 +1248,17 @@ EXTCONST U32 PL_charclass[];
#define isIDFIRST_utf8(p) _generic_utf8(_CC_IDFIRST, \
_is_utf8_perl_idstart, p)
-#define isLOWER_utf8(p) _generic_utf8(_CC_LOWER, is_utf8_lower, p)
-#define isPRINT_utf8(p) _generic_utf8(_CC_PRINT, is_utf8_print, p)
+#define isLOWER_utf8(p) _generic_swash_utf8(_CC_LOWER, p)
+#define isPRINT_utf8(p) _generic_swash_utf8(_CC_PRINT, p)
/* Posix and regular space are identical above Latin1 */
#define isPSXSPC_utf8(p) _generic_utf8(_CC_PSXSPC, is_XPERLSPACE_high, p)
-#define isPUNCT_utf8(p) _generic_utf8(_CC_PUNCT, is_utf8_punct, p)
+#define isPUNCT_utf8(p) _generic_swash_utf8(_CC_PUNCT, p)
#define isSPACE_utf8(p) _generic_utf8(_CC_SPACE, is_XPERLSPACE_high, p)
-#define isUPPER_utf8(p) _generic_utf8(_CC_UPPER, is_utf8_upper, p)
+#define isUPPER_utf8(p) _generic_swash_utf8(_CC_UPPER, p)
#define isVERTWS_utf8(p) _generic_utf8(_CC_VERTSPACE, is_VERTWS_high, p)
-#define isWORDCHAR_utf8(p) _generic_utf8(_CC_WORDCHAR, is_utf8_alnum, p)
+#define isWORDCHAR_utf8(p) _generic_swash_utf8(_CC_WORDCHAR, p)
#define isXDIGIT_utf8(p) _generic_utf8_no_upper_latin1(_CC_XDIGIT, \
is_XDIGIT_high, p)
diff --git a/proto.h b/proto.h
index e22d7c90fc..d47e5de925 100644
--- a/proto.h
+++ b/proto.h
@@ -32,9 +32,18 @@ PERL_CALLCONV void Perl_Slab_Free(pTHX_ void *op)
#define PERL_ARGS_ASSERT_SLAB_FREE \
assert(op)
+PERL_CALLCONV bool Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
+ __attribute__warn_unused_result__;
+
PERL_CALLCONV bool Perl__is_uni_perl_idstart(pTHX_ UV c)
__attribute__warn_unused_result__;
+PERL_CALLCONV bool Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
+ __attribute__warn_unused_result__
+ __attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT__IS_UTF8_FOO \
+ assert(p)
+
PERL_CALLCONV bool Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
diff --git a/utf8.c b/utf8.c
index 2a5aff1aea..2fb39c4b6d 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1479,6 +1479,14 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
return utf16_to_utf8(p, d, bytelen, newlen);
}
+bool
+Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
+{
+ U8 tmpbuf[UTF8_MAXBYTES+1];
+ uvchr_to_utf8(tmpbuf, c);
+ return _is_utf8_FOO(classnum, tmpbuf);
+}
+
/* for now these are all defined (inefficiently) in terms of the utf8 versions.
* Note that the macros in handy.h that call these short-circuit calling them
* for Latin-1 range inputs */
@@ -1488,7 +1496,7 @@ Perl_is_uni_alnum(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
- return is_utf8_alnum(tmpbuf);
+ return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf);
}
bool
@@ -1496,7 +1504,7 @@ Perl_is_uni_alnumc(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
- return is_utf8_alnumc(tmpbuf);
+ return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf);
}
bool /* Internal function so we can deprecate the external one, and call
@@ -1532,7 +1540,7 @@ Perl_is_uni_alpha(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
- return is_utf8_alpha(tmpbuf);
+ return _is_utf8_FOO(_CC_ALPHA, tmpbuf);
}
bool
@@ -1558,7 +1566,7 @@ Perl_is_uni_digit(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
- return is_utf8_digit(tmpbuf);
+ return _is_utf8_FOO(_CC_DIGIT, tmpbuf);
}
bool
@@ -1566,7 +1574,7 @@ Perl_is_uni_upper(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
- return is_utf8_upper(tmpbuf);
+ return _is_utf8_FOO(_CC_UPPER, tmpbuf);
}
bool
@@ -1574,7 +1582,7 @@ Perl_is_uni_lower(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
- return is_utf8_lower(tmpbuf);
+ return _is_utf8_FOO(_CC_LOWER, tmpbuf);
}
bool
@@ -1588,7 +1596,7 @@ Perl_is_uni_graph(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
- return is_utf8_graph(tmpbuf);
+ return _is_utf8_FOO(_CC_GRAPH, tmpbuf);
}
bool
@@ -1596,7 +1604,7 @@ Perl_is_uni_print(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
- return is_utf8_print(tmpbuf);
+ return _is_utf8_FOO(_CC_PRINT, tmpbuf);
}
bool
@@ -1604,7 +1612,7 @@ Perl_is_uni_punct(pTHX_ UV c)
{
U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
- return is_utf8_punct(tmpbuf);
+ return _is_utf8_FOO(_CC_PUNCT, tmpbuf);
}
bool
@@ -1841,7 +1849,7 @@ Perl_is_uni_alnum_lc(pTHX_ UV c)
if (c < 256) {
return isALNUM_LC(UNI_TO_NATIVE(c));
}
- return is_uni_alnum(c);
+ return _is_uni_FOO(_CC_WORDCHAR, c);
}
bool
@@ -1850,7 +1858,7 @@ Perl_is_uni_alnumc_lc(pTHX_ UV c)
if (c < 256) {
return isALPHANUMERIC_LC(UNI_TO_NATIVE(c));
}
- return is_uni_alnumc(c);
+ return _is_uni_FOO(_CC_ALPHANUMERIC, c);
}
bool
@@ -1868,7 +1876,7 @@ Perl_is_uni_alpha_lc(pTHX_ UV c)
if (c < 256) {
return isALPHA_LC(UNI_TO_NATIVE(c));
}
- return is_uni_alpha(c);
+ return _is_uni_FOO(_CC_ALPHA, c);
}
bool
@@ -1904,7 +1912,7 @@ Perl_is_uni_digit_lc(pTHX_ UV c)
if (c < 256) {
return isDIGIT_LC(UNI_TO_NATIVE(c));
}
- return is_uni_digit(c);
+ return _is_uni_FOO(_CC_DIGIT, c);
}
bool
@@ -1913,7 +1921,7 @@ Perl_is_uni_upper_lc(pTHX_ UV c)
if (c < 256) {
return isUPPER_LC(UNI_TO_NATIVE(c));
}
- return is_uni_upper(c);
+ return _is_uni_FOO(_CC_UPPER, c);
}
bool
@@ -1922,7 +1930,7 @@ Perl_is_uni_lower_lc(pTHX_ UV c)
if (c < 256) {
return isLOWER_LC(UNI_TO_NATIVE(c));
}
- return is_uni_lower(c);
+ return _is_uni_FOO(_CC_LOWER, c);
}
bool
@@ -1940,7 +1948,7 @@ Perl_is_uni_graph_lc(pTHX_ UV c)
if (c < 256) {
return isGRAPH_LC(UNI_TO_NATIVE(c));
}
- return is_uni_graph(c);
+ return _is_uni_FOO(_CC_GRAPH, c);
}
bool
@@ -1949,7 +1957,7 @@ Perl_is_uni_print_lc(pTHX_ UV c)
if (c < 256) {
return isPRINT_LC(UNI_TO_NATIVE(c));
}
- return is_uni_print(c);
+ return _is_uni_FOO(_CC_PRINT, c);
}
bool
@@ -1958,7 +1966,7 @@ Perl_is_uni_punct_lc(pTHX_ UV c)
if (c < 256) {
return isPUNCT_LC(UNI_TO_NATIVE(c));
}
- return is_uni_punct(c);
+ return _is_uni_FOO(_CC_PUNCT, c);
}
bool
@@ -2034,6 +2042,18 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
}
bool
+Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT__IS_UTF8_FOO;
+
+ assert(classnum < _FIRST_NON_SWASH_CC);
+
+ return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]);
+}
+
+bool
Perl_is_utf8_alnum(pTHX_ const U8 *p)
{
dVAR;