summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorNicholas Clark <nick@ccl4.org>2008-02-12 13:15:20 +0000
committerNicholas Clark <nick@ccl4.org>2008-02-12 13:15:20 +0000
commit7918f24d20384771923d344a382e1d16d9552018 (patch)
tree627e24f3c520f70ddfd3fc9779420bd72fd00c55 /utf8.c
parent9f10164a6c9d93684fedbbc188fb9dfe004c22c4 (diff)
downloadperl-7918f24d20384771923d344a382e1d16d9552018.tar.gz
assert() that every NN argument is not NULL. Otherwise we have the
ability to create landmines that will explode under someone in the future when they upgrade their compiler to one with better optimisation. We've already done this at least twice. (Yes, some of the assertions are after code that would already have SEGVd because it already deferences a pointer, but they are put in to make it easier to automate checking that each and every case is covered.) Add a tool, checkARGS_ASSERT.pl, to check that every case is covered. p4raw-id: //depot/perl@33291
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c127
1 files changed, 125 insertions, 2 deletions
diff --git a/utf8.c b/utf8.c
index 440ee760e2..3b5d5d4a9c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -71,6 +71,8 @@ is the recommended Unicode-aware way of saying
U8 *
Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
+ PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
+
if (ckWARN(WARN_UTF8)) {
if (UNICODE_IS_SURROGATE(uv) &&
!(flags & UNICODE_ALLOW_SURROGATE))
@@ -199,6 +201,8 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len)
STRLEN slen;
UV uv, ouv;
+ PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
+
if (UTF8_IS_INVARIANT(u))
return 1;
@@ -245,6 +249,8 @@ STRLEN
Perl_is_utf8_char(pTHX_ const U8 *s)
{
const STRLEN len = UTF8SKIP(s);
+
+ PERL_ARGS_ASSERT_IS_UTF8_CHAR;
PERL_UNUSED_CONTEXT;
#ifdef IS_UTF8_CHAR
if (IS_UTF8_CHAR_FAST(len))
@@ -272,6 +278,7 @@ Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
const U8* const send = s + (len ? len : strlen((const char *)s));
const U8* x = s;
+ PERL_ARGS_ASSERT_IS_UTF8_STRING;
PERL_UNUSED_CONTEXT;
while (x < send) {
@@ -337,6 +344,8 @@ Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN
const U8* x = s;
STRLEN c;
STRLEN outlen = 0;
+
+ PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
PERL_UNUSED_CONTEXT;
while (x < send) {
@@ -410,6 +419,8 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
STRLEN expectlen = 0;
U32 warning = 0;
+ PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
+
/* This list is a superset of the UTF8_ALLOW_XXX. */
#define UTF8_WARN_EMPTY 1
@@ -617,6 +628,8 @@ returned and retlen is set, if possible, to -1.
UV
Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
{
+ PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
+
return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
@@ -640,6 +653,8 @@ returned and retlen is set, if possible, to -1.
UV
Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
{
+ PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
+
/* Call the low level routine asking for checks */
return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
@@ -662,6 +677,8 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
STRLEN len = 0;
U8 t = 0;
+ PERL_ARGS_ASSERT_UTF8_LENGTH;
+
/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
* the bitops (especially ~) can create illegal UTF-8.
* In other words: in Perl UTF-8 is not just for Unicode. */
@@ -703,6 +720,8 @@ same UTF-8 buffer.
IV
Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
{
+ PERL_ARGS_ASSERT_UTF8_DISTANCE;
+
return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
}
@@ -722,6 +741,8 @@ on the first byte of character or just after the last byte of a character.
U8 *
Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
{
+ PERL_ARGS_ASSERT_UTF8_HOP;
+
PERL_UNUSED_CONTEXT;
/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
* the bitops (especially ~) can create illegal UTF-8.
@@ -761,6 +782,8 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
U8 * const send = s + *len;
U8 *d;
+ PERL_ARGS_ASSERT_UTF8_TO_BYTES;
+
/* ensure valid UTF-8 and chars < 256 before updating string */
while (s < send) {
U8 c = *s++;
@@ -805,6 +828,8 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
const U8 *send;
I32 count = 0;
+ PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
+
PERL_UNUSED_CONTEXT;
if (!*is_utf8)
return (U8 *)start;
@@ -858,6 +883,8 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
const U8 * const send = s + (*len);
U8 *d;
U8 *dst;
+
+ PERL_ARGS_ASSERT_BYTES_TO_UTF8;
PERL_UNUSED_CONTEXT;
Newx(d, (*len) * 2 + 1, U8);
@@ -889,6 +916,8 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
U8* pend;
U8* dstart = d;
+ PERL_ARGS_ASSERT_UTF16_TO_UTF8;
+
if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
d[0] = 0;
*newlen = 1;
@@ -948,6 +977,9 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
{
U8* s = (U8*)p;
U8* const send = s + bytelen;
+
+ PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
+
while (s < send) {
const U8 tmp = s[0];
s[0] = s[1];
@@ -1074,6 +1106,8 @@ Perl_is_uni_xdigit(pTHX_ UV c)
UV
Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
{
+ PERL_ARGS_ASSERT_TO_UNI_UPPER;
+
uvchr_to_utf8(p, c);
return to_utf8_upper(p, p, lenp);
}
@@ -1081,6 +1115,8 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
UV
Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
{
+ PERL_ARGS_ASSERT_TO_UNI_TITLE;
+
uvchr_to_utf8(p, c);
return to_utf8_title(p, p, lenp);
}
@@ -1088,6 +1124,8 @@ Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
UV
Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
{
+ PERL_ARGS_ASSERT_TO_UNI_LOWER;
+
uvchr_to_utf8(p, c);
return to_utf8_lower(p, p, lenp);
}
@@ -1095,6 +1133,8 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
UV
Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
{
+ PERL_ARGS_ASSERT_TO_UNI_FOLD;
+
uvchr_to_utf8(p, c);
return to_utf8_fold(p, p, lenp);
}
@@ -1220,6 +1260,9 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
const char *const swashname)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_COMMON;
+
if (!is_utf8_char(p))
return FALSE;
if (!*swash)
@@ -1231,6 +1274,9 @@ bool
Perl_is_utf8_alnum(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
+
/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
* descendant of isalnum(3), in other words, it doesn't
* contain the '_'. --jhi */
@@ -1241,6 +1287,9 @@ bool
Perl_is_utf8_alnumc(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
+
return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
}
@@ -1248,6 +1297,9 @@ bool
Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
+
if (*p == '_')
return TRUE;
/* is_utf8_idstart would be more logical. */
@@ -1258,6 +1310,9 @@ bool
Perl_is_utf8_idcont(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
+
if (*p == '_')
return TRUE;
return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
@@ -1267,6 +1322,9 @@ bool
Perl_is_utf8_alpha(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
+
return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
}
@@ -1274,6 +1332,9 @@ bool
Perl_is_utf8_ascii(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_ASCII;
+
return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
}
@@ -1281,6 +1342,9 @@ bool
Perl_is_utf8_space(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_SPACE;
+
return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
}
@@ -1288,6 +1352,9 @@ bool
Perl_is_utf8_digit(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
+
return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
}
@@ -1295,6 +1362,9 @@ bool
Perl_is_utf8_upper(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_UPPER;
+
return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
}
@@ -1302,6 +1372,9 @@ bool
Perl_is_utf8_lower(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_LOWER;
+
return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
}
@@ -1309,6 +1382,9 @@ bool
Perl_is_utf8_cntrl(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
+
return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
}
@@ -1316,6 +1392,9 @@ bool
Perl_is_utf8_graph(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
+
return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
}
@@ -1323,6 +1402,9 @@ bool
Perl_is_utf8_print(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_PRINT;
+
return is_utf8_common(p, &PL_utf8_print, "IsPrint");
}
@@ -1330,6 +1412,9 @@ bool
Perl_is_utf8_punct(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
+
return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
}
@@ -1337,6 +1422,9 @@ bool
Perl_is_utf8_xdigit(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
+
return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
}
@@ -1344,6 +1432,9 @@ bool
Perl_is_utf8_mark(pTHX_ const U8 *p)
{
dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_MARK;
+
return is_utf8_common(p, &PL_utf8_mark, "IsM");
}
@@ -1379,12 +1470,14 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
dVAR;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
STRLEN len = 0;
-
const UV uv0 = utf8_to_uvchr(p, NULL);
/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
* are necessary in EBCDIC, they are redundant no-ops
* in ASCII-ish platforms, and hopefully optimized away. */
const UV uv1 = NATIVE_TO_UNI(uv0);
+
+ PERL_ARGS_ASSERT_TO_UTF8_CASE;
+
uvuni_to_utf8(tmpbuf, uv1);
if (!*swashp) /* load on-demand */
@@ -1477,6 +1570,9 @@ UV
Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
{
dVAR;
+
+ PERL_ARGS_ASSERT_TO_UTF8_UPPER;
+
return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
}
@@ -1498,6 +1594,9 @@ UV
Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
{
dVAR;
+
+ PERL_ARGS_ASSERT_TO_UTF8_TITLE;
+
return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
}
@@ -1519,6 +1618,9 @@ UV
Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
{
dVAR;
+
+ PERL_ARGS_ASSERT_TO_UTF8_LOWER;
+
return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
}
@@ -1541,6 +1643,9 @@ UV
Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
{
dVAR;
+
+ PERL_ARGS_ASSERT_TO_UTF8_FOLD;
+
return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
}
@@ -1562,6 +1667,8 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
SV* errsv_save;
+ PERL_ARGS_ASSERT_SWASH_INIT;
+
PUSHSTACKi(PERLSI_MAGIC);
ENTER;
SAVEI32(PL_hints);
@@ -1644,6 +1751,8 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
U8 tmputf8[2];
const UV c = NATIVE_TO_ASCII(*ptr);
+ PERL_ARGS_ASSERT_SWASH_FETCH;
+
if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
@@ -1754,7 +1863,6 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
SV *swatch;
U8 *l, *lend, *x, *xend, *s;
STRLEN lcur, xcur, scur;
-
HV* const hv = (HV*)SvRV(swash);
SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
@@ -1768,6 +1876,8 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
const UV none = SvUV(*nonesvp);
const UV end = start + span;
+ PERL_ARGS_ASSERT_SWASH_GET;
+
if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
(UV)bits);
@@ -2079,12 +2189,16 @@ is the recommended wide native character-aware way of saying
U8 *
Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
{
+ PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
+
return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
}
U8 *
Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
+ PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
+
return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
}
@@ -2109,6 +2223,9 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
U32 flags)
{
const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
+
+ PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
+
return UNI_TO_NATIVE(uv);
}
@@ -2135,6 +2252,8 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f
int truncated = 0;
const char *s, *e;
+ PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
+
sv_setpvn(dsv, "", 0);
SvUTF8_off(dsv);
for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
@@ -2204,6 +2323,8 @@ The pointer to the PV of the dsv is returned.
char *
Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
{
+ PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
+
return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
SvCUR(ssv), pvlim, flags);
}
@@ -2251,6 +2372,8 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const
U8 natbuf[1+1];
STRLEN foldlen1, foldlen2;
bool match;
+
+ PERL_ARGS_ASSERT_IBCMP_UTF8;
if (pe1)
e1 = *(U8**)pe1;