diff options
Diffstat (limited to 'ext/Unicode/Normalize/Normalize.xs')
-rw-r--r-- | ext/Unicode/Normalize/Normalize.xs | 160 |
1 files changed, 149 insertions, 11 deletions
diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs index 987a839162..04d02563a0 100644 --- a/ext/Unicode/Normalize/Normalize.xs +++ b/ext/Unicode/Normalize/Normalize.xs @@ -20,6 +20,12 @@ #define utf8n_to_uvuni utf8_to_uv #endif /* utf8n_to_uvuni */ +/* if utf8n_to_uvuni() sets retlen to 0 when flags = 0 */ +#define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character" + +/* utf8_hop() hops back before start. Maybe broken UTF-8 */ +#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" + /* At present, char > 0x10ffff are unaffected without complaint, right? */ #define VALID_UTF_MAX (0x10ffff) #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) @@ -187,9 +193,11 @@ decompose(arg, compat = &PL_sv_no) s = (U8*)SvPV(src,srclen); e = s + srclen; - for (p = s; p < e;) { + for (p = s; p < e; p += retlen) { uv = utf8n_to_uvuni(p, e - p, &retlen, 0); - p += retlen; + if (!retlen) + croak(ErrRetlenIsZero); + if (Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv); else { @@ -197,7 +205,7 @@ decompose(arg, compat = &PL_sv_no) if (r) sv_catpv(dst, (char *)r); else - sv_catpvn(dst, (char *)p - retlen, retlen); + sv_catpvn(dst, (char *)p, retlen); } } RETVAL = dst; @@ -242,9 +250,13 @@ reorder(arg) STRLEN cc_len, cc_iter, cc_pos; uv = utf8n_to_uvuni(p, e - p, &retlen, 0); - curCC = getCombinClass(uv); + if (!retlen) + croak(ErrRetlenIsZero); p += retlen; + + + curCC = getCombinClass(uv); if (! (curCC && p < e)) continue; else @@ -257,10 +269,14 @@ reorder(arg) while (p < e) { uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); + p += retlen; + curCC = getCombinClass(uv); if (!curCC) break; - p += retlen; + cc_pos++; if (stk_cc_max <= cc_pos) { /* extend if need */ stk_cc_max = cc_pos + 1; @@ -294,6 +310,8 @@ SV* compose(arg) SV * arg PROTOTYPE: $ + ALIAS: + composeContiguous = 1 PREINIT: SV *src, *dst, *tmp; U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; @@ -324,6 +342,8 @@ compose(arg) for (p = s; p < e;) { if (beginning) { uvS = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); p += retlen; if (getCombinClass(uvS)) { /* no Starter found yet */ @@ -340,7 +360,10 @@ compose(arg) /* to the next Starter */ while (p < e) { uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); p += retlen; + curCC = getCombinClass(uv); if (preCC && preCC == curCC) { @@ -349,7 +372,8 @@ compose(arg) } else { uvComp = composite_uv(uvS, uv); - if (uvComp && ! isExclusion(uvComp) && preCC <= curCC) { + if (uvComp && ! isExclusion(uvComp) && + (ix ? (t == tmp_start) : (preCC <= curCC))) { STRLEN leftcur, rightcur, dstcur; leftcur = UNISKIP(uvComp); rightcur = UNISKIP(uvS) + UNISKIP(uv); @@ -385,7 +409,6 @@ compose(arg) RETVAL - void checkNFD(arg) SV * arg @@ -397,7 +420,7 @@ checkNFD(arg) SV *src; STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; - PPCODE: + CODE: if (SvUTF8(arg)) { src = arg; } else { @@ -411,6 +434,9 @@ checkNFD(arg) preCC = 0; for (p = s; p < e; p += retlen) { uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); + curCC = getCombinClass(uv); if (preCC > curCC && curCC != 0) /* canonical ordering violated */ XSRETURN_NO; @@ -434,7 +460,7 @@ checkNFC(arg) STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; bool isMAYBE; - PPCODE: + CODE: if (SvUTF8(arg)) { src = arg; } else { @@ -449,6 +475,9 @@ checkNFC(arg) isMAYBE = FALSE; for (p = s; p < e; p += retlen) { uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); + curCC = getCombinClass(uv); if (preCC > curCC && curCC != 0) /* canonical ordering violated */ @@ -479,6 +508,78 @@ checkNFC(arg) +void +checkFCD(arg) + SV * arg + PROTOTYPE: $ + ALIAS: + checkFCC = 1 + PREINIT: + UV uv, uvLead, uvTrail; + SV *src; + STRLEN srclen, retlen, canlen, canret; + U8 *s, *e, *p, curCC, preCC; + U8 *sCan, *pCan, *eCan; + bool isMAYBE; + CODE: + if (SvUTF8(arg)) { + src = arg; + } else { + src = sv_mortalcopy(arg); + sv_utf8_upgrade(src); + } + + s = (U8*)SvPV(src,srclen); + e = s + srclen; + + preCC = 0; + isMAYBE = FALSE; + for (p = s; p < e; p += retlen) { + uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (!retlen) + croak(ErrRetlenIsZero); + + sCan = (U8*) dec_canonical(uv); + + if (sCan) { + canlen = (STRLEN)strlen((char *) sCan); + uvLead = utf8n_to_uvuni(sCan, canlen, &canret, 0); + } + else { + uvLead = uv; + } + + curCC = getCombinClass(uvLead); + + if (curCC != 0 && curCC < preCC) /* canonical ordering violated */ + XSRETURN_NO; + + if (ix) { + if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) + XSRETURN_NO; + else if (isComp2nd(uv)) + isMAYBE = TRUE; + } + + if (sCan) { + eCan = sCan + canlen; + pCan = utf8_hop(eCan, -1); + if (pCan < sCan) + croak(ErrHopBeforeStart); + uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, 0); + preCC = getCombinClass(uvTrail); + } + else { + preCC = curCC; + } + } + if (isMAYBE) + XSRETURN_UNDEF; + else + XSRETURN_YES; + + + U8 getCombinClass(uv) UV uv @@ -515,7 +616,7 @@ isNFD_NO(uv) PROTOTYPE: $ ALIAS: isNFKD_NO = 1 - PPCODE: + CODE: if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) XSRETURN_YES; /* NFD_NO or NFKD_NO */ else @@ -530,7 +631,7 @@ isComp_Ex(uv) ALIAS: isNFC_NO = 0 isNFKC_NO = 1 - PPCODE: + CODE: if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) XSRETURN_YES; /* NFC_NO or NFKC_NO */ else if (ix) { @@ -587,3 +688,40 @@ getCanon(uv) OUTPUT: RETVAL + +void +splitOnLastStarter(arg) + SV * arg + PREINIT: + UV uv; + SV *src, *svp; + STRLEN srclen, retlen; + U8 *s, *e, *p; + PPCODE: + if (SvUTF8(arg)) { + src = arg; + } else { + src = sv_mortalcopy(arg); + sv_utf8_upgrade(src); + } + + s = (U8*)SvPV(src,srclen); + e = s + srclen; + + for (p = e; s < p; ) { + p = utf8_hop(p, -1); + if (p < s) + croak(ErrHopBeforeStart); + uv = utf8n_to_uvuni(p, e - p, &retlen, 0); + if (getCombinClass(uv) == 0) /* Last Starter found */ + break; + } + + svp = sv_2mortal(newSVpvn((char*)s, p - s)); + SvUTF8_on(svp); + XPUSHs(svp); + + svp = sv_2mortal(newSVpvn((char*)p, e - p)); + SvUTF8_on(svp); + XPUSHs(svp); + |