summaryrefslogtreecommitdiff
path: root/ext/Unicode/Normalize/Normalize.xs
diff options
context:
space:
mode:
Diffstat (limited to 'ext/Unicode/Normalize/Normalize.xs')
-rw-r--r--ext/Unicode/Normalize/Normalize.xs290
1 files changed, 240 insertions, 50 deletions
diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs
index 7a11095c24..3cb221fd75 100644
--- a/ext/Unicode/Normalize/Normalize.xs
+++ b/ext/Unicode/Normalize/Normalize.xs
@@ -62,16 +62,19 @@ int compare_cc(const void *a, const void *b)
{
int ret_cc;
ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc;
- if(ret_cc) return ret_cc;
+ if (ret_cc)
+ return ret_cc;
return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos;
}
U8* dec_canonical (UV uv)
{
U8 ***plane, **row;
- if(OVER_UTF_MAX(uv)) return NULL;
+ if (OVER_UTF_MAX(uv))
+ return NULL;
plane = (U8***)UNF_canon[uv >> 16];
- if(! plane) return NULL;
+ if (! plane)
+ return NULL;
row = plane[(uv >> 8) & 0xff];
return row ? row[uv & 0xff] : NULL;
}
@@ -79,9 +82,11 @@ U8* dec_canonical (UV uv)
U8* dec_compat (UV uv)
{
U8 ***plane, **row;
- if(OVER_UTF_MAX(uv)) return NULL;
+ if (OVER_UTF_MAX(uv))
+ return NULL;
plane = (U8***)UNF_compat[uv >> 16];
- if(! plane) return NULL;
+ if (! plane)
+ return NULL;
row = plane[(uv >> 8) & 0xff];
return row ? row[uv & 0xff] : NULL;
}
@@ -90,25 +95,30 @@ UV composite_uv (UV uv, UV uv2)
{
UNF_complist ***plane, **row, *cell, *i;
- if(! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0;
+ if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
+ return 0;
- if(Hangul_IsL(uv) && Hangul_IsV(uv2)) {
+ if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
uv -= Hangul_LBase; /* lindex */
uv2 -= Hangul_VBase; /* vindex */
return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
}
- if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
+ if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
uv2 -= Hangul_TBase; /* tindex */
return(uv + uv2);
}
plane = UNF_compos[uv >> 16];
- if(! plane) return 0;
+ if (! plane)
+ return 0;
row = plane[(uv >> 8) & 0xff];
- if(! row) return 0;
+ if (! row)
+ return 0;
cell = row[uv & 0xff];
- if(! cell) return 0;
- for(i = cell; i->nextchar; i++) {
- if(uv2 == i->nextchar) return i->composite;
+ if (! cell)
+ return 0;
+ for (i = cell; i->nextchar; i++) {
+ if (uv2 == i->nextchar)
+ return i->composite;
}
return 0;
}
@@ -116,9 +126,11 @@ UV composite_uv (UV uv, UV uv2)
U8 getCombinClass (UV uv)
{
U8 **plane, *row;
- if(OVER_UTF_MAX(uv)) return 0;
+ if (OVER_UTF_MAX(uv))
+ return 0;
plane = (U8**)UNF_combin[uv >> 16];
- if(! plane) return 0;
+ if (! plane)
+ return 0;
row = plane[(uv >> 8) & 0xff];
return row ? row[uv & 0xff] : 0;
}
@@ -128,7 +140,8 @@ void sv_cat_decompHangul (SV* sv, UV uv)
UV sindex, lindex, vindex, tindex;
U8 *t, tmp[3 * UTF8_MAXLEN + 1];
- if(! Hangul_IsS(uv)) return;
+ if (! Hangul_IsS(uv))
+ return;
sindex = uv - Hangul_SBase;
lindex = sindex / Hangul_NCount;
@@ -138,7 +151,8 @@ void sv_cat_decompHangul (SV* sv, UV uv)
t = tmp;
t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
- if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
+ if (tindex)
+ t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
*t = '\0';
sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
}
@@ -157,7 +171,7 @@ decompose(arg, compat = &PL_sv_no)
U8 *s, *e, *p, *r;
bool iscompat;
CODE:
- if(SvUTF8(arg)) {
+ if (SvUTF8(arg)) {
src = arg;
} else {
src = sv_mortalcopy(arg);
@@ -171,14 +185,17 @@ decompose(arg, compat = &PL_sv_no)
s = (U8*)SvPV(src,srclen);
e = s + srclen;
- for(p = s; p < e;){
+ for (p = s; p < e;) {
uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
p += retlen;
- if(Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv);
+ if (Hangul_IsS(uv))
+ sv_cat_decompHangul(dst, uv);
else {
r = iscompat ? dec_compat(uv) : dec_canonical(uv);
- if(r) sv_catpv(dst, (char *)r);
- else sv_catpvn(dst, (char *)p - retlen, retlen);
+ if (r)
+ sv_catpv(dst, (char *)r);
+ else
+ sv_catpvn(dst, (char *)p - retlen, retlen);
}
}
RETVAL = dst;
@@ -192,22 +209,33 @@ reorder(arg)
SV * arg
PROTOTYPE: $
PREINIT:
- SV *src;
- STRLEN srclen, retlen, stk_cc_max;
- U8 *s, *e, *p, curCC;
+ SV *src, *dst;
+ STRLEN srclen, dstlen, retlen, stk_cc_max;
+ U8 *s, *e, *p, *d, curCC;
UV uv;
UNF_cc * stk_cc;
CODE:
- src = newSVsv(arg);
- if(! SvUTF8(arg)) sv_utf8_upgrade(src);
+ if (SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ s = (U8*)SvPV(src, srclen);
+
+ dstlen = srclen + 1;
+ dst = newSV(dstlen);
+ sv_setpvn(dst,(const char*)s,srclen);
+ SvUTF8_on(dst);
stk_cc_max = 10; /* enough as an initial value? */
New(0, stk_cc, stk_cc_max, UNF_cc);
- s = (U8*)SvPV(src,srclen);
- e = s + srclen;
+ d = (U8*)SvPV(dst,dstlen);
+ e = d + dstlen;
- for(p = s; p < e;){
+ for (p = d; p < e;) {
U8 *cc_in;
STRLEN cc_len, cc_iter, cc_pos;
@@ -215,20 +243,24 @@ reorder(arg)
curCC = getCombinClass(uv);
p += retlen;
- if(! (curCC && p < e)) continue; else cc_in = p - retlen;
+ if (! (curCC && p < e))
+ continue;
+ else
+ cc_in = p - retlen;
cc_pos = 0;
stk_cc[cc_pos].cc = curCC;
stk_cc[cc_pos].uv = uv;
stk_cc[cc_pos].pos = cc_pos;
- while(p < e) {
+ while (p < e) {
uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
curCC = getCombinClass(uv);
- if(!curCC) break;
+ if (!curCC)
+ break;
p += retlen;
cc_pos++;
- if(stk_cc_max <= cc_pos) { /* extend if need */
+ if (stk_cc_max <= cc_pos) { /* extend if need */
stk_cc_max = cc_pos + 1;
Renew(stk_cc, stk_cc_max, UNF_cc);
}
@@ -238,18 +270,19 @@ reorder(arg)
}
/* only one c.c. in cc_len from cc_in, no need of reordering */
- if(!cc_pos) continue;
+ if (!cc_pos)
+ continue;
qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
cc_len = p - cc_in;
p = cc_in;
- for(cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
+ for (cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
}
}
Safefree(stk_cc);
- RETVAL = src;
+ RETVAL = dst;
OUTPUT:
RETVAL
@@ -266,7 +299,7 @@ compose(arg)
STRLEN srclen, dstlen, tmplen, retlen;
bool beginning = TRUE;
CODE:
- if(SvUTF8(arg)) {
+ if (SvUTF8(arg)) {
src = arg;
} else {
src = sv_mortalcopy(arg);
@@ -286,12 +319,12 @@ compose(arg)
(void)SvPOK_only(tmp);
SvUTF8_on(tmp);
- for(p = s; p < e;){
- if(beginning) {
+ for (p = s; p < e;) {
+ if (beginning) {
uvS = utf8n_to_uvuni(p, e - p, &retlen, 0);
p += retlen;
- if (getCombinClass(uvS)){ /* no Starter found yet */
+ if (getCombinClass(uvS)) { /* no Starter found yet */
d = uvuni_to_utf8(d, uvS);
continue;
}
@@ -303,20 +336,18 @@ compose(arg)
preCC = 0;
/* to the next Starter */
- while(p < e) {
+ while (p < e) {
uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
p += retlen;
curCC = getCombinClass(uv);
- if(preCC && preCC == curCC) {
+ if (preCC && preCC == curCC) {
preCC = curCC;
t = uvuni_to_utf8(t, uv);
} else {
uvComp = composite_uv(uvS, uv);
- /* S + C + S => S-S + C would be also blocked. */
- if( uvComp && ! isExclusion(uvComp) && preCC <= curCC)
- {
+ if (uvComp && ! isExclusion(uvComp) && preCC <= curCC) {
STRLEN leftcur, rightcur, dstcur;
leftcur = UNISKIP(uvComp);
rightcur = UNISKIP(uvS) + UNISKIP(uv);
@@ -326,7 +357,6 @@ compose(arg)
dstlen += leftcur - rightcur;
d = (U8*)SvGROW(dst,dstlen) + dstcur;
}
-
/* preCC not changed to curCC */
uvS = uvComp;
} else if (! curCC && p < e) { /* blocked */
@@ -341,7 +371,8 @@ compose(arg)
tmplen = t - tmp_start;
if (tmplen) { /* uncomposed combining char */
t = (U8*)SvPVX(tmp);
- while(tmplen--) *d++ = *t++;
+ while (tmplen--)
+ *d++ = *t++;
}
uvS = uv;
} /* for */
@@ -352,13 +383,169 @@ compose(arg)
+void
+checkNFD(arg)
+ SV * arg
+ PROTOTYPE: $
+ ALIAS:
+ checkNFKD = 1
+ PREINIT:
+ UV uv;
+ SV *src;
+ STRLEN srclen, retlen;
+ U8 *s, *e, *p, curCC, preCC;
+ PPCODE:
+ if (SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ s = (U8*)SvPV(src,srclen);
+ e = s + srclen;
+
+ preCC = 0;
+ for (p = s; p < e; p += retlen) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ curCC = getCombinClass(uv);
+ if (preCC > curCC && curCC != 0) /* canonical ordering violated */
+ XSRETURN_NO;
+ if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
+ XSRETURN_NO;
+ preCC = curCC;
+ }
+ XSRETURN_YES;
+
+
+
+void
+checkNFC(arg)
+ SV * arg
+ PROTOTYPE: $
+ ALIAS:
+ checkNFKC = 1
+ PREINIT:
+ UV uv;
+ SV *src;
+ STRLEN srclen, retlen;
+ U8 *s, *e, *p, curCC, preCC;
+ bool isMAYBE;
+ PPCODE:
+ if (SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ s = (U8*)SvPV(src,srclen);
+ e = s + srclen;
+
+ preCC = 0;
+ isMAYBE = FALSE;
+ for (p = s; p < e; p += retlen) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ curCC = getCombinClass(uv);
+
+ if (preCC > curCC && curCC != 0) /* canonical ordering violated */
+ XSRETURN_NO;
+
+ /* get NFC/NFKC property */
+ if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
+ ; /* YES */
+ else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
+ XSRETURN_NO;
+ else if (isComp2nd(uv))
+ isMAYBE = TRUE;
+ else if (ix) {
+ char *canon, *compat;
+ /*
+ * NFKC_NO when having compatibility mapping;
+ * i.e. dec_compat(uv) defined & different with dec_canonical(uv).
+ */
+ canon = (char *) dec_canonical(uv);
+ compat = (char *) dec_compat(uv);
+ if (compat && (!canon || strNE(canon, compat)))
+ XSRETURN_NO;
+ } /* end of get NFC/NFKC property */
+
+ preCC = curCC;
+ }
+ if (isMAYBE)
+ XSRETURN_UNDEF;
+ else
+ XSRETURN_YES;
+
+
+
U8
getCombinClass(uv)
UV uv
+ PROTOTYPE: $
bool
isExclusion(uv)
UV uv
+ PROTOTYPE: $
+
+bool
+isSingleton(uv)
+ UV uv
+ PROTOTYPE: $
+
+bool
+isNonStDecomp(uv)
+ UV uv
+ PROTOTYPE: $
+
+bool
+isComp2nd(uv)
+ UV uv
+ PROTOTYPE: $
+ ALIAS:
+ isNFC_MAYBE = 1
+ isNFKC_MAYBE = 2
+
+
+
+void
+isNFD_NO(uv)
+ UV uv
+ PROTOTYPE: $
+ ALIAS:
+ isNFKD_NO = 1
+ PPCODE:
+ if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
+ XSRETURN_YES; /* NFD_NO or NFKD_NO */
+ else
+ XSRETURN_NO;
+
+
+
+void
+isComp_Ex(uv)
+ UV uv
+ PROTOTYPE: $
+ ALIAS:
+ isNFC_NO = 0
+ isNFKC_NO = 1
+ PPCODE:
+ if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
+ XSRETURN_YES; /* NFC_NO or NFKC_NO */
+ else if (ix) {
+ char *canon, *compat;
+ canon = (char *) dec_canonical(uv);
+ compat = (char *) dec_compat(uv);
+ if (compat && (!canon || strNE(canon, compat)))
+ XSRETURN_YES; /* NFC_NO or NFKC_NO */
+ else
+ XSRETURN_NO;
+ }
+ else
+ XSRETURN_NO;
+
+
SV*
getComposite(uv, uv2)
@@ -373,6 +560,8 @@ getComposite(uv, uv2)
OUTPUT:
RETVAL
+
+
SV*
getCanon(uv)
UV uv
@@ -382,7 +571,7 @@ getCanon(uv)
PREINIT:
U8 * rstr;
CODE:
- if(Hangul_IsS(uv)) {
+ if (Hangul_IsS(uv)) {
SV * dst;
dst = newSV(1);
(void)SvPOK_only(dst);
@@ -390,7 +579,8 @@ getCanon(uv)
RETVAL = dst;
} else {
rstr = ix ? dec_compat(uv) : dec_canonical(uv);
- if(!rstr) XSRETURN_UNDEF;
+ if (!rstr)
+ XSRETURN_UNDEF;
RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
}
SvUTF8_on(RETVAL);