diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2002-04-02 20:35:13 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2002-04-02 20:35:13 +0000 |
commit | 872c91ae155f6880f8bf2b15c143bda5279a5794 (patch) | |
tree | 663505dbece632b3e75b0158fb36720c2d54829f /regexec.c | |
parent | b929be1d26aeff4c8c3d4bbd655731a23247ecc8 (diff) | |
download | perl-872c91ae155f6880f8bf2b15c143bda5279a5794.tar.gz |
What started as a small nit (the charnames test, nit found
be Hugo), ballooned a bit... the goal is Larry's wish that
illegal Unicode (such as U+FFFF) by default doesn't warn,
since what if somebody WANTS to create illegal Unicode?
Now getting close to this in the regex runtime.
(Also, fix more of my fixation that BOM would be U+FFFE.)
p4raw-id: //depot/perl@15689
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 68 |
1 files changed, 51 insertions, 17 deletions
@@ -999,8 +999,10 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta to_utf8_lower((U8*)m, tmpbuf1, &ulen1); to_utf8_upper((U8*)m, tmpbuf2, &ulen2); - c1 = utf8_to_uvchr(tmpbuf1, 0); - c2 = utf8_to_uvchr(tmpbuf2, 0); + c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, + 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC, + 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } else { c1 = *(U8*)m; @@ -1037,7 +1039,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (c1 == c2) { while (s <= e) { - c = utf8_to_uvchr((U8*)s, &len); + c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); if ( c == c1 && (ln == len || ibcmp_utf8(s, (char **)0, 0, do_utf8, @@ -1062,7 +1066,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else { while (s <= e) { - c = utf8_to_uvchr((U8*)s, &len); + c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); /* Handle some of the three Greek sigmas cases. * Note that not all the possible combinations @@ -2390,7 +2396,9 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*(U8*)s) != - utf8_to_uvuni((U8*)l, &ulen)) + utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY)) sayNO; l += ulen; s ++; @@ -2402,7 +2410,9 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*((U8*)l)) != - utf8_to_uvuni((U8*)s, &ulen)) + utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY)) sayNO; s += ulen; l ++; @@ -3545,11 +3555,17 @@ S_regmatch(pTHX_ regnode *prog) to_utf8_lower((U8*)s, tmpbuf1, &ulen1); to_utf8_upper((U8*)s, tmpbuf2, &ulen2); - c1 = utf8_to_uvuni(tmpbuf1, 0); - c2 = utf8_to_uvuni(tmpbuf2, 0); + c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); + c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); } else { - c2 = c1 = utf8_to_uvchr(s, NULL); + c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); } } } @@ -3605,16 +3621,24 @@ S_regmatch(pTHX_ regnode *prog) else { STRLEN len; if (c1 == c2) { - /* count initialised to utf8_distance(old, locinput) */ + /* count initialised to + * utf8_distance(old, locinput) */ while (locinput <= e && - utf8_to_uvchr((U8*)locinput, &len) != c1) { + utf8n_to_uvchr((U8*)locinput, + UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY) != c1) { locinput += len; count++; } } else { - /* count initialised to utf8_distance(old, locinput) */ + /* count initialised to + * utf8_distance(old, locinput) */ while (locinput <= e) { - UV c = utf8_to_uvchr((U8*)locinput, &len); + UV c = utf8n_to_uvchr((U8*)locinput, + UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); if (c == c1 || c == c2) break; locinput += len; @@ -3648,7 +3672,10 @@ S_regmatch(pTHX_ regnode *prog) UV c; if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); /* If it could work, try it. */ @@ -3695,7 +3722,10 @@ S_regmatch(pTHX_ regnode *prog) while (n >= ln) { if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); } @@ -3715,7 +3745,10 @@ S_regmatch(pTHX_ regnode *prog) while (n >= ln) { if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); } @@ -4297,7 +4330,8 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b STRLEN len = 0; STRLEN plen; - c = do_utf8 ? utf8_to_uvchr(p, &len) : *p; + c = do_utf8 ? utf8n_to_uvchr(p, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) : *p; plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); if (do_utf8 || (flags & ANYOF_UNICODE)) { |