summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2000-11-26 19:01:05 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2000-11-26 19:01:05 +0000
commit02eb7b47b8a6793752e5b001af6e62c374b2c440 (patch)
tree07c212ac1a760e5468e7b769b861a1fe00d96718
parent21477fb41342ef2f0f21af5ef95caf64eee65dee (diff)
downloadperl-02eb7b47b8a6793752e5b001af6e62c374b2c440.tar.gz
Make utf8_length() and utf8_distance() (the latter of which
is unused at the moment) to be less forgiving about bad UTF-8. p4raw-id: //depot/perl@7869
-rwxr-xr-xembed.pl2
-rw-r--r--proto.h2
-rw-r--r--utf8.c28
3 files changed, 21 insertions, 11 deletions
diff --git a/embed.pl b/embed.pl
index 1d35bf6917..6412ef6b9a 100755
--- a/embed.pl
+++ b/embed.pl
@@ -2072,7 +2072,7 @@ p |void |utilize |int aver|I32 floor|OP* version|OP* id|OP* arg
Ap |U8* |utf16_to_utf8 |U8* p|U8 *d|I32 bytelen|I32 *newlen
Ap |U8* |utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen
Ap |STRLEN |utf8_length |U8* s|U8 *e
-Ap |I32 |utf8_distance |U8 *a|U8 *b
+Ap |IV |utf8_distance |U8 *a|U8 *b
Ap |U8* |utf8_hop |U8 *s|I32 off
ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len
ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len
diff --git a/proto.h b/proto.h
index 2a601956f9..1e34c81cec 100644
--- a/proto.h
+++ b/proto.h
@@ -807,7 +807,7 @@ PERL_CALLCONV void Perl_utilize(pTHX_ int aver, I32 floor, OP* version, OP* id,
PERL_CALLCONV U8* Perl_utf16_to_utf8(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen);
PERL_CALLCONV U8* Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen);
PERL_CALLCONV STRLEN Perl_utf8_length(pTHX_ U8* s, U8 *e);
-PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
+PERL_CALLCONV IV Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off);
PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
diff --git a/utf8.c b/utf8.c
index fc625dc464..d25b43bbe7 100644
--- a/utf8.c
+++ b/utf8.c
@@ -357,8 +357,8 @@ Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen)
=for apidoc|utf8_length|U8 *s|U8 *e
Return the length of the UTF-8 char encoded string C<s> in characters.
-Stops at string C<e>. If C<e E<lt> s> or if the scan would end up
-past C<e>, return -1.
+Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
+up past C<e>, croaks.
=cut
*/
@@ -369,12 +369,12 @@ Perl_utf8_length(pTHX_ U8* s, U8* e)
STRLEN len = 0;
if (e < s)
- return -1;
+ Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
while (s < e) {
- STRLEN t = UTF8SKIP(s);
+ U8 t = UTF8SKIP(s);
if (e - s < t)
- return -1;
+ Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
s += t;
len++;
}
@@ -385,22 +385,32 @@ Perl_utf8_length(pTHX_ U8* s, U8* e)
/* utf8_distance(a,b) returns the number of UTF8 characters between
the pointers a and b */
-I32
+IV
Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
{
- I32 off = 0;
+ IV off = 0;
+
if (a < b) {
while (a < b) {
- a += UTF8SKIP(a);
+ U8 c = UTF8SKIP(a);
+
+ if (b - a < c)
+ Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
+ a += c;
off--;
}
}
else {
while (b < a) {
- b += UTF8SKIP(b);
+ U8 c = UTF8SKIP(b);
+
+ if (a - b < c)
+ Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
+ b += c;
off++;
}
}
+
return off;
}