diff options
author | Ton Hospel <perl5-porters@ton.iguana.be> | 2005-03-06 18:29:38 +0000 |
---|---|---|
committer | Rafael Garcia-Suarez <rgarciasuarez@gmail.com> | 2005-03-08 17:53:50 +0000 |
commit | f337b084e4f053c4222a0b9a773a9e12c0232e6d (patch) | |
tree | 1292203ca74046d2df21ce05bb8f8289ea14bc8d /pp_pack.c | |
parent | c478aefb95db58c5f937ab7c70bba552d23df9b2 (diff) | |
download | perl-f337b084e4f053c4222a0b9a773a9e12c0232e6d.tar.gz |
Encoding neutral unpack
Message-Id: <d0fi6i$k06$1@post.home.lunix>
p4raw-id: //depot/perl@24010
Diffstat (limited to 'pp_pack.c')
-rw-r--r-- | pp_pack.c | 1542 |
1 files changed, 955 insertions, 587 deletions
@@ -32,8 +32,12 @@ #include "perl.h" #if PERL_VERSION >= 9 -#define PERL_PACK_CAN_BYTEORDER -#define PERL_PACK_CAN_SHRIEKSIGN +# define PERL_PACK_CAN_BYTEORDER +# define PERL_PACK_CAN_SHRIEKSIGN +#endif + +#ifndef CHAR_BIT +# define CHAR_BIT 8 #endif /* @@ -77,32 +81,52 @@ # define OFF32(p) ((char *) (p)) #endif -#define COPY16(s,p) Copy(s, OFF16(p), SIZE16, char) -#define COPY32(s,p) Copy(s, OFF32(p), SIZE32, char) -#define CAT16(sv,p) sv_catpvn(sv, OFF16(p), SIZE16) -#define CAT32(sv,p) sv_catpvn(sv, OFF32(p), SIZE32) +/* Only to be used inside a loop (see the break) */ +#define SHIFT16(utf8, s, strend, p, datumtype) STMT_START { \ + if (utf8) { \ + if (!uni_to_bytes(aTHX_ &(s), strend, OFF16(p), SIZE16, datumtype)) break; \ + } else { \ + Copy(s, OFF16(p), SIZE16, char); \ + (s) += SIZE16; \ + } \ +} STMT_END + +/* Only to be used inside a loop (see the break) */ +#define SHIFT32(utf8, s, strend, p, datumtype) STMT_START { \ + if (utf8) { \ + if (!uni_to_bytes(aTHX_ &(s), strend, OFF32(p), SIZE32, datumtype)) break; \ + } else { \ + Copy(s, OFF32(p), SIZE32, char); \ + (s) += SIZE32; \ + } \ +} STMT_END + +#define PUSH16(utf8, cur, p) PUSH_BYTES(utf8, cur, OFF16(p), SIZE16) +#define PUSH32(utf8, cur, p) PUSH_BYTES(utf8, cur, OFF32(p), SIZE32) /* Only to be used inside a loop (see the break) */ -#define COPYVAR(s,strend,utf8,var,format) \ +#define SHIFT_VAR(utf8, s, strend, var, datumtype) \ STMT_START { \ if (utf8) { \ - if (!next_uni_bytes(aTHX_ &s, strend, \ - (char *) &var, sizeof(var))) break; \ + if (!uni_to_bytes(aTHX_ &s, strend, \ + (char *) &var, sizeof(var), datumtype)) break;\ } else { \ Copy(s, (char *) &var, sizeof(var), char); \ s += sizeof(var); \ } \ - DO_BO_UNPACK(var, format); \ } STMT_END +#define PUSH_VAR(utf8, aptr, var) \ + PUSH_BYTES(utf8, aptr, (char *) &(var), sizeof(var)) + /* Avoid stack overflow due to pathological templates. 100 should be plenty. */ #define MAX_SUB_TEMPLATE_LEVEL 100 /* flags (note that type modifiers can also be used as flags!) */ -#define FLAG_UNPACK_WAS_UTF8 0x40 /* original had FLAG_UNPACK_DO_UTF8 */ -#define FLAG_UNPACK_PARSE_UTF8 0x20 /* Parse as utf8 */ +#define FLAG_WAS_UTF8 0x40 +#define FLAG_PARSE_UTF8 0x20 /* Parse as utf8 */ #define FLAG_UNPACK_ONLY_ONE 0x10 -#define FLAG_UNPACK_DO_UTF8 0x08 /* The underlying string is utf8 */ +#define FLAG_DO_UTF8 0x08 /* The underlying string is utf8 */ #define FLAG_SLASH 0x04 #define FLAG_COMMA 0x02 #define FLAG_PACK 0x01 @@ -151,6 +175,7 @@ S_mul128(pTHX_ SV *sv, U8 m) #define TYPE_IS_SHRIEKING 0x100 #define TYPE_IS_BIG_ENDIAN 0x200 #define TYPE_IS_LITTLE_ENDIAN 0x400 +#define TYPE_IS_PACK 0x800 #define TYPE_ENDIANNESS_MASK (TYPE_IS_BIG_ENDIAN|TYPE_IS_LITTLE_ENDIAN) #define TYPE_MODIFIERS(t) ((t) & ~0xFF) #define TYPE_NO_MODIFIERS(t) ((t) & 0xFF) @@ -177,7 +202,7 @@ S_mul128(pTHX_ SV *sv, U8 m) # define DO_BO_UNPACK_P(var) # define DO_BO_PACK_P(var) -#else +#else /* PERL_PACK_CAN_BYTEORDER */ # define TYPE_ENDIANNESS(t) ((t) & TYPE_ENDIANNESS_MASK) # define TYPE_NO_ENDIANNESS(t) ((t) & ~TYPE_ENDIANNESS_MASK) @@ -281,10 +306,10 @@ S_mul128(pTHX_ SV *sv, U8 m) # define DO_BO_PACK_N(var, type) BO_CANT_DOIT(pack, type) # endif -#endif +#endif /* PERL_PACK_CAN_BYTEORDER */ #define PACK_SIZE_CANNOT_CSUM 0x80 -#define PACK_SIZE_SPARE 0x40 +#define PACK_SIZE_UNPREDICTABLE 0x40 /* Not a fixed size element */ #define PACK_SIZE_MASK 0x3F @@ -326,9 +351,9 @@ unsigned char size_normal[53] = { 0, /* S */ SIZE16, 0, - /* U */ sizeof(char), + /* U */ sizeof(char) | PACK_SIZE_UNPREDICTABLE, /* V */ SIZE32, - /* W */ sizeof(unsigned char), + /* W */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* c */ sizeof(char), /* d */ sizeof(double), @@ -352,7 +377,7 @@ unsigned char size_normal[53] = { /* s */ SIZE16, 0, 0, /* v */ SIZE16, - /* w */ sizeof(char) | PACK_SIZE_CANNOT_CSUM, + /* w */ sizeof(char) | PACK_SIZE_UNPREDICTABLE | PACK_SIZE_CANNOT_CSUM, }; unsigned char size_shrieking[46] = { /* I */ sizeof(unsigned int), @@ -421,7 +446,7 @@ unsigned char size_normal[100] = { /* s */ SIZE16, 0, 0, /* v */ SIZE16, - /* w */ sizeof(char) | PACK_SIZE_CANNOT_CSUM, + /* w */ sizeof(char) | PACK_SIZE_UNPREDICTABLE | PACK_SIZE_CANNOT_CSUM, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* C */ sizeof(unsigned char), @@ -449,9 +474,9 @@ unsigned char size_normal[100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, /* S */ SIZE16, 0, - /* U */ sizeof(char), + /* U */ sizeof(char) | PACK_SIZE_UNPREDICTABLE, /* V */ SIZE32, - /* W */ sizeof(unsigned char), + /* W */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE, }; unsigned char size_shrieking[93] = { /* i */ sizeof(int), @@ -498,34 +523,35 @@ struct packsize_t packsize[2] = { #endif STATIC U8 -next_uni_byte(pTHX_ char **s, const char *end, I32 datumtype) +uni_to_byte(pTHX_ char **s, const char *end, I32 datumtype) { UV val; STRLEN retlen; - val = - UNI_TO_NATIVE(utf8n_to_uvuni((U8*)*s, end-*s, &retlen, - ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)); + val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); /* We try to process malformed UTF-8 as much as possible (preferrably with warnings), but these two mean we make no progress in the string and might enter an infinite loop */ if (retlen == (STRLEN) -1 || retlen == 0) - Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); + Perl_croak(aTHX_ "Malformed UTF-8 string in '%c' format in unpack", + (int) TYPE_NO_MODIFIERS(datumtype)); if (val >= 0x100) { + if (ckWARN(WARN_UNPACK)) Perl_warner(aTHX_ packWARN(WARN_UNPACK), "Character in '%c' format wrapped in unpack", - (int) datumtype); + (int) TYPE_NO_MODIFIERS(datumtype)); val &= 0xff; } *s += retlen; return val; } -#define NEXT_BYTE(utf8, s, strend, datumtype) ((utf8) ? \ - next_uni_byte(aTHX_ &(s), (strend), (datumtype)) : \ +#define SHIFT_BYTE(utf8, s, strend, datumtype) ((utf8) ? \ + uni_to_byte(aTHX_ &(s), (strend), (datumtype)) : \ *(U8 *)(s)++) STATIC bool -next_uni_bytes(pTHX_ char **s, char *end, char *buf, int buf_len) +uni_to_bytes(pTHX_ char **s, char *end, char *buf, int buf_len, I32 datumtype) { UV val; STRLEN retlen; @@ -535,7 +561,7 @@ next_uni_bytes(pTHX_ char **s, char *end, char *buf, int buf_len) UTF8_CHECK_ONLY : (UTF8_CHECK_ONLY | UTF8_ALLOW_ANY); for (;buf_len > 0; buf_len--) { if (from >= end) return FALSE; - val = UNI_TO_NATIVE(utf8n_to_uvuni((U8*)from, end-from, &retlen, flags)); + val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags); if (retlen == (STRLEN) -1 || retlen == 0) { from += UTF8SKIP(from); bad |= 1; @@ -554,13 +580,16 @@ next_uni_bytes(pTHX_ char **s, char *end, char *buf, int buf_len) flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr)) { if (ptr >= end) break; - utf8n_to_uvuni((U8*)ptr, end-ptr, &retlen, flags); + utf8n_to_uvuni((U8 *) ptr, end-ptr, &retlen, flags); } if (from > end) from = end; } if ((bad & 2) && ckWARN(WARN_UNPACK)) - Perl_warner(aTHX_ packWARN(WARN_UNPACK), - "Character(s) wrapped in unpack"); + Perl_warner(aTHX_ packWARN(datumtype & TYPE_IS_PACK ? + WARN_PACK : WARN_UNPACK), + "Character(s) in '%c' format wrapped in %s", + (int) TYPE_NO_MODIFIERS(datumtype), + datumtype & TYPE_IS_PACK ? "pack" : "unpack"); } *s = from; return TRUE; @@ -571,43 +600,118 @@ next_uni_uu(pTHX_ char **s, const char *end, I32 *out) { UV val; STRLEN retlen; - char *from = *s; - val = UNI_TO_NATIVE(utf8n_to_uvuni((U8*)*s, end-*s, &retlen, UTF8_CHECK_ONLY)); + val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen, UTF8_CHECK_ONLY); if (val >= 0x100 || !ISUUCHAR(val) || retlen == (STRLEN) -1 || retlen == 0) { *out = 0; return FALSE; } *out = PL_uudmap[val] & 077; - *s = from; + *s += retlen; return TRUE; } +STATIC void +bytes_to_uni(pTHX_ U8 *start, STRLEN len, char **dest) { + U8 buffer[UTF8_MAXLEN]; + U8 *end = start + len; + char *d = *dest; + while (start < end) { + int length = + uvuni_to_utf8_flags(buffer, NATIVE_TO_UNI(*start), 0) - buffer; + switch(length) { + case 1: + *d++ = buffer[0]; + break; + case 2: + *d++ = buffer[0]; + *d++ = buffer[1]; + break; + default: + Perl_croak(aTHX_ "Perl bug: value %d UTF-8 expands to %d bytes", + *start, length); + } + start++; + } + *dest = d; +} + +#define PUSH_BYTES(utf8, cur, buf, len) \ +STMT_START { \ + if (utf8) bytes_to_uni(aTHX_ buf, len, &(cur)); \ + else { \ + Copy(buf, cur, len, char); \ + (cur) += (len); \ + } \ +} STMT_END + +#define GROWING(utf8, cat, start, cur, in_len) \ +STMT_START { \ + STRLEN glen = (in_len); \ + if (utf8) glen *= 2; \ + if ((cur) + glen >= (start) + SvLEN(cat)) { \ + (start) = sv_exp_grow(aTHX_ cat, glen); \ + (cur) = (start) + SvCUR(cat); \ + } \ +} STMT_END + +#define PUSH_GROWING_BYTES(utf8, cat, start, cur, buf, in_len) \ +STMT_START { \ + STRLEN glen = (in_len); \ + STRLEN gl = glen; \ + if (utf8) gl *= 2; \ + if ((cur) + gl >= (start) + SvLEN(cat)) { \ + *cur = '\0'; \ + SvCUR(cat) = (cur) - (start); \ + (start) = sv_exp_grow(aTHX_ cat, gl); \ + (cur) = (start) + SvCUR(cat); \ + } \ + PUSH_BYTES(utf8, cur, buf, glen); \ +} STMT_END + +#define PUSH_BYTE(utf8, s, byte) \ +STMT_START { \ + if (utf8) { \ + U8 au8 = (byte); \ + bytes_to_uni(aTHX_ &au8, 1, &(s)); \ + } else *(U8 *)(s)++ = (byte); \ +} STMT_END + +/* Only to be used inside a loop (see the break) */ +#define NEXT_UNI_VAL(val, cur, str, end, utf8_flags) \ +STMT_START { \ + STRLEN retlen; \ + if (str >= end) break; \ + val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags); \ + if (retlen == (STRLEN) -1 || retlen == 0) { \ + *cur = '\0'; \ + Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); \ + } \ + str += retlen; \ +} STMT_END + /* Returns the sizeof() struct described by pat */ STATIC I32 -S_measure_struct(pTHX_ register tempsym_t* symptr) +S_measure_struct(pTHX_ tempsym_t* symptr) { - register I32 len = 0; - register I32 total = 0; - int star; - - register int size; + I32 total = 0; while (next_symbol(symptr)) { - int which = (symptr->code & TYPE_IS_SHRIEKING) - ? PACK_SIZE_SHRIEKING : PACK_SIZE_NORMAL; - int offset - = TYPE_NO_MODIFIERS(symptr->code) - packsize[which].first; - - switch( symptr->howlen ){ - case e_no_len: - case e_number: - len = symptr->length; - break; + I32 len; + int star, size; + int which = (symptr->code & TYPE_IS_SHRIEKING) ? + PACK_SIZE_SHRIEKING : PACK_SIZE_NORMAL; + int offset = TYPE_NO_MODIFIERS(symptr->code) - packsize[which].first; + + switch (symptr->howlen) { case e_star: Perl_croak(aTHX_ "Within []-length '*' not allowed in %s", symptr->flags & FLAG_PACK ? "pack" : "unpack" ); break; + default: + /* e_no_len and e_number */ + len = symptr->length; + break; } if ((offset >= 0) && (offset < packsize[which].size)) @@ -746,10 +850,11 @@ S_get_num(pTHX_ register char *patptr, I32 *lenptr ) * locates next template code and count */ STATIC bool -S_next_symbol(pTHX_ register tempsym_t* symptr ) +S_next_symbol(pTHX_ tempsym_t* symptr ) { - register char* patptr = symptr->patptr; - register char* patend = symptr->patend; + char* patptr = symptr->patptr; + char* patend = symptr->patend; + const char *allowed = ""; symptr->flags &= ~FLAG_SLASH; @@ -797,7 +902,6 @@ S_next_symbol(pTHX_ register tempsym_t* symptr ) /* look for modifiers */ while (patptr < patend) { - const char *allowed; I32 modifier = 0; switch (*patptr) { case '!': @@ -813,7 +917,7 @@ S_next_symbol(pTHX_ register tempsym_t* symptr ) modifier = TYPE_IS_LITTLE_ENDIAN; allowed = ENDIANNESS_ALLOWED_TYPES; break; -#endif +#endif /* PERL_PACK_CAN_BYTEORDER */ default: break; } @@ -965,23 +1069,23 @@ and ocnt are not used. This call should not be used, use unpackstring instead. =cut */ I32 -Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char *strbeg, char *strend, char **new_s, I32 ocnt, U32 flags) +Perl_unpack_str(pTHX_ char *pat, char *patend, char *s, char *strbeg, char *strend, char **new_s, I32 ocnt, U32 flags) { tempsym_t sym = { 0 }; - if (flags & FLAG_UNPACK_DO_UTF8) flags |= FLAG_UNPACK_WAS_UTF8; + if (flags & FLAG_DO_UTF8) flags |= FLAG_WAS_UTF8; else if (need_utf8(pat, patend)) { /* We probably should try to avoid this in case a scalar context call wouldn't get to the "U0" */ STRLEN len = strend - s; - s = (char*)bytes_to_utf8((U8*)s, &len); + s = (char *) bytes_to_utf8(s, &len); SAVEFREEPV(s); strend = s + len; - flags |= FLAG_UNPACK_DO_UTF8; + flags |= FLAG_DO_UTF8; } - if (first_symbol(pat, patend) != 'U' && (flags & FLAG_UNPACK_DO_UTF8)) - flags |= FLAG_UNPACK_PARSE_UTF8; + if (first_symbol(pat, patend) != 'U' && (flags & FLAG_DO_UTF8)) + flags |= FLAG_PARSE_UTF8; sym.patptr = pat; sym.patend = patend; @@ -1000,23 +1104,23 @@ Issue C<PUTBACK> before and C<SPAGAIN> after the call to this function. =cut */ I32 -Perl_unpackstring(pTHX_ char *pat, register char *patend, register char *s, char *strend, U32 flags) +Perl_unpackstring(pTHX_ char *pat, char *patend, char *s, char *strend, U32 flags) { tempsym_t sym = { 0 }; - if (flags & FLAG_UNPACK_DO_UTF8) flags |= FLAG_UNPACK_WAS_UTF8; + if (flags & FLAG_DO_UTF8) flags |= FLAG_WAS_UTF8; else if (need_utf8(pat, patend)) { /* We probably should try to avoid this in case a scalar context call wouldn't get to the "U0" */ STRLEN len = strend - s; - s = (char*)bytes_to_utf8((U8*)s, &len); + s = (char *) bytes_to_utf8(s, &len); SAVEFREEPV(s); strend = s + len; - flags |= FLAG_UNPACK_DO_UTF8; + flags |= FLAG_DO_UTF8; } - if (first_symbol(pat, patend) != 'U' && (flags & FLAG_UNPACK_DO_UTF8)) - flags |= FLAG_UNPACK_PARSE_UTF8; + if (first_symbol(pat, patend) != 'U' && (flags & FLAG_DO_UTF8)) + flags |= FLAG_PARSE_UTF8; sym.patptr = pat; sym.patend = patend; @@ -1030,8 +1134,6 @@ I32 S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char **new_s ) { dSP; - I32 datumtype, ai32; - I32 len = 0; SV *sv; I32 start_sp_offset = SP - PL_stack_base; howlen_t howlen; @@ -1039,15 +1141,16 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char I32 checksum = 0; UV cuv = 0; NV cdouble = 0.0; - const int bits_in_uv = 8 * sizeof(cuv); + const int bits_in_uv = CHAR_BIT * sizeof(cuv); char* strrelbeg = s; bool beyond = FALSE; bool explicit_length; bool unpack_only_one = (symptr->flags & FLAG_UNPACK_ONLY_ONE) != 0; - bool utf8 = (symptr->flags & FLAG_UNPACK_PARSE_UTF8) ? 1 : 0; + bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0; while (next_symbol(symptr)) { - datumtype = symptr->code; + I32 len, ai32; + I32 datumtype = symptr->code; /* do first one only unless in list context / is implemented by unpacking the count, then popping it from the stack, so must check that we're not in the middle of a / */ @@ -1056,28 +1159,29 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char && (datumtype != '/') ) /* XXX can this be omitted */ break; - switch( howlen = symptr->howlen ){ - case e_no_len: - case e_number: - len = symptr->length; - break; + switch (howlen = symptr->howlen) { case e_star: len = strend - strbeg; /* long enough */ break; + default: + /* e_no_len and e_number */ + len = symptr->length; + break; } explicit_length = TRUE; redo_switch: beyond = s >= strend; { - int which = (symptr->code & TYPE_IS_SHRIEKING) - ? PACK_SIZE_SHRIEKING : PACK_SIZE_NORMAL; + struct packsize_t *pack_props = + &packsize[(symptr->code & TYPE_IS_SHRIEKING) ? + PACK_SIZE_SHRIEKING : PACK_SIZE_NORMAL]; const int rawtype = TYPE_NO_MODIFIERS(datumtype); - int offset = rawtype - packsize[which].first; + int offset = rawtype - pack_props->first; - if (offset >= 0 && offset < packsize[which].size) { + if (offset >= 0 && offset < pack_props->size) { /* Data about this template letter */ - unsigned char data = packsize[which].array[offset]; + unsigned char data = pack_props->array[offset]; if (data) { /* data nonzero means we can process this letter. */ @@ -1116,8 +1220,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char PUTBACK; while (len--) { symptr->patptr = savsym.grpbeg; - if (utf8) symptr->flags |= FLAG_UNPACK_PARSE_UTF8; - else symptr->flags &= ~FLAG_UNPACK_PARSE_UTF8; + if (utf8) symptr->flags |= FLAG_PARSE_UTF8; + else symptr->flags &= ~FLAG_PARSE_UTF8; unpack_rec(symptr, s, strbeg, strend, &s); if (s == strend && savsym.howlen == e_star) break; /* No way to continue */ @@ -1150,22 +1254,28 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char len = 1; if (utf8) { char *hop, *last; - I32 l; - for (l=len, hop = strbeg; hop < s; l++, hop += UTF8SKIP(hop)) - if (l == len) { + I32 l = len; + hop = last = strbeg; + while (hop < s) { + hop += UTF8SKIP(hop); + if (--l == 0) { last = hop; - l = 0; + l = len; + } } + if (last > s) + Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); s = last; break; - } else len = (s - strbeg) % len; + } + len = (s - strbeg) % len; /* FALL THROUGH */ case 'X': if (utf8) { while (len > 0) { if (s <= strbeg) Perl_croak(aTHX_ "'X' outside of string in unpack"); - while (UTF8_IS_CONTINUATION(*--s)) { + while (--s, UTF8_IS_CONTINUATION(*s)) { if (s <= strbeg) Perl_croak(aTHX_ "'X' outside of string in unpack"); } @@ -1180,14 +1290,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'x' | TYPE_IS_SHRIEKING: if (!len) /* Avoid division by 0 */ len = 1; - if (utf8) { - char *hop = strbeg; - I32 l = 0; - for (hop = strbeg; hop < s; hop += UTF8SKIP(hop)) l++; - if (s != hop) - Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); - ai32 = l % len; - } else ai32 = (s - strbeg) % len; + if (utf8) ai32 = utf8_length(strbeg, s) % len; + else ai32 = (s - strbeg) % len; if (ai32 == 0) break; len -= ai32; /* FALL THROUGH */ @@ -1203,7 +1307,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (len > strend - s) Perl_croak(aTHX_ "'x' outside of string in unpack"); s += len; - }; + } break; case '/': Perl_croak(aTHX_ "'/' must follow a numeric type in unpack"); @@ -1234,8 +1338,9 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (datumtype == 'Z') { /* 'Z' strips stuff after first null */ - char *ptr; - for (ptr = s; ptr < strend; ptr++) if (*ptr == 0) break; + char *ptr, *end; + end = s + len; + for (ptr = s; ptr < end; ptr++) if (*ptr == 0) break; sv = newSVpvn(s, ptr-s); if (howlen == e_star) /* exact for 'Z*' */ len = ptr-s + (ptr != strend ? 1 : 0); @@ -1251,7 +1356,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (utf8) { SvUTF8_on(sv); /* Undo any upgrade done due to need_utf8() */ - if (!(symptr->flags & FLAG_UNPACK_WAS_UTF8)) + if (!(symptr->flags & FLAG_WAS_UTF8)) sv_utf8_downgrade(sv, 0); } XPUSHs(sv_2mortal(sv)); @@ -1277,32 +1382,30 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (bits & 128) PL_bitcount[bits]++; } } - if (utf8) { + if (utf8) while (len >= 8 && s < strend) { - cuv += PL_bitcount[next_uni_byte(aTHX_ &s, strend, datumtype)]; + cuv += PL_bitcount[uni_to_byte(aTHX_ &s, strend, datumtype)]; len -= 8; } - } else { + else while (len >= 8) { cuv += PL_bitcount[*(U8 *)s++]; len -= 8; } - } if (len && s < strend) { U8 bits; - bits = NEXT_BYTE(utf8, s, strend, datumtype); - if (datumtype == 'b') { + bits = SHIFT_BYTE(utf8, s, strend, datumtype); + if (datumtype == 'b') while (len-- > 0) { if (bits & 1) cuv++; bits >>= 1; } - } else { + else while (len-- > 0) { if (bits & 0x80) cuv++; bits <<= 1; } } - } break; } @@ -1310,24 +1413,24 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char SvPOK_on(sv); str = SvPVX(sv); if (datumtype == 'b') { - U8 bits; + U8 bits = 0; ai32 = len; for (len = 0; len < ai32; len++) { if (len & 7) bits >>= 1; else if (utf8) { if (s >= strend) break; - bits = next_uni_byte(aTHX_ &s, strend, datumtype); + bits = uni_to_byte(aTHX_ &s, strend, datumtype); } else bits = *(U8 *) s++; *str++ = bits & 1 ? '1' : '0'; } } else { - U8 bits; + U8 bits = 0; ai32 = len; for (len = 0; len < ai32; len++) { if (len & 7) bits <<= 1; else if (utf8) { if (s >= strend) break; - bits = next_uni_byte(aTHX_ &s, strend, datumtype); + bits = uni_to_byte(aTHX_ &s, strend, datumtype); } else bits = *(U8 *) s++; *str++ = bits & 0x80 ? '1' : '0'; } @@ -1347,24 +1450,24 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char SvPOK_on(sv); str = SvPVX(sv); if (datumtype == 'h') { - U8 bits; + U8 bits = 0; ai32 = len; for (len = 0; len < ai32; len++) { if (len & 1) bits >>= 4; else if (utf8) { if (s >= strend) break; - bits = next_uni_byte(aTHX_ &s, strend, datumtype); + bits = uni_to_byte(aTHX_ &s, strend, datumtype); } else bits = * (U8 *) s++; *str++ = PL_hexdigit[bits & 15]; } } else { - U8 bits; + U8 bits = 0; ai32 = len; for (len = 0; len < ai32; len++) { if (len & 1) bits <<= 4; else if (utf8) { if (s >= strend) break; - bits = next_uni_byte(aTHX_ &s, strend, datumtype); + bits = uni_to_byte(aTHX_ &s, strend, datumtype); } else bits = *(U8 *) s++; *str++ = PL_hexdigit[(bits >> 4) & 15]; } @@ -1376,7 +1479,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char } case 'c': while (len-- > 0) { - int aint = NEXT_BYTE(utf8, s, strend, datumtype); + int aint = SHIFT_BYTE(utf8, s, strend, datumtype); if (aint >= 128) /* fake up signed chars */ aint -= 256; if (!checksum) @@ -1393,18 +1496,17 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (len == 0) { if (explicit_length && datumtype == 'C') /* Switch to "character" mode */ - utf8 = (symptr->flags & FLAG_UNPACK_DO_UTF8) ? 1 : 0; + utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; break; } if (datumtype == 'C' ? - (symptr->flags & FLAG_UNPACK_DO_UTF8) && - !(symptr->flags & FLAG_UNPACK_WAS_UTF8) : utf8) { + (symptr->flags & FLAG_DO_UTF8) && + !(symptr->flags & FLAG_WAS_UTF8) : utf8) { while (len-- > 0 && s < strend) { UV val; STRLEN retlen; - val = - UNI_TO_NATIVE(utf8n_to_uvuni((U8*)s, strend-s, &retlen, - ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)); + val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); if (retlen == (STRLEN) -1 || retlen == 0) Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); s += retlen; @@ -1429,7 +1531,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (len == 0) { if (explicit_length) { /* Switch to "bytes in UTF-8" mode */ - if (symptr->flags & FLAG_UNPACK_DO_UTF8) utf8 = 0; + if (symptr->flags & FLAG_DO_UTF8) utf8 = 0; else /* Should be impossible due to the need_utf8() test */ Perl_croak(aTHX_ "U0 mode on a byte string"); @@ -1452,11 +1554,11 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char ptr = s; /* Bug: warns about bad utf8 even if we are short on bytes and will break out of the loop */ - if (!next_uni_bytes(aTHX_ &ptr, strend, (char*)result, 1)) + if (!uni_to_bytes(aTHX_ &ptr, strend, result, 1, 'U')) break; len = UTF8SKIP(result); - if (!next_uni_bytes(aTHX_ &ptr, strend, (char*)&result[1], len-1)) - break; + if (!uni_to_bytes(aTHX_ &ptr, strend, + &result[1], len-1, 'U')) break; auv = utf8n_to_uvuni(result, len, &retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV); s = ptr; } else { @@ -1477,7 +1579,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if SHORTSIZE != SIZE16 while (len-- > 0) { short ashort; - COPYVAR(s, strend, utf8, ashort, s); + SHIFT_VAR(utf8, s, strend, ashort, datumtype); + DO_BO_UNPACK(ashort, s); if (!checksum) PUSHs(sv_2mortal(newSViv((IV)ashort))); else if (checksum > bits_in_uv) @@ -1496,13 +1599,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if U16SIZE > SIZE16 ai16 = 0; #endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - OFF16(&ai16), SIZE16)) break; - } else { - COPY16(s, &ai16); - s += SIZE16; - } + SHIFT16(utf8, s, strend, &ai16, datumtype); DO_BO_UNPACK(ai16, 16); #if U16SIZE > SIZE16 if (ai16 > 32767) @@ -1520,7 +1617,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if SHORTSIZE != SIZE16 while (len-- > 0) { unsigned short aushort; - COPYVAR(s, strend, utf8, aushort, s); + SHIFT_VAR(utf8, s, strend, aushort, datumtype); + DO_BO_UNPACK(aushort, s); if (!checksum) PUSHs(sv_2mortal(newSVuv((UV) aushort))); else if (checksum > bits_in_uv) @@ -1540,13 +1638,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if U16SIZE > SIZE16 au16 = 0; #endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - OFF16(&au16), SIZE16)) break; - } else { - COPY16(s, &au16); - s += SIZE16; - } + SHIFT16(utf8, s, strend, &au16, datumtype); DO_BO_UNPACK(au16, 16); #ifdef HAS_NTOHS if (datumtype == 'n') @@ -1559,7 +1651,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (!checksum) PUSHs(sv_2mortal(newSVuv((UV)au16))); else if (checksum > bits_in_uv) - cdouble += (NV)au16; + cdouble += (NV) au16; else cuv += au16; } @@ -1572,13 +1664,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char # if U16SIZE > SIZE16 ai16 = 0; # endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - (char *) &ai16, sizeof(ai16))) break; - } else { - COPY16(s, &ai16); - s += SIZE16; - } + SHIFT16(utf8, s, strend, &ai16, datumtype); # ifdef HAS_NTOHS if (datumtype == ('n' | TYPE_IS_SHRIEKING)) ai16 = (I16) PerlSock_ntohs((U16) ai16); @@ -1600,7 +1686,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'i' | TYPE_IS_SHRIEKING: while (len-- > 0) { int aint; - COPYVAR(s, strend, utf8, aint, i); + SHIFT_VAR(utf8, s, strend, aint, datumtype); + DO_BO_UNPACK(aint, i); if (!checksum) PUSHs(sv_2mortal(newSViv((IV)aint))); else if (checksum > bits_in_uv) @@ -1613,7 +1700,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'I' | TYPE_IS_SHRIEKING: while (len-- > 0) { unsigned int auint; - COPYVAR(s, strend, utf8, auint, i); + SHIFT_VAR(utf8, s, strend, auint, datumtype); + DO_BO_UNPACK(auint, i); if (!checksum) PUSHs(sv_2mortal(newSVuv((UV)auint))); else if (checksum > bits_in_uv) @@ -1625,12 +1713,13 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'j': while (len-- > 0) { IV aiv; + SHIFT_VAR(utf8, s, strend, aiv, datumtype); #if IVSIZE == INTSIZE - COPYVAR(s, strend, utf8, aiv, i); + DO_BO_UNPACK(aiv, i); #elif IVSIZE == LONGSIZE - COPYVAR(s, strend, utf8, aiv, l); + DO_BO_UNPACK(aiv, l); #elif defined(HAS_QUAD) && IVSIZE == U64SIZE - COPYVAR(s, strend, utf8, aiv, 64); + DO_BO_UNPACK(aiv, 64); #else Perl_croak(aTHX_ "'j' not supported on this platform"); #endif @@ -1645,12 +1734,13 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'J': while (len-- > 0) { UV auv; + SHIFT_VAR(utf8, s, strend, auv, datumtype); #if IVSIZE == INTSIZE - COPYVAR(s, strend, utf8, auv, i); + DO_BO_UNPACK(auv, i); #elif IVSIZE == LONGSIZE - COPYVAR(s, strend, utf8, auv, l); + DO_BO_UNPACK(auv, l); #elif defined(HAS_QUAD) && IVSIZE == U64SIZE - COPYVAR(s, strend, utf8, auv, 64); + DO_BO_UNPACK(auv, 64); #else Perl_croak(aTHX_ "'J' not supported on this platform"); #endif @@ -1666,7 +1756,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if LONGSIZE != SIZE32 while (len-- > 0) { long along; - COPYVAR(s, strend, utf8, along, l); + SHIFT_VAR(utf8, s, strend, along, datumtype); + DO_BO_UNPACK(along, l); if (!checksum) PUSHs(sv_2mortal(newSViv((IV)along))); else if (checksum > bits_in_uv) @@ -1684,13 +1775,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if U32SIZE > SIZE32 ai32 = 0; #endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - OFF32(&ai32), SIZE32)) break; - } else { - COPY32(s, &ai32); - s += SIZE32; - } + SHIFT32(utf8, s, strend, &ai32, datumtype); DO_BO_UNPACK(ai32, 32); #if U32SIZE > SIZE32 if (ai32 > 2147483647) ai32 -= 4294967296; @@ -1707,7 +1792,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if LONGSIZE != SIZE32 while (len-- > 0) { unsigned long aulong; - COPYVAR(s, strend, utf8, aulong, l); + SHIFT_VAR(utf8, s, strend, aulong, datumtype); + DO_BO_UNPACK(aulong, l); if (!checksum) PUSHs(sv_2mortal(newSVuv((UV)aulong))); else if (checksum > bits_in_uv) @@ -1727,13 +1813,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if U32SIZE > SIZE32 au32 = 0; #endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - OFF32(&au32), SIZE32)) break; - } else { - COPY32(s, &au32); - s += SIZE32; - } + SHIFT32(utf8, s, strend, &au32, datumtype); DO_BO_UNPACK(au32, 32); #ifdef HAS_NTOHL if (datumtype == 'N') @@ -1759,13 +1839,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char # if U32SIZE > SIZE32 ai32 = 0; # endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - OFF32(&ai32), SIZE32)) break; - } else { - COPY32(s, &ai32); - s += SIZE32; - } + SHIFT32(utf8, s, strend, &ai32, datumtype); # ifdef HAS_NTOHL if (datumtype == ('N' | TYPE_IS_SHRIEKING)) ai32 = (I32)PerlSock_ntohl((U32)ai32); @@ -1786,13 +1860,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'p': while (len-- > 0) { char *aptr; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - (char *) &aptr, sizeof(aptr))) break; - } else { - Copy(s, &aptr, 1, char*); - s += sizeof(aptr); - } + SHIFT_VAR(utf8, s, strend, aptr, datumtype); DO_BO_UNPACK_P(aptr); /* newSVpv generates undef if aptr is NULL */ PUSHs(sv_2mortal(newSVpv(aptr, 0))); @@ -1805,7 +1873,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char while (len > 0 && s < strend) { U8 ch; - ch = NEXT_BYTE(utf8, s, strend, 'w'); + ch = SHIFT_BYTE(utf8, s, strend, datumtype); auv = (auv << 7) | (ch & 0x7f); /* UTF8_IS_XXXXX not right here - using constant 0x80 */ if (ch < 0x80) { @@ -1821,7 +1889,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char sv = Perl_newSVpvf(aTHX_ "%.*"UVf, (int)TYPE_DIGITS(UV), auv); while (s < strend) { - ch = NEXT_BYTE(utf8, s, strend, 'w'); + ch = SHIFT_BYTE(utf8, s, strend, datumtype); sv = mul128(sv, (U8)(ch & 0x7f)); if (!(ch & 0x80)) { bytes = 0; @@ -1847,13 +1915,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char EXTEND(SP, 1); if (sizeof(char*) <= strend - s) { char *aptr; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, (char *) &aptr, - sizeof(aptr))) break; - } else { - Copy(s, &aptr, 1, char*); - s += sizeof(aptr); - } + SHIFT_VAR(utf8, s, strend, aptr, datumtype); DO_BO_UNPACK_P(aptr); /* newSVpvn generates undef if aptr is NULL */ PUSHs(sv_2mortal(newSVpvn(aptr, len))); @@ -1863,7 +1925,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'q': while (len-- > 0) { Quad_t aquad; - COPYVAR(s, strend, utf8, aquad, 64); + SHIFT_VAR(utf8, s, strend, aquad, datumtype); + DO_BO_UNPACK(aquad, 64); if (!checksum) PUSHs(sv_2mortal(aquad >= IV_MIN && aquad <= IV_MAX ? newSViv((IV)aquad) : newSVnv((NV)aquad))); @@ -1876,7 +1939,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'Q': while (len-- > 0) { Uquad_t auquad; - COPYVAR(s, strend, utf8, auquad, 64); + SHIFT_VAR(utf8, s, strend, auquad, datumtype); + DO_BO_UNPACK(auquad, 64); if (!checksum) PUSHs(sv_2mortal(auquad <= UV_MAX ? newSVuv((UV)auquad):newSVnv((NV)auquad))); @@ -1891,13 +1955,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'f': while (len-- > 0) { float afloat; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, (char *) &afloat, - sizeof(afloat))) break; - } else { - Copy(s, &afloat, 1, float); - s += sizeof(float); - } + SHIFT_VAR(utf8, s, strend, afloat, datumtype); DO_BO_UNPACK_N(afloat, float); if (!checksum) PUSHs(sv_2mortal(newSVnv((NV)afloat))); @@ -1908,13 +1966,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'd': while (len-- > 0) { double adouble; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, (char *) &adouble, - sizeof(adouble))) break; - } else { - Copy(s, &adouble, 1, double); - s += sizeof(double); - } + SHIFT_VAR(utf8, s, strend, adouble, datumtype); DO_BO_UNPACK_N(adouble, double); if (!checksum) PUSHs(sv_2mortal(newSVnv((NV)adouble))); @@ -1925,13 +1977,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'F': while (len-- > 0) { NV anv; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - (char *) &anv, sizeof(anv))) break; - } else { - Copy(s, &anv, 1, NV); - s += NVSIZE; - } + SHIFT_VAR(utf8, s, strend, anv, datumtype); DO_BO_UNPACK_N(anv, NV); if (!checksum) PUSHs(sv_2mortal(newSVnv(anv))); @@ -1943,13 +1989,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'D': while (len-- > 0) { long double aldouble; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, (char *) &aldouble, - sizeof(aldouble))) break; - } else { - Copy(s, &aldouble, 1, long double); - s += LONG_DOUBLESIZE; - } + SHIFT_VAR(utf8, s, strend, aldouble, datumtype); DO_BO_UNPACK_N(aldouble, long double); if (!checksum) PUSHs(sv_2mortal(newSVnv((NV)aldouble))); @@ -2120,7 +2160,7 @@ PP(pp_unpack) PUTBACK; cnt = unpackstring(pat, patend, s, strend, ((gimme == G_SCALAR) ? FLAG_UNPACK_ONLY_ONE : 0) - | (DO_UTF8(right) ? FLAG_UNPACK_DO_UTF8 : 0)); + | (DO_UTF8(right) ? FLAG_DO_UTF8 : 0)); SPAGAIN; if ( !cnt && gimme == G_SCALAR ) @@ -2128,32 +2168,27 @@ PP(pp_unpack) RETURN; } -STATIC void -S_doencodes(pTHX_ register SV *sv, register char *s, register I32 len) +STATIC U8 * +doencodes(U8 *h, char *s, I32 len) { - char hunk[5]; - - *hunk = PL_uuemap[len]; - sv_catpvn(sv, hunk, 1); - hunk[4] = '\0'; + *h++ = PL_uuemap[len]; while (len > 2) { - hunk[0] = PL_uuemap[(077 & (*s >> 2))]; - hunk[1] = PL_uuemap[(077 & (((*s << 4) & 060) | ((s[1] >> 4) & 017)))]; - hunk[2] = PL_uuemap[(077 & (((s[1] << 2) & 074) | ((s[2] >> 6) & 03)))]; - hunk[3] = PL_uuemap[(077 & (s[2] & 077))]; - sv_catpvn(sv, hunk, 4); + *h++ = PL_uuemap[(077 & (s[0] >> 2))]; + *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((s[1] >> 4) & 017)))]; + *h++ = PL_uuemap[(077 & (((s[1] << 2) & 074) | ((s[2] >> 6) & 03)))]; + *h++ = PL_uuemap[(077 & (s[2] & 077))]; s += 3; len -= 3; } if (len > 0) { char r = (len > 1 ? s[1] : '\0'); - hunk[0] = PL_uuemap[(077 & (*s >> 2))]; - hunk[1] = PL_uuemap[(077 & (((*s << 4) & 060) | ((r >> 4) & 017)))]; - hunk[2] = PL_uuemap[(077 & ((r << 2) & 074))]; - hunk[3] = PL_uuemap[0]; - sv_catpvn(sv, hunk, 4); + *h++ = PL_uuemap[(077 & (s[0] >> 2))]; + *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((r >> 4) & 017)))]; + *h++ = PL_uuemap[(077 & ((r << 2) & 074))]; + *h++ = PL_uuemap[0]; } - sv_catpvn(sv, "\n", 1); + *h++ = '\n'; + return h; } STATIC SV * @@ -2266,449 +2301,757 @@ The engine implementing pack() Perl function. void Perl_packlist(pTHX_ SV *cat, char *pat, register char *patend, register SV **beglist, SV **endlist ) { + STRLEN no_len; tempsym_t sym = { 0 }; + sym.patptr = pat; sym.patend = patend; sym.flags = FLAG_PACK; + /* We're going to do changes through SvPVX(cat). Make sure it's valid. + Also make sure any UTF8 flag is loaded */ + SvPV_force(cat, no_len); + if (DO_UTF8(cat)) sym.flags |= FLAG_PARSE_UTF8 | FLAG_DO_UTF8; + (void)pack_rec( cat, &sym, beglist, endlist ); } +/* like sv_utf8_upgrade, but also repoint the group start markers */ +STATIC void +marked_upgrade(pTHX_ SV *sv, tempsym_t *sym_ptr) { + STRLEN len; + tempsym_t *group; + char *from_ptr, *to_start, *to_ptr, **marks, **m, *from_start, *from_end; + + if (SvUTF8(sv)) return; + + from_start = SvPVX(sv); + from_end = from_start + SvCUR(sv); + for (from_ptr = from_start; from_ptr < from_end; from_ptr++) + if (!NATIVE_IS_INVARIANT(*from_ptr)) break; + if (from_ptr == from_end) { + /* Simple case: no character needs to be changed */ + SvUTF8_on(sv); + return; + } + + /* We assume a char translates to at most 2 UTF-8 bytes */ + len = (from_end-from_ptr)*2+(from_ptr-from_start)+1; + New('U', to_start, len, char); + Copy(from_start, to_start, from_ptr-from_start, char); + to_ptr = to_start + (from_ptr-from_start); + + New('U', marks, sym_ptr->level+2, char *); + for (group=sym_ptr; group; group = group->previous) + marks[group->level] = from_start + group->strbeg; + marks[sym_ptr->level+1] = from_end+1; + for (m = marks; *m < from_ptr; m++) + *m = to_start + (*m-from_start); + + for (;from_ptr < from_end; from_ptr++) { + while (*m == from_ptr) *m++ = to_ptr; + to_ptr = uvchr_to_utf8(to_ptr, *(U8 *) from_ptr); + } + *to_ptr = 0; + + while (*m == from_ptr) *m++ = to_ptr; + if (m != marks + sym_ptr->level+1) { + Safefree(marks); + Safefree(to_start); + Perl_croak(aTHX_ "Assertion: marks beyond string end"); + } + for (group=sym_ptr; group; group = group->previous) + group->strbeg = marks[group->level] - to_start; + Safefree(marks); + + if (SvOOK(sv)) { + if (SvIVX(sv)) { + SvLEN(sv) += SvIVX(sv); + from_start -= SvIVX(sv); + SvIV_set(sv, 0); + } + SvFLAGS(sv) &= ~SVf_OOK; + } + if (SvLEN(sv) != 0) + Safefree(from_start); + SvPVX(sv) = to_start; + SvCUR(sv) = to_ptr - to_start; + SvLEN(sv) = len; + SvUTF8_on(sv); +} + +/* Exponential string grower. Makes string extension effectively O(n) + needed says how many extra bytes we need (not counting the final '\0') + Only grows the string if there is an actual lack of space +*/ +STATIC char * +sv_exp_grow(pTHX_ SV *sv, STRLEN needed) { + STRLEN cur = SvCUR(sv); + STRLEN len = SvLEN(sv); + STRLEN extend; + if (len - cur > needed) return SvPVX(sv); + extend = needed > len ? needed : len; + return SvGROW(sv, len+extend+1); +} STATIC SV ** -S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV **endlist ) +S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) { - register I32 items; - STRLEN fromlen; - register I32 len = 0; - SV *fromstr; - /*SUPPRESS 442*/ - static char null10[] = {0,0,0,0,0,0,0,0,0,0}; - static char *space10 = " "; - bool found; - - /* These must not be in registers: */ - char achar; - I16 ai16; - U16 au16; - I32 ai32; - U32 au32; -#ifdef HAS_QUAD - Quad_t aquad; - Uquad_t auquad; -#endif -#if SHORTSIZE != SIZE16 - short ashort; - unsigned short aushort; -#endif - int aint; - unsigned int auint; -#if LONGSIZE != SIZE32 - long along; - unsigned long aulong; -#endif - char *aptr; - float afloat; - double adouble; -#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) - long double aldouble; -#endif - IV aiv; - UV auv; - NV anv; - - int strrelbeg = SvCUR(cat); tempsym_t lookahead; - - items = endlist - beglist; - found = next_symbol( symptr ); - -#ifndef PACKED_IS_OCTETS - if (symptr->level == 0 && found && symptr->code == 'U' ){ - SvUTF8_on(cat); + I32 items = endlist - beglist; + bool found = next_symbol(symptr); + bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0; + + if (symptr->level == 0 && found && symptr->code == 'U') { + marked_upgrade(aTHX_ cat, symptr); + symptr->flags |= FLAG_DO_UTF8; + utf8 = 0; } -#endif + symptr->strbeg = SvCUR(cat); while (found) { + SV *fromstr; + STRLEN fromlen; + I32 len; SV *lengthcode = Nullsv; -#define NEXTFROM ( lengthcode ? lengthcode : items-- > 0 ? *beglist++ : &PL_sv_no) - I32 datumtype = symptr->code; - howlen_t howlen; + howlen_t howlen = symptr->howlen; + char *start = SvPVX(cat); + char *cur = start + SvCUR(cat); - switch( howlen = symptr->howlen ){ - case e_no_len: - case e_number: - len = symptr->length; - break; +#define NEXTFROM (lengthcode ? lengthcode : items-- > 0 ? *beglist++ : &PL_sv_no) + + switch (howlen) { case e_star: - len = strchr("@Xxu", TYPE_NO_MODIFIERS(datumtype)) ? 0 : items; + len = strchr("@Xxu", TYPE_NO_MODIFIERS(datumtype)) ? + 0 : items; + break; + default: + /* e_no_len and e_number */ + len = symptr->length; break; } + if (len) { + struct packsize_t *pack_props = + &packsize[(symptr->code & TYPE_IS_SHRIEKING) ? + PACK_SIZE_SHRIEKING : PACK_SIZE_NORMAL]; + const int rawtype = TYPE_NO_MODIFIERS(datumtype); + int offset = rawtype - pack_props->first; + + if (offset >= 0 && offset < pack_props->size) { + /* Data about this template letter */ + unsigned char data = pack_props->array[offset]; + + if (data && !(data & PACK_SIZE_UNPREDICTABLE)) { + /* We can process this letter. */ + STRLEN size = data & PACK_SIZE_MASK; + GROWING(utf8, cat, start, cur, (STRLEN) len * size); + } + } + + } + /* Look ahead for next symbol. Do we have code/code? */ lookahead = *symptr; found = next_symbol(&lookahead); if ( symptr->flags & FLAG_SLASH ) { - if (found){ + if (!found) Perl_croak(aTHX_ "Code missing after '/' in pack"); if ( 0 == strchr( "aAZ", lookahead.code ) || e_star != lookahead.howlen ) Perl_croak(aTHX_ "'/' must be followed by 'a*', 'A*' or 'Z*' in pack"); - lengthcode = sv_2mortal(newSViv(sv_len(items > 0 - ? *beglist : &PL_sv_no) - + (lookahead.code == 'Z' ? 1 : 0))); - } else { - Perl_croak(aTHX_ "Code missing after '/' in pack"); - } + lengthcode = + sv_2mortal(newSViv((items > 0 ? DO_UTF8(*beglist) ? sv_len_utf8(*beglist) : sv_len(*beglist) : 0) + (lookahead.code == 'Z' ? 1 : 0))); } + /* Code inside the switch must take care to properly update + cat (CUR length and '\0' termination) if it updated *cur and + doesn't simply leave using break */ switch(TYPE_NO_ENDIANNESS(datumtype)) { default: - Perl_croak(aTHX_ "Invalid type '%c' in pack", (int)TYPE_NO_MODIFIERS(datumtype)); + Perl_croak(aTHX_ "Invalid type '%c' in pack", + (int) TYPE_NO_MODIFIERS(datumtype)); case '%': Perl_croak(aTHX_ "'%%' may not be used in pack"); case '@': - len += strrelbeg - SvCUR(cat); - if (len > 0) - goto grow; + if (utf8) { + char *s = start + symptr->strbeg; + while (len > 0 && s < cur) { + s += UTF8SKIP(s); + len--; + } + if (s > cur) + Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); + if (len > 0) { + grow: + GROWING(0, cat, start, cur, len); + Zero(cur, len, char); + cur += len; + } else if (s < cur) cur = s; + else goto no_change; + } else { + len -= cur - (start+symptr->strbeg); + if (len > 0) goto grow; len = -len; - if (len > 0) - goto shrink; + if (len > 0) goto shrink; + else goto no_change; + } break; - case '(': - { + case '(': { tempsym_t savsym = *symptr; U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags); symptr->flags |= group_modifiers; symptr->patend = savsym.grpend; symptr->level++; + symptr->previous = &lookahead; while (len--) { + U32 was_utf8; + if (utf8) symptr->flags |= FLAG_PARSE_UTF8; + else symptr->flags &= ~FLAG_PARSE_UTF8; + was_utf8 = SvUTF8(cat); symptr->patptr = savsym.grpbeg; - beglist = pack_rec(cat, symptr, beglist, endlist ); + beglist = pack_rec(cat, symptr, beglist, endlist); + if (SvUTF8(cat) != was_utf8) + /* This had better be an upgrade while in utf8==0 mode */ + utf8 = 1; + if (savsym.howlen == e_star && beglist == endlist) break; /* No way to continue */ } - symptr->flags &= ~group_modifiers; - lookahead.flags = symptr->flags; - *symptr = savsym; - break; + lookahead.flags = symptr->flags & ~group_modifiers; + goto no_change; } case 'X' | TYPE_IS_SHRIEKING: if (!len) /* Avoid division by 0 */ len = 1; - len = (SvCUR(cat)) % len; + if (utf8) { + char *hop, *last; + I32 l = len; + hop = last = start; + while (hop < cur) { + hop += UTF8SKIP(hop); + if (--l == 0) { + last = hop; + l = len; + } + } + if (last > cur) + Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); + cur = last; + break; + } + len = (cur-start) % len; /* FALL THROUGH */ case 'X': + if (utf8) { + if (len < 1) goto no_change; + while (len > 0) { + if (cur <= start) + Perl_croak(aTHX_ "'X' outside of string in pack"); + while (--cur, UTF8_IS_CONTINUATION(*cur)) { + if (cur <= start) + Perl_croak(aTHX_ "'X' outside of string in pack"); + } + len--; + } + } else { shrink: - if ((I32)SvCUR(cat) < len) + if (cur - start < len) Perl_croak(aTHX_ "'X' outside of string in pack"); - SvCUR(cat) -= len; - *SvEND(cat) = '\0'; + cur -= len; + } + if (cur < start+symptr->strbeg) { + /* Make sure group starts don't point into the void */ + tempsym_t *group; + STRLEN length = cur-start; + for (group = symptr; + group && length < group->strbeg; + group = group->previous) group->strbeg = length; + lookahead.strbeg = length; + } break; - case 'x' | TYPE_IS_SHRIEKING: + case 'x' | TYPE_IS_SHRIEKING: { + I32 ai32; if (!len) /* Avoid division by 0 */ len = 1; - aint = (SvCUR(cat)) % len; - if (aint) /* Other portable ways? */ - len = len - aint; - else - len = 0; + if (utf8) ai32 = utf8_length(start, cur) % len; + else ai32 = (cur - start) % len; + if (ai32 == 0) goto no_change; + len -= ai32; + } /* FALL THROUGH */ - case 'x': - grow: - while (len >= 10) { - sv_catpvn(cat, null10, 10); - len -= 10; - } - sv_catpvn(cat, null10, len); - break; + goto grow; case 'A': case 'Z': - case 'a': + case 'a': { + char *aptr; + fromstr = NEXTFROM; aptr = SvPV(fromstr, fromlen); + if (DO_UTF8(fromstr)) { + char *end, *s; + + if (!utf8 && !SvUTF8(cat)) { + marked_upgrade(aTHX_ cat, symptr); + lookahead.flags |= FLAG_DO_UTF8; + lookahead.strbeg = symptr->strbeg; + utf8 = 1; + start = SvPVX(cat); + cur = start + SvCUR(cat); + } if (howlen == e_star) { + if (utf8) goto string_copy; + len = fromlen+1; + } + s = aptr; + end = aptr + fromlen; + fromlen = datumtype == 'Z' ? len-1 : len; + while ((I32) fromlen > 0 && s < end) { + s += UTF8SKIP(s); + fromlen--; + } + if (s > end) + Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); + if (utf8) { len = fromlen; - if (datumtype == 'Z') - ++len; - } - if ((I32)fromlen >= len) { - sv_catpvn(cat, aptr, len); - if (datumtype == 'Z' && len > 0) - *(SvEND(cat)-1) = '\0'; + if (datumtype == 'Z') len++; + fromlen = s-aptr; + len += fromlen; + + goto string_copy; + } + fromlen = len - fromlen; + if (datumtype == 'Z') fromlen--; + if (howlen == e_star) { + len = fromlen; + if (datumtype == 'Z') len++; } - else { - sv_catpvn(cat, aptr, fromlen); + GROWING(0, cat, start, cur, len); + if (!uni_to_bytes(aTHX_ &aptr, end, cur, fromlen, + datumtype | TYPE_IS_PACK)) + Perl_croak(aTHX_ "Perl bug: predicted utf8 length not available"); + cur += fromlen; len -= fromlen; - if (datumtype == 'A') { - while (len >= 10) { - sv_catpvn(cat, space10, 10); - len -= 10; - } - sv_catpvn(cat, space10, len); + } else if (utf8) { + if (howlen == e_star) { + len = fromlen; + if (datumtype == 'Z') len++; } - else { - while (len >= 10) { - sv_catpvn(cat, null10, 10); - len -= 10; + if (len <= (I32) fromlen) { + fromlen = len; + if (datumtype == 'Z' && fromlen > 0) fromlen--; + } + /* assumes a byte expands to at most 2 bytes on upgrade: + expected_length <= from_len*2 + (len-from_len) */ + GROWING(0, cat, start, cur, fromlen+len); + len -= fromlen; + while (fromlen > 0) { + cur = uvchr_to_utf8(cur, * (U8 *) aptr); + aptr++; + fromlen--; } - sv_catpvn(cat, null10, len); + } else { + string_copy: + if (howlen == e_star) { + len = fromlen; + if (datumtype == 'Z') len++; + } + if (len <= (I32) fromlen) { + fromlen = len; + if (datumtype == 'Z' && fromlen > 0) fromlen--; } + GROWING(0, cat, start, cur, len); + Copy(aptr, cur, fromlen, char); + cur += fromlen; + len -= fromlen; } + memset(cur, datumtype == 'A' ? ' ' : '\0', len); + cur += len; break; + } case 'B': - case 'b': - { - register char *str; - I32 saveitems; + case 'b': { + char *str, *end; + I32 l, field_len; + U8 bits; + bool utf8_source; + U32 utf8_flags; fromstr = NEXTFROM; - saveitems = items; str = SvPV(fromstr, fromlen); - if (howlen == e_star) - len = fromlen; - aint = SvCUR(cat); - SvCUR(cat) += (len+7)/8; - SvGROW(cat, SvCUR(cat) + 1); - aptr = SvPVX(cat) + aint; - if (len > (I32)fromlen) - len = fromlen; - aint = len; - items = 0; - if (datumtype == 'B') { - for (len = 0; len++ < aint;) { - items |= *str++ & 1; - if (len & 7) - items <<= 1; + end = str + fromlen; + if (DO_UTF8(fromstr)) { + utf8_source = TRUE; + utf8_flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; + } else { + utf8_source = FALSE; + utf8_flags = 0; /* Unused, but keep compilers happy */ + } + if (howlen == e_star) len = fromlen; + field_len = (len+7)/8; + GROWING(utf8, cat, start, cur, field_len); + if (len > (I32)fromlen) len = fromlen; + bits = 0; + l = 0; + if (datumtype == 'B') + while (l++ < len) { + if (utf8_source) { + UV val; + NEXT_UNI_VAL(val, cur, str, end, utf8_flags); + bits |= val & 1; + } else bits |= *str++ & 1; + if (l & 7) bits <<= 1; else { - *aptr++ = items & 0xff; - items = 0; - } + PUSH_BYTE(utf8, cur, bits); + bits = 0; } } + else + /* datumtype == 'b' */ + while (l++ < len) { + if (utf8_source) { + UV val; + NEXT_UNI_VAL(val, cur, str, end, utf8_flags); + if (val & 1) bits |= 0x80; + } else if (*str++ & 1) + bits |= 0x80; + if (l & 7) bits >>= 1; else { - for (len = 0; len++ < aint;) { - if (*str++ & 1) - items |= 128; - if (len & 7) - items >>= 1; - else { - *aptr++ = items & 0xff; - items = 0; - } + PUSH_BYTE(utf8, cur, bits); + bits = 0; } } - if (aint & 7) { + l--; + if (l & 7) { if (datumtype == 'B') - items <<= 7 - (aint & 7); + bits <<= 7 - (l & 7); else - items >>= 7 - (aint & 7); - *aptr++ = items & 0xff; - } - str = SvPVX(cat) + SvCUR(cat); - while (aptr <= str) - *aptr++ = '\0'; - - items = saveitems; + bits >>= 7 - (l & 7); + PUSH_BYTE(utf8, cur, bits); + l += 7; } + /* Determine how many chars are left in the requested field */ + l /= 8; + if (howlen == e_star) field_len = 0; + else field_len -= l; + Zero(cur, field_len, char); + cur += field_len; break; + } case 'H': - case 'h': - { - register char *str; - I32 saveitems; + case 'h': { + char *str, *end; + I32 l, field_len; + U8 bits; + bool utf8_source; + U32 utf8_flags; fromstr = NEXTFROM; - saveitems = items; str = SvPV(fromstr, fromlen); - if (howlen == e_star) - len = fromlen; - aint = SvCUR(cat); - SvCUR(cat) += (len+1)/2; - SvGROW(cat, SvCUR(cat) + 1); - aptr = SvPVX(cat) + aint; - if (len > (I32)fromlen) - len = fromlen; - aint = len; - items = 0; - if (datumtype == 'H') { - for (len = 0; len++ < aint;) { - if (isALPHA(*str)) - items |= ((*str++ & 15) + 9) & 15; + end = str + fromlen; + if (DO_UTF8(fromstr)) { + utf8_source = TRUE; + utf8_flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; + } else { + utf8_source = FALSE; + utf8_flags = 0; /* Unused, but keep compilers happy */ + } + if (howlen == e_star) len = fromlen; + field_len = (len+1)/2; + GROWING(utf8, cat, start, cur, field_len); + if (!utf8 && len > (I32)fromlen) len = fromlen; + bits = 0; + l = 0; + if (datumtype == 'H') + while (l++ < len) { + if (utf8_source) { + UV val; + NEXT_UNI_VAL(val, cur, str, end, utf8_flags); + if (val < 256 && isALPHA(val)) + bits |= (val + 9) & 0xf; else - items |= *str++ & 15; - if (len & 1) - items <<= 4; + bits |= val & 0xf; + } else if (isALPHA(*str)) + bits |= (*str++ + 9) & 0xf; + else + bits |= *str++ & 0xf; + if (l & 1) bits <<= 4; else { - *aptr++ = items & 0xff; - items = 0; - } + PUSH_BYTE(utf8, cur, bits); + bits = 0; } } - else { - for (len = 0; len++ < aint;) { - if (isALPHA(*str)) - items |= (((*str++ & 15) + 9) & 15) << 4; + else + while (l++ < len) { + if (utf8_source) { + UV val; + NEXT_UNI_VAL(val, cur, str, end, utf8_flags); + if (val < 256 && isALPHA(val)) + bits |= ((val + 9) & 0xf) << 4; else - items |= (*str++ & 15) << 4; - if (len & 1) - items >>= 4; - else { - *aptr++ = items & 0xff; - items = 0; + bits |= (val & 0xf) << 4; + } else if (isALPHA(*str)) + bits |= ((*str++ + 9) & 0xf) << 4; + else + bits |= (*str++ & 0xf) << 4; + if (l & 1) bits >>= 4; + else { + PUSH_BYTE(utf8, cur, bits); + bits = 0; } } + l--; + if (l & 1) { + PUSH_BYTE(utf8, cur, bits); + l++; + } + /* Determine how many chars are left in the requested field */ + l /= 2; + if (howlen == e_star) field_len = 0; + else field_len -= l; + Zero(cur, field_len, char); + cur += field_len; + break; } - if (aint & 1) - *aptr++ = items & 0xff; - str = SvPVX(cat) + SvCUR(cat); - while (aptr <= str) - *aptr++ = '\0'; - - items = saveitems; + case 'c': + while (len-- > 0) { + IV aiv; + fromstr = NEXTFROM; + aiv = SvIV(fromstr); + if ((-128 > aiv || aiv > 127) && + ckWARN(WARN_PACK)) + Perl_warner(aTHX_ packWARN(WARN_PACK), + "Character in 'c' format wrapped in pack"); + PUSH_BYTE(utf8, cur, aiv & 0xff); } break; case 'C': - case 'c': + if (len == 0) { + utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; + break; + } + GROWING(0, cat, start, cur, len); while (len-- > 0) { + IV aiv; fromstr = NEXTFROM; - switch (TYPE_NO_MODIFIERS(datumtype)) { - case 'C': - aint = SvIV(fromstr); - if ((aint < 0 || aint > 255) && + aiv = SvIV(fromstr); + if ((0 > aiv || aiv > 0xff) && ckWARN(WARN_PACK)) Perl_warner(aTHX_ packWARN(WARN_PACK), "Character in 'C' format wrapped in pack"); - achar = aint & 255; - sv_catpvn(cat, &achar, sizeof(char)); + *cur++ = aiv & 0xff; + } break; - case 'c': - aint = SvIV(fromstr); - if ((aint < -128 || aint > 127) && - ckWARN(WARN_PACK)) + case 'W': { + char *end; + U8 in_bytes = IN_BYTES; + + end = start+SvLEN(cat)-1; + if (utf8) end -= UTF8_MAXLEN-1; + while (len-- > 0) { + UV auv; + fromstr = NEXTFROM; + auv = SvUV(fromstr); + if (in_bytes) auv = auv % 0x100; + if (utf8) { + W_utf8: + if (cur > end) { + *cur = '\0'; + SvCUR(cat) = cur - start; + + GROWING(0, cat, start, cur, len+UTF8_MAXLEN); + end = start+SvLEN(cat)-UTF8_MAXLEN; + } + cur = uvuni_to_utf8_flags(cur, NATIVE_TO_UNI(auv), + ckWARN(WARN_UTF8) ? + 0 : UNICODE_ALLOW_ANY); + } else { + if (auv >= 0x100) { + if (!SvUTF8(cat)) { + *cur = '\0'; + SvCUR(cat) = cur - start; + marked_upgrade(aTHX_ cat, symptr); + lookahead.flags |= FLAG_DO_UTF8; + lookahead.strbeg = symptr->strbeg; + utf8 = 1; + start = SvPVX(cat); + cur = start + SvCUR(cat); + end = start+SvLEN(cat)-UTF8_MAXLEN; + goto W_utf8; + } + if (ckWARN(WARN_PACK)) Perl_warner(aTHX_ packWARN(WARN_PACK), - "Character in 'c' format wrapped in pack" ); - achar = aint & 255; - sv_catpvn(cat, &achar, sizeof(char)); - break; + "Character in 'W' format wrapped in pack"); + auv &= 0xff; + } + if (cur >= end) { + *cur = '\0'; + SvCUR(cat) = cur - start; + GROWING(0, cat, start, cur, len+1); + end = start+SvLEN(cat)-1; + } + *(U8 *) cur++ = auv; } } break; - case 'U': + } + case 'U': { + char *end; + + if (len == 0) { + if (!(symptr->flags & FLAG_DO_UTF8)) { + marked_upgrade(aTHX_ cat, symptr); + lookahead.flags |= FLAG_DO_UTF8; + lookahead.strbeg = symptr->strbeg; + } + utf8 = 0; + goto no_change; + } + + end = start+SvLEN(cat); + if (!utf8) end -= UTF8_MAXLEN; while (len-- > 0) { + UV auv; fromstr = NEXTFROM; - auint = UNI_TO_NATIVE(SvUV(fromstr)); - SvGROW(cat, SvCUR(cat) + UTF8_MAXBYTES + 1); - SvCUR_set(cat, - (char*)uvchr_to_utf8_flags((U8*)SvEND(cat), - auint, + auv = SvUV(fromstr); + if (utf8) { + char buffer[UTF8_MAXLEN], *end; + end = uvuni_to_utf8_flags(buffer, auv, + ckWARN(WARN_UTF8) ? + 0 : UNICODE_ALLOW_ANY); + if (cur >= end-(end-buffer)*2) { + *cur = '\0'; + SvCUR(cat) = cur - start; + GROWING(0, cat, start, cur, len+(end-buffer)*2); + end = start+SvLEN(cat)-UTF8_MAXLEN; + } + bytes_to_uni(aTHX_ buffer, end-buffer, &cur); + } else { + if (cur >= end) { + *cur = '\0'; + SvCUR(cat) = cur - start; + GROWING(0, cat, start, cur, len+UTF8_MAXLEN); + end = start+SvLEN(cat)-UTF8_MAXLEN; + } + cur = uvuni_to_utf8_flags(cur, auv, ckWARN(WARN_UTF8) ? - 0 : UNICODE_ALLOW_ANY) - - SvPVX(cat)); + 0 : UNICODE_ALLOW_ANY); + } } - *SvEND(cat) = '\0'; break; + } /* Float and double added by gnb@melba.bby.oz.au 22/11/89 */ case 'f': while (len-- > 0) { + float afloat; + NV anv; fromstr = NEXTFROM; + anv = SvNV(fromstr); #ifdef __VOS__ -/* VOS does not automatically map a floating-point overflow + /* VOS does not automatically map a floating-point overflow during conversion from double to float into infinity, so we do it by hand. This code should either be generalized for any OS that needs it, or removed if and when VOS implements posix-976 (suggestion to support mapping to infinity). Paul.Green@stratus.com 02-04-02. */ - if (SvNV(fromstr) > FLT_MAX) + if (anv > FLT_MAX) afloat = _float_constants[0]; /* single prec. inf. */ - else if (SvNV(fromstr) < -FLT_MAX) + else if (anv < -FLT_MAX) afloat = _float_constants[0]; /* single prec. inf. */ - else afloat = (float)SvNV(fromstr); -#else + else afloat = (float) anv; +#else /* __VOS__ */ # if defined(VMS) && !defined(__IEEE_FP) -/* IEEE fp overflow shenanigans are unavailable on VAX and optional + /* IEEE fp overflow shenanigans are unavailable on VAX and optional * on Alpha; fake it if we don't have them. */ - if (SvNV(fromstr) > FLT_MAX) + if (anv > FLT_MAX) afloat = FLT_MAX; - else if (SvNV(fromstr) < -FLT_MAX) + else if (anv < -FLT_MAX) afloat = -FLT_MAX; - else afloat = (float)SvNV(fromstr); + else afloat = (float)anv; # else - afloat = (float)SvNV(fromstr); + afloat = (float)anv; # endif -#endif +#endif /* __VOS__ */ DO_BO_PACK_N(afloat, float); - sv_catpvn(cat, (char *)&afloat, sizeof (float)); + PUSH_VAR(utf8, cur, afloat); } break; case 'd': while (len-- > 0) { + double adouble; + NV anv; fromstr = NEXTFROM; + anv = SvNV(fromstr); #ifdef __VOS__ -/* VOS does not automatically map a floating-point overflow + /* VOS does not automatically map a floating-point overflow during conversion from long double to double into infinity, so we do it by hand. This code should either be generalized for any OS that needs it, or removed if and when VOS implements posix-976 (suggestion to support mapping to infinity). Paul.Green@stratus.com 02-04-02. */ - if (SvNV(fromstr) > DBL_MAX) + if (anv > DBL_MAX) adouble = _double_constants[0]; /* double prec. inf. */ - else if (SvNV(fromstr) < -DBL_MAX) + else if (anv < -DBL_MAX) adouble = _double_constants[0]; /* double prec. inf. */ - else adouble = (double)SvNV(fromstr); -#else + else adouble = (double) anv; +#else /* __VOS__ */ # if defined(VMS) && !defined(__IEEE_FP) -/* IEEE fp overflow shenanigans are unavailable on VAX and optional + /* IEEE fp overflow shenanigans are unavailable on VAX and optional * on Alpha; fake it if we don't have them. */ - if (SvNV(fromstr) > DBL_MAX) + if (anv > DBL_MAX) adouble = DBL_MAX; - else if (SvNV(fromstr) < -DBL_MAX) + else if (anv < -DBL_MAX) adouble = -DBL_MAX; - else adouble = (double)SvNV(fromstr); + else adouble = (double)anv; # else - adouble = (double)SvNV(fromstr); + adouble = (double)anv; # endif -#endif +#endif /* __VOS__ */ DO_BO_PACK_N(adouble, double); - sv_catpvn(cat, (char *)&adouble, sizeof (double)); + PUSH_VAR(utf8, cur, adouble); } break; - case 'F': + case 'F': { + NV anv; Zero(&anv, 1, NV); /* can be long double with unused bits */ while (len-- > 0) { fromstr = NEXTFROM; anv = SvNV(fromstr); DO_BO_PACK_N(anv, NV); - sv_catpvn(cat, (char *)&anv, NVSIZE); + PUSH_VAR(utf8, cur, anv); } break; + } #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) - case 'D': + case 'D': { + long double aldouble; /* long doubles can have unused bits, which may be nonzero */ Zero(&aldouble, 1, long double); while (len-- > 0) { fromstr = NEXTFROM; aldouble = (long double)SvNV(fromstr); DO_BO_PACK_N(aldouble, long double); - sv_catpvn(cat, (char *)&aldouble, LONG_DOUBLESIZE); + PUSH_VAR(utf8, cur, aldouble); } break; + } #endif #ifdef PERL_PACK_CAN_SHRIEKSIGN case 'n' | TYPE_IS_SHRIEKING: #endif case 'n': while (len-- > 0) { + I16 ai16; fromstr = NEXTFROM; ai16 = (I16)SvIV(fromstr); #ifdef HAS_HTONS ai16 = PerlSock_htons(ai16); #endif - CAT16(cat, &ai16); + PUSH16(utf8, cur, &ai16); } break; #ifdef PERL_PACK_CAN_SHRIEKSIGN @@ -2716,48 +3059,45 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV #endif case 'v': while (len-- > 0) { + I16 ai16; fromstr = NEXTFROM; ai16 = (I16)SvIV(fromstr); #ifdef HAS_HTOVS ai16 = htovs(ai16); #endif - CAT16(cat, &ai16); + PUSH16(utf8, cur, &ai16); } break; case 'S' | TYPE_IS_SHRIEKING: #if SHORTSIZE != SIZE16 - { while (len-- > 0) { + unsigned short aushort; fromstr = NEXTFROM; aushort = SvUV(fromstr); DO_BO_PACK(aushort, s); - sv_catpvn(cat, (char *)&aushort, sizeof(unsigned short)); - } + PUSH_VAR(utf8, cur, aushort); } break; #else /* Fall through! */ #endif case 'S': - { while (len-- > 0) { + U16 au16; fromstr = NEXTFROM; au16 = (U16)SvUV(fromstr); DO_BO_PACK(au16, 16); - CAT16(cat, &au16); - } - + PUSH16(utf8, cur, &au16); } break; case 's' | TYPE_IS_SHRIEKING: #if SHORTSIZE != SIZE16 - { while (len-- > 0) { + short ashort; fromstr = NEXTFROM; ashort = SvIV(fromstr); DO_BO_PACK(ashort, s); - sv_catpvn(cat, (char *)&ashort, sizeof(short)); - } + PUSH_VAR(utf8, cur, ashort); } break; #else @@ -2765,23 +3105,26 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV #endif case 's': while (len-- > 0) { + I16 ai16; fromstr = NEXTFROM; ai16 = (I16)SvIV(fromstr); DO_BO_PACK(ai16, 16); - CAT16(cat, &ai16); + PUSH16(utf8, cur, &ai16); } break; case 'I': case 'I' | TYPE_IS_SHRIEKING: while (len-- > 0) { + unsigned int auint; fromstr = NEXTFROM; auint = SvUV(fromstr); DO_BO_PACK(auint, i); - sv_catpvn(cat, (char*)&auint, sizeof(unsigned int)); + PUSH_VAR(utf8, cur, auint); } break; case 'j': while (len-- > 0) { + IV aiv; fromstr = NEXTFROM; aiv = SvIV(fromstr); #if IVSIZE == INTSIZE @@ -2790,12 +3133,15 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV DO_BO_PACK(aiv, l); #elif defined(HAS_QUAD) && IVSIZE == U64SIZE DO_BO_PACK(aiv, 64); +#else + Perl_croak(aTHX_ "'j' not supported on this platform"); #endif - sv_catpvn(cat, (char*)&aiv, IVSIZE); + PUSH_VAR(utf8, cur, aiv); } break; case 'J': while (len-- > 0) { + UV auv; fromstr = NEXTFROM; auv = SvUV(fromstr); #if UVSIZE == INTSIZE @@ -2804,26 +3150,31 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV DO_BO_PACK(auv, l); #elif defined(HAS_QUAD) && UVSIZE == U64SIZE DO_BO_PACK(auv, 64); +#else + Perl_croak(aTHX_ "'J' not supported on this platform"); #endif - sv_catpvn(cat, (char*)&auv, UVSIZE); + PUSH_VAR(utf8, cur, auv); } break; case 'w': while (len-- > 0) { + NV anv; fromstr = NEXTFROM; anv = SvNV(fromstr); - if (anv < 0) + if (anv < 0) { + *cur = '\0'; + SvCUR(cat) = cur - start; Perl_croak(aTHX_ "Cannot compress negative numbers in pack"); + } /* 0xFFFFFFFFFFFFFFFF may cast to 18446744073709551616.0, which is == UV_MAX_P1. IOK is fine (instead of UV_only), as any negative IVs will have already been got by the croak() above. IOK is untrue for fractions, so we test them against UV_MAX_P1. */ - if (SvIOK(fromstr) || anv < UV_MAX_P1) - { - char buf[(sizeof(UV)*8)/7+1]; + if (SvIOK(fromstr) || anv < UV_MAX_P1) { + char buf[(sizeof(UV)*CHAR_BIT)/7+1]; char *in = buf + sizeof(buf); UV auv = SvUV(fromstr); @@ -2832,29 +3183,10 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV auv >>= 7; } while (auv); buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */ - sv_catpvn(cat, in, (buf + sizeof(buf)) - in); - } - else if (SvPOKp(fromstr)) { /* decimal string arithmetics */ - char *from, *result, *in; - SV *norm; - STRLEN len; - bool done; - - /* Copy string and check for compliance */ - from = SvPV(fromstr, len); - if ((norm = is_an_int(from, len)) == NULL) - Perl_croak(aTHX_ "Can only compress unsigned integers in pack"); - - New('w', result, len, char); - in = result + len; - done = FALSE; - while (!done) - *--in = div128(norm, &done) | 0x80; - result[len - 1] &= 0x7F; /* clear continue bit */ - sv_catpvn(cat, in, (result + len) - in); - Safefree(result); - SvREFCNT_dec(norm); /* free norm */ - } + PUSH_GROWING_BYTES(utf8, cat, start, cur, + in, (buf + sizeof(buf)) - in); + } else if (SvPOKp(fromstr)) + goto w_string; else if (SvNOKp(fromstr)) { /* 10**NV_MAX_10_EXP is the largest power of 10 so 10**(NV_MAX_10_EXP+1) is definately unrepresentable @@ -2867,10 +3199,10 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV floating-point value. */ #ifdef NV_MAX_10_EXP -/* char buf[1 + (int)((NV_MAX_10_EXP + 1) * 0.47456)]; -- invalid C */ + /* char buf[1 + (int)((NV_MAX_10_EXP + 1) * 0.47456)]; -- invalid C */ char buf[1 + (int)((NV_MAX_10_EXP + 1) / 2)]; /* valid C */ #else -/* char buf[1 + (int)((308 + 1) * 0.47456)]; -- invalid C */ + /* char buf[1 + (int)((308 + 1) * 0.47456)]; -- invalid C */ char buf[1 + (int)((308 + 1) / 2)]; /* valid C */ #endif char *in = buf + sizeof(buf); @@ -2884,14 +3216,15 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV anv = next; } while (anv > 0); buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */ - sv_catpvn(cat, in, (buf + sizeof(buf)) - in); - } - else { + PUSH_GROWING_BYTES(utf8, cat, start, cur, + in, (buf + sizeof(buf)) - in); + } else { char *from, *result, *in; SV *norm; STRLEN len; bool done; + w_string: /* Copy string and check for compliance */ from = SvPV(fromstr, len); if ((norm = is_an_int(from, len)) == NULL) @@ -2900,10 +3233,10 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV New('w', result, len, char); in = result + len; done = FALSE; - while (!done) - *--in = div128(norm, &done) | 0x80; + while (!done) *--in = div128(norm, &done) | 0x80; result[len - 1] &= 0x7F; /* clear continue bit */ - sv_catpvn(cat, in, (result + len) - in); + PUSH_GROWING_BYTES(utf8, cat, start, cur, + in, (result + len) - in); Safefree(result); SvREFCNT_dec(norm); /* free norm */ } @@ -2912,10 +3245,11 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV case 'i': case 'i' | TYPE_IS_SHRIEKING: while (len-- > 0) { + int aint; fromstr = NEXTFROM; aint = SvIV(fromstr); DO_BO_PACK(aint, i); - sv_catpvn(cat, (char*)&aint, sizeof(int)); + PUSH_VAR(utf8, cur, aint); } break; #ifdef PERL_PACK_CAN_SHRIEKSIGN @@ -2923,12 +3257,13 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV #endif case 'N': while (len-- > 0) { + U32 au32; fromstr = NEXTFROM; au32 = SvUV(fromstr); #ifdef HAS_HTONL au32 = PerlSock_htonl(au32); #endif - CAT32(cat, &au32); + PUSH32(utf8, cur, &au32); } break; #ifdef PERL_PACK_CAN_SHRIEKSIGN @@ -2936,47 +3271,45 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV #endif case 'V': while (len-- > 0) { + U32 au32; fromstr = NEXTFROM; au32 = SvUV(fromstr); #ifdef HAS_HTOVL au32 = htovl(au32); #endif - CAT32(cat, &au32); + PUSH32(utf8, cur, &au32); } break; case 'L' | TYPE_IS_SHRIEKING: #if LONGSIZE != SIZE32 - { while (len-- > 0) { + unsigned long aulong; fromstr = NEXTFROM; aulong = SvUV(fromstr); DO_BO_PACK(aulong, l); - sv_catpvn(cat, (char *)&aulong, sizeof(unsigned long)); - } + PUSH_VAR(utf8, cur, aulong); } break; #else /* Fall though! */ #endif case 'L': - { while (len-- > 0) { + U32 au32; fromstr = NEXTFROM; au32 = SvUV(fromstr); DO_BO_PACK(au32, 32); - CAT32(cat, &au32); - } + PUSH32(utf8, cur, &au32); } break; case 'l' | TYPE_IS_SHRIEKING: #if LONGSIZE != SIZE32 - { while (len-- > 0) { + long along; fromstr = NEXTFROM; along = SvIV(fromstr); DO_BO_PACK(along, l); - sv_catpvn(cat, (char *)&along, sizeof(long)); - } + PUSH_VAR(utf8, cur, along); } break; #else @@ -2984,35 +3317,41 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV #endif case 'l': while (len-- > 0) { + I32 ai32; fromstr = NEXTFROM; ai32 = SvIV(fromstr); DO_BO_PACK(ai32, 32); - CAT32(cat, &ai32); + PUSH32(utf8, cur, &ai32); } break; #ifdef HAS_QUAD case 'Q': while (len-- > 0) { + Uquad_t auquad; fromstr = NEXTFROM; - auquad = (Uquad_t)SvUV(fromstr); + auquad = (Uquad_t) SvUV(fromstr); DO_BO_PACK(auquad, 64); - sv_catpvn(cat, (char*)&auquad, sizeof(Uquad_t)); + PUSH_VAR(utf8, cur, auquad); } break; case 'q': while (len-- > 0) { + Quad_t aquad; fromstr = NEXTFROM; aquad = (Quad_t)SvIV(fromstr); DO_BO_PACK(aquad, 64); - sv_catpvn(cat, (char*)&aquad, sizeof(Quad_t)); + PUSH_VAR(utf8, cur, aquad); } break; -#endif +#endif /* HAS_QUAD */ case 'P': len = 1; /* assume SV is correct length */ + GROWING(utf8, cat, start, cur, sizeof(char *)); /* Fall through! */ case 'p': while (len-- > 0) { + char *aptr; + fromstr = NEXTFROM; SvGETMAGIC(fromstr); if (!SvOK(fromstr)) aptr = NULL; @@ -3023,10 +3362,9 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV * of pack() (and all copies of the result) are * gone. */ - if (ckWARN(WARN_PACK) && (SvTEMP(fromstr) - || (SvPADTMP(fromstr) - && !SvREADONLY(fromstr)))) - { + if (ckWARN(WARN_PACK) && + (SvTEMP(fromstr) || (SvPADTMP(fromstr) && + !SvREADONLY(fromstr)))) { Perl_warner(aTHX_ packWARN(WARN_PACK), "Attempt to pack pointer to temporary value"); } @@ -3036,30 +3374,59 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV aptr = SvPV_force_flags(fromstr, n_a, 0); } DO_BO_PACK_P(aptr); - sv_catpvn(cat, (char*)&aptr, sizeof(char*)); + PUSH_VAR(utf8, cur, aptr); } break; - case 'u': + case 'u': { + char *aptr, *aend; + bool from_utf8; + fromstr = NEXTFROM; + if (len <= 2) len = 45; + else len = len / 3 * 3; + if (len >= 64) { + Perl_warner(aTHX_ packWARN(WARN_PACK), + "Field too wide in 'u' format in pack"); + len = 63; + } aptr = SvPV(fromstr, fromlen); - SvGROW(cat, fromlen * 4 / 3); - if (len <= 2) - len = 45; - else - len = len / 3 * 3; + from_utf8 = DO_UTF8(fromstr); + if (from_utf8) { + aend = aptr + fromlen; + fromlen = sv_len_utf8(fromstr); + } else aend = NULL; /* Unused, but keep compilers happy */ + GROWING(utf8, cat, start, cur, (fromlen+2) / 3 * 4 + (fromlen+len-1)/len * 2); while (fromlen > 0) { + U8 *end; I32 todo; + U8 hunk[1+63/3*4+1]; if ((I32)fromlen > len) todo = len; else todo = fromlen; - doencodes(cat, aptr, todo); - fromlen -= todo; + if (from_utf8) { + char buffer[64]; + if (!uni_to_bytes(aTHX_ &aptr, aend, buffer, todo, + 'u' | TYPE_IS_PACK)) { + *cur = '\0'; + SvCUR(cat) = cur - start; + Perl_croak(aTHX_ "Assertion: string is shorter than advertised"); + } + end = doencodes(hunk, buffer, todo); + } else { + end = doencodes(hunk, aptr, todo); aptr += todo; } + PUSH_BYTES(utf8, cur, hunk, end-hunk); + fromlen -= todo; + } break; } + } + *cur = '\0'; + SvCUR(cat) = cur - start; + no_change: *symptr = lookahead; } return beglist; @@ -3077,6 +3444,7 @@ PP(pp_pack) MARK++; sv_setpvn(cat, "", 0); + SvUTF8_off(cat); packlist(cat, pat, patend, MARK, SP + 1); |