diff options
Diffstat (limited to 'string.c')
-rw-r--r-- | string.c | 573 |
1 files changed, 408 insertions, 165 deletions
@@ -43,6 +43,7 @@ VALUE rb_cString; VALUE rb_cSymbol; +#define RUBY_MAX_CHAR_LEN 16 #define STR_TMPLOCK FL_USER7 #define STR_NOEMBED FL_USER1 #define STR_SHARED FL_USER2 /* = ELTS_SHARED */ @@ -111,18 +112,19 @@ VALUE rb_cSymbol; #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str)) -static int +static inline int single_byte_optimizable(VALUE str) { - rb_encoding *enc = STR_ENC_GET(str); - - if (rb_enc_mbmaxlen(enc) == 1) - return 1; + rb_encoding *enc; /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */ if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) return 1; + enc = STR_ENC_GET(str); + if (rb_enc_mbmaxlen(enc) == 1) + return 1; + /* Conservative. Possibly single byte. * "\xa1" in Shift_JIS for example. */ return 0; @@ -1057,6 +1059,17 @@ rb_str_modify(VALUE str) ENC_CODERANGE_CLEAR(str); } +/* As rb_str_modify(), but don't clear coderange */ +static void +str_modify_keep_cr(VALUE str) +{ + if (!str_independent(str)) + str_make_independent(str); + if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN) + /* Force re-scan later */ + ENC_CODERANGE_CLEAR(str); +} + void rb_str_associate(VALUE str, VALUE add) { @@ -1281,20 +1294,35 @@ rb_str_substr(VALUE str, long beg, long len) rb_encoding *enc = STR_ENC_GET(str); VALUE str2; char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str); - int singlebyte; if (len < 0) return Qnil; if (!RSTRING_LEN(str)) { len = 0; } + if (single_byte_optimizable(str)) { + if (beg > RSTRING_LEN(str)) return Qnil; + if (beg < 0) { + beg += RSTRING_LEN(str); + if (beg < 0) return Qnil; + } + if (beg + len > RSTRING_LEN(str)) + len = RSTRING_LEN(str) - beg; + if (len <= 0) { + len = 0; + p = 0; + } + else + p = s + beg; + goto sub; + } if (beg < 0) { if (len > -beg) len = -beg; if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) { beg = -beg; - while (beg-- > len && (e = rb_enc_prev_char(s, e, enc)) != 0); + while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0); p = e; if (!p) return Qnil; - while (len-- > 0 && (p = rb_enc_prev_char(s, p, enc)) != 0); + while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0); if (!p) return Qnil; len = e - p; goto sub; @@ -1307,7 +1335,6 @@ rb_str_substr(VALUE str, long beg, long len) else if (beg > 0 && beg > str_strlen(str, enc)) { return Qnil; } - singlebyte = single_byte_optimizable(str); if (len == 0) { p = 0; } @@ -1318,17 +1345,24 @@ rb_str_substr(VALUE str, long beg, long len) len = str_utf8_offset(p, e, len); } #endif - else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) { - len = 0; - } else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { - if (len * rb_enc_mbmaxlen(enc) > e - p) + int char_sz = rb_enc_mbmaxlen(enc); + + p = s + beg * char_sz; + if (p > e) { + p = e; + len = 0; + } + else if (len * char_sz > e - p) len = e - p; else - len *= rb_enc_mbmaxlen(enc); + len *= char_sz; + } + else if ((p = str_nth(s, e, beg, enc, 0)) == e) { + len = 0; } else { - len = str_offset(p, e, len, enc, singlebyte); + len = str_offset(p, e, len, enc, 0); } sub: if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) { @@ -1623,7 +1657,7 @@ rb_str_buf_cat_ascii(VALUE str, const char *ptr) else { char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc)); while (*ptr) { - int c = (unsigned char)*ptr; + unsigned int c = (unsigned char)*ptr; int len = rb_enc_codelen(c, enc); rb_enc_mbcput(c, buf, enc); rb_enc_cr_str_buf_cat(str, buf, len, @@ -1678,13 +1712,13 @@ rb_str_append(VALUE str, VALUE str2) /* * call-seq: - * str << fixnum => str - * str.concat(fixnum) => str + * str << integer => str + * str.concat(integer) => str * str << obj => str * str.concat(obj) => str * * Append---Concatenates the given object to <i>str</i>. If the object is a - * <code>Fixnum</code>, it is considered as a codepoint, and is converted + * <code>Integer</code>, it is considered as a codepoint, and is converted * to a character before concatenation. * * a = "hello " @@ -1695,9 +1729,9 @@ rb_str_append(VALUE str, VALUE str2) VALUE rb_str_concat(VALUE str1, VALUE str2) { - if (FIXNUM_P(str2)) { + if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) { rb_encoding *enc = STR_ENC_GET(str1); - int c = FIX2INT(str2); + unsigned int c = NUM2UINT(str2); int pos = RSTRING_LEN(str1); int len = rb_enc_codelen(c, enc); int cr = ENC_CODERANGE(str1); @@ -1892,6 +1926,7 @@ rb_str_comparable(VALUE str1, VALUE str2) { int idx1, idx2; int rc1, rc2; + int a8; if (RSTRING_LEN(str1) == 0) return Qtrue; if (RSTRING_LEN(str2) == 0) return Qtrue; @@ -1909,6 +1944,8 @@ rb_str_comparable(VALUE str1, VALUE str2) if (rb_enc_asciicompat(rb_enc_from_index(idx1))) return Qtrue; } + a8 = rb_ascii8bit_encindex(); + if (idx1 == a8 || idx2 == a8) return Qtrue; return Qfalse; } @@ -1922,7 +1959,7 @@ rb_str_cmp(VALUE str1, VALUE str2) retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len); if (retval == 0) { if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) { - if (!rb_enc_compatible(str1, str2)) { + if (!rb_str_comparable(str1, str2)) { if (ENCODING_GET(str1) - ENCODING_GET(str2) > 0) return 1; return -1; @@ -2065,19 +2102,33 @@ rb_str_casecmp(VALUE str1, VALUE str2) p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1); p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2); - while (p1 < p1end && p2 < p2end) { - unsigned int c1 = rb_enc_codepoint(p1, p1end, enc); - unsigned int c2 = rb_enc_codepoint(p2, p2end, enc); - - if (c1 != c2) { - c1 = rb_enc_toupper(c1, enc); - c2 = rb_enc_toupper(c2, enc); - if (c1 > c2) return INT2FIX(1); - if (c1 < c2) return INT2FIX(-1); + if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) { + while (p1 < p1end && p2 < p2end) { + if (*p1 != *p2) { + unsigned int c1 = rb_enc_toupper(*p1 & 0xff, enc); + unsigned int c2 = rb_enc_toupper(*p2 & 0xff, enc); + if (c1 > c2) return INT2FIX(1); + if (c1 < c2) return INT2FIX(-1); + } + p1++; + p2++; + } + } + else { + while (p1 < p1end && p2 < p2end) { + unsigned int c1 = rb_enc_codepoint(p1, p1end, enc); + unsigned int c2 = rb_enc_codepoint(p2, p2end, enc); + + if (c1 != c2) { + c1 = rb_enc_toupper(c1, enc); + c2 = rb_enc_toupper(c2, enc); + if (c1 > c2) return INT2FIX(1); + if (c1 < c2) return INT2FIX(-1); + } + len = rb_enc_codelen(c1, enc); + p1 += len; + p2 += len; } - len = rb_enc_codelen(c1, enc); - p1 += len; - p2 += len; } if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0); if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1); @@ -2088,7 +2139,7 @@ static long rb_str_index(VALUE str, VALUE sub, long offset) { long pos; - char *s, *sptr; + char *s, *sptr, *e; long len, slen; rb_encoding *enc; @@ -2104,6 +2155,7 @@ rb_str_index(VALUE str, VALUE sub, long offset) } if (len - offset < slen) return -1; s = RSTRING_PTR(str); + e = s + RSTRING_LEN(str); if (offset) { offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str)); s += offset; @@ -2117,7 +2169,7 @@ rb_str_index(VALUE str, VALUE sub, long offset) char *t; pos = rb_memsearch(sptr, slen, s, len, enc); if (pos < 0) return pos; - t = rb_enc_right_char_head(s, s+pos, enc); + t = rb_enc_right_char_head(s, s+pos, e, enc); if (t == s + pos) break; if ((len -= t - s) <= 0) return -1; offset += t - s; @@ -2571,7 +2623,7 @@ rb_str_succ(VALUE orig) sbeg = RSTRING_PTR(str); s = e = sbeg + RSTRING_LEN(str); - while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) { + while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) { if (ISALPHA(*last_alnum) ? ISDIGIT(*s) : ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) { @@ -2596,7 +2648,7 @@ rb_str_succ(VALUE orig) } if (c == -1) { /* str contains no alnum */ s = e; - while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) { + while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { enum neighbor_char neighbor; if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; neighbor = enc_succ_char(s, l, enc); @@ -2894,11 +2946,11 @@ rb_str_splice(VALUE str, long beg, long len, VALUE val) char *p, *e; rb_encoding *enc; int singlebyte = single_byte_optimizable(str); + int cr; if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); StringValue(val); - rb_str_modify(str); enc = rb_enc_check(str, val); slen = str_strlen(str, enc); @@ -2915,6 +2967,7 @@ rb_str_splice(VALUE str, long beg, long len, VALUE val) if (slen < len || slen < beg + len) { len = slen - beg; } + str_modify_keep_cr(str); p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte); if (!p) p = RSTRING_END(str); e = str_nth(p, RSTRING_END(str), len, enc, singlebyte); @@ -2924,6 +2977,9 @@ rb_str_splice(VALUE str, long beg, long len, VALUE val) len = e - p; /* physical length */ rb_str_splice_0(str, beg, len, val); rb_enc_associate(str, enc); + cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val)); + if (cr != ENC_CODERANGE_BROKEN) + ENC_CODERANGE_SET(str, cr); } void @@ -3114,7 +3170,7 @@ rb_str_slice_bang(int argc, VALUE *argv, VALUE str) for (i=0; i<argc; i++) { buf[i] = argv[i]; } - rb_str_modify(str); + str_modify_keep_cr(str); buf[i] = rb_str_new(0,0); result = rb_str_aref_m(argc, buf, str); if (!NIL_P(result)) { @@ -3524,7 +3580,10 @@ rb_str_clear(VALUE str) STR_SET_EMBED(str); STR_SET_EMBED_LEN(str, 0); RSTRING_PTR(str)[0] = 0; - ENC_CODERANGE_CLEAR(str); + if (rb_enc_asciicompat(STR_ENC_GET(str))) + ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); + else + ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); return str; } @@ -3656,22 +3715,15 @@ rb_str_reverse_bang(VALUE str) if (RSTRING_LEN(str) > 1) { if (single_byte_optimizable(str)) { char *s, *e, c; - int cr = ENC_CODERANGE(str); - int single = 1; - rb_str_modify(str); + str_modify_keep_cr(str); s = RSTRING_PTR(str); e = RSTRING_END(str) - 1; while (s < e) { c = *s; - if (*s & 0x80) single = 0; *s++ = *e; *e-- = c; } - if (cr == ENC_CODERANGE_UNKNOWN && single) { - cr = ENC_CODERANGE_7BIT; - } - ENC_CODERANGE_SET(str, cr); } else { rb_str_shared_replace(str, rb_str_reverse(str)); @@ -3787,9 +3839,9 @@ rb_str_to_s(VALUE str) } static void -str_cat_char(VALUE str, int c, rb_encoding *enc) +str_cat_char(VALUE str, unsigned int c, rb_encoding *enc) { - char s[16]; + char s[RUBY_MAX_CHAR_LEN]; int n = rb_enc_codelen(c, enc); rb_enc_mbcput(c, s, enc); @@ -3797,7 +3849,7 @@ str_cat_char(VALUE str, int c, rb_encoding *enc) } static void -prefix_escape(VALUE str, int c, rb_encoding *enc) +prefix_escape(VALUE str, unsigned int c, rb_encoding *enc) { str_cat_char(str, '\\', enc); str_cat_char(str, c, enc); @@ -4033,23 +4085,46 @@ rb_str_upcase_bang(VALUE str) rb_encoding *enc; char *s, *send; int modify = 0; - int cr = ENC_CODERANGE(str); - rb_str_modify(str); + str_modify_keep_cr(str); enc = STR_ENC_GET(str); s = RSTRING_PTR(str); send = RSTRING_END(str); - while (s < send) { - unsigned int c = rb_enc_codepoint(s, send, enc); + if (single_byte_optimizable(str)) { + while (s < send) { + unsigned int c = *(unsigned char*)s; - if (rb_enc_islower(c, enc)) { - /* assuming toupper returns codepoint with same size */ - rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); - modify = 1; + if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { + *s = 'A' + (c - 'a'); + modify = 1; + } + s++; + } + } + else { + int ascompat = rb_enc_asciicompat(enc); + + while (s < send) { + unsigned int c; + + if (ascompat && (c = *(unsigned char*)s) < 0x80) { + if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { + *s = 'A' + (c - 'a'); + modify = 1; + } + s++; + } + else { + c = rb_enc_codepoint(s, send, enc); + if (rb_enc_islower(c, enc)) { + /* assuming toupper returns codepoint with same size */ + rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); + modify = 1; + } + s += rb_enc_codelen(c, enc); + } } - s += rb_enc_codelen(c, enc); } - ENC_CODERANGE_SET(str, cr); if (modify) return str; return Qnil; } @@ -4091,23 +4166,46 @@ rb_str_downcase_bang(VALUE str) rb_encoding *enc; char *s, *send; int modify = 0; - int cr = ENC_CODERANGE(str); - rb_str_modify(str); + str_modify_keep_cr(str); enc = STR_ENC_GET(str); s = RSTRING_PTR(str); send = RSTRING_END(str); - while (s < send) { - unsigned int c = rb_enc_codepoint(s, send, enc); + if (single_byte_optimizable(str)) { + while (s < send) { + unsigned int c = *(unsigned char*)s; - if (rb_enc_isupper(c, enc)) { - /* assuming toupper returns codepoint with same size */ - rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); - modify = 1; + if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { + *s = 'a' + (c - 'A'); + modify = 1; + } + s++; + } + } + else { + int ascompat = rb_enc_asciicompat(enc); + + while (s < send) { + unsigned int c; + + if (ascompat && (c = *(unsigned char*)s) < 0x80) { + if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { + *s = 'a' + (c - 'A'); + modify = 1; + } + s++; + } + else { + c = rb_enc_codepoint(s, send, enc); + if (rb_enc_isupper(c, enc)) { + /* assuming toupper returns codepoint with same size */ + rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); + modify = 1; + } + s += rb_enc_codelen(c, enc); + } } - s += rb_enc_codelen(c, enc); } - ENC_CODERANGE_SET(str, cr); if (modify) return str; return Qnil; } @@ -4155,9 +4253,8 @@ rb_str_capitalize_bang(VALUE str) char *s, *send; int modify = 0; unsigned int c; - int cr = ENC_CODERANGE(str); - rb_str_modify(str); + str_modify_keep_cr(str); enc = STR_ENC_GET(str); if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; s = RSTRING_PTR(str); send = RSTRING_END(str); @@ -4177,7 +4274,6 @@ rb_str_capitalize_bang(VALUE str) s += rb_enc_codelen(c, enc); } - ENC_CODERANGE_SET(str, cr); if (modify) return str; return Qnil; } @@ -4220,9 +4316,8 @@ rb_str_swapcase_bang(VALUE str) rb_encoding *enc; char *s, *send; int modify = 0; - int cr = ENC_CODERANGE(str); - rb_str_modify(str); + str_modify_keep_cr(str); enc = STR_ENC_GET(str); s = RSTRING_PTR(str); send = RSTRING_END(str); while (s < send) { @@ -4234,14 +4329,13 @@ rb_str_swapcase_bang(VALUE str) modify = 1; } else if (rb_enc_islower(c, enc)) { - /* assuming toupper returns codepoint with same size */ + /* assuming tolower returns codepoint with same size */ rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); modify = 1; } - s += rb_enc_codelen(c, enc); + s += rb_enc_mbclen(s, send, enc); } - ENC_CODERANGE_SET(str, cr); if (modify) return str; return Qnil; } @@ -4323,6 +4417,7 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) char *s, *send; VALUE hash = 0; int singlebyte = single_byte_optimizable(str); + int cr; StringValue(src); StringValue(repl); @@ -4331,6 +4426,7 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) return rb_str_delete_bang(1, &src, str); } + cr = ENC_CODERANGE(str); e1 = rb_enc_check(str, src); e2 = rb_enc_check(str, repl); if (e1 == e2) { @@ -4394,7 +4490,7 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) } } - rb_str_modify(str); + str_modify_keep_cr(str); s = RSTRING_PTR(str); send = RSTRING_END(str); if (sflag) { int clen, tlen, max = RSTRING_LEN(str); @@ -4516,6 +4612,9 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) } if (modify) { + cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(repl)); + if (cr != ENC_CODERANGE_BROKEN) + ENC_CODERANGE_SET(str, cr); rb_enc_associate(str, enc); return str; } @@ -4654,11 +4753,9 @@ rb_str_delete_bang(int argc, VALUE *argv, VALUE str) char *s, *send, *t; VALUE del = 0, nodel = 0; int modify = 0; - int i; - int cr; + int i, ascompat; if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; - cr = ENC_CODERANGE(str); if (argc < 1) { rb_raise(rb_eArgError, "wrong number of arguments"); } @@ -4670,27 +4767,41 @@ rb_str_delete_bang(int argc, VALUE *argv, VALUE str) tr_setup_table(s, squeez, i==0, &del, &nodel, enc); } - rb_str_modify(str); + str_modify_keep_cr(str); + ascompat = rb_enc_asciicompat(enc); s = t = RSTRING_PTR(str); - if (!s || RSTRING_LEN(str) == 0) return Qnil; send = RSTRING_END(str); while (s < send) { - unsigned int c = rb_enc_codepoint(s, send, enc); - int clen = rb_enc_codelen(c, enc); + unsigned int c; + int clen; - if (tr_find(c, squeez, del, nodel)) { - modify = 1; + if (ascompat && (c = *(unsigned char*)s) < 0x80) { + if (squeez[c]) { + modify = 1; + } + else { + if (t != s) *t = c; + t++; + } + s++; } else { - if (t != s) rb_enc_mbcput(c, t, enc); - t += clen; + c = rb_enc_codepoint(s, send, enc); + clen = rb_enc_codelen(c, enc); + + if (tr_find(c, squeez, del, nodel)) { + modify = 1; + } + else { + if (t != s) rb_enc_mbcput(c, t, enc); + t += clen; + } + s += clen; } - s += clen; } *t = '\0'; STR_SET_LEN(str, t - RSTRING_PTR(str)); - ENC_CODERANGE_SET(str, cr); if (modify) return str; return Qnil; } @@ -4736,6 +4847,7 @@ rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) char *s, *send, *t; int save, modify = 0; int i; + int ascompat, singlebyte = single_byte_optimizable(str); if (argc == 0) { enc = STR_ENC_GET(str); @@ -4746,26 +4858,51 @@ rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) StringValue(s); enc = rb_enc_check(str, s); + if (singlebyte && !single_byte_optimizable(s)) + singlebyte = 0; tr_setup_table(s, squeez, i==0, &del, &nodel, enc); } } - rb_str_modify(str); + str_modify_keep_cr(str); s = t = RSTRING_PTR(str); if (!s || RSTRING_LEN(str) == 0) return Qnil; send = RSTRING_END(str); save = -1; - while (s < send) { - unsigned int c = rb_enc_codepoint(s, send, enc); - int clen = rb_enc_codelen(c, enc); + ascompat = rb_enc_asciicompat(enc); - if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) { - if (t != s) rb_enc_mbcput(c, t, enc); - save = c; - t += clen; + if (singlebyte) { + while (s < send) { + unsigned int c = *(unsigned char*)s++; + if (c != save || (argc > 0 && !squeez[c])) { + *t++ = save = c; + } + } + } else { + while (s < send) { + unsigned int c; + int clen; + + if (ascompat && (c = *(unsigned char*)s) < 0x80) { + if (c != save || (argc > 0 && !squeez[c])) { + *t++ = save = c; + } + s++; + } + else { + c = rb_enc_codepoint(s, send, enc); + clen = rb_enc_codelen(c, enc); + + if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) { + if (t != s) rb_enc_mbcput(c, t, enc); + save = c; + t += clen; + } + s += clen; + } } - s += clen; } + *t = '\0'; if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) { STR_SET_LEN(str, t - RSTRING_PTR(str)); @@ -4777,7 +4914,7 @@ rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) } -/* +/* * call-seq: * str.squeeze([other_str]*) => new_str * @@ -4862,6 +4999,7 @@ rb_str_count(int argc, VALUE *argv, VALUE str) VALUE del = 0, nodel = 0; char *s, *send; int i; + int ascompat; if (argc < 1) { rb_raise(rb_eArgError, "wrong number of arguments"); @@ -4871,22 +5009,36 @@ rb_str_count(int argc, VALUE *argv, VALUE str) StringValue(s); enc = rb_enc_check(str, s); - tr_setup_table(s, table,i==0, &del, &nodel, enc); + tr_setup_table(s, table, i==0, &del, &nodel, enc); } s = RSTRING_PTR(str); if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); send = RSTRING_END(str); i = 0; + ascompat = rb_enc_asciicompat(enc); + while (s < send) { - unsigned int c = rb_enc_codepoint(s, send, enc); - int clen = rb_enc_codelen(c, enc); + unsigned int c; + int clen; - if (tr_find(c, table, del, nodel)) { - i++; + if (ascompat && (c = *(unsigned char*)s) < 0x80) { + clen = 1; + if (table[c]) { + i++; + } + s++; + } + else { + c = rb_enc_codepoint(s, send, enc); + clen = rb_enc_codelen(c, enc); + if (tr_find(c, table, del, nodel)) { + i++; + } + s += clen; } - s += clen; } + return INT2NUM(i); } @@ -4939,7 +5091,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) rb_encoding *enc; VALUE spat; VALUE limit; - int awk_split = Qfalse; + enum {awk, string, regexp} split_type; long beg, end, i = 0; int lim = 0; VALUE result, tmp; @@ -4961,37 +5113,41 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) spat = rb_fs; goto fs_set; } - awk_split = Qtrue; + split_type = awk; } else { fs_set: if (TYPE(spat) == T_STRING) { rb_encoding *enc2 = STR_ENC_GET(spat); - if (rb_enc_mbminlen(enc2) == 1) { + split_type = string; + if (RSTRING_LEN(spat) == 0) { + /* Special case - split into chars */ + spat = rb_reg_regcomp(spat); + split_type = regexp; + } + else if (rb_enc_mbminlen(enc2) == 1) { if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){ - awk_split = Qtrue; + split_type = awk; } } else { int l; if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' && RSTRING_LEN(spat) == l) { - awk_split = Qtrue; + split_type = awk; } } - if (!awk_split) { - spat = rb_reg_regcomp(rb_reg_quote(spat)); - } } else { spat = get_pat(spat, 1); + split_type = regexp; } } result = rb_ary_new(); beg = 0; - if (awk_split) { + if (split_type == awk) { char *ptr = RSTRING_PTR(str); char *eptr = RSTRING_END(str); char *bptr = ptr; @@ -5025,6 +5181,33 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } } } + else if (split_type == string) { + char *ptr = RSTRING_PTR(str); + char *eptr = RSTRING_END(str); + char *sptr = RSTRING_PTR(spat); + int slen = RSTRING_LEN(spat); + + if (is_broken_string(str)) { + rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str))); + } + if (is_broken_string(spat)) { + rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat))); + } + enc = rb_enc_check(str, spat); + while (ptr < eptr && + (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) { + /* Check we are at the start of a char */ + char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc); + if (t != ptr + end) { + ptr = t; + continue; + } + rb_ary_push(result, rb_str_substr(str, ptr - RSTRING_PTR(str), end)); + ptr += end + slen; + if (!NIL_P(limit) && lim <= ++i) break; + } + beg = ptr - RSTRING_PTR(str); + } else { long start = beg; long idx; @@ -5178,7 +5361,7 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str) p = memchr(p, '\n', pend - p); if (!p) break; - p0 = rb_enc_left_char_head(s, p, enc); + p0 = rb_enc_left_char_head(s, p, pend, enc); if (!rb_enc_is_newline(p0, pend, enc)) { p++; continue; @@ -5326,6 +5509,59 @@ rb_str_each_char(VALUE str) return str; } +/* + * Document-method: codepoints + * call-seq: + * str.codepoints => anEnumerator + * str.codepoints {|fixnum| block } => str + * + * Returns an enumerator that gives the <code>Integer</code> ordinal + * of each character in the string, also known as a <i>codepoint</i> + * when applied to Unicode strings. If a block is given, it iterates + * over each character in the string. + * + * "foo\u0635".chars.to_a #=> [102, 111, 111, 1589] + */ + +/* + * Document-method: each_codepoint + * call-seq: + * str.each_codepoint {|integer| block } => str + * + * Passes the <code>Integer</code> ordinal of each character in <i>str</i>, + * also known as a <i>codepoint</i> when applied to Unicode strings to the + * given block. + * + * "hello\u0639".each_codepoint {|c| print c, ' ' } + * + * <em>produces:</em> + * + * 104 101 108 108 111 1593 + */ + +static VALUE +rb_str_each_codepoint(VALUE str) +{ + int len, n; + unsigned int c; + const char *ptr, *end; + rb_encoding *enc; + + if (single_byte_optimizable(str)) return rb_str_each_byte(str); + RETURN_ENUMERATOR(str, 0, 0); + ptr = RSTRING_PTR(str); + len = RSTRING_LEN(str); + end = RSTRING_END(str); + enc = STR_ENC_GET(str); + while (ptr < end) { + c = rb_enc_codepoint(ptr, end, enc); + n = rb_enc_codelen(c, enc); + rb_yield(UINT2NUM(c)); + ptr += n; + } + return str; +} + static long chopped_length(VALUE str) { @@ -5335,10 +5571,10 @@ chopped_length(VALUE str) beg = RSTRING_PTR(str); end = beg + RSTRING_LEN(str); if (beg > end) return 0; - p = rb_enc_prev_char(beg, end, enc); + p = rb_enc_prev_char(beg, end, end, enc); if (!p) return 0; if (p > beg && rb_enc_codepoint(p, end, enc) == '\n') { - p2 = rb_enc_prev_char(beg, p, enc); + p2 = rb_enc_prev_char(beg, p, end, enc); if (p2 && rb_enc_codepoint(p2, end, enc) == '\r') p = p2; } return p - beg; @@ -5358,7 +5594,7 @@ rb_str_chop_bang(VALUE str) { if (RSTRING_LEN(str) > 0) { long len; - rb_str_modify(str); + str_modify_keep_cr(str); len = chopped_length(str); STR_SET_LEN(str, len); RSTRING_PTR(str)[len] = '\0'; @@ -5420,16 +5656,16 @@ rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) rs = rb_rs; if (rs == rb_default_rs) { smart_chomp: - rb_str_modify(str); + str_modify_keep_cr(str); enc = rb_enc_get(str); if (rb_enc_mbminlen(enc) > 1) { - pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), enc); + pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc); if (rb_enc_is_newline(pp, e, enc)) { e = pp; } pp = e - rb_enc_mbminlen(enc); if (pp >= p) { - pp = rb_enc_left_char_head(p, pp, enc); + pp = rb_enc_left_char_head(p, pp, e, enc); if (rb_enc_ascget(pp, e, 0, enc) == '\r') { e = pp; } @@ -5472,7 +5708,7 @@ rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) len--; } if (len < RSTRING_LEN(str)) { - rb_str_modify(str); + str_modify_keep_cr(str); STR_SET_LEN(str, len); RSTRING_PTR(str)[len] = '\0'; return str; @@ -5492,9 +5728,9 @@ rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) if (p[len-1] == newline && (rslen <= 1 || memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) { - if (rb_enc_left_char_head(p, pp, enc) != pp) + if (rb_enc_left_char_head(p, pp, e, enc) != pp) return Qnil; - rb_str_modify(str); + str_modify_keep_cr(str); STR_SET_LEN(str, RSTRING_LEN(str) - rslen); RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; return str; @@ -5548,7 +5784,7 @@ rb_str_lstrip_bang(VALUE str) rb_encoding *enc; char *s, *t, *e; - rb_str_modify(str); + str_modify_keep_cr(str); enc = STR_ENC_GET(str); s = RSTRING_PTR(str); if (!s || RSTRING_LEN(str) == 0) return Qnil; @@ -5562,7 +5798,6 @@ rb_str_lstrip_bang(VALUE str) } if (s > RSTRING_PTR(str)) { - rb_str_modify(str); STR_SET_LEN(str, t-s); memmove(RSTRING_PTR(str), s, RSTRING_LEN(str)); RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; @@ -5609,30 +5844,33 @@ rb_str_rstrip_bang(VALUE str) { rb_encoding *enc; char *s, *t, *e; - int space_seen = Qfalse; - rb_str_modify(str); enc = STR_ENC_GET(str); s = RSTRING_PTR(str); if (!s || RSTRING_LEN(str) == 0) return Qnil; t = e = RSTRING_END(str); - while (s < e) { - unsigned int cc = rb_enc_codepoint(s, e, enc); - if (!cc || rb_enc_isspace(cc, enc)) { - if (!space_seen) t = s; - space_seen = Qtrue; - } - else { - space_seen = Qfalse; + if (single_byte_optimizable(str)) { + /* remove trailing '\0's */ + while (s < t && t[-1] == '\0') t--; + + /* remove trailing spaces */ + while (s < t && rb_enc_isspace(*(t-1), enc)) t--; + } + else { + char *tp; + + while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) { + if (!rb_enc_isspace(rb_enc_codepoint(tp, e, enc), enc)) break; + t = tp; } - s += rb_enc_codelen(cc, enc); } - if (!space_seen) t = s; if (t < e) { - rb_str_modify(str); - STR_SET_LEN(str, t-RSTRING_PTR(str)); - RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; + int len = t-RSTRING_PTR(str); + + str_modify_keep_cr(str); + STR_SET_LEN(str, len); + RSTRING_PTR(str)[len] = '\0'; return str; } return Qnil; @@ -5864,30 +6102,27 @@ rb_str_crypt(VALUE str, VALUE salt) { extern char *crypt(const char *, const char *); VALUE result; - const char *s; + const char *s, *saltp; #ifdef BROKEN_CRYPT - VALUE salt_8bit_clean; - rb_encoding *enc; + char salt_8bit_clean[3]; #endif StringValue(salt); if (RSTRING_LEN(salt) < 2) rb_raise(rb_eArgError, "salt too short (need >=2 bytes)"); - if (RSTRING_PTR(str)) s = RSTRING_PTR(str); - else s = ""; + s = RSTRING_PTR(str); + if (!s) s = ""; + saltp = RSTRING_PTR(salt); #ifdef BROKEN_CRYPT - salt_8bit_clean = rb_str_dup(salt); - enc = rb_ascii8bit_encoding(); - str_modifiable(salt_8bit_clean); - rb_enc_associate(salt_8bit_clean, enc); - salt_8bit_clean = rb_str_tr(salt_8bit_clean, - rb_enc_str_new("\x80-\xFF", 3, enc), - rb_usascii_str_new("\x00-\x7F", 3)); - result = rb_str_new2(crypt(s, RSTRING_PTR(salt_8bit_clean))); -#else - result = rb_str_new2(crypt(s, RSTRING_PTR(salt))); + if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) { + salt_8bit_clean[0] = saltp[0] & 0x7f; + salt_8bit_clean[1] = saltp[1] & 0x7f; + salt_8bit_clean[2] = '\0'; + saltp = salt_8bit_clean; + } #endif + result = rb_str_new2(crypt(s, saltp)); OBJ_INFECT(result, str); OBJ_INFECT(result, salt); return result; @@ -6029,7 +6264,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) const char *f = " "; long n, llen, rlen; volatile VALUE pad; - int singlebyte = 1; + int singlebyte = 1, cr; rb_scan_args(argc, argv, "11", &w, &pad); enc = STR_ENC_GET(str); @@ -6050,6 +6285,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) n = width - len; llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2); rlen = n - llen; + cr = ENC_CODERANGE(str); res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2); p = RSTRING_PTR(res); while (llen) { @@ -6095,6 +6331,10 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) OBJ_INFECT(res, str); if (!NIL_P(pad)) OBJ_INFECT(res, pad); rb_enc_associate(res, enc); + if (argc == 2) + cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad)); + if (cr != ENC_CODERANGE_BROKEN) + ENC_CODERANGE_SET(res, cr); return res; } @@ -6283,7 +6523,7 @@ static VALUE rb_str_end_with(int argc, VALUE *argv, VALUE str) { int i; - char *p, *s; + char *p, *s, *e; rb_encoding *enc; for (i=0; i<argc; i++) { @@ -6292,8 +6532,9 @@ rb_str_end_with(int argc, VALUE *argv, VALUE str) enc = rb_enc_check(str, tmp); if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; p = RSTRING_PTR(str); - s = p + RSTRING_LEN(str) - RSTRING_LEN(tmp); - if (rb_enc_left_char_head(p, s, enc) != s) + e = p + RSTRING_LEN(str); + s = e - RSTRING_LEN(tmp); + if (rb_enc_left_char_head(p, s, e, enc) != s) continue; if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) return Qtrue; @@ -6695,6 +6936,7 @@ Init_String(void) rb_define_method(rb_cString, "lines", rb_str_each_line, -1); rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0); rb_define_method(rb_cString, "chars", rb_str_each_char, 0); + rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0); rb_define_method(rb_cString, "reverse", rb_str_reverse, 0); rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0); rb_define_method(rb_cString, "concat", rb_str_concat, 1); @@ -6744,6 +6986,7 @@ Init_String(void) rb_define_method(rb_cString, "each_line", rb_str_each_line, -1); rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0); rb_define_method(rb_cString, "each_char", rb_str_each_char, 0); + rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0); rb_define_method(rb_cString, "sum", rb_str_sum, -1); |