summaryrefslogtreecommitdiff
path: root/string.c
diff options
context:
space:
mode:
Diffstat (limited to 'string.c')
-rw-r--r--string.c573
1 files changed, 408 insertions, 165 deletions
diff --git a/string.c b/string.c
index 24d33ef508..a56dbf66a3 100644
--- a/string.c
+++ b/string.c
@@ -43,6 +43,7 @@
VALUE rb_cString;
VALUE rb_cSymbol;
+#define RUBY_MAX_CHAR_LEN 16
#define STR_TMPLOCK FL_USER7
#define STR_NOEMBED FL_USER1
#define STR_SHARED FL_USER2 /* = ELTS_SHARED */
@@ -111,18 +112,19 @@ VALUE rb_cSymbol;
#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
-static int
+static inline int
single_byte_optimizable(VALUE str)
{
- rb_encoding *enc = STR_ENC_GET(str);
-
- if (rb_enc_mbmaxlen(enc) == 1)
- return 1;
+ rb_encoding *enc;
/* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
return 1;
+ enc = STR_ENC_GET(str);
+ if (rb_enc_mbmaxlen(enc) == 1)
+ return 1;
+
/* Conservative. Possibly single byte.
* "\xa1" in Shift_JIS for example. */
return 0;
@@ -1057,6 +1059,17 @@ rb_str_modify(VALUE str)
ENC_CODERANGE_CLEAR(str);
}
+/* As rb_str_modify(), but don't clear coderange */
+static void
+str_modify_keep_cr(VALUE str)
+{
+ if (!str_independent(str))
+ str_make_independent(str);
+ if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
+ /* Force re-scan later */
+ ENC_CODERANGE_CLEAR(str);
+}
+
void
rb_str_associate(VALUE str, VALUE add)
{
@@ -1281,20 +1294,35 @@ rb_str_substr(VALUE str, long beg, long len)
rb_encoding *enc = STR_ENC_GET(str);
VALUE str2;
char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
- int singlebyte;
if (len < 0) return Qnil;
if (!RSTRING_LEN(str)) {
len = 0;
}
+ if (single_byte_optimizable(str)) {
+ if (beg > RSTRING_LEN(str)) return Qnil;
+ if (beg < 0) {
+ beg += RSTRING_LEN(str);
+ if (beg < 0) return Qnil;
+ }
+ if (beg + len > RSTRING_LEN(str))
+ len = RSTRING_LEN(str) - beg;
+ if (len <= 0) {
+ len = 0;
+ p = 0;
+ }
+ else
+ p = s + beg;
+ goto sub;
+ }
if (beg < 0) {
if (len > -beg) len = -beg;
if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
beg = -beg;
- while (beg-- > len && (e = rb_enc_prev_char(s, e, enc)) != 0);
+ while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
p = e;
if (!p) return Qnil;
- while (len-- > 0 && (p = rb_enc_prev_char(s, p, enc)) != 0);
+ while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
if (!p) return Qnil;
len = e - p;
goto sub;
@@ -1307,7 +1335,6 @@ rb_str_substr(VALUE str, long beg, long len)
else if (beg > 0 && beg > str_strlen(str, enc)) {
return Qnil;
}
- singlebyte = single_byte_optimizable(str);
if (len == 0) {
p = 0;
}
@@ -1318,17 +1345,24 @@ rb_str_substr(VALUE str, long beg, long len)
len = str_utf8_offset(p, e, len);
}
#endif
- else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) {
- len = 0;
- }
else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
- if (len * rb_enc_mbmaxlen(enc) > e - p)
+ int char_sz = rb_enc_mbmaxlen(enc);
+
+ p = s + beg * char_sz;
+ if (p > e) {
+ p = e;
+ len = 0;
+ }
+ else if (len * char_sz > e - p)
len = e - p;
else
- len *= rb_enc_mbmaxlen(enc);
+ len *= char_sz;
+ }
+ else if ((p = str_nth(s, e, beg, enc, 0)) == e) {
+ len = 0;
}
else {
- len = str_offset(p, e, len, enc, singlebyte);
+ len = str_offset(p, e, len, enc, 0);
}
sub:
if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
@@ -1623,7 +1657,7 @@ rb_str_buf_cat_ascii(VALUE str, const char *ptr)
else {
char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
while (*ptr) {
- int c = (unsigned char)*ptr;
+ unsigned int c = (unsigned char)*ptr;
int len = rb_enc_codelen(c, enc);
rb_enc_mbcput(c, buf, enc);
rb_enc_cr_str_buf_cat(str, buf, len,
@@ -1678,13 +1712,13 @@ rb_str_append(VALUE str, VALUE str2)
/*
* call-seq:
- * str << fixnum => str
- * str.concat(fixnum) => str
+ * str << integer => str
+ * str.concat(integer) => str
* str << obj => str
* str.concat(obj) => str
*
* Append---Concatenates the given object to <i>str</i>. If the object is a
- * <code>Fixnum</code>, it is considered as a codepoint, and is converted
+ * <code>Integer</code>, it is considered as a codepoint, and is converted
* to a character before concatenation.
*
* a = "hello "
@@ -1695,9 +1729,9 @@ rb_str_append(VALUE str, VALUE str2)
VALUE
rb_str_concat(VALUE str1, VALUE str2)
{
- if (FIXNUM_P(str2)) {
+ if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
rb_encoding *enc = STR_ENC_GET(str1);
- int c = FIX2INT(str2);
+ unsigned int c = NUM2UINT(str2);
int pos = RSTRING_LEN(str1);
int len = rb_enc_codelen(c, enc);
int cr = ENC_CODERANGE(str1);
@@ -1892,6 +1926,7 @@ rb_str_comparable(VALUE str1, VALUE str2)
{
int idx1, idx2;
int rc1, rc2;
+ int a8;
if (RSTRING_LEN(str1) == 0) return Qtrue;
if (RSTRING_LEN(str2) == 0) return Qtrue;
@@ -1909,6 +1944,8 @@ rb_str_comparable(VALUE str1, VALUE str2)
if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
return Qtrue;
}
+ a8 = rb_ascii8bit_encindex();
+ if (idx1 == a8 || idx2 == a8) return Qtrue;
return Qfalse;
}
@@ -1922,7 +1959,7 @@ rb_str_cmp(VALUE str1, VALUE str2)
retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
if (retval == 0) {
if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
- if (!rb_enc_compatible(str1, str2)) {
+ if (!rb_str_comparable(str1, str2)) {
if (ENCODING_GET(str1) - ENCODING_GET(str2) > 0)
return 1;
return -1;
@@ -2065,19 +2102,33 @@ rb_str_casecmp(VALUE str1, VALUE str2)
p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
- while (p1 < p1end && p2 < p2end) {
- unsigned int c1 = rb_enc_codepoint(p1, p1end, enc);
- unsigned int c2 = rb_enc_codepoint(p2, p2end, enc);
-
- if (c1 != c2) {
- c1 = rb_enc_toupper(c1, enc);
- c2 = rb_enc_toupper(c2, enc);
- if (c1 > c2) return INT2FIX(1);
- if (c1 < c2) return INT2FIX(-1);
+ if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
+ while (p1 < p1end && p2 < p2end) {
+ if (*p1 != *p2) {
+ unsigned int c1 = rb_enc_toupper(*p1 & 0xff, enc);
+ unsigned int c2 = rb_enc_toupper(*p2 & 0xff, enc);
+ if (c1 > c2) return INT2FIX(1);
+ if (c1 < c2) return INT2FIX(-1);
+ }
+ p1++;
+ p2++;
+ }
+ }
+ else {
+ while (p1 < p1end && p2 < p2end) {
+ unsigned int c1 = rb_enc_codepoint(p1, p1end, enc);
+ unsigned int c2 = rb_enc_codepoint(p2, p2end, enc);
+
+ if (c1 != c2) {
+ c1 = rb_enc_toupper(c1, enc);
+ c2 = rb_enc_toupper(c2, enc);
+ if (c1 > c2) return INT2FIX(1);
+ if (c1 < c2) return INT2FIX(-1);
+ }
+ len = rb_enc_codelen(c1, enc);
+ p1 += len;
+ p2 += len;
}
- len = rb_enc_codelen(c1, enc);
- p1 += len;
- p2 += len;
}
if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
@@ -2088,7 +2139,7 @@ static long
rb_str_index(VALUE str, VALUE sub, long offset)
{
long pos;
- char *s, *sptr;
+ char *s, *sptr, *e;
long len, slen;
rb_encoding *enc;
@@ -2104,6 +2155,7 @@ rb_str_index(VALUE str, VALUE sub, long offset)
}
if (len - offset < slen) return -1;
s = RSTRING_PTR(str);
+ e = s + RSTRING_LEN(str);
if (offset) {
offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
s += offset;
@@ -2117,7 +2169,7 @@ rb_str_index(VALUE str, VALUE sub, long offset)
char *t;
pos = rb_memsearch(sptr, slen, s, len, enc);
if (pos < 0) return pos;
- t = rb_enc_right_char_head(s, s+pos, enc);
+ t = rb_enc_right_char_head(s, s+pos, e, enc);
if (t == s + pos) break;
if ((len -= t - s) <= 0) return -1;
offset += t - s;
@@ -2571,7 +2623,7 @@ rb_str_succ(VALUE orig)
sbeg = RSTRING_PTR(str);
s = e = sbeg + RSTRING_LEN(str);
- while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
+ while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
@@ -2596,7 +2648,7 @@ rb_str_succ(VALUE orig)
}
if (c == -1) { /* str contains no alnum */
s = e;
- while ((s = rb_enc_prev_char(sbeg, s, enc)) != 0) {
+ while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
enum neighbor_char neighbor;
if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
neighbor = enc_succ_char(s, l, enc);
@@ -2894,11 +2946,11 @@ rb_str_splice(VALUE str, long beg, long len, VALUE val)
char *p, *e;
rb_encoding *enc;
int singlebyte = single_byte_optimizable(str);
+ int cr;
if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
StringValue(val);
- rb_str_modify(str);
enc = rb_enc_check(str, val);
slen = str_strlen(str, enc);
@@ -2915,6 +2967,7 @@ rb_str_splice(VALUE str, long beg, long len, VALUE val)
if (slen < len || slen < beg + len) {
len = slen - beg;
}
+ str_modify_keep_cr(str);
p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
if (!p) p = RSTRING_END(str);
e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
@@ -2924,6 +2977,9 @@ rb_str_splice(VALUE str, long beg, long len, VALUE val)
len = e - p; /* physical length */
rb_str_splice_0(str, beg, len, val);
rb_enc_associate(str, enc);
+ cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
+ if (cr != ENC_CODERANGE_BROKEN)
+ ENC_CODERANGE_SET(str, cr);
}
void
@@ -3114,7 +3170,7 @@ rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
for (i=0; i<argc; i++) {
buf[i] = argv[i];
}
- rb_str_modify(str);
+ str_modify_keep_cr(str);
buf[i] = rb_str_new(0,0);
result = rb_str_aref_m(argc, buf, str);
if (!NIL_P(result)) {
@@ -3524,7 +3580,10 @@ rb_str_clear(VALUE str)
STR_SET_EMBED(str);
STR_SET_EMBED_LEN(str, 0);
RSTRING_PTR(str)[0] = 0;
- ENC_CODERANGE_CLEAR(str);
+ if (rb_enc_asciicompat(STR_ENC_GET(str)))
+ ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
+ else
+ ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
return str;
}
@@ -3656,22 +3715,15 @@ rb_str_reverse_bang(VALUE str)
if (RSTRING_LEN(str) > 1) {
if (single_byte_optimizable(str)) {
char *s, *e, c;
- int cr = ENC_CODERANGE(str);
- int single = 1;
- rb_str_modify(str);
+ str_modify_keep_cr(str);
s = RSTRING_PTR(str);
e = RSTRING_END(str) - 1;
while (s < e) {
c = *s;
- if (*s & 0x80) single = 0;
*s++ = *e;
*e-- = c;
}
- if (cr == ENC_CODERANGE_UNKNOWN && single) {
- cr = ENC_CODERANGE_7BIT;
- }
- ENC_CODERANGE_SET(str, cr);
}
else {
rb_str_shared_replace(str, rb_str_reverse(str));
@@ -3787,9 +3839,9 @@ rb_str_to_s(VALUE str)
}
static void
-str_cat_char(VALUE str, int c, rb_encoding *enc)
+str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
{
- char s[16];
+ char s[RUBY_MAX_CHAR_LEN];
int n = rb_enc_codelen(c, enc);
rb_enc_mbcput(c, s, enc);
@@ -3797,7 +3849,7 @@ str_cat_char(VALUE str, int c, rb_encoding *enc)
}
static void
-prefix_escape(VALUE str, int c, rb_encoding *enc)
+prefix_escape(VALUE str, unsigned int c, rb_encoding *enc)
{
str_cat_char(str, '\\', enc);
str_cat_char(str, c, enc);
@@ -4033,23 +4085,46 @@ rb_str_upcase_bang(VALUE str)
rb_encoding *enc;
char *s, *send;
int modify = 0;
- int cr = ENC_CODERANGE(str);
- rb_str_modify(str);
+ str_modify_keep_cr(str);
enc = STR_ENC_GET(str);
s = RSTRING_PTR(str); send = RSTRING_END(str);
- while (s < send) {
- unsigned int c = rb_enc_codepoint(s, send, enc);
+ if (single_byte_optimizable(str)) {
+ while (s < send) {
+ unsigned int c = *(unsigned char*)s;
- if (rb_enc_islower(c, enc)) {
- /* assuming toupper returns codepoint with same size */
- rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
- modify = 1;
+ if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
+ *s = 'A' + (c - 'a');
+ modify = 1;
+ }
+ s++;
+ }
+ }
+ else {
+ int ascompat = rb_enc_asciicompat(enc);
+
+ while (s < send) {
+ unsigned int c;
+
+ if (ascompat && (c = *(unsigned char*)s) < 0x80) {
+ if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
+ *s = 'A' + (c - 'a');
+ modify = 1;
+ }
+ s++;
+ }
+ else {
+ c = rb_enc_codepoint(s, send, enc);
+ if (rb_enc_islower(c, enc)) {
+ /* assuming toupper returns codepoint with same size */
+ rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
+ modify = 1;
+ }
+ s += rb_enc_codelen(c, enc);
+ }
}
- s += rb_enc_codelen(c, enc);
}
- ENC_CODERANGE_SET(str, cr);
if (modify) return str;
return Qnil;
}
@@ -4091,23 +4166,46 @@ rb_str_downcase_bang(VALUE str)
rb_encoding *enc;
char *s, *send;
int modify = 0;
- int cr = ENC_CODERANGE(str);
- rb_str_modify(str);
+ str_modify_keep_cr(str);
enc = STR_ENC_GET(str);
s = RSTRING_PTR(str); send = RSTRING_END(str);
- while (s < send) {
- unsigned int c = rb_enc_codepoint(s, send, enc);
+ if (single_byte_optimizable(str)) {
+ while (s < send) {
+ unsigned int c = *(unsigned char*)s;
- if (rb_enc_isupper(c, enc)) {
- /* assuming toupper returns codepoint with same size */
- rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
- modify = 1;
+ if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
+ *s = 'a' + (c - 'A');
+ modify = 1;
+ }
+ s++;
+ }
+ }
+ else {
+ int ascompat = rb_enc_asciicompat(enc);
+
+ while (s < send) {
+ unsigned int c;
+
+ if (ascompat && (c = *(unsigned char*)s) < 0x80) {
+ if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
+ *s = 'a' + (c - 'A');
+ modify = 1;
+ }
+ s++;
+ }
+ else {
+ c = rb_enc_codepoint(s, send, enc);
+ if (rb_enc_isupper(c, enc)) {
+ /* assuming toupper returns codepoint with same size */
+ rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
+ modify = 1;
+ }
+ s += rb_enc_codelen(c, enc);
+ }
}
- s += rb_enc_codelen(c, enc);
}
- ENC_CODERANGE_SET(str, cr);
if (modify) return str;
return Qnil;
}
@@ -4155,9 +4253,8 @@ rb_str_capitalize_bang(VALUE str)
char *s, *send;
int modify = 0;
unsigned int c;
- int cr = ENC_CODERANGE(str);
- rb_str_modify(str);
+ str_modify_keep_cr(str);
enc = STR_ENC_GET(str);
if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
s = RSTRING_PTR(str); send = RSTRING_END(str);
@@ -4177,7 +4274,6 @@ rb_str_capitalize_bang(VALUE str)
s += rb_enc_codelen(c, enc);
}
- ENC_CODERANGE_SET(str, cr);
if (modify) return str;
return Qnil;
}
@@ -4220,9 +4316,8 @@ rb_str_swapcase_bang(VALUE str)
rb_encoding *enc;
char *s, *send;
int modify = 0;
- int cr = ENC_CODERANGE(str);
- rb_str_modify(str);
+ str_modify_keep_cr(str);
enc = STR_ENC_GET(str);
s = RSTRING_PTR(str); send = RSTRING_END(str);
while (s < send) {
@@ -4234,14 +4329,13 @@ rb_str_swapcase_bang(VALUE str)
modify = 1;
}
else if (rb_enc_islower(c, enc)) {
- /* assuming toupper returns codepoint with same size */
+ /* assuming tolower returns codepoint with same size */
rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
modify = 1;
}
- s += rb_enc_codelen(c, enc);
+ s += rb_enc_mbclen(s, send, enc);
}
- ENC_CODERANGE_SET(str, cr);
if (modify) return str;
return Qnil;
}
@@ -4323,6 +4417,7 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
char *s, *send;
VALUE hash = 0;
int singlebyte = single_byte_optimizable(str);
+ int cr;
StringValue(src);
StringValue(repl);
@@ -4331,6 +4426,7 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
return rb_str_delete_bang(1, &src, str);
}
+ cr = ENC_CODERANGE(str);
e1 = rb_enc_check(str, src);
e2 = rb_enc_check(str, repl);
if (e1 == e2) {
@@ -4394,7 +4490,7 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
}
}
- rb_str_modify(str);
+ str_modify_keep_cr(str);
s = RSTRING_PTR(str); send = RSTRING_END(str);
if (sflag) {
int clen, tlen, max = RSTRING_LEN(str);
@@ -4516,6 +4612,9 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
}
if (modify) {
+ cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(repl));
+ if (cr != ENC_CODERANGE_BROKEN)
+ ENC_CODERANGE_SET(str, cr);
rb_enc_associate(str, enc);
return str;
}
@@ -4654,11 +4753,9 @@ rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
char *s, *send, *t;
VALUE del = 0, nodel = 0;
int modify = 0;
- int i;
- int cr;
+ int i, ascompat;
if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
- cr = ENC_CODERANGE(str);
if (argc < 1) {
rb_raise(rb_eArgError, "wrong number of arguments");
}
@@ -4670,27 +4767,41 @@ rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
}
- rb_str_modify(str);
+ str_modify_keep_cr(str);
+ ascompat = rb_enc_asciicompat(enc);
s = t = RSTRING_PTR(str);
- if (!s || RSTRING_LEN(str) == 0) return Qnil;
send = RSTRING_END(str);
while (s < send) {
- unsigned int c = rb_enc_codepoint(s, send, enc);
- int clen = rb_enc_codelen(c, enc);
+ unsigned int c;
+ int clen;
- if (tr_find(c, squeez, del, nodel)) {
- modify = 1;
+ if (ascompat && (c = *(unsigned char*)s) < 0x80) {
+ if (squeez[c]) {
+ modify = 1;
+ }
+ else {
+ if (t != s) *t = c;
+ t++;
+ }
+ s++;
}
else {
- if (t != s) rb_enc_mbcput(c, t, enc);
- t += clen;
+ c = rb_enc_codepoint(s, send, enc);
+ clen = rb_enc_codelen(c, enc);
+
+ if (tr_find(c, squeez, del, nodel)) {
+ modify = 1;
+ }
+ else {
+ if (t != s) rb_enc_mbcput(c, t, enc);
+ t += clen;
+ }
+ s += clen;
}
- s += clen;
}
*t = '\0';
STR_SET_LEN(str, t - RSTRING_PTR(str));
- ENC_CODERANGE_SET(str, cr);
if (modify) return str;
return Qnil;
}
@@ -4736,6 +4847,7 @@ rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
char *s, *send, *t;
int save, modify = 0;
int i;
+ int ascompat, singlebyte = single_byte_optimizable(str);
if (argc == 0) {
enc = STR_ENC_GET(str);
@@ -4746,26 +4858,51 @@ rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
StringValue(s);
enc = rb_enc_check(str, s);
+ if (singlebyte && !single_byte_optimizable(s))
+ singlebyte = 0;
tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
}
}
- rb_str_modify(str);
+ str_modify_keep_cr(str);
s = t = RSTRING_PTR(str);
if (!s || RSTRING_LEN(str) == 0) return Qnil;
send = RSTRING_END(str);
save = -1;
- while (s < send) {
- unsigned int c = rb_enc_codepoint(s, send, enc);
- int clen = rb_enc_codelen(c, enc);
+ ascompat = rb_enc_asciicompat(enc);
- if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
- if (t != s) rb_enc_mbcput(c, t, enc);
- save = c;
- t += clen;
+ if (singlebyte) {
+ while (s < send) {
+ unsigned int c = *(unsigned char*)s++;
+ if (c != save || (argc > 0 && !squeez[c])) {
+ *t++ = save = c;
+ }
+ }
+ } else {
+ while (s < send) {
+ unsigned int c;
+ int clen;
+
+ if (ascompat && (c = *(unsigned char*)s) < 0x80) {
+ if (c != save || (argc > 0 && !squeez[c])) {
+ *t++ = save = c;
+ }
+ s++;
+ }
+ else {
+ c = rb_enc_codepoint(s, send, enc);
+ clen = rb_enc_codelen(c, enc);
+
+ if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
+ if (t != s) rb_enc_mbcput(c, t, enc);
+ save = c;
+ t += clen;
+ }
+ s += clen;
+ }
}
- s += clen;
}
+
*t = '\0';
if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
STR_SET_LEN(str, t - RSTRING_PTR(str));
@@ -4777,7 +4914,7 @@ rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
}
-/*
+/*
* call-seq:
* str.squeeze([other_str]*) => new_str
*
@@ -4862,6 +4999,7 @@ rb_str_count(int argc, VALUE *argv, VALUE str)
VALUE del = 0, nodel = 0;
char *s, *send;
int i;
+ int ascompat;
if (argc < 1) {
rb_raise(rb_eArgError, "wrong number of arguments");
@@ -4871,22 +5009,36 @@ rb_str_count(int argc, VALUE *argv, VALUE str)
StringValue(s);
enc = rb_enc_check(str, s);
- tr_setup_table(s, table,i==0, &del, &nodel, enc);
+ tr_setup_table(s, table, i==0, &del, &nodel, enc);
}
s = RSTRING_PTR(str);
if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
send = RSTRING_END(str);
i = 0;
+ ascompat = rb_enc_asciicompat(enc);
+
while (s < send) {
- unsigned int c = rb_enc_codepoint(s, send, enc);
- int clen = rb_enc_codelen(c, enc);
+ unsigned int c;
+ int clen;
- if (tr_find(c, table, del, nodel)) {
- i++;
+ if (ascompat && (c = *(unsigned char*)s) < 0x80) {
+ clen = 1;
+ if (table[c]) {
+ i++;
+ }
+ s++;
+ }
+ else {
+ c = rb_enc_codepoint(s, send, enc);
+ clen = rb_enc_codelen(c, enc);
+ if (tr_find(c, table, del, nodel)) {
+ i++;
+ }
+ s += clen;
}
- s += clen;
}
+
return INT2NUM(i);
}
@@ -4939,7 +5091,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
rb_encoding *enc;
VALUE spat;
VALUE limit;
- int awk_split = Qfalse;
+ enum {awk, string, regexp} split_type;
long beg, end, i = 0;
int lim = 0;
VALUE result, tmp;
@@ -4961,37 +5113,41 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
spat = rb_fs;
goto fs_set;
}
- awk_split = Qtrue;
+ split_type = awk;
}
else {
fs_set:
if (TYPE(spat) == T_STRING) {
rb_encoding *enc2 = STR_ENC_GET(spat);
- if (rb_enc_mbminlen(enc2) == 1) {
+ split_type = string;
+ if (RSTRING_LEN(spat) == 0) {
+ /* Special case - split into chars */
+ spat = rb_reg_regcomp(spat);
+ split_type = regexp;
+ }
+ else if (rb_enc_mbminlen(enc2) == 1) {
if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
- awk_split = Qtrue;
+ split_type = awk;
}
}
else {
int l;
if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
RSTRING_LEN(spat) == l) {
- awk_split = Qtrue;
+ split_type = awk;
}
}
- if (!awk_split) {
- spat = rb_reg_regcomp(rb_reg_quote(spat));
- }
}
else {
spat = get_pat(spat, 1);
+ split_type = regexp;
}
}
result = rb_ary_new();
beg = 0;
- if (awk_split) {
+ if (split_type == awk) {
char *ptr = RSTRING_PTR(str);
char *eptr = RSTRING_END(str);
char *bptr = ptr;
@@ -5025,6 +5181,33 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
}
}
}
+ else if (split_type == string) {
+ char *ptr = RSTRING_PTR(str);
+ char *eptr = RSTRING_END(str);
+ char *sptr = RSTRING_PTR(spat);
+ int slen = RSTRING_LEN(spat);
+
+ if (is_broken_string(str)) {
+ rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
+ }
+ if (is_broken_string(spat)) {
+ rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
+ }
+ enc = rb_enc_check(str, spat);
+ while (ptr < eptr &&
+ (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
+ /* Check we are at the start of a char */
+ char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
+ if (t != ptr + end) {
+ ptr = t;
+ continue;
+ }
+ rb_ary_push(result, rb_str_substr(str, ptr - RSTRING_PTR(str), end));
+ ptr += end + slen;
+ if (!NIL_P(limit) && lim <= ++i) break;
+ }
+ beg = ptr - RSTRING_PTR(str);
+ }
else {
long start = beg;
long idx;
@@ -5178,7 +5361,7 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str)
p = memchr(p, '\n', pend - p);
if (!p) break;
- p0 = rb_enc_left_char_head(s, p, enc);
+ p0 = rb_enc_left_char_head(s, p, pend, enc);
if (!rb_enc_is_newline(p0, pend, enc)) {
p++;
continue;
@@ -5326,6 +5509,59 @@ rb_str_each_char(VALUE str)
return str;
}
+/*
+ * Document-method: codepoints
+ * call-seq:
+ * str.codepoints => anEnumerator
+ * str.codepoints {|fixnum| block } => str
+ *
+ * Returns an enumerator that gives the <code>Integer</code> ordinal
+ * of each character in the string, also known as a <i>codepoint</i>
+ * when applied to Unicode strings. If a block is given, it iterates
+ * over each character in the string.
+ *
+ * "foo\u0635".chars.to_a #=> [102, 111, 111, 1589]
+ */
+
+/*
+ * Document-method: each_codepoint
+ * call-seq:
+ * str.each_codepoint {|integer| block } => str
+ *
+ * Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
+ * also known as a <i>codepoint</i> when applied to Unicode strings to the
+ * given block.
+ *
+ * "hello\u0639".each_codepoint {|c| print c, ' ' }
+ *
+ * <em>produces:</em>
+ *
+ * 104 101 108 108 111 1593
+ */
+
+static VALUE
+rb_str_each_codepoint(VALUE str)
+{
+ int len, n;
+ unsigned int c;
+ const char *ptr, *end;
+ rb_encoding *enc;
+
+ if (single_byte_optimizable(str)) return rb_str_each_byte(str);
+ RETURN_ENUMERATOR(str, 0, 0);
+ ptr = RSTRING_PTR(str);
+ len = RSTRING_LEN(str);
+ end = RSTRING_END(str);
+ enc = STR_ENC_GET(str);
+ while (ptr < end) {
+ c = rb_enc_codepoint(ptr, end, enc);
+ n = rb_enc_codelen(c, enc);
+ rb_yield(UINT2NUM(c));
+ ptr += n;
+ }
+ return str;
+}
+
static long
chopped_length(VALUE str)
{
@@ -5335,10 +5571,10 @@ chopped_length(VALUE str)
beg = RSTRING_PTR(str);
end = beg + RSTRING_LEN(str);
if (beg > end) return 0;
- p = rb_enc_prev_char(beg, end, enc);
+ p = rb_enc_prev_char(beg, end, end, enc);
if (!p) return 0;
if (p > beg && rb_enc_codepoint(p, end, enc) == '\n') {
- p2 = rb_enc_prev_char(beg, p, enc);
+ p2 = rb_enc_prev_char(beg, p, end, enc);
if (p2 && rb_enc_codepoint(p2, end, enc) == '\r') p = p2;
}
return p - beg;
@@ -5358,7 +5594,7 @@ rb_str_chop_bang(VALUE str)
{
if (RSTRING_LEN(str) > 0) {
long len;
- rb_str_modify(str);
+ str_modify_keep_cr(str);
len = chopped_length(str);
STR_SET_LEN(str, len);
RSTRING_PTR(str)[len] = '\0';
@@ -5420,16 +5656,16 @@ rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
rs = rb_rs;
if (rs == rb_default_rs) {
smart_chomp:
- rb_str_modify(str);
+ str_modify_keep_cr(str);
enc = rb_enc_get(str);
if (rb_enc_mbminlen(enc) > 1) {
- pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), enc);
+ pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
if (rb_enc_is_newline(pp, e, enc)) {
e = pp;
}
pp = e - rb_enc_mbminlen(enc);
if (pp >= p) {
- pp = rb_enc_left_char_head(p, pp, enc);
+ pp = rb_enc_left_char_head(p, pp, e, enc);
if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
e = pp;
}
@@ -5472,7 +5708,7 @@ rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
len--;
}
if (len < RSTRING_LEN(str)) {
- rb_str_modify(str);
+ str_modify_keep_cr(str);
STR_SET_LEN(str, len);
RSTRING_PTR(str)[len] = '\0';
return str;
@@ -5492,9 +5728,9 @@ rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
if (p[len-1] == newline &&
(rslen <= 1 ||
memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
- if (rb_enc_left_char_head(p, pp, enc) != pp)
+ if (rb_enc_left_char_head(p, pp, e, enc) != pp)
return Qnil;
- rb_str_modify(str);
+ str_modify_keep_cr(str);
STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
return str;
@@ -5548,7 +5784,7 @@ rb_str_lstrip_bang(VALUE str)
rb_encoding *enc;
char *s, *t, *e;
- rb_str_modify(str);
+ str_modify_keep_cr(str);
enc = STR_ENC_GET(str);
s = RSTRING_PTR(str);
if (!s || RSTRING_LEN(str) == 0) return Qnil;
@@ -5562,7 +5798,6 @@ rb_str_lstrip_bang(VALUE str)
}
if (s > RSTRING_PTR(str)) {
- rb_str_modify(str);
STR_SET_LEN(str, t-s);
memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
@@ -5609,30 +5844,33 @@ rb_str_rstrip_bang(VALUE str)
{
rb_encoding *enc;
char *s, *t, *e;
- int space_seen = Qfalse;
- rb_str_modify(str);
enc = STR_ENC_GET(str);
s = RSTRING_PTR(str);
if (!s || RSTRING_LEN(str) == 0) return Qnil;
t = e = RSTRING_END(str);
- while (s < e) {
- unsigned int cc = rb_enc_codepoint(s, e, enc);
- if (!cc || rb_enc_isspace(cc, enc)) {
- if (!space_seen) t = s;
- space_seen = Qtrue;
- }
- else {
- space_seen = Qfalse;
+ if (single_byte_optimizable(str)) {
+ /* remove trailing '\0's */
+ while (s < t && t[-1] == '\0') t--;
+
+ /* remove trailing spaces */
+ while (s < t && rb_enc_isspace(*(t-1), enc)) t--;
+ }
+ else {
+ char *tp;
+
+ while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
+ if (!rb_enc_isspace(rb_enc_codepoint(tp, e, enc), enc)) break;
+ t = tp;
}
- s += rb_enc_codelen(cc, enc);
}
- if (!space_seen) t = s;
if (t < e) {
- rb_str_modify(str);
- STR_SET_LEN(str, t-RSTRING_PTR(str));
- RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
+ int len = t-RSTRING_PTR(str);
+
+ str_modify_keep_cr(str);
+ STR_SET_LEN(str, len);
+ RSTRING_PTR(str)[len] = '\0';
return str;
}
return Qnil;
@@ -5864,30 +6102,27 @@ rb_str_crypt(VALUE str, VALUE salt)
{
extern char *crypt(const char *, const char *);
VALUE result;
- const char *s;
+ const char *s, *saltp;
#ifdef BROKEN_CRYPT
- VALUE salt_8bit_clean;
- rb_encoding *enc;
+ char salt_8bit_clean[3];
#endif
StringValue(salt);
if (RSTRING_LEN(salt) < 2)
rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
- if (RSTRING_PTR(str)) s = RSTRING_PTR(str);
- else s = "";
+ s = RSTRING_PTR(str);
+ if (!s) s = "";
+ saltp = RSTRING_PTR(salt);
#ifdef BROKEN_CRYPT
- salt_8bit_clean = rb_str_dup(salt);
- enc = rb_ascii8bit_encoding();
- str_modifiable(salt_8bit_clean);
- rb_enc_associate(salt_8bit_clean, enc);
- salt_8bit_clean = rb_str_tr(salt_8bit_clean,
- rb_enc_str_new("\x80-\xFF", 3, enc),
- rb_usascii_str_new("\x00-\x7F", 3));
- result = rb_str_new2(crypt(s, RSTRING_PTR(salt_8bit_clean)));
-#else
- result = rb_str_new2(crypt(s, RSTRING_PTR(salt)));
+ if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
+ salt_8bit_clean[0] = saltp[0] & 0x7f;
+ salt_8bit_clean[1] = saltp[1] & 0x7f;
+ salt_8bit_clean[2] = '\0';
+ saltp = salt_8bit_clean;
+ }
#endif
+ result = rb_str_new2(crypt(s, saltp));
OBJ_INFECT(result, str);
OBJ_INFECT(result, salt);
return result;
@@ -6029,7 +6264,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
const char *f = " ";
long n, llen, rlen;
volatile VALUE pad;
- int singlebyte = 1;
+ int singlebyte = 1, cr;
rb_scan_args(argc, argv, "11", &w, &pad);
enc = STR_ENC_GET(str);
@@ -6050,6 +6285,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
n = width - len;
llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
rlen = n - llen;
+ cr = ENC_CODERANGE(str);
res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2);
p = RSTRING_PTR(res);
while (llen) {
@@ -6095,6 +6331,10 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
OBJ_INFECT(res, str);
if (!NIL_P(pad)) OBJ_INFECT(res, pad);
rb_enc_associate(res, enc);
+ if (argc == 2)
+ cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
+ if (cr != ENC_CODERANGE_BROKEN)
+ ENC_CODERANGE_SET(res, cr);
return res;
}
@@ -6283,7 +6523,7 @@ static VALUE
rb_str_end_with(int argc, VALUE *argv, VALUE str)
{
int i;
- char *p, *s;
+ char *p, *s, *e;
rb_encoding *enc;
for (i=0; i<argc; i++) {
@@ -6292,8 +6532,9 @@ rb_str_end_with(int argc, VALUE *argv, VALUE str)
enc = rb_enc_check(str, tmp);
if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
p = RSTRING_PTR(str);
- s = p + RSTRING_LEN(str) - RSTRING_LEN(tmp);
- if (rb_enc_left_char_head(p, s, enc) != s)
+ e = p + RSTRING_LEN(str);
+ s = e - RSTRING_LEN(tmp);
+ if (rb_enc_left_char_head(p, s, e, enc) != s)
continue;
if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
return Qtrue;
@@ -6695,6 +6936,7 @@ Init_String(void)
rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
+ rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
rb_define_method(rb_cString, "concat", rb_str_concat, 1);
@@ -6744,6 +6986,7 @@ Init_String(void)
rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
+ rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
rb_define_method(rb_cString, "sum", rb_str_sum, -1);