diff options
author | Burdette Lamar <BurdetteLamar@Yahoo.com> | 2022-02-25 13:12:59 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-02-25 13:12:59 -0600 |
commit | 26ffda2fd217651e73eb71e6da8f89eb17866f9d (patch) | |
tree | 6f41f5b0ce7c206fb9a249d8f739df48330053d3 | |
parent | 189ac52bba8b1355186431acfa335d40991a7406 (diff) | |
download | ruby-26ffda2fd217651e73eb71e6da8f89eb17866f9d.tar.gz |
[DOC] Enhanced RDoc for some encoding methods (#5598)
In String, treats:
#b
#scrub
#scrub!
#unicode_normalize
#unicode_normalize!
#encode
#encode!
Also adds a note to IO.new (suggested by @jeremyevans).
-rw-r--r-- | io.c | 7 | ||||
-rw-r--r-- | string.c | 120 | ||||
-rw-r--r-- | transcode.c | 101 |
3 files changed, 131 insertions, 97 deletions
@@ -8943,6 +8943,13 @@ rb_io_make_open_file(VALUE obj) * fd = IO.sysopen(path) # => 3 * IO.new(fd) # => #<IO:fd 3> * + * The new \IO object does not inherit encoding + * (because the integer file descriptor does not have an encoding): + * + * fd = IO.sysopen('t.rus', 'rb') + * io = IO.new(fd) + * io.external_encoding # => #<Encoding:UTF-8> # Not ASCII-8BIT. + * * Optional argument +mode+ (defaults to 'r') must specify a valid mode * see IO@Modes: * @@ -6670,7 +6670,6 @@ rb_str_escape(VALUE str) * and with special characters escaped: * * s = "foo\tbar\tbaz\n" - * # => "foo\tbar\tbaz\n" * s.inspect * # => "\"foo\\tbar\\tbaz\\n\"" * @@ -10963,9 +10962,22 @@ rb_str_force_encoding(VALUE str, VALUE enc) /* * call-seq: - * str.b -> str + * b -> string + * + * Returns a copy of +self+ with that has ASCII-8BIT encoding; + * the contents (bytes) of +self+ are not modified: + * + * s = "\x99" + * s.encoding # => #<Encoding:UTF-8> + * t = s.b # => "\x99" + * t.encoding # => #<Encoding:ASCII-8BIT> + * + * s = "\u4095" + * s.encoding # => #<Encoding:UTF-8> + * s.bytes # => [228, 130, 149] + * t = s.b # => "\xE4\x82\x95" + * t.encoding # => #<Encoding:ASCII-8BIT> * - * Returns a copied string whose encoding is ASCII-8BIT. */ static VALUE @@ -11341,17 +11353,38 @@ enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr) /* * call-seq: - * str.scrub -> new_str - * str.scrub(repl) -> new_str - * str.scrub{|bytes|} -> new_str + * scrub(replacement_string = default_replacement) -> string + * scrub{|bytes| ... } -> string + * + * Returns a copy of self with each invalid byte sequence replaced + * by a replacement string. + * + * With no block given and no argument, replaces each invalid sequence + * with the default replacement string + * (<tt>"\uFFFD"</tt> for a Unicode encoding, <tt>'?'</tt> otherwise): + * + * "\uFFFD".bytes # => [239, 191, 189] + * s = "foo\x81\x81bar" + * s.bytes + * # => [102, 111, 111, 129, 129, 98, 97, 114] + * s.scrub.bytes + * # => [102, 111, 111, 239, 191, 189, 239, 191, 189, 98, 97, 114] * - * If the string is invalid byte sequence then replace invalid bytes with given replacement - * character, else returns self. - * If block is given, replace invalid bytes with returned value of the block. + * With no block given and argument +replacement_string+ given, + * replaces each invalid sequence with that string: + * + * "foo\x81\x81bar".scrub('xyzzy') # => "fooxyzzyxyzzybar" + * + * With a block given, replaces each invalid sequence with the value + * of the block: + * + * "foo\x81\x81bar".scrub {|bytes| p bytes; 'XYZZY' } # => "fooXYZZYXYZZYbar" + * + * Output: + * + * "\x81" + * "\x81" * - * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD" - * "abc\u3042\x81".scrub("*") #=> "abc\u3042*" - * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>" */ static VALUE str_scrub(int argc, VALUE *argv, VALUE str) @@ -11363,17 +11396,12 @@ str_scrub(int argc, VALUE *argv, VALUE str) /* * call-seq: - * str.scrub! -> str - * str.scrub!(repl) -> str - * str.scrub!{|bytes|} -> str + * scrub! -> self + * scrub!(replacement_string = default_replacement) -> self + * scrub!{|bytes|} -> self * - * If the string is invalid byte sequence then replace invalid bytes with given replacement - * character, else returns self. - * If block is given, replace invalid bytes with returned value of the block. + * Like String#scrub, except that any replacements are made in +self+. * - * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD" - * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*" - * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>" */ static VALUE str_scrub_bang(int argc, VALUE *argv, VALUE str) @@ -11405,25 +11433,36 @@ unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id) /* * call-seq: - * str.unicode_normalize(form=:nfc) + * unicode_normalize(form = :nfc) -> string * - * Unicode Normalization---Returns a normalized form of +str+, - * using Unicode normalizations NFC, NFD, NFKC, or NFKD. - * The normalization form used is determined by +form+, which can - * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+. - * The default is +:nfc+. + * Returns a copy of +self+ with + * {Unicode normalization}[https://unicode.org/reports/tr15] applied. * - * If the string is not in a Unicode Encoding, then an Exception is raised. - * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE, - * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE. - * Anything other than UTF-8 is implemented by converting to UTF-8, - * which makes it slower than UTF-8. + * Argument +form+ must be one of the following symbols + * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]): + * + * - +:nfc+: Canonical decomposition, followed by canonical composition. + * - +:nfd+: Canonical decomposition. + * - +:nfkc+: Compatibility decomposition, followed by canonical composition. + * - +:nfkd+: Compatibility decomposition. + * + * +self+ must have encoding UTF-8 or one of the other supported encodings: + * + * UnicodeNormalize::UNICODE_ENCODINGS + * # => + * [#<Encoding:UTF-16BE (autoload)>, + * #<Encoding:UTF-16LE>, + * #<Encoding:UTF-32BE (autoload)>, + * #<Encoding:UTF-32LE (autoload)>, + * #<Encoding:GB18030 (autoload)>, + * #<Encoding:UTF-16BE (autoload)>, + * #<Encoding:UTF-32BE (autoload)>] + * + * Examples: + * + * "a\u0300".unicode_normalize # => "a" + * "\u00E0".unicode_normalize(:nfd) # => "a " * - * "a\u0300".unicode_normalize #=> "\u00E0" - * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0" - * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300" - * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd) - * #=> Encoding::CompatibilityError raised */ static VALUE rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str) @@ -11433,10 +11472,11 @@ rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str) /* * call-seq: - * str.unicode_normalize!(form=:nfc) + * unicode_normalize!(form = :nfc) -> self + * + * Like String#unicode_normalize, except that the normalization + * is performed on +self+. * - * Destructive version of String#unicode_normalize, doing Unicode - * normalization in place. */ static VALUE rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str) diff --git a/transcode.c b/transcode.c index 9cc4d00f28..400ad13775 100644 --- a/transcode.c +++ b/transcode.c @@ -2801,16 +2801,11 @@ str_encode_associate(VALUE str, int encidx) /* * call-seq: - * str.encode!(encoding, **options) -> str - * str.encode!(dst_encoding, src_encoding, **options) -> str + * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self + * encode!(dst_encoding, src_encoding, **enc_opts) -> self + * + * Like #encode, but applies encoding changes to +self+; returns +self+. * - * The first form transcodes the contents of <i>str</i> from - * str.encoding to +encoding+. - * The second form transcodes the contents of <i>str</i> from - * src_encoding to dst_encoding. - * The +options+ keyword arguments give details for conversion. See String#encode - * for details. - * Returns the string even if no changes were made. */ static VALUE @@ -2837,58 +2832,50 @@ static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx); /* * call-seq: - * str.encode(encoding, **options) -> str - * str.encode(dst_encoding, src_encoding, **options) -> str - * str.encode(**options) -> str + * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string + * encode(dst_encoding, src_encoding, **enc_opts) -> string + * + * Returns a copy of +self+ transcoded as determined by +dst_encoding+. + * By default, raises an exception if +self+ + * contains an invalid byte or a character not defined in +dst_encoding+; + * that behavior may be modified by encoding options; see below. + * + * With no arguments: + * + * - Uses the same encoding if <tt>Encoding.default_internal</tt> is +nil+ + * (the default): + * + * Encoding.default_internal # => nil + * s = "Ruby\x99".force_encoding('Windows-1252') + * s.encoding # => #<Encoding:Windows-1252> + * s.bytes # => [82, 117, 98, 121, 153] + * t = s.encode # => "Ruby\x99" + * t.encoding # => #<Encoding:Windows-1252> + * t.bytes # => [82, 117, 98, 121, 226, 132, 162] + * + * - Otherwise, uses the encoding <tt>Encoding.default_internal</tt>: + * + * Encoding.default_internal = 'UTF-8' + * t = s.encode # => "Ruby™" + * t.encoding # => #<Encoding:UTF-8> + * + * With only argument +dst_encoding+ given, uses that encoding: + * + * s = "Ruby\x99".force_encoding('Windows-1252') + * s.encoding # => #<Encoding:Windows-1252> + * t = s.encode('UTF-8') # => "Ruby™" + * t.encoding # => #<Encoding:UTF-8> * - * The first form returns a copy of +str+ transcoded - * to encoding +encoding+. - * The second form returns a copy of +str+ transcoded - * from src_encoding to dst_encoding. - * The last form returns a copy of +str+ transcoded to - * <tt>Encoding.default_internal</tt>. + * With arguments +dst_encoding+ and +src_encoding+ given, + * interprets +self+ using +src_encoding+, encodes the new string using +dst_encoding+: * - * By default, the first and second form raise - * Encoding::UndefinedConversionError for characters that are - * undefined in the destination encoding, and - * Encoding::InvalidByteSequenceError for invalid byte sequences - * in the source encoding. The last form by default does not raise - * exceptions but uses replacement strings. + * s = "Ruby\x99" + * t = s.encode('UTF-8', 'Windows-1252') # => "Ruby™" + * t.encoding # => #<Encoding:UTF-8> * - * The +options+ keyword arguments give details for conversion. - * The arguments are: + * Optional keyword arguments +enc_opts+ specify encoding options; + * see {Encoding Options}[rdoc-ref:encoding.rdoc@Encoding+Options]. * - * :invalid :: - * If the value is +:replace+, #encode replaces invalid byte sequences in - * +str+ with the replacement character. The default is to raise the - * Encoding::InvalidByteSequenceError exception - * :undef :: - * If the value is +:replace+, #encode replaces characters which are - * undefined in the destination encoding with the replacement character. - * The default is to raise the Encoding::UndefinedConversionError. - * :replace :: - * Sets the replacement string to the given value. The default replacement - * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise. - * :fallback :: - * Sets the replacement string by the given object for undefined - * character. The object should be a Hash, a Proc, a Method, or an - * object which has [] method. - * Its key is an undefined character encoded in the source encoding - * of current transcoder. Its value can be any encoding until it - * can be converted into the destination encoding of the transcoder. - * :xml :: - * The value must be +:text+ or +:attr+. - * If the value is +:text+ #encode replaces undefined characters with their - * (upper-case hexadecimal) numeric character references. '&', '<', and '>' - * are converted to "&", "<", and ">", respectively. - * If the value is +:attr+, #encode also quotes the replacement result - * (using '"'), and replaces '"' with """. - * :cr_newline :: - * Replaces LF ("\n") with CR ("\r") if value is true. - * :crlf_newline :: - * Replaces LF ("\n") with CRLF ("\r\n") if value is true. - * :universal_newline :: - * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true. */ static VALUE |