summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas E. Enebo <tom.enebo@gmail.com>2017-09-05 16:30:31 -0500
committerThomas E. Enebo <tom.enebo@gmail.com>2017-09-05 16:30:31 -0500
commit2bbb245edaf80179f4673bf4bf75978a10654c64 (patch)
tree3bcf8b863240a2dfb2023242fe2a35d71ba4add2
parent4039a811248de7b0b9ae9e4a97854f00ba985255 (diff)
downloadpsych-2bbb245edaf80179f4673bf4bf75978a10654c64.tar.gz
Parse.load with an IO/File which has a non-YAML allowed encoding will just
set the encoding to UTF-8 and hope for the best. This appears to be how libyaml works. This issue was noticed in yaml/store because it extends pstore which will create an IO in read_only mode as: ```ruby RD_ACCESS = {mode: IO::RDONLY | IO::BINARY, encoding: Encoding::ASCII_8BIT} ``` The data in the test case I was debugging was in fact UTF-8 data and MRI was happy to take this 8bit IO and pretend it is UTF-8. Form-fitting ftw.
-rw-r--r--ext/java/PsychParser.java7
-rw-r--r--test/psych/test_encoding.rb12
2 files changed, 19 insertions, 0 deletions
diff --git a/ext/java/PsychParser.java b/ext/java/PsychParser.java
index b3e747e..f5b6faf 100644
--- a/ext/java/PsychParser.java
+++ b/ext/java/PsychParser.java
@@ -33,6 +33,8 @@ import java.nio.charset.Charset;
import java.util.Map;
import org.jcodings.Encoding;
+import org.jcodings.specific.UTF16BEEncoding;
+import org.jcodings.specific.UTF16LEEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jcodings.unicode.UnicodeEncoding;
import org.jruby.Ruby;
@@ -162,6 +164,11 @@ public class PsychParser extends RubyObject {
if (yaml instanceof RubyIO) {
Encoding enc = ((RubyIO) yaml).getReadEncoding();
charset = enc.getCharset();
+
+ // libyaml treats non-utf encodings as utf-8 and hopes for the best.
+ if (!(enc instanceof UTF8Encoding) && !(enc instanceof UTF16LEEncoding) && !(enc instanceof UTF16BEEncoding)) {
+ charset = UTF8Encoding.INSTANCE.getCharset();
+ }
}
if (charset == null) {
// If we can't get it from the IO or it doesn't have a charset, fall back on UTF-8
diff --git a/test/psych/test_encoding.rb b/test/psych/test_encoding.rb
index a4f9f03..01ebe25 100644
--- a/test/psych/test_encoding.rb
+++ b/test/psych/test_encoding.rb
@@ -106,6 +106,18 @@ module Psych
}
end
+ def test_io_utf8_read_as_binary
+ Tempfile.create(['utf8', 'yml']) {|t|
+ t.binmode
+ t.write '--- こんにちは!'.encode('UTF-8')
+ t.close
+
+ File.open(t.path, 'rb', :encoding => 'ascii-8bit') do |f|
+ assert_equal "こんにちは!", Psych.load(f)
+ end
+ }
+ end
+
def test_emit_alias
@emitter.start_stream Psych::Parser::UTF8
@emitter.start_document [], [], true