java/src/json/ext/StringEncoder.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

/*
 * This code is copyrighted work by Daniel Luz <dev at mernen dot com>.
 *
 * Distributed under the Ruby license: https://www.ruby-lang.org/en/about/license.txt
 */
package json.ext;

import org.jruby.exceptions.RaiseException;
import org.jruby.runtime.ThreadContext;
import org.jruby.util.ByteList;

/**
 * An encoder that reads from the given source and outputs its representation
 * to another ByteList. The source string is fully checked for UTF-8 validity,
 * and throws a GeneratorError if any problem is found.
 */
final class StringEncoder extends ByteListTranscoder {
    private final boolean asciiOnly;

    // Escaped characters will reuse this array, to avoid new allocations
    // or appending them byte-by-byte
    private final byte[] aux =
        new byte[] {/* First unicode character */
                    '\\', 'u', 0, 0, 0, 0,
                    /* Second unicode character (for surrogate pairs) */
                    '\\', 'u', 0, 0, 0, 0,
                    /* "\X" characters */
                    '\\', 0};
    // offsets on the array above
    private static final int ESCAPE_UNI1_OFFSET = 0;
    private static final int ESCAPE_UNI2_OFFSET = ESCAPE_UNI1_OFFSET + 6;
    private static final int ESCAPE_CHAR_OFFSET = ESCAPE_UNI2_OFFSET + 6;
    /** Array used for code point decomposition in surrogates */
    private final char[] utf16 = new char[2];

    private static final byte[] HEX =
            new byte[] {'0', '1', '2', '3', '4', '5', '6', '7',
                        '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};

    StringEncoder(ThreadContext context, boolean asciiOnly) {
        super(context);
        this.asciiOnly = asciiOnly;
    }

    void encode(ByteList src, ByteList out) {
        init(src, out);
        append('"');
        while (hasNext()) {
            handleChar(readUtf8Char());
        }
        quoteStop(pos);
        append('"');
    }

    private void handleChar(int c) {
        switch (c) {
        case '"':
        case '\\':
            escapeChar((char)c);
            break;
        case '\n':
            escapeChar('n');
            break;
        case '\r':
            escapeChar('r');
            break;
        case '\t':
            escapeChar('t');
            break;
        case '\f':
            escapeChar('f');
            break;
        case '\b':
            escapeChar('b');
            break;
        default:
            if (c >= 0x20 && c <= 0x7f ||
                    (c >= 0x80 && !asciiOnly)) {
                quoteStart();
            } else {
                quoteStop(charStart);
                escapeUtf8Char(c);
            }
        }
    }

    private void escapeChar(char c) {
        quoteStop(charStart);
        aux[ESCAPE_CHAR_OFFSET + 1] = (byte)c;
        append(aux, ESCAPE_CHAR_OFFSET, 2);
    }

    private void escapeUtf8Char(int codePoint) {
        int numChars = Character.toChars(codePoint, utf16, 0);
        escapeCodeUnit(utf16[0], ESCAPE_UNI1_OFFSET + 2);
        if (numChars > 1) escapeCodeUnit(utf16[1], ESCAPE_UNI2_OFFSET + 2);
        append(aux, ESCAPE_UNI1_OFFSET, 6 * numChars);
    }

    private void escapeCodeUnit(char c, int auxOffset) {
        for (int i = 0; i < 4; i++) {
            aux[auxOffset + i] = HEX[(c >>> (12 - 4 * i)) & 0xf];
        }
    }

    @Override
    protected RaiseException invalidUtf8() {
         return Utils.newException(context, Utils.M_GENERATOR_ERROR,
                 "source sequence is illegal/malformed utf-8");
    }
}