summaryrefslogtreecommitdiff
path: root/java/src/json/ext/ByteListTranscoder.java
blob: 6f6ab66c1c2a3ceba2762eb97417465be00b7840 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
/*
 * This code is copyrighted work by Daniel Luz <dev at mernen dot com>.
 *
 * Distributed under the Ruby license: https://www.ruby-lang.org/en/about/license.txt
 */
package json.ext;

import org.jruby.exceptions.RaiseException;
import org.jruby.runtime.ThreadContext;
import org.jruby.util.ByteList;

/**
 * A class specialized in transcoding a certain String format into another,
 * using UTF-8 ByteLists as both input and output.
 */
abstract class ByteListTranscoder {
    protected final ThreadContext context;

    protected ByteList src;
    protected int srcEnd;
    /** Position where the last read character started */
    protected int charStart;
    /** Position of the next character to read */
    protected int pos;

    private ByteList out;
    /**
     * When a character that can be copied straight into the output is found,
     * its index is stored on this variable, and copying is delayed until
     * the sequence of characters that can be copied ends.
     *
     * <p>The variable stores -1 when not in a plain sequence.
     */
    private int quoteStart = -1;

    protected ByteListTranscoder(ThreadContext context) {
        this.context = context;
    }

    protected void init(ByteList src, ByteList out) {
        this.init(src, 0, src.length(), out);
    }

    protected void init(ByteList src, int start, int end, ByteList out) {
        this.src = src;
        this.pos = start;
        this.charStart = start;
        this.srcEnd = end;
        this.out = out;
    }

    /**
     * Returns whether there are any characters left to be read.
     */
    protected boolean hasNext() {
        return pos < srcEnd;
    }

    /**
     * Returns the next character in the buffer.
     */
    private char next() {
        return src.charAt(pos++);
    }

    /**
     * Reads an UTF-8 character from the input and returns its code point,
     * while advancing the input position.
     *
     * <p>Raises an {@link #invalidUtf8()} exception if an invalid byte
     * is found.
     */
    protected int readUtf8Char() {
        charStart = pos;
        char head = next();
        if (head <= 0x7f) { // 0b0xxxxxxx (ASCII)
            return head;
        }
        if (head <= 0xbf) { // 0b10xxxxxx
            throw invalidUtf8(); // tail byte with no head
        }
        if (head <= 0xdf) { // 0b110xxxxx
            ensureMin(1);
            int cp = ((head  & 0x1f) << 6)
                     | nextPart();
            if (cp < 0x0080) throw invalidUtf8();
            return cp;
        }
        if (head <= 0xef) { // 0b1110xxxx
            ensureMin(2);
            int cp = ((head & 0x0f) << 12)
                     | (nextPart()  << 6)
                     | nextPart();
            if (cp < 0x0800) throw invalidUtf8();
            return cp;
        }
        if (head <= 0xf7) { // 0b11110xxx
            ensureMin(3);
            int cp = ((head & 0x07) << 18)
                     | (nextPart()  << 12)
                     | (nextPart()  << 6)
                     | nextPart();
            if (!Character.isValidCodePoint(cp)) throw invalidUtf8();
            return cp;
        }
        // 0b11111xxx?
        throw invalidUtf8();
    }

    /**
     * Throws a GeneratorError if the input list doesn't have at least this
     * many bytes left.
     */
    protected void ensureMin(int n) {
        if (pos + n > srcEnd) throw incompleteUtf8();
    }

    /**
     * Reads the next byte of a multi-byte UTF-8 character and returns its
     * contents (lower 6 bits).
     *
     * <p>Throws a GeneratorError if the byte is not a valid tail.
     */
    private int nextPart() {
        char c = next();
        // tail bytes must be 0b10xxxxxx
        if ((c & 0xc0) != 0x80) throw invalidUtf8();
        return c & 0x3f;
    }


    protected void quoteStart() {
        if (quoteStart == -1) quoteStart = charStart;
    }

    /**
     * When in a sequence of characters that can be copied directly,
     * interrupts the sequence and copies it to the output buffer.
     *
     * @param endPos The offset until which the direct character quoting should
     *               occur. You may pass {@link #pos} to quote until the most
     *               recently read character, or {@link #charStart} to quote
     *               until the character before it.
     */
    protected void quoteStop(int endPos) {
        if (quoteStart != -1) {
            out.append(src, quoteStart, endPos - quoteStart);
            quoteStart = -1;
        }
    }

    protected void append(int b) {
        out.append(b);
    }

    protected void append(byte[] origin, int start, int length) {
        out.append(origin, start, length);
    }


    protected abstract RaiseException invalidUtf8();

    protected RaiseException incompleteUtf8() {
        return invalidUtf8();
    }
}