1
2
3
4
5
6 package ch.twiddlefinger.inet.rewinder.model.parser.conversion;
7
8 import java.io.CharConversionException;
9 import java.io.IOException;
10 import java.io.InputStream;
11 import java.io.Reader;
12
13
14 /***
15 * <p> This class represents an UTF-8 stream reader.</p>
16 *
17 * <p> This reader supports surrogate <code>char</code> pairs (representing
18 * characters in the range [U+10000 .. U+10FFFF]). It can also be used
19 * to read characters unicodes (31 bits) directly
20 * (ref. {@link #read()}).</p>
21 *
22 * <p> Each invocation of one of the <code>read()</code> methods may cause one
23 * or more bytes to be read from the underlying byte-input stream.
24 * To enable the efficient conversion of bytes to characters, more bytes may
25 * be read ahead from the underlying stream than are necessary to satisfy
26 * the current read operation.</p>
27 *
28 * <p> Unlike <code>java.io.InputStreamReader</code> this class does not
29 * allocate new buffers (e.g. <code>java.nio.HeapCharBuffer</code>) each
30 * time a {@link #read} is performed and its execution speed is therefore
31 * greatly improved (twice as fast).</p>
32 *
33 * <p> Instances of this class can be reused for different input streams
34 * and can be part of a higher level component (e.g. parser) in order
35 * to avoid dynamic buffer allocation when the input source changes.
36 * Also wrapping using a <code>java.io.BufferedReader</code> is unnescessary
37 * as instances of this class embed their own data buffers.</p>
38 *
39 * <p> Note: This reader is unsynchronized and does not test if the UTF-8
40 * encoding is well-formed (e.g. UTF-8 sequences longer than
41 * necessary to encode a character).</p>
42 *
43 * <p><i> This class is <b>public domain</b> (not copyrighted).</i></p>
44 *
45 * @author <a href="mailto:jean-marie@dautelle.com">Jean-Marie Dautelle</a>
46 * @version 4.6, Jly 14, 2003
47 * @see Utf8StreamWriter
48 */
49 public final class Utf8StreamReader extends Reader {
50 /***
51 * Holds the current input stream or <code>null</code> if closed.
52 */
53 private InputStream _inStream;
54
55 /***
56 * Holds the start index.
57 */
58 private int _start;
59
60 /***
61 * Holds the end index.
62 */
63 private int _end;
64
65 /***
66 * Holds the bytes buffer.
67 */
68 private final byte[] _bytes;
69 private int _code;
70 private int _moreBytes;
71
72 /***
73 * Default constructor.
74 */
75 public Utf8StreamReader() {
76 this(2048);
77 }
78
79 /***
80 * Creates a {@link Utf8StreamReader} of specified buffer size.
81 *
82 * @param bufferSize the buffer size in bytes.
83 */
84 public Utf8StreamReader(int bufferSize) {
85 _bytes = new byte[bufferSize];
86 }
87
88 /***
89 * Sets the input stream to use for reading until this reader is closed.
90 * For example:<pre>
91 * Reader reader = new Utf8StreamReader().setInputStream(inStream);
92 * </pre> is equivalent but reads twice as fast as <pre>
93 * Reader reader = new java.io.InputStreamReader(inStream, "UTF-8");
94 * </pre>
95 *
96 * @param inStream the input stream.
97 * @return this UTF-8 reader.
98 * @see #close
99 */
100 public Utf8StreamReader setInputStream(InputStream inStream) {
101 _inStream = inStream;
102
103 return this;
104 }
105
106 /***
107 * Indicates if this stream is ready to be read.
108 *
109 * @return <code>true</code> if the next read() is guaranteed not to block
110 * for input; <code>false</code> otherwise.
111 * @throws IOException if an I/O error occurs.
112 */
113 public boolean ready() throws IOException {
114 if (_inStream != null) {
115 return ((_end - _start) > 0) || (_inStream.available() != 0);
116 } else {
117 throw new IOException("Stream closed");
118 }
119 }
120
121 /***
122 * Closes the stream. Once a stream has been closed, further read(),
123 * ready(), mark(), or reset() invocations will throw an IOException.
124 * Closing a previously-closed stream, however, has no effect.
125 *
126 * @throws IOException if an I/O error occurs.
127 */
128 public void close() throws IOException {
129 if (_inStream != null) {
130 _inStream.close();
131 _start = 0;
132 _end = 0;
133 _code = 0;
134 _moreBytes = 0;
135 _inStream = null;
136 }
137 }
138
139 /***
140 * Reads a single character. This method will block until a character is
141 * available, an I/O error occurs, or the end of the stream is reached.
142 *
143 * @return the 31-bits Unicode of the character read, or -1 if the end of
144 * the stream has been reached.
145 * @throws IOException if an I/O error occurs.
146 */
147 public int read() throws IOException {
148 byte b = _bytes[_start];
149
150 return ((b >= 0) && (_start++ < _end)) ? b : read2();
151 }
152
153
154 private int read2() throws IOException {
155 if (_start < _end) {
156 byte b = _bytes[_start++];
157
158
159 if ((b >= 0) && (_moreBytes == 0)) {
160
161 return b;
162 } else if (((b & 0xc0) == 0x80) && (_moreBytes != 0)) {
163
164 _code = (_code << 6) | (b & 0x3f);
165
166 if (--_moreBytes == 0) {
167 return _code;
168 } else {
169 return read2();
170 }
171 } else if (((b & 0xe0) == 0xc0) && (_moreBytes == 0)) {
172
173 _code = b & 0x1f;
174 _moreBytes = 1;
175
176 return read2();
177 } else if (((b & 0xf0) == 0xe0) && (_moreBytes == 0)) {
178
179 _code = b & 0x0f;
180 _moreBytes = 2;
181
182 return read2();
183 } else if (((b & 0xf8) == 0xf0) && (_moreBytes == 0)) {
184
185 _code = b & 0x07;
186 _moreBytes = 3;
187
188 return read2();
189 } else if (((b & 0xfc) == 0xf8) && (_moreBytes == 0)) {
190
191 _code = b & 0x03;
192 _moreBytes = 4;
193
194 return read2();
195 } else if (((b & 0xfe) == 0xfc) && (_moreBytes == 0)) {
196
197 _code = b & 0x01;
198 _moreBytes = 5;
199
200 return read2();
201 } else {
202 throw new CharConversionException("Invalid UTF-8 Encoding");
203 }
204 } else {
205
206 if (_inStream != null) {
207 _start = 0;
208 _end = _inStream.read(_bytes, 0, _bytes.length);
209
210 if (_end > 0) {
211 return read2();
212 } else {
213
214 if (_moreBytes == 0) {
215 return -1;
216 } else {
217 throw new CharConversionException(
218 "Unexpected end of stream");
219 }
220 }
221 } else {
222 throw new IOException("Stream closed");
223 }
224 }
225 }
226
227 /***
228 * Reads characters into a portion of an array. This method will block
229 * until some input is available, an I/O error occurs, or the end of the
230 * stream is reached.
231 *
232 * <p> Note: Characters between U+10000 and U+10FFFF are represented
233 * by surrogate pairs (two <code>char</code>).</p>
234 *
235 * @param cbuf the destination buffer.
236 * @param off the offset at which to start storing characters.
237 * @param len the maximum number of characters to read
238 * @return the number of characters read, or -1 if the end of the
239 * stream has been reached
240 * @throws IOException if an I/O error occurs.
241 */
242 public int read(char[] cbuf, int off, int len) throws IOException {
243 if (_inStream != null) {
244 if (_start >= _end) {
245 _start = 0;
246 _end = _inStream.read(_bytes, 0, _bytes.length);
247
248 if (_end <= 0) {
249
250 return _end;
251 }
252 }
253
254 final int off_plus_len = off + len;
255
256 for (int i = off; i < off_plus_len;) {
257
258 byte b = _bytes[_start];
259
260 if ((b >= 0) && (++_start < _end)) {
261 cbuf[i++] = (char) b;
262 } else if (b < 0) {
263 if (i < (off_plus_len - 1)) {
264
265 int code = read2();
266
267 if (code < 0x10000) {
268 cbuf[i++] = (char) code;
269 } else if (code <= 0x10ffff) {
270 cbuf[i++] = (char) (((code - 0x10000) >> 10) +
271 0xd800);
272 cbuf[i++] = (char) (((code - 0x10000) & 0x3ff) +
273 0xdc00);
274 } else {
275 throw new CharConversionException(
276 "Cannot convert U+" +
277 Integer.toHexString(code) +
278 " to char (code greater than U+10FFFF)");
279 }
280
281 if (_start < _end) {
282 continue;
283 }
284 }
285
286 return i - off;
287 } else {
288 cbuf[i++] = (char) b;
289
290 return i - off;
291 }
292 }
293
294 return len;
295 } else {
296 throw new IOException("Stream closed");
297 }
298 }
299 }