View Javadoc

1   /*
2    * J.A.D.E. Java(TM) Addition to Default Environment.
3    * Latest release available at http://jade.dautelle.com/
4    * This class is public domain (not copyrighted).
5    */
6   package ch.twiddlefinger.inet.rewinder.model.parser.conversion;
7   
8   import java.io.CharConversionException;
9   import java.io.IOException;
10  import java.io.InputStream;
11  import java.io.Reader;
12  
13  
14  /***
15   * <p> This class represents an UTF-8 stream reader.</p>
16   *
17   * <p> This reader supports surrogate <code>char</code> pairs (representing
18   *     characters in the range [U+10000 .. U+10FFFF]). It can also be used
19   *     to read characters unicodes (31 bits) directly
20   *     (ref. {@link #read()}).</p>
21   *
22   * <p> Each invocation of one of the <code>read()</code> methods may cause one
23   *     or more bytes to be read from the underlying byte-input stream.
24   *     To enable the efficient conversion of bytes to characters, more bytes may
25   *     be read ahead from the underlying stream than are necessary to satisfy
26   *     the current read operation.</p>
27   *
28   * <p> Unlike <code>java.io.InputStreamReader</code> this class does not
29   *     allocate new buffers (e.g. <code>java.nio.HeapCharBuffer</code>) each
30   *     time a {@link #read} is performed and its execution speed is therefore
31   *     greatly improved (twice as fast).</p>
32   *
33   * <p> Instances of this class can be reused for different input streams
34   *     and can be part of a higher level component (e.g. parser) in order
35   *     to avoid dynamic buffer allocation when the input source changes.
36   *     Also wrapping using a <code>java.io.BufferedReader</code> is unnescessary
37   *     as instances of this class embed their own data buffers.</p>
38   *
39   * <p> Note: This reader is unsynchronized and does not test if the UTF-8
40   *           encoding is well-formed (e.g. UTF-8 sequences longer than
41   *           necessary to encode a character).</p>
42   *
43   *  <p><i> This class is <b>public domain</b> (not copyrighted).</i></p>
44   *
45   * @author  <a href="mailto:jean-marie@dautelle.com">Jean-Marie Dautelle</a>
46   * @version 4.6, Jly 14, 2003
47   * @see     Utf8StreamWriter
48   */
49  public final class Utf8StreamReader extends Reader {
50      /***
51   * Holds the current input stream or <code>null</code> if closed.
52   */
53      private InputStream _inStream;
54  
55      /***
56   * Holds the start index.
57   */
58      private int _start;
59  
60      /***
61   * Holds the end index.
62   */
63      private int _end;
64  
65      /***
66   * Holds the bytes buffer.
67   */
68      private final byte[] _bytes;
69      private int _code;
70      private int _moreBytes;
71  
72      /***
73   * Default constructor.
74   */
75      public Utf8StreamReader() {
76          this(2048);
77      }
78  
79      /***
80   * Creates a {@link Utf8StreamReader} of specified buffer size.
81   *
82   * @param  bufferSize the buffer size in bytes.
83   */
84      public Utf8StreamReader(int bufferSize) {
85          _bytes = new byte[bufferSize];
86      }
87  
88      /***
89   * Sets the input stream to use for reading until this reader is closed.
90   * For example:<pre>
91   *     Reader reader = new Utf8StreamReader().setInputStream(inStream);
92   * </pre> is equivalent but reads twice as fast as <pre>
93   *     Reader reader = new java.io.InputStreamReader(inStream, "UTF-8");
94   * </pre>
95   *
96   * @param  inStream the input stream.
97   * @return this UTF-8 reader.
98   * @see    #close
99   */
100     public Utf8StreamReader setInputStream(InputStream inStream) {
101         _inStream = inStream;
102 
103         return this;
104     }
105 
106     /***
107  * Indicates if this stream is ready to be read.
108  *
109  * @return <code>true</code> if the next read() is guaranteed not to block
110  *         for input; <code>false</code> otherwise.
111  * @throws  IOException if an I/O error occurs.
112  */
113     public boolean ready() throws IOException {
114         if (_inStream != null) {
115             return ((_end - _start) > 0) || (_inStream.available() != 0);
116         } else {
117             throw new IOException("Stream closed");
118         }
119     }
120 
121     /***
122  * Closes the stream. Once a stream has been closed, further read(),
123  * ready(), mark(), or reset() invocations will throw an IOException.
124  * Closing a previously-closed stream, however, has no effect.
125  *
126  * @throws IOException if an I/O error occurs.
127  */
128     public void close() throws IOException {
129         if (_inStream != null) {
130             _inStream.close();
131             _start = 0;
132             _end = 0;
133             _code = 0;
134             _moreBytes = 0;
135             _inStream = null;
136         }
137     }
138 
139     /***
140  * Reads a single character.  This method will block until a character is
141  * available, an I/O error occurs, or the end of the stream is reached.
142  *
143  * @return the 31-bits Unicode of the character read, or -1 if the end of
144  *         the stream has been reached.
145  * @throws IOException if an I/O error occurs.
146  */
147     public int read() throws IOException {
148         byte b = _bytes[_start];
149 
150         return ((b >= 0) && (_start++ < _end)) ? b : read2();
151     }
152 
153     // Reads one full character, blocks if necessary.
154     private int read2() throws IOException {
155         if (_start < _end) {
156             byte b = _bytes[_start++];
157 
158             // Decodes UTF-8.
159             if ((b >= 0) && (_moreBytes == 0)) {
160                 // 0xxxxxxx
161                 return b;
162             } else if (((b & 0xc0) == 0x80) && (_moreBytes != 0)) {
163                 // 10xxxxxx (continuation byte)
164                 _code = (_code << 6) | (b & 0x3f); // Adds 6 bits to code.
165 
166                 if (--_moreBytes == 0) {
167                     return _code;
168                 } else {
169                     return read2();
170                 }
171             } else if (((b & 0xe0) == 0xc0) && (_moreBytes == 0)) {
172                 // 110xxxxx
173                 _code = b & 0x1f;
174                 _moreBytes = 1;
175 
176                 return read2();
177             } else if (((b & 0xf0) == 0xe0) && (_moreBytes == 0)) {
178                 // 1110xxxx
179                 _code = b & 0x0f;
180                 _moreBytes = 2;
181 
182                 return read2();
183             } else if (((b & 0xf8) == 0xf0) && (_moreBytes == 0)) {
184                 // 11110xxx
185                 _code = b & 0x07;
186                 _moreBytes = 3;
187 
188                 return read2();
189             } else if (((b & 0xfc) == 0xf8) && (_moreBytes == 0)) {
190                 // 111110xx
191                 _code = b & 0x03;
192                 _moreBytes = 4;
193 
194                 return read2();
195             } else if (((b & 0xfe) == 0xfc) && (_moreBytes == 0)) {
196                 // 1111110x
197                 _code = b & 0x01;
198                 _moreBytes = 5;
199 
200                 return read2();
201             } else {
202                 throw new CharConversionException("Invalid UTF-8 Encoding");
203             }
204         } else { // No more bytes in buffer.
205 
206             if (_inStream != null) {
207                 _start = 0;
208                 _end = _inStream.read(_bytes, 0, _bytes.length);
209 
210                 if (_end > 0) {
211                     return read2(); // Continues.
212                 } else { // Done.
213 
214                     if (_moreBytes == 0) {
215                         return -1;
216                     } else { // Incomplete sequence.
217                         throw new CharConversionException(
218                             "Unexpected end of stream");
219                     }
220                 }
221             } else {
222                 throw new IOException("Stream closed");
223             }
224         }
225     }
226 
227     /***
228  * Reads characters into a portion of an array.  This method will block
229  * until some input is available, an I/O error occurs, or the end of the
230  * stream is reached.
231  *
232  * <p> Note: Characters between U+10000 and U+10FFFF are represented
233  *     by surrogate pairs (two <code>char</code>).</p>
234  *
235  * @param  cbuf the destination buffer.
236  * @param  off the offset at which to start storing characters.
237  * @param  len the maximum number of characters to read
238  * @return the number of characters read, or -1 if the end of the
239  *         stream has been reached
240  * @throws IOException if an I/O error occurs.
241  */
242     public int read(char[] cbuf, int off, int len) throws IOException {
243         if (_inStream != null) {
244             if (_start >= _end) { // Fills buffer.
245                 _start = 0;
246                 _end = _inStream.read(_bytes, 0, _bytes.length);
247 
248                 if (_end <= 0) { // Done.
249 
250                     return _end;
251                 }
252             }
253 
254             final int off_plus_len = off + len;
255 
256             for (int i = off; i < off_plus_len;) {
257                 // assert(_start < _end)
258                 byte b = _bytes[_start];
259 
260                 if ((b >= 0) && (++_start < _end)) {
261                     cbuf[i++] = (char) b; // Most common case.
262                 } else if (b < 0) {
263                     if (i < (off_plus_len - 1)) { // Up to two 'char' can be read.
264 
265                         int code = read2();
266 
267                         if (code < 0x10000) {
268                             cbuf[i++] = (char) code;
269                         } else if (code <= 0x10ffff) { // Surrogates.
270                             cbuf[i++] = (char) (((code - 0x10000) >> 10) +
271                                 0xd800);
272                             cbuf[i++] = (char) (((code - 0x10000) & 0x3ff) +
273                                 0xdc00);
274                         } else {
275                             throw new CharConversionException(
276                                 "Cannot convert U+" +
277                                 Integer.toHexString(code) +
278                                 " to char (code greater than U+10FFFF)");
279                         }
280 
281                         if (_start < _end) {
282                             continue;
283                         }
284                     }
285 
286                     return i - off;
287                 } else { // End of buffer (_start >= _end).
288                     cbuf[i++] = (char) b;
289 
290                     return i - off;
291                 }
292             }
293 
294             return len;
295         } else {
296             throw new IOException("Stream closed");
297         }
298     }
299 }