1 /*
2 * Copyright 2004 Sun Microsystems, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17 package org.codehaus.plexus.util.xml;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.io.StringReader;
27 import java.net.HttpURLConnection;
28 import java.net.URL;
29 import java.net.URLConnection;
30 import java.nio.file.Files;
31 import java.nio.file.Path;
32 import java.text.MessageFormat;
33 import java.util.Locale;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
36
37 /**
38 * <p>Character stream that handles (or at least attempts to) all the necessary Voodo to figure out the charset encoding of
39 * the XML document within the stream.</p>
40 *
41 * <p>IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.</p>
42 *
43 * <p>All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the
44 * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right
45 * now, XmlReader handles it and things work in all parsers).</p>
46 *
47 * <p>The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering
48 * a wide set of constructors.</p>
49 *
50 * <p>By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script
51 * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog,
52 * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a
53 * feed</a>.</p>
54 *
55 * @author Alejandro Abdelnur
56 * @version revision 1.17 taken on 26/06/2007 from Rome (see
57 * https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java)
58 * @deprecated use XmlStreamReader
59 * @since 1.4.3
60 */
61 @Deprecated
62 public class XmlReader extends Reader {
63 private static final int BUFFER_SIZE = 4096;
64
65 private static final String UTF_8 = "UTF-8";
66
67 private static final String US_ASCII = "US-ASCII";
68
69 private static final String UTF_16BE = "UTF-16BE";
70
71 private static final String UTF_16LE = "UTF-16LE";
72
73 private static final String UTF_16 = "UTF-16";
74
75 private static final String EBCDIC = "CP1047";
76
77 private static String _staticDefaultEncoding = null;
78
79 private Reader _reader;
80
81 private String _encoding;
82
83 private String _defaultEncoding;
84
85 /**
86 * <p>Sets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on
87 * content-type are not adequate.</p>
88 *
89 * <p>If it is set to NULL the content-type based rules are used.</p>
90 *
91 * <p>By default it is NULL.</p>
92 *
93 * @param encoding charset encoding to default to.
94 */
95 public static void setDefaultEncoding(String encoding) {
96 _staticDefaultEncoding = encoding;
97 }
98
99 /**
100 * <p>Returns the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on
101 * content-type are not adequate.</p>
102 *
103 * <p>If it is NULL the content-type based rules are used.</p>
104 *
105 * @return the default encoding to use.
106 */
107 public static String getDefaultEncoding() {
108 return _staticDefaultEncoding;
109 }
110
111 /**
112 * Creates a Reader for a Path.
113 * <p>
114 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
115 * UTF-8.
116 * <p>
117 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
118 * <p>
119 *
120 * @param path Path to create a Reader from.
121 * @throws IOException thrown if there is a problem reading the file.
122 */
123 public XmlReader(Path path) throws IOException {
124 this(Files.newInputStream(path));
125 }
126
127 /**
128 * Creates a Reader for a File.
129 * <p>
130 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
131 * UTF-8.
132 * <p>
133 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
134 * <p>
135 *
136 * @param file File to create a Reader from.
137 * @throws IOException thrown if there is a problem reading the file.
138 */
139 public XmlReader(File file) throws IOException {
140 this(file.toPath());
141 }
142
143 /**
144 * Creates a Reader for a raw InputStream.
145 * <p>
146 * It follows the same logic used for files.
147 * <p>
148 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
149 * <p>
150 *
151 * @param is InputStream to create a Reader from.
152 * @throws IOException thrown if there is a problem reading the stream.
153 */
154 public XmlReader(InputStream is) throws IOException {
155 this(is, true);
156 }
157
158 /**
159 * Creates a Reader for a raw InputStream.
160 * <p>
161 * It follows the same logic used for files.
162 * <p>
163 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
164 * following:
165 * <p>
166 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
167 * <p>
168 * Else if the XML prolog had a charset encoding that encoding is used.
169 * <p>
170 * Else if the content type had a charset encoding that encoding is used.
171 * <p>
172 * Else 'UTF-8' is used.
173 * <p>
174 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
175 * <p>
176 *
177 * @param is InputStream to create a Reader from.
178 * @param lenient indicates if the charset encoding detection should be relaxed.
179 * @throws IOException thrown if there is a problem reading the stream.
180 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
181 */
182 public XmlReader(InputStream is, boolean lenient) throws IOException, XmlStreamReaderException {
183 _defaultEncoding = _staticDefaultEncoding;
184 try {
185 doRawStream(is, lenient);
186 } catch (XmlStreamReaderException ex) {
187 if (!lenient) {
188 throw ex;
189 } else {
190 doLenientDetection(null, ex);
191 }
192 }
193 }
194
195 /**
196 * Creates a Reader using the InputStream of a URL.
197 * <p>
198 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic
199 * used for Files.
200 * <p>
201 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for
202 * an InputStream with content-type.
203 * <p>
204 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
205 * <p>
206 *
207 * @param url URL to create a Reader from.
208 * @throws IOException thrown if there is a problem reading the stream of the URL.
209 */
210 public XmlReader(URL url) throws IOException {
211 this(url.openConnection());
212 }
213
214 /**
215 * Creates a Reader using the InputStream of a URLConnection.
216 * <p>
217 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data
218 * it uses the same logic used for files.
219 * <p>
220 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic
221 * used for an InputStream with content-type.
222 * <p>
223 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
224 * <p>
225 *
226 * @param conn URLConnection to create a Reader from.
227 * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
228 */
229 public XmlReader(URLConnection conn) throws IOException {
230 _defaultEncoding = _staticDefaultEncoding;
231 boolean lenient = true;
232 if (conn instanceof HttpURLConnection) {
233 try {
234 doHttpStream(conn.getInputStream(), conn.getContentType(), lenient);
235 } catch (XmlStreamReaderException ex) {
236 doLenientDetection(conn.getContentType(), ex);
237 }
238 } else if (conn.getContentType() != null) {
239 try {
240 doHttpStream(conn.getInputStream(), conn.getContentType(), lenient);
241 } catch (XmlStreamReaderException ex) {
242 doLenientDetection(conn.getContentType(), ex);
243 }
244 } else {
245 try {
246 doRawStream(conn.getInputStream(), lenient);
247 } catch (XmlStreamReaderException ex) {
248 doLenientDetection(null, ex);
249 }
250 }
251 }
252
253 /**
254 * Creates a Reader using an InputStream an the associated content-type header.
255 * <p>
256 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
257 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
258 * encoding mandated by the content-type MIME type.
259 * <p>
260 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
261 * <p>
262 *
263 * @param is InputStream to create the reader from.
264 * @param httpContentType content-type header to use for the resolution of the charset encoding.
265 * @throws IOException thrown if there is a problem reading the file.
266 */
267 public XmlReader(InputStream is, String httpContentType) throws IOException {
268 this(is, httpContentType, true);
269 }
270
271 /**
272 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
273 * regarding the encoding detection.
274 * <p>
275 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
276 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
277 * encoding mandated by the content-type MIME type.
278 * <p>
279 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
280 * following:
281 * <p>
282 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
283 * <p>
284 * Else if the XML prolog had a charset encoding that encoding is used.
285 * <p>
286 * Else if the content type had a charset encoding that encoding is used.
287 * <p>
288 * Else 'UTF-8' is used.
289 * <p>
290 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
291 * <p>
292 *
293 * @param is InputStream to create the reader from.
294 * @param httpContentType content-type header to use for the resolution of the charset encoding.
295 * @param lenient indicates if the charset encoding detection should be relaxed.
296 * @param defaultEncoding encoding to use
297 * @throws IOException thrown if there is a problem reading the file.
298 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
299 */
300 public XmlReader(InputStream is, String httpContentType, boolean lenient, String defaultEncoding)
301 throws IOException, XmlStreamReaderException {
302 _defaultEncoding = (defaultEncoding == null) ? _staticDefaultEncoding : defaultEncoding;
303 try {
304 doHttpStream(is, httpContentType, lenient);
305 } catch (XmlStreamReaderException ex) {
306 if (!lenient) {
307 throw ex;
308 } else {
309 doLenientDetection(httpContentType, ex);
310 }
311 }
312 }
313
314 /**
315 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
316 * regarding the encoding detection.
317 * <p>
318 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
319 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
320 * encoding mandated by the content-type MIME type.
321 * <p>
322 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
323 * following:
324 * <p>
325 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
326 * <p>
327 * Else if the XML prolog had a charset encoding that encoding is used.
328 * <p>
329 * Else if the content type had a charset encoding that encoding is used.
330 * <p>
331 * Else 'UTF-8' is used.
332 * <p>
333 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
334 * <p>
335 *
336 * @param is InputStream to create the reader from.
337 * @param httpContentType content-type header to use for the resolution of the charset encoding.
338 * @param lenient indicates if the charset encoding detection should be relaxed.
339 * @throws IOException thrown if there is a problem reading the file.
340 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
341 */
342 public XmlReader(InputStream is, String httpContentType, boolean lenient)
343 throws IOException, XmlStreamReaderException {
344 this(is, httpContentType, lenient, null);
345 }
346
347 private void doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
348 if (httpContentType != null) {
349 if (httpContentType.startsWith("text/html")) {
350 httpContentType = httpContentType.substring("text/html".length());
351 httpContentType = "text/xml" + httpContentType;
352 try {
353 doHttpStream(ex.getInputStream(), httpContentType, true);
354 ex = null;
355 } catch (XmlStreamReaderException ex2) {
356 ex = ex2;
357 }
358 }
359 }
360 if (ex != null) {
361 String encoding = ex.getXmlEncoding();
362 if (encoding == null) {
363 encoding = ex.getContentTypeEncoding();
364 }
365 if (encoding == null) {
366 encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
367 }
368 prepareReader(ex.getInputStream(), encoding);
369 }
370 }
371
372 /**
373 * Returns the charset encoding of the XmlReader.
374 * <p>
375 *
376 * @return charset encoding.
377 */
378 public String getEncoding() {
379 return _encoding;
380 }
381
382 @Override
383 public int read(char[] buf, int offset, int len) throws IOException {
384 return _reader.read(buf, offset, len);
385 }
386
387 /**
388 * Closes the XmlReader stream.
389 * <p>
390 *
391 * @throws IOException thrown if there was a problem closing the stream.
392 */
393 @Override
394 public void close() throws IOException {
395 _reader.close();
396 }
397
398 private void doRawStream(InputStream is, boolean lenient) throws IOException {
399 BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
400 String bomEnc = getBOMEncoding(pis);
401 String xmlGuessEnc = getXMLGuessEncoding(pis);
402 String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
403 String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
404 prepareReader(pis, encoding);
405 }
406
407 private void doHttpStream(InputStream is, String httpContentType, boolean lenient) throws IOException {
408 BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
409 String cTMime = getContentTypeMime(httpContentType);
410 String cTEnc = getContentTypeEncoding(httpContentType);
411 String bomEnc = getBOMEncoding(pis);
412 String xmlGuessEnc = getXMLGuessEncoding(pis);
413 String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
414 String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis, lenient);
415 prepareReader(pis, encoding);
416 }
417
418 private void prepareReader(InputStream is, String encoding) throws IOException {
419 _reader = new InputStreamReader(is, encoding);
420 _encoding = encoding;
421 }
422
423 // InputStream is passed for XmlStreamReaderException creation only
424 private String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is)
425 throws IOException {
426 String encoding;
427 if (bomEnc == null) {
428 if (xmlGuessEnc == null || xmlEnc == null) {
429 encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
430 } else if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
431 encoding = xmlGuessEnc;
432 } else {
433 encoding = xmlEnc;
434 }
435 } else if (bomEnc.equals(UTF_8)) {
436 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
437 throw new XmlStreamReaderException(
438 RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
439 }
440 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
441 throw new XmlStreamReaderException(
442 RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
443 }
444 encoding = UTF_8;
445 } else if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
446 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)
447 || xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
448 throw new XmlStreamReaderException(
449 RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
450 }
451 encoding = bomEnc;
452 } else {
453 throw new XmlStreamReaderException(
454 RAW_EX_2.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
455 }
456 return encoding;
457 }
458
459 // InputStream is passed for XmlStreamReaderException creation only
460 private String calculateHttpEncoding(
461 String cTMime,
462 String cTEnc,
463 String bomEnc,
464 String xmlGuessEnc,
465 String xmlEnc,
466 InputStream is,
467 boolean lenient)
468 throws IOException {
469 String encoding;
470 if (lenient & xmlEnc != null) {
471 encoding = xmlEnc;
472 } else {
473 boolean appXml = isAppXml(cTMime);
474 boolean textXml = isTextXml(cTMime);
475 if (appXml || textXml) {
476 if (cTEnc == null) {
477 if (appXml) {
478 encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
479 } else {
480 encoding = (_defaultEncoding == null) ? US_ASCII : _defaultEncoding;
481 }
482 } else if (bomEnc != null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
483 throw new XmlStreamReaderException(
484 HTTP_EX_1.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
485 cTMime,
486 cTEnc,
487 bomEnc,
488 xmlGuessEnc,
489 xmlEnc,
490 is);
491 } else if (cTEnc.equals(UTF_16)) {
492 if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
493 encoding = bomEnc;
494 } else {
495 throw new XmlStreamReaderException(
496 HTTP_EX_2.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
497 cTMime,
498 cTEnc,
499 bomEnc,
500 xmlGuessEnc,
501 xmlEnc,
502 is);
503 }
504 } else {
505 encoding = cTEnc;
506 }
507 } else {
508 throw new XmlStreamReaderException(
509 HTTP_EX_3.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
510 cTMime,
511 cTEnc,
512 bomEnc,
513 xmlGuessEnc,
514 xmlEnc,
515 is);
516 }
517 }
518 return encoding;
519 }
520
521 // returns MIME type or NULL if httpContentType is NULL
522 private static String getContentTypeMime(String httpContentType) {
523 String mime = null;
524 if (httpContentType != null) {
525 int i = httpContentType.indexOf(";");
526 mime = ((i == -1) ? httpContentType : httpContentType.substring(0, i)).trim();
527 }
528 return mime;
529 }
530
531 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
532
533 // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
534 private static String getContentTypeEncoding(String httpContentType) {
535 String encoding = null;
536 if (httpContentType != null) {
537 int i = httpContentType.indexOf(";");
538 if (i > -1) {
539 String postMime = httpContentType.substring(i + 1);
540 Matcher m = CHARSET_PATTERN.matcher(postMime);
541 encoding = (m.find()) ? m.group(1) : null;
542 encoding = (encoding != null) ? encoding.toUpperCase(Locale.ENGLISH) : null;
543 }
544 }
545 return encoding;
546 }
547
548 // returns the BOM in the stream, NULL if not present,
549 // if there was BOM the in the stream it is consumed
550 private static String getBOMEncoding(BufferedInputStream is) throws IOException {
551 String encoding = null;
552 int[] bytes = new int[3];
553 is.mark(3);
554 bytes[0] = is.read();
555 bytes[1] = is.read();
556 bytes[2] = is.read();
557
558 if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
559 encoding = UTF_16BE;
560 is.reset();
561 is.read();
562 is.read();
563 } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
564 encoding = UTF_16LE;
565 is.reset();
566 is.read();
567 is.read();
568 } else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
569 encoding = UTF_8;
570 } else {
571 is.reset();
572 }
573 return encoding;
574 }
575
576 // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
577 private static String getXMLGuessEncoding(BufferedInputStream is) throws IOException {
578 String encoding = null;
579 int[] bytes = new int[4];
580 is.mark(4);
581 bytes[0] = is.read();
582 bytes[1] = is.read();
583 bytes[2] = is.read();
584 bytes[3] = is.read();
585 is.reset();
586
587 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
588 encoding = UTF_16BE;
589 } else if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
590 encoding = UTF_16LE;
591 } else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
592 encoding = UTF_8;
593 } else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 && bytes[3] == 0x94) {
594 encoding = EBCDIC;
595 }
596 return encoding;
597 }
598
599 static final Pattern ENCODING_PATTERN =
600 Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE);
601
602 // returns the encoding declared in the <?xml encoding=...?>, NULL if none
603 private static String getXmlProlog(BufferedInputStream is, String guessedEnc) throws IOException {
604 String encoding = null;
605 if (guessedEnc != null) {
606 byte[] bytes = new byte[BUFFER_SIZE];
607 is.mark(BUFFER_SIZE);
608 int offset = 0;
609 int max = BUFFER_SIZE;
610 int c = is.read(bytes, offset, max);
611 int firstGT = -1;
612 String xmlProlog = null;
613 while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
614 offset += c;
615 max -= c;
616 c = is.read(bytes, offset, max);
617 xmlProlog = new String(bytes, 0, offset, guessedEnc);
618 firstGT = xmlProlog.indexOf('>');
619 }
620 if (firstGT == -1) {
621 if (c == -1) {
622 throw new IOException("Unexpected end of XML stream");
623 } else {
624 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
625 }
626 }
627 int bytesRead = offset;
628 if (bytesRead > 0) {
629 is.reset();
630 BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
631 StringBuilder prolog = new StringBuilder();
632 String line = bReader.readLine();
633 while (line != null) {
634 prolog.append(line);
635 line = bReader.readLine();
636 }
637 Matcher m = ENCODING_PATTERN.matcher(prolog);
638 if (m.find()) {
639 encoding = m.group(1).toUpperCase(Locale.ENGLISH);
640 encoding = encoding.substring(1, encoding.length() - 1);
641 }
642 }
643 }
644 return encoding;
645 }
646
647 // indicates if the MIME type belongs to the APPLICATION XML family
648 private static boolean isAppXml(String mime) {
649 return mime != null
650 && (mime.equals("application/xml")
651 || mime.equals("application/xml-dtd")
652 || mime.equals("application/xml-external-parsed-entity")
653 || (mime.startsWith("application/") && mime.endsWith("+xml")));
654 }
655
656 // indicates if the MIME type belongs to the TEXT XML family
657 private static boolean isTextXml(String mime) {
658 return mime != null
659 && (mime.equals("text/xml")
660 || mime.equals("text/xml-external-parsed-entity")
661 || (mime.startsWith("text/") && mime.endsWith("+xml")));
662 }
663
664 private static final MessageFormat RAW_EX_1 =
665 new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
666
667 private static final MessageFormat RAW_EX_2 =
668 new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
669
670 private static final MessageFormat HTTP_EX_1 = new MessageFormat(
671 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
672
673 private static final MessageFormat HTTP_EX_2 = new MessageFormat(
674 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
675
676 private static final MessageFormat HTTP_EX_3 = new MessageFormat(
677 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
678 }