1 /*
2 * Copyright 2004 Sun Microsystems, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17 package org.codehaus.plexus.util.xml;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.io.StringReader;
27 import java.net.HttpURLConnection;
28 import java.net.URL;
29 import java.net.URLConnection;
30 import java.nio.file.Files;
31 import java.text.MessageFormat;
32 import java.util.Locale;
33 import java.util.regex.Matcher;
34 import java.util.regex.Pattern;
35
36 /**
37 * <p>Character stream that handles (or at least attempts to) all the necessary Voodo to figure out the charset encoding of
38 * the XML document within the stream.</p>
39 *
40 * <p>IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.</p>
41 *
42 * <p>All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the
43 * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right
44 * now, XmlReader handles it and things work in all parsers).</p>
45 *
46 * <p>The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering
47 * a wide set of constructors.</p>
48 *
49 * <p>By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script
50 * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog,
51 * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a
52 * feed</a>.</p>
53 *
54 * @author Alejandro Abdelnur
55 * @version revision 1.17 taken on 26/06/2007 from Rome (see
56 * https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java)
57 * @deprecated use XmlStreamReader
58 * @since 1.4.3
59 */
60 @Deprecated
61 public class XmlReader extends Reader {
62 private static final int BUFFER_SIZE = 4096;
63
64 private static final String UTF_8 = "UTF-8";
65
66 private static final String US_ASCII = "US-ASCII";
67
68 private static final String UTF_16BE = "UTF-16BE";
69
70 private static final String UTF_16LE = "UTF-16LE";
71
72 private static final String UTF_16 = "UTF-16";
73
74 private static final String EBCDIC = "CP1047";
75
76 private static String _staticDefaultEncoding = null;
77
78 private Reader _reader;
79
80 private String _encoding;
81
82 private String _defaultEncoding;
83
84 /**
85 * <p>Sets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on
86 * content-type are not adequate.</p>
87 *
88 * <p>If it is set to NULL the content-type based rules are used.</p>
89 *
90 * <p>By default it is NULL.</p>
91 *
92 * @param encoding charset encoding to default to.
93 */
94 public static void setDefaultEncoding(String encoding) {
95 _staticDefaultEncoding = encoding;
96 }
97
98 /**
99 * <p>Returns the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on
100 * content-type are not adequate.</p>
101 *
102 * <p>If it is NULL the content-type based rules are used.</p>
103 *
104 * @return the default encoding to use.
105 */
106 public static String getDefaultEncoding() {
107 return _staticDefaultEncoding;
108 }
109
110 /**
111 * Creates a Reader for a File.
112 * <p>
113 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
114 * UTF-8.
115 * <p>
116 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
117 * <p>
118 *
119 * @param file File to create a Reader from.
120 * @throws IOException thrown if there is a problem reading the file.
121 */
122 public XmlReader(File file) throws IOException {
123 this(Files.newInputStream(file.toPath()));
124 }
125
126 /**
127 * Creates a Reader for a raw InputStream.
128 * <p>
129 * It follows the same logic used for files.
130 * <p>
131 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
132 * <p>
133 *
134 * @param is InputStream to create a Reader from.
135 * @throws IOException thrown if there is a problem reading the stream.
136 */
137 public XmlReader(InputStream is) throws IOException {
138 this(is, true);
139 }
140
141 /**
142 * Creates a Reader for a raw InputStream.
143 * <p>
144 * It follows the same logic used for files.
145 * <p>
146 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
147 * following:
148 * <p>
149 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
150 * <p>
151 * Else if the XML prolog had a charset encoding that encoding is used.
152 * <p>
153 * Else if the content type had a charset encoding that encoding is used.
154 * <p>
155 * Else 'UTF-8' is used.
156 * <p>
157 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
158 * <p>
159 *
160 * @param is InputStream to create a Reader from.
161 * @param lenient indicates if the charset encoding detection should be relaxed.
162 * @throws IOException thrown if there is a problem reading the stream.
163 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
164 */
165 public XmlReader(InputStream is, boolean lenient) throws IOException, XmlStreamReaderException {
166 _defaultEncoding = _staticDefaultEncoding;
167 try {
168 doRawStream(is, lenient);
169 } catch (XmlStreamReaderException ex) {
170 if (!lenient) {
171 throw ex;
172 } else {
173 doLenientDetection(null, ex);
174 }
175 }
176 }
177
178 /**
179 * Creates a Reader using the InputStream of a URL.
180 * <p>
181 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic
182 * used for Files.
183 * <p>
184 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for
185 * an InputStream with content-type.
186 * <p>
187 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
188 * <p>
189 *
190 * @param url URL to create a Reader from.
191 * @throws IOException thrown if there is a problem reading the stream of the URL.
192 */
193 public XmlReader(URL url) throws IOException {
194 this(url.openConnection());
195 }
196
197 /**
198 * Creates a Reader using the InputStream of a URLConnection.
199 * <p>
200 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data
201 * it uses the same logic used for files.
202 * <p>
203 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic
204 * used for an InputStream with content-type.
205 * <p>
206 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
207 * <p>
208 *
209 * @param conn URLConnection to create a Reader from.
210 * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
211 */
212 public XmlReader(URLConnection conn) throws IOException {
213 _defaultEncoding = _staticDefaultEncoding;
214 boolean lenient = true;
215 if (conn instanceof HttpURLConnection) {
216 try {
217 doHttpStream(conn.getInputStream(), conn.getContentType(), lenient);
218 } catch (XmlStreamReaderException ex) {
219 doLenientDetection(conn.getContentType(), ex);
220 }
221 } else if (conn.getContentType() != null) {
222 try {
223 doHttpStream(conn.getInputStream(), conn.getContentType(), lenient);
224 } catch (XmlStreamReaderException ex) {
225 doLenientDetection(conn.getContentType(), ex);
226 }
227 } else {
228 try {
229 doRawStream(conn.getInputStream(), lenient);
230 } catch (XmlStreamReaderException ex) {
231 doLenientDetection(null, ex);
232 }
233 }
234 }
235
236 /**
237 * Creates a Reader using an InputStream an the associated content-type header.
238 * <p>
239 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
240 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
241 * encoding mandated by the content-type MIME type.
242 * <p>
243 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
244 * <p>
245 *
246 * @param is InputStream to create the reader from.
247 * @param httpContentType content-type header to use for the resolution of the charset encoding.
248 * @throws IOException thrown if there is a problem reading the file.
249 */
250 public XmlReader(InputStream is, String httpContentType) throws IOException {
251 this(is, httpContentType, true);
252 }
253
254 /**
255 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
256 * regarding the encoding detection.
257 * <p>
258 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
259 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
260 * encoding mandated by the content-type MIME type.
261 * <p>
262 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
263 * following:
264 * <p>
265 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
266 * <p>
267 * Else if the XML prolog had a charset encoding that encoding is used.
268 * <p>
269 * Else if the content type had a charset encoding that encoding is used.
270 * <p>
271 * Else 'UTF-8' is used.
272 * <p>
273 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
274 * <p>
275 *
276 * @param is InputStream to create the reader from.
277 * @param httpContentType content-type header to use for the resolution of the charset encoding.
278 * @param lenient indicates if the charset encoding detection should be relaxed.
279 * @param defaultEncoding encoding to use
280 * @throws IOException thrown if there is a problem reading the file.
281 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
282 */
283 public XmlReader(InputStream is, String httpContentType, boolean lenient, String defaultEncoding)
284 throws IOException, XmlStreamReaderException {
285 _defaultEncoding = (defaultEncoding == null) ? _staticDefaultEncoding : defaultEncoding;
286 try {
287 doHttpStream(is, httpContentType, lenient);
288 } catch (XmlStreamReaderException ex) {
289 if (!lenient) {
290 throw ex;
291 } else {
292 doLenientDetection(httpContentType, ex);
293 }
294 }
295 }
296
297 /**
298 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
299 * regarding the encoding detection.
300 * <p>
301 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
302 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
303 * encoding mandated by the content-type MIME type.
304 * <p>
305 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
306 * following:
307 * <p>
308 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
309 * <p>
310 * Else if the XML prolog had a charset encoding that encoding is used.
311 * <p>
312 * Else if the content type had a charset encoding that encoding is used.
313 * <p>
314 * Else 'UTF-8' is used.
315 * <p>
316 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
317 * <p>
318 *
319 * @param is InputStream to create the reader from.
320 * @param httpContentType content-type header to use for the resolution of the charset encoding.
321 * @param lenient indicates if the charset encoding detection should be relaxed.
322 * @throws IOException thrown if there is a problem reading the file.
323 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
324 */
325 public XmlReader(InputStream is, String httpContentType, boolean lenient)
326 throws IOException, XmlStreamReaderException {
327 this(is, httpContentType, lenient, null);
328 }
329
330 private void doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
331 if (httpContentType != null) {
332 if (httpContentType.startsWith("text/html")) {
333 httpContentType = httpContentType.substring("text/html".length());
334 httpContentType = "text/xml" + httpContentType;
335 try {
336 doHttpStream(ex.getInputStream(), httpContentType, true);
337 ex = null;
338 } catch (XmlStreamReaderException ex2) {
339 ex = ex2;
340 }
341 }
342 }
343 if (ex != null) {
344 String encoding = ex.getXmlEncoding();
345 if (encoding == null) {
346 encoding = ex.getContentTypeEncoding();
347 }
348 if (encoding == null) {
349 encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
350 }
351 prepareReader(ex.getInputStream(), encoding);
352 }
353 }
354
355 /**
356 * Returns the charset encoding of the XmlReader.
357 * <p>
358 *
359 * @return charset encoding.
360 */
361 public String getEncoding() {
362 return _encoding;
363 }
364
365 @Override
366 public int read(char[] buf, int offset, int len) throws IOException {
367 return _reader.read(buf, offset, len);
368 }
369
370 /**
371 * Closes the XmlReader stream.
372 * <p>
373 *
374 * @throws IOException thrown if there was a problem closing the stream.
375 */
376 @Override
377 public void close() throws IOException {
378 _reader.close();
379 }
380
381 private void doRawStream(InputStream is, boolean lenient) throws IOException {
382 BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
383 String bomEnc = getBOMEncoding(pis);
384 String xmlGuessEnc = getXMLGuessEncoding(pis);
385 String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
386 String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
387 prepareReader(pis, encoding);
388 }
389
390 private void doHttpStream(InputStream is, String httpContentType, boolean lenient) throws IOException {
391 BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
392 String cTMime = getContentTypeMime(httpContentType);
393 String cTEnc = getContentTypeEncoding(httpContentType);
394 String bomEnc = getBOMEncoding(pis);
395 String xmlGuessEnc = getXMLGuessEncoding(pis);
396 String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
397 String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis, lenient);
398 prepareReader(pis, encoding);
399 }
400
401 private void prepareReader(InputStream is, String encoding) throws IOException {
402 _reader = new InputStreamReader(is, encoding);
403 _encoding = encoding;
404 }
405
406 // InputStream is passed for XmlStreamReaderException creation only
407 private String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is)
408 throws IOException {
409 String encoding;
410 if (bomEnc == null) {
411 if (xmlGuessEnc == null || xmlEnc == null) {
412 encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
413 } else if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
414 encoding = xmlGuessEnc;
415 } else {
416 encoding = xmlEnc;
417 }
418 } else if (bomEnc.equals(UTF_8)) {
419 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
420 throw new XmlStreamReaderException(
421 RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
422 }
423 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
424 throw new XmlStreamReaderException(
425 RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
426 }
427 encoding = UTF_8;
428 } else if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
429 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
430 throw new IOException(RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}));
431 }
432 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
433 throw new XmlStreamReaderException(
434 RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
435 }
436 encoding = bomEnc;
437 } else {
438 throw new XmlStreamReaderException(
439 RAW_EX_2.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
440 }
441 return encoding;
442 }
443
444 // InputStream is passed for XmlStreamReaderException creation only
445 private String calculateHttpEncoding(
446 String cTMime,
447 String cTEnc,
448 String bomEnc,
449 String xmlGuessEnc,
450 String xmlEnc,
451 InputStream is,
452 boolean lenient)
453 throws IOException {
454 String encoding;
455 if (lenient & xmlEnc != null) {
456 encoding = xmlEnc;
457 } else {
458 boolean appXml = isAppXml(cTMime);
459 boolean textXml = isTextXml(cTMime);
460 if (appXml || textXml) {
461 if (cTEnc == null) {
462 if (appXml) {
463 encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
464 } else {
465 encoding = (_defaultEncoding == null) ? US_ASCII : _defaultEncoding;
466 }
467 } else if (bomEnc != null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
468 throw new XmlStreamReaderException(
469 HTTP_EX_1.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
470 cTMime,
471 cTEnc,
472 bomEnc,
473 xmlGuessEnc,
474 xmlEnc,
475 is);
476 } else if (cTEnc.equals(UTF_16)) {
477 if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
478 encoding = bomEnc;
479 } else {
480 throw new XmlStreamReaderException(
481 HTTP_EX_2.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
482 cTMime,
483 cTEnc,
484 bomEnc,
485 xmlGuessEnc,
486 xmlEnc,
487 is);
488 }
489 } else {
490 encoding = cTEnc;
491 }
492 } else {
493 throw new XmlStreamReaderException(
494 HTTP_EX_3.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
495 cTMime,
496 cTEnc,
497 bomEnc,
498 xmlGuessEnc,
499 xmlEnc,
500 is);
501 }
502 }
503 return encoding;
504 }
505
506 // returns MIME type or NULL if httpContentType is NULL
507 private static String getContentTypeMime(String httpContentType) {
508 String mime = null;
509 if (httpContentType != null) {
510 int i = httpContentType.indexOf(";");
511 mime = ((i == -1) ? httpContentType : httpContentType.substring(0, i)).trim();
512 }
513 return mime;
514 }
515
516 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
517
518 // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
519 private static String getContentTypeEncoding(String httpContentType) {
520 String encoding = null;
521 if (httpContentType != null) {
522 int i = httpContentType.indexOf(";");
523 if (i > -1) {
524 String postMime = httpContentType.substring(i + 1);
525 Matcher m = CHARSET_PATTERN.matcher(postMime);
526 encoding = (m.find()) ? m.group(1) : null;
527 encoding = (encoding != null) ? encoding.toUpperCase(Locale.ENGLISH) : null;
528 }
529 }
530 return encoding;
531 }
532
533 // returns the BOM in the stream, NULL if not present,
534 // if there was BOM the in the stream it is consumed
535 private static String getBOMEncoding(BufferedInputStream is) throws IOException {
536 String encoding = null;
537 int[] bytes = new int[3];
538 is.mark(3);
539 bytes[0] = is.read();
540 bytes[1] = is.read();
541 bytes[2] = is.read();
542
543 if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
544 encoding = UTF_16BE;
545 is.reset();
546 is.read();
547 is.read();
548 } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
549 encoding = UTF_16LE;
550 is.reset();
551 is.read();
552 is.read();
553 } else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
554 encoding = UTF_8;
555 } else {
556 is.reset();
557 }
558 return encoding;
559 }
560
561 // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
562 private static String getXMLGuessEncoding(BufferedInputStream is) throws IOException {
563 String encoding = null;
564 int[] bytes = new int[4];
565 is.mark(4);
566 bytes[0] = is.read();
567 bytes[1] = is.read();
568 bytes[2] = is.read();
569 bytes[3] = is.read();
570 is.reset();
571
572 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
573 encoding = UTF_16BE;
574 } else if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
575 encoding = UTF_16LE;
576 } else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
577 encoding = UTF_8;
578 } else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 && bytes[3] == 0x94) {
579 encoding = EBCDIC;
580 }
581 return encoding;
582 }
583
584 static final Pattern ENCODING_PATTERN =
585 Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE);
586
587 // returns the encoding declared in the <?xml encoding=...?>, NULL if none
588 private static String getXmlProlog(BufferedInputStream is, String guessedEnc) throws IOException {
589 String encoding = null;
590 if (guessedEnc != null) {
591 byte[] bytes = new byte[BUFFER_SIZE];
592 is.mark(BUFFER_SIZE);
593 int offset = 0;
594 int max = BUFFER_SIZE;
595 int c = is.read(bytes, offset, max);
596 int firstGT = -1;
597 String xmlProlog = null;
598 while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
599 offset += c;
600 max -= c;
601 c = is.read(bytes, offset, max);
602 xmlProlog = new String(bytes, 0, offset, guessedEnc);
603 firstGT = xmlProlog.indexOf('>');
604 }
605 if (firstGT == -1) {
606 if (c == -1) {
607 throw new IOException("Unexpected end of XML stream");
608 } else {
609 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
610 }
611 }
612 int bytesRead = offset;
613 if (bytesRead > 0) {
614 is.reset();
615 BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
616 StringBuilder prolog = new StringBuilder();
617 String line = bReader.readLine();
618 while (line != null) {
619 prolog.append(line);
620 line = bReader.readLine();
621 }
622 Matcher m = ENCODING_PATTERN.matcher(prolog);
623 if (m.find()) {
624 encoding = m.group(1).toUpperCase(Locale.ENGLISH);
625 encoding = encoding.substring(1, encoding.length() - 1);
626 }
627 }
628 }
629 return encoding;
630 }
631
632 // indicates if the MIME type belongs to the APPLICATION XML family
633 private static boolean isAppXml(String mime) {
634 return mime != null
635 && (mime.equals("application/xml")
636 || mime.equals("application/xml-dtd")
637 || mime.equals("application/xml-external-parsed-entity")
638 || (mime.startsWith("application/") && mime.endsWith("+xml")));
639 }
640
641 // indicates if the MIME type belongs to the TEXT XML family
642 private static boolean isTextXml(String mime) {
643 return mime != null
644 && (mime.equals("text/xml")
645 || mime.equals("text/xml-external-parsed-entity")
646 || (mime.startsWith("text/") && mime.endsWith("+xml")));
647 }
648
649 private static final MessageFormat RAW_EX_1 =
650 new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
651
652 private static final MessageFormat RAW_EX_2 =
653 new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
654
655 private static final MessageFormat HTTP_EX_1 = new MessageFormat(
656 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
657
658 private static final MessageFormat HTTP_EX_2 = new MessageFormat(
659 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
660
661 private static final MessageFormat HTTP_EX_3 = new MessageFormat(
662 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
663 }