View Javadoc
1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package org.codehaus.plexus.util.xml;
18  
19  import java.io.BufferedInputStream;
20  import java.io.BufferedReader;
21  import java.io.File;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.InputStreamReader;
25  import java.io.Reader;
26  import java.io.StringReader;
27  import java.net.HttpURLConnection;
28  import java.net.URL;
29  import java.net.URLConnection;
30  import java.nio.file.Files;
31  import java.text.MessageFormat;
32  import java.util.Locale;
33  import java.util.regex.Matcher;
34  import java.util.regex.Pattern;
35  
36  /**
37   * <p>Character stream that handles (or at least attempts to) all the necessary Voodo to figure out the charset encoding of
38   * the XML document within the stream.</p>
39   *
40   * <p>IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.</p>
41   *
42   * <p>All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the
43   * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right
44   * now, XmlReader handles it and things work in all parsers).</p>
45   *
46   * <p>The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering
47   * a wide set of constructors.</p>
48   *
49   * <p>By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script
50   * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog,
51   * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a
52   * feed</a>.</p>
53   *
54   * @author Alejandro Abdelnur
55   * @version revision 1.17 taken on 26/06/2007 from Rome (see
56   *          https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java)
57   * @deprecated use XmlStreamReader
58   * @since 1.4.3
59   */
60  @Deprecated
61  public class XmlReader extends Reader {
62      private static final int BUFFER_SIZE = 4096;
63  
64      private static final String UTF_8 = "UTF-8";
65  
66      private static final String US_ASCII = "US-ASCII";
67  
68      private static final String UTF_16BE = "UTF-16BE";
69  
70      private static final String UTF_16LE = "UTF-16LE";
71  
72      private static final String UTF_16 = "UTF-16";
73  
74      private static final String EBCDIC = "CP1047";
75  
76      private static String _staticDefaultEncoding = null;
77  
78      private Reader _reader;
79  
80      private String _encoding;
81  
82      private String _defaultEncoding;
83  
84      /**
85       * <p>Sets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on
86       * content-type are not adequate.</p>
87       *
88       * <p>If it is set to NULL the content-type based rules are used.</p>
89       *
90       * <p>By default it is NULL.</p>
91       *
92       * @param encoding charset encoding to default to.
93       */
94      public static void setDefaultEncoding(String encoding) {
95          _staticDefaultEncoding = encoding;
96      }
97  
98      /**
99       * <p>Returns the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on
100      * content-type are not adequate.</p>
101      *
102      * <p>If it is NULL the content-type based rules are used.</p>
103      *
104      * @return the default encoding to use.
105      */
106     public static String getDefaultEncoding() {
107         return _staticDefaultEncoding;
108     }
109 
110     /**
111      * Creates a Reader for a File.
112      * <p>
113      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
114      * UTF-8.
115      * <p>
116      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
117      * <p>
118      *
119      * @param file File to create a Reader from.
120      * @throws IOException thrown if there is a problem reading the file.
121      */
122     public XmlReader(File file) throws IOException {
123         this(Files.newInputStream(file.toPath()));
124     }
125 
126     /**
127      * Creates a Reader for a raw InputStream.
128      * <p>
129      * It follows the same logic used for files.
130      * <p>
131      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
132      * <p>
133      *
134      * @param is InputStream to create a Reader from.
135      * @throws IOException thrown if there is a problem reading the stream.
136      */
137     public XmlReader(InputStream is) throws IOException {
138         this(is, true);
139     }
140 
141     /**
142      * Creates a Reader for a raw InputStream.
143      * <p>
144      * It follows the same logic used for files.
145      * <p>
146      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
147      * following:
148      * <p>
149      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
150      * <p>
151      * Else if the XML prolog had a charset encoding that encoding is used.
152      * <p>
153      * Else if the content type had a charset encoding that encoding is used.
154      * <p>
155      * Else 'UTF-8' is used.
156      * <p>
157      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
158      * <p>
159      *
160      * @param is InputStream to create a Reader from.
161      * @param lenient indicates if the charset encoding detection should be relaxed.
162      * @throws IOException thrown if there is a problem reading the stream.
163      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
164      */
165     public XmlReader(InputStream is, boolean lenient) throws IOException, XmlStreamReaderException {
166         _defaultEncoding = _staticDefaultEncoding;
167         try {
168             doRawStream(is, lenient);
169         } catch (XmlStreamReaderException ex) {
170             if (!lenient) {
171                 throw ex;
172             } else {
173                 doLenientDetection(null, ex);
174             }
175         }
176     }
177 
178     /**
179      * Creates a Reader using the InputStream of a URL.
180      * <p>
181      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic
182      * used for Files.
183      * <p>
184      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for
185      * an InputStream with content-type.
186      * <p>
187      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
188      * <p>
189      *
190      * @param url URL to create a Reader from.
191      * @throws IOException thrown if there is a problem reading the stream of the URL.
192      */
193     public XmlReader(URL url) throws IOException {
194         this(url.openConnection());
195     }
196 
197     /**
198      * Creates a Reader using the InputStream of a URLConnection.
199      * <p>
200      * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data
201      * it uses the same logic used for files.
202      * <p>
203      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic
204      * used for an InputStream with content-type.
205      * <p>
206      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
207      * <p>
208      *
209      * @param conn URLConnection to create a Reader from.
210      * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
211      */
212     public XmlReader(URLConnection conn) throws IOException {
213         _defaultEncoding = _staticDefaultEncoding;
214         boolean lenient = true;
215         if (conn instanceof HttpURLConnection) {
216             try {
217                 doHttpStream(conn.getInputStream(), conn.getContentType(), lenient);
218             } catch (XmlStreamReaderException ex) {
219                 doLenientDetection(conn.getContentType(), ex);
220             }
221         } else if (conn.getContentType() != null) {
222             try {
223                 doHttpStream(conn.getInputStream(), conn.getContentType(), lenient);
224             } catch (XmlStreamReaderException ex) {
225                 doLenientDetection(conn.getContentType(), ex);
226             }
227         } else {
228             try {
229                 doRawStream(conn.getInputStream(), lenient);
230             } catch (XmlStreamReaderException ex) {
231                 doLenientDetection(null, ex);
232             }
233         }
234     }
235 
236     /**
237      * Creates a Reader using an InputStream an the associated content-type header.
238      * <p>
239      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
240      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
241      * encoding mandated by the content-type MIME type.
242      * <p>
243      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
244      * <p>
245      *
246      * @param is InputStream to create the reader from.
247      * @param httpContentType content-type header to use for the resolution of the charset encoding.
248      * @throws IOException thrown if there is a problem reading the file.
249      */
250     public XmlReader(InputStream is, String httpContentType) throws IOException {
251         this(is, httpContentType, true);
252     }
253 
254     /**
255      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
256      * regarding the encoding detection.
257      * <p>
258      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
259      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
260      * encoding mandated by the content-type MIME type.
261      * <p>
262      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
263      * following:
264      * <p>
265      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
266      * <p>
267      * Else if the XML prolog had a charset encoding that encoding is used.
268      * <p>
269      * Else if the content type had a charset encoding that encoding is used.
270      * <p>
271      * Else 'UTF-8' is used.
272      * <p>
273      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
274      * <p>
275      *
276      * @param is InputStream to create the reader from.
277      * @param httpContentType content-type header to use for the resolution of the charset encoding.
278      * @param lenient indicates if the charset encoding detection should be relaxed.
279      * @param defaultEncoding encoding to use
280      * @throws IOException thrown if there is a problem reading the file.
281      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
282      */
283     public XmlReader(InputStream is, String httpContentType, boolean lenient, String defaultEncoding)
284             throws IOException, XmlStreamReaderException {
285         _defaultEncoding = (defaultEncoding == null) ? _staticDefaultEncoding : defaultEncoding;
286         try {
287             doHttpStream(is, httpContentType, lenient);
288         } catch (XmlStreamReaderException ex) {
289             if (!lenient) {
290                 throw ex;
291             } else {
292                 doLenientDetection(httpContentType, ex);
293             }
294         }
295     }
296 
297     /**
298      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
299      * regarding the encoding detection.
300      * <p>
301      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
302      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
303      * encoding mandated by the content-type MIME type.
304      * <p>
305      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
306      * following:
307      * <p>
308      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
309      * <p>
310      * Else if the XML prolog had a charset encoding that encoding is used.
311      * <p>
312      * Else if the content type had a charset encoding that encoding is used.
313      * <p>
314      * Else 'UTF-8' is used.
315      * <p>
316      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
317      * <p>
318      *
319      * @param is InputStream to create the reader from.
320      * @param httpContentType content-type header to use for the resolution of the charset encoding.
321      * @param lenient indicates if the charset encoding detection should be relaxed.
322      * @throws IOException thrown if there is a problem reading the file.
323      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
324      */
325     public XmlReader(InputStream is, String httpContentType, boolean lenient)
326             throws IOException, XmlStreamReaderException {
327         this(is, httpContentType, lenient, null);
328     }
329 
330     private void doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
331         if (httpContentType != null) {
332             if (httpContentType.startsWith("text/html")) {
333                 httpContentType = httpContentType.substring("text/html".length());
334                 httpContentType = "text/xml" + httpContentType;
335                 try {
336                     doHttpStream(ex.getInputStream(), httpContentType, true);
337                     ex = null;
338                 } catch (XmlStreamReaderException ex2) {
339                     ex = ex2;
340                 }
341             }
342         }
343         if (ex != null) {
344             String encoding = ex.getXmlEncoding();
345             if (encoding == null) {
346                 encoding = ex.getContentTypeEncoding();
347             }
348             if (encoding == null) {
349                 encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
350             }
351             prepareReader(ex.getInputStream(), encoding);
352         }
353     }
354 
355     /**
356      * Returns the charset encoding of the XmlReader.
357      * <p>
358      *
359      * @return charset encoding.
360      */
361     public String getEncoding() {
362         return _encoding;
363     }
364 
365     @Override
366     public int read(char[] buf, int offset, int len) throws IOException {
367         return _reader.read(buf, offset, len);
368     }
369 
370     /**
371      * Closes the XmlReader stream.
372      * <p>
373      *
374      * @throws IOException thrown if there was a problem closing the stream.
375      */
376     @Override
377     public void close() throws IOException {
378         _reader.close();
379     }
380 
381     private void doRawStream(InputStream is, boolean lenient) throws IOException {
382         BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
383         String bomEnc = getBOMEncoding(pis);
384         String xmlGuessEnc = getXMLGuessEncoding(pis);
385         String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
386         String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
387         prepareReader(pis, encoding);
388     }
389 
390     private void doHttpStream(InputStream is, String httpContentType, boolean lenient) throws IOException {
391         BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
392         String cTMime = getContentTypeMime(httpContentType);
393         String cTEnc = getContentTypeEncoding(httpContentType);
394         String bomEnc = getBOMEncoding(pis);
395         String xmlGuessEnc = getXMLGuessEncoding(pis);
396         String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
397         String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis, lenient);
398         prepareReader(pis, encoding);
399     }
400 
401     private void prepareReader(InputStream is, String encoding) throws IOException {
402         _reader = new InputStreamReader(is, encoding);
403         _encoding = encoding;
404     }
405 
406     // InputStream is passed for XmlStreamReaderException creation only
407     private String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is)
408             throws IOException {
409         String encoding;
410         if (bomEnc == null) {
411             if (xmlGuessEnc == null || xmlEnc == null) {
412                 encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
413             } else if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
414                 encoding = xmlGuessEnc;
415             } else {
416                 encoding = xmlEnc;
417             }
418         } else if (bomEnc.equals(UTF_8)) {
419             if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
420                 throw new XmlStreamReaderException(
421                         RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
422             }
423             if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
424                 throw new XmlStreamReaderException(
425                         RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
426             }
427             encoding = UTF_8;
428         } else if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
429             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
430                 throw new IOException(RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}));
431             }
432             if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
433                 throw new XmlStreamReaderException(
434                         RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
435             }
436             encoding = bomEnc;
437         } else {
438             throw new XmlStreamReaderException(
439                     RAW_EX_2.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
440         }
441         return encoding;
442     }
443 
444     // InputStream is passed for XmlStreamReaderException creation only
445     private String calculateHttpEncoding(
446             String cTMime,
447             String cTEnc,
448             String bomEnc,
449             String xmlGuessEnc,
450             String xmlEnc,
451             InputStream is,
452             boolean lenient)
453             throws IOException {
454         String encoding;
455         if (lenient & xmlEnc != null) {
456             encoding = xmlEnc;
457         } else {
458             boolean appXml = isAppXml(cTMime);
459             boolean textXml = isTextXml(cTMime);
460             if (appXml || textXml) {
461                 if (cTEnc == null) {
462                     if (appXml) {
463                         encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
464                     } else {
465                         encoding = (_defaultEncoding == null) ? US_ASCII : _defaultEncoding;
466                     }
467                 } else if (bomEnc != null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
468                     throw new XmlStreamReaderException(
469                             HTTP_EX_1.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
470                             cTMime,
471                             cTEnc,
472                             bomEnc,
473                             xmlGuessEnc,
474                             xmlEnc,
475                             is);
476                 } else if (cTEnc.equals(UTF_16)) {
477                     if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
478                         encoding = bomEnc;
479                     } else {
480                         throw new XmlStreamReaderException(
481                                 HTTP_EX_2.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
482                                 cTMime,
483                                 cTEnc,
484                                 bomEnc,
485                                 xmlGuessEnc,
486                                 xmlEnc,
487                                 is);
488                     }
489                 } else {
490                     encoding = cTEnc;
491                 }
492             } else {
493                 throw new XmlStreamReaderException(
494                         HTTP_EX_3.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
495                         cTMime,
496                         cTEnc,
497                         bomEnc,
498                         xmlGuessEnc,
499                         xmlEnc,
500                         is);
501             }
502         }
503         return encoding;
504     }
505 
506     // returns MIME type or NULL if httpContentType is NULL
507     private static String getContentTypeMime(String httpContentType) {
508         String mime = null;
509         if (httpContentType != null) {
510             int i = httpContentType.indexOf(";");
511             mime = ((i == -1) ? httpContentType : httpContentType.substring(0, i)).trim();
512         }
513         return mime;
514     }
515 
516     private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
517 
518     // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
519     private static String getContentTypeEncoding(String httpContentType) {
520         String encoding = null;
521         if (httpContentType != null) {
522             int i = httpContentType.indexOf(";");
523             if (i > -1) {
524                 String postMime = httpContentType.substring(i + 1);
525                 Matcher m = CHARSET_PATTERN.matcher(postMime);
526                 encoding = (m.find()) ? m.group(1) : null;
527                 encoding = (encoding != null) ? encoding.toUpperCase(Locale.ENGLISH) : null;
528             }
529         }
530         return encoding;
531     }
532 
533     // returns the BOM in the stream, NULL if not present,
534     // if there was BOM the in the stream it is consumed
535     private static String getBOMEncoding(BufferedInputStream is) throws IOException {
536         String encoding = null;
537         int[] bytes = new int[3];
538         is.mark(3);
539         bytes[0] = is.read();
540         bytes[1] = is.read();
541         bytes[2] = is.read();
542 
543         if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
544             encoding = UTF_16BE;
545             is.reset();
546             is.read();
547             is.read();
548         } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
549             encoding = UTF_16LE;
550             is.reset();
551             is.read();
552             is.read();
553         } else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
554             encoding = UTF_8;
555         } else {
556             is.reset();
557         }
558         return encoding;
559     }
560 
561     // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
562     private static String getXMLGuessEncoding(BufferedInputStream is) throws IOException {
563         String encoding = null;
564         int[] bytes = new int[4];
565         is.mark(4);
566         bytes[0] = is.read();
567         bytes[1] = is.read();
568         bytes[2] = is.read();
569         bytes[3] = is.read();
570         is.reset();
571 
572         if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
573             encoding = UTF_16BE;
574         } else if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
575             encoding = UTF_16LE;
576         } else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
577             encoding = UTF_8;
578         } else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 && bytes[3] == 0x94) {
579             encoding = EBCDIC;
580         }
581         return encoding;
582     }
583 
584     static final Pattern ENCODING_PATTERN =
585             Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE);
586 
587     // returns the encoding declared in the <?xml encoding=...?>, NULL if none
588     private static String getXmlProlog(BufferedInputStream is, String guessedEnc) throws IOException {
589         String encoding = null;
590         if (guessedEnc != null) {
591             byte[] bytes = new byte[BUFFER_SIZE];
592             is.mark(BUFFER_SIZE);
593             int offset = 0;
594             int max = BUFFER_SIZE;
595             int c = is.read(bytes, offset, max);
596             int firstGT = -1;
597             String xmlProlog = null;
598             while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
599                 offset += c;
600                 max -= c;
601                 c = is.read(bytes, offset, max);
602                 xmlProlog = new String(bytes, 0, offset, guessedEnc);
603                 firstGT = xmlProlog.indexOf('>');
604             }
605             if (firstGT == -1) {
606                 if (c == -1) {
607                     throw new IOException("Unexpected end of XML stream");
608                 } else {
609                     throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
610                 }
611             }
612             int bytesRead = offset;
613             if (bytesRead > 0) {
614                 is.reset();
615                 BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
616                 StringBuilder prolog = new StringBuilder();
617                 String line = bReader.readLine();
618                 while (line != null) {
619                     prolog.append(line);
620                     line = bReader.readLine();
621                 }
622                 Matcher m = ENCODING_PATTERN.matcher(prolog);
623                 if (m.find()) {
624                     encoding = m.group(1).toUpperCase(Locale.ENGLISH);
625                     encoding = encoding.substring(1, encoding.length() - 1);
626                 }
627             }
628         }
629         return encoding;
630     }
631 
632     // indicates if the MIME type belongs to the APPLICATION XML family
633     private static boolean isAppXml(String mime) {
634         return mime != null
635                 && (mime.equals("application/xml")
636                         || mime.equals("application/xml-dtd")
637                         || mime.equals("application/xml-external-parsed-entity")
638                         || (mime.startsWith("application/") && mime.endsWith("+xml")));
639     }
640 
641     // indicates if the MIME type belongs to the TEXT XML family
642     private static boolean isTextXml(String mime) {
643         return mime != null
644                 && (mime.equals("text/xml")
645                         || mime.equals("text/xml-external-parsed-entity")
646                         || (mime.startsWith("text/") && mime.endsWith("+xml")));
647     }
648 
649     private static final MessageFormat RAW_EX_1 =
650             new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
651 
652     private static final MessageFormat RAW_EX_2 =
653             new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
654 
655     private static final MessageFormat HTTP_EX_1 = new MessageFormat(
656             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
657 
658     private static final MessageFormat HTTP_EX_2 = new MessageFormat(
659             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
660 
661     private static final MessageFormat HTTP_EX_3 = new MessageFormat(
662             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
663 }