View Javadoc
1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package org.codehaus.plexus.util.xml;
18  
19  import java.io.BufferedInputStream;
20  import java.io.BufferedReader;
21  import java.io.File;
22  import java.io.FileInputStream;
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.io.InputStreamReader;
26  import java.io.Reader;
27  import java.io.StringReader;
28  import java.net.URL;
29  import java.net.URLConnection;
30  import java.net.HttpURLConnection;
31  import java.util.Locale;
32  import java.util.regex.Pattern;
33  import java.util.regex.Matcher;
34  import java.text.MessageFormat;
35  
36  /**
37   * Character stream that handles (or at least attempts to) all the necessary Voodo to figure out the charset encoding of
38   * the XML document within the stream.
39   * <p>
40   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
41   * <p>
42   * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the
43   * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right
44   * now, XmlReader handles it and things work in all parsers).
45   * <p>
46   * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering
47   * a wide set of constructors.
48   * <P>
49   * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script
50   * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog,
51   * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a
52   * feed</a>.
53   * <p>
54   *
55   * @author Alejandro Abdelnur
56   * @version revision 1.17 taken on 26/06/2007 from Rome (see
57   *          https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java)
58   * @deprecated use XmlStreamReader
59   * @since 1.4.3
60   */
61  public class XmlReader
62      extends Reader
63  {
64      private static final int BUFFER_SIZE = 4096;
65  
66      private static final String UTF_8 = "UTF-8";
67  
68      private static final String US_ASCII = "US-ASCII";
69  
70      private static final String UTF_16BE = "UTF-16BE";
71  
72      private static final String UTF_16LE = "UTF-16LE";
73  
74      private static final String UTF_16 = "UTF-16";
75  
76      private static final String EBCDIC = "CP1047";
77  
78      private static String _staticDefaultEncoding = null;
79  
80      private Reader _reader;
81  
82      private String _encoding;
83  
84      private String _defaultEncoding;
85  
86      /**
87       * Sets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on
88       * content-type are not adequate.
89       * <p/>
90       * If it is set to NULL the content-type based rules are used.
91       * <p/>
92       * By default it is NULL.
93       * <p/>
94       *
95       * @param encoding charset encoding to default to.
96       */
97      public static void setDefaultEncoding( String encoding )
98      {
99          _staticDefaultEncoding = encoding;
100     }
101 
102     /**
103      * Returns the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on
104      * content-type are not adequate.
105      * <p/>
106      * If it is NULL the content-type based rules are used.
107      * <p/>
108      *
109      * @return the default encoding to use.
110      */
111     public static String getDefaultEncoding()
112     {
113         return _staticDefaultEncoding;
114     }
115 
116     /**
117      * Creates a Reader for a File.
118      * <p>
119      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
120      * UTF-8.
121      * <p>
122      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
123      * <p>
124      *
125      * @param file File to create a Reader from.
126      * @throws IOException thrown if there is a problem reading the file.
127      */
128     public XmlReader( File file )
129         throws IOException
130     {
131         this( new FileInputStream( file ) );
132     }
133 
134     /**
135      * Creates a Reader for a raw InputStream.
136      * <p>
137      * It follows the same logic used for files.
138      * <p>
139      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
140      * <p>
141      *
142      * @param is InputStream to create a Reader from.
143      * @throws IOException thrown if there is a problem reading the stream.
144      */
145     public XmlReader( InputStream is )
146         throws IOException
147     {
148         this( is, true );
149     }
150 
151     /**
152      * Creates a Reader for a raw InputStream.
153      * <p>
154      * It follows the same logic used for files.
155      * <p>
156      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
157      * following:
158      * <p>
159      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
160      * <p>
161      * Else if the XML prolog had a charset encoding that encoding is used.
162      * <p>
163      * Else if the content type had a charset encoding that encoding is used.
164      * <p>
165      * Else 'UTF-8' is used.
166      * <p>
167      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
168      * <p>
169      *
170      * @param is InputStream to create a Reader from.
171      * @param lenient indicates if the charset encoding detection should be relaxed.
172      * @throws IOException thrown if there is a problem reading the stream.
173      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
174      */
175     public XmlReader( InputStream is, boolean lenient )
176         throws IOException, XmlStreamReaderException
177     {
178         _defaultEncoding = _staticDefaultEncoding;
179         try
180         {
181             doRawStream( is, lenient );
182         }
183         catch ( XmlStreamReaderException ex )
184         {
185             if ( !lenient )
186             {
187                 throw ex;
188             }
189             else
190             {
191                 doLenientDetection( null, ex );
192             }
193         }
194     }
195 
196     /**
197      * Creates a Reader using the InputStream of a URL.
198      * <p>
199      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic
200      * used for Files.
201      * <p>
202      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for
203      * an InputStream with content-type.
204      * <p>
205      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
206      * <p>
207      *
208      * @param url URL to create a Reader from.
209      * @throws IOException thrown if there is a problem reading the stream of the URL.
210      */
211     public XmlReader( URL url )
212         throws IOException
213     {
214         this( url.openConnection() );
215     }
216 
217     /**
218      * Creates a Reader using the InputStream of a URLConnection.
219      * <p>
220      * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data
221      * it uses the same logic used for files.
222      * <p>
223      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic
224      * used for an InputStream with content-type.
225      * <p>
226      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
227      * <p>
228      *
229      * @param conn URLConnection to create a Reader from.
230      * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
231      */
232     public XmlReader( URLConnection conn )
233         throws IOException
234     {
235         _defaultEncoding = _staticDefaultEncoding;
236         boolean lenient = true;
237         if ( conn instanceof HttpURLConnection )
238         {
239             try
240             {
241                 doHttpStream( conn.getInputStream(), conn.getContentType(), lenient );
242             }
243             catch ( XmlStreamReaderException ex )
244             {
245                 doLenientDetection( conn.getContentType(), ex );
246             }
247         }
248         else if ( conn.getContentType() != null )
249         {
250             try
251             {
252                 doHttpStream( conn.getInputStream(), conn.getContentType(), lenient );
253             }
254             catch ( XmlStreamReaderException ex )
255             {
256                 doLenientDetection( conn.getContentType(), ex );
257             }
258         }
259         else
260         {
261             try
262             {
263                 doRawStream( conn.getInputStream(), lenient );
264             }
265             catch ( XmlStreamReaderException ex )
266             {
267                 doLenientDetection( null, ex );
268             }
269         }
270     }
271 
272     /**
273      * Creates a Reader using an InputStream an the associated content-type header.
274      * <p>
275      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
276      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
277      * encoding mandated by the content-type MIME type.
278      * <p>
279      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
280      * <p>
281      *
282      * @param is InputStream to create the reader from.
283      * @param httpContentType content-type header to use for the resolution of the charset encoding.
284      * @throws IOException thrown if there is a problem reading the file.
285      */
286     public XmlReader( InputStream is, String httpContentType )
287         throws IOException
288     {
289         this( is, httpContentType, true );
290     }
291 
292     /**
293      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
294      * regarding the encoding detection.
295      * <p>
296      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
297      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
298      * encoding mandated by the content-type MIME type.
299      * <p>
300      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
301      * following:
302      * <p>
303      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
304      * <p>
305      * Else if the XML prolog had a charset encoding that encoding is used.
306      * <p>
307      * Else if the content type had a charset encoding that encoding is used.
308      * <p>
309      * Else 'UTF-8' is used.
310      * <p>
311      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
312      * <p>
313      *
314      * @param is InputStream to create the reader from.
315      * @param httpContentType content-type header to use for the resolution of the charset encoding.
316      * @param lenient indicates if the charset encoding detection should be relaxed.
317      * @throws IOException thrown if there is a problem reading the file.
318      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
319      */
320     public XmlReader( InputStream is, String httpContentType, boolean lenient, String defaultEncoding )
321         throws IOException, XmlStreamReaderException
322     {
323         _defaultEncoding = ( defaultEncoding == null ) ? _staticDefaultEncoding : defaultEncoding;
324         try
325         {
326             doHttpStream( is, httpContentType, lenient );
327         }
328         catch ( XmlStreamReaderException ex )
329         {
330             if ( !lenient )
331             {
332                 throw ex;
333             }
334             else
335             {
336                 doLenientDetection( httpContentType, ex );
337             }
338         }
339     }
340 
341     /**
342      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
343      * regarding the encoding detection.
344      * <p>
345      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
346      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
347      * encoding mandated by the content-type MIME type.
348      * <p>
349      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
350      * following:
351      * <p>
352      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
353      * <p>
354      * Else if the XML prolog had a charset encoding that encoding is used.
355      * <p>
356      * Else if the content type had a charset encoding that encoding is used.
357      * <p>
358      * Else 'UTF-8' is used.
359      * <p>
360      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
361      * <p>
362      *
363      * @param is InputStream to create the reader from.
364      * @param httpContentType content-type header to use for the resolution of the charset encoding.
365      * @param lenient indicates if the charset encoding detection should be relaxed.
366      * @throws IOException thrown if there is a problem reading the file.
367      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
368      */
369     public XmlReader( InputStream is, String httpContentType, boolean lenient )
370         throws IOException, XmlStreamReaderException
371     {
372         this( is, httpContentType, lenient, null );
373     }
374 
375     private void doLenientDetection( String httpContentType, XmlStreamReaderException ex )
376         throws IOException
377     {
378         if ( httpContentType != null )
379         {
380             if ( httpContentType.startsWith( "text/html" ) )
381             {
382                 httpContentType = httpContentType.substring( "text/html".length() );
383                 httpContentType = "text/xml" + httpContentType;
384                 try
385                 {
386                     doHttpStream( ex.getInputStream(), httpContentType, true );
387                     ex = null;
388                 }
389                 catch ( XmlStreamReaderException ex2 )
390                 {
391                     ex = ex2;
392                 }
393             }
394         }
395         if ( ex != null )
396         {
397             String encoding = ex.getXmlEncoding();
398             if ( encoding == null )
399             {
400                 encoding = ex.getContentTypeEncoding();
401             }
402             if ( encoding == null )
403             {
404                 encoding = ( _defaultEncoding == null ) ? UTF_8 : _defaultEncoding;
405             }
406             prepareReader( ex.getInputStream(), encoding );
407         }
408     }
409 
410     /**
411      * Returns the charset encoding of the XmlReader.
412      * <p>
413      *
414      * @return charset encoding.
415      */
416     public String getEncoding()
417     {
418         return _encoding;
419     }
420 
421     public int read( char[] buf, int offset, int len )
422         throws IOException
423     {
424         return _reader.read( buf, offset, len );
425     }
426 
427     /**
428      * Closes the XmlReader stream.
429      * <p>
430      *
431      * @throws IOException thrown if there was a problem closing the stream.
432      */
433     public void close()
434         throws IOException
435     {
436         _reader.close();
437     }
438 
439     private void doRawStream( InputStream is, boolean lenient )
440         throws IOException
441     {
442         BufferedInputStream pis = new BufferedInputStream( is, BUFFER_SIZE );
443         String bomEnc = getBOMEncoding( pis );
444         String xmlGuessEnc = getXMLGuessEncoding( pis );
445         String xmlEnc = getXmlProlog( pis, xmlGuessEnc );
446         String encoding = calculateRawEncoding( bomEnc, xmlGuessEnc, xmlEnc, pis );
447         prepareReader( pis, encoding );
448     }
449 
450     private void doHttpStream( InputStream is, String httpContentType, boolean lenient )
451         throws IOException
452     {
453         BufferedInputStream pis = new BufferedInputStream( is, BUFFER_SIZE );
454         String cTMime = getContentTypeMime( httpContentType );
455         String cTEnc = getContentTypeEncoding( httpContentType );
456         String bomEnc = getBOMEncoding( pis );
457         String xmlGuessEnc = getXMLGuessEncoding( pis );
458         String xmlEnc = getXmlProlog( pis, xmlGuessEnc );
459         String encoding = calculateHttpEncoding( cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis, lenient );
460         prepareReader( pis, encoding );
461     }
462 
463     private void prepareReader( InputStream is, String encoding )
464         throws IOException
465     {
466         _reader = new InputStreamReader( is, encoding );
467         _encoding = encoding;
468     }
469 
470     // InputStream is passed for XmlStreamReaderException creation only
471     private String calculateRawEncoding( String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is )
472         throws IOException
473     {
474         String encoding;
475         if ( bomEnc == null )
476         {
477             if ( xmlGuessEnc == null || xmlEnc == null )
478             {
479                 encoding = ( _defaultEncoding == null ) ? UTF_8 : _defaultEncoding;
480             }
481             else if ( xmlEnc.equals( UTF_16 ) && ( xmlGuessEnc.equals( UTF_16BE ) || xmlGuessEnc.equals( UTF_16LE ) ) )
482             {
483                 encoding = xmlGuessEnc;
484             }
485             else
486             {
487                 encoding = xmlEnc;
488             }
489         }
490         else if ( bomEnc.equals( UTF_8 ) )
491         {
492             if ( xmlGuessEnc != null && !xmlGuessEnc.equals( UTF_8 ) )
493             {
494                 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ),
495                                                     bomEnc, xmlGuessEnc, xmlEnc, is );
496             }
497             if ( xmlEnc != null && !xmlEnc.equals( UTF_8 ) )
498             {
499                 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ),
500                                                     bomEnc, xmlGuessEnc, xmlEnc, is );
501             }
502             encoding = UTF_8;
503         }
504         else if ( bomEnc.equals( UTF_16BE ) || bomEnc.equals( UTF_16LE ) )
505         {
506             if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc ) )
507             {
508                 throw new IOException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ) );
509             }
510             if ( xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
511             {
512                 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ),
513                                                     bomEnc, xmlGuessEnc, xmlEnc, is );
514             }
515             encoding = bomEnc;
516         }
517         else
518         {
519             throw new XmlStreamReaderException( RAW_EX_2.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ), bomEnc,
520                                                 xmlGuessEnc, xmlEnc, is );
521         }
522         return encoding;
523     }
524 
525     // InputStream is passed for XmlStreamReaderException creation only
526     private String calculateHttpEncoding( String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc,
527                                           InputStream is, boolean lenient )
528         throws IOException
529     {
530         String encoding;
531         if ( lenient & xmlEnc != null )
532         {
533             encoding = xmlEnc;
534         }
535         else
536         {
537             boolean appXml = isAppXml( cTMime );
538             boolean textXml = isTextXml( cTMime );
539             if ( appXml || textXml )
540             {
541                 if ( cTEnc == null )
542                 {
543                     if ( appXml )
544                     {
545                         encoding = calculateRawEncoding( bomEnc, xmlGuessEnc, xmlEnc, is );
546                     }
547                     else
548                     {
549                         encoding = ( _defaultEncoding == null ) ? US_ASCII : _defaultEncoding;
550                     }
551                 }
552                 else if ( bomEnc != null && ( cTEnc.equals( UTF_16BE ) || cTEnc.equals( UTF_16LE ) ) )
553                 {
554                     throw new XmlStreamReaderException( HTTP_EX_1.format( new Object[] { cTMime, cTEnc, bomEnc,
555                         xmlGuessEnc, xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
556                 }
557                 else if ( cTEnc.equals( UTF_16 ) )
558                 {
559                     if ( bomEnc != null && bomEnc.startsWith( UTF_16 ) )
560                     {
561                         encoding = bomEnc;
562                     }
563                     else
564                     {
565                         throw new XmlStreamReaderException( HTTP_EX_2.format( new Object[] { cTMime, cTEnc, bomEnc,
566                             xmlGuessEnc, xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
567                     }
568                 }
569                 else
570                 {
571                     encoding = cTEnc;
572                 }
573             }
574             else
575             {
576                 throw new XmlStreamReaderException( HTTP_EX_3.format( new Object[] { cTMime, cTEnc, bomEnc, xmlGuessEnc,
577                     xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
578             }
579         }
580         return encoding;
581     }
582 
583     // returns MIME type or NULL if httpContentType is NULL
584     private static String getContentTypeMime( String httpContentType )
585     {
586         String mime = null;
587         if ( httpContentType != null )
588         {
589             int i = httpContentType.indexOf( ";" );
590             mime = ( ( i == -1 ) ? httpContentType : httpContentType.substring( 0, i ) ).trim();
591         }
592         return mime;
593     }
594 
595     private static final Pattern CHARSET_PATTERN = Pattern.compile( "charset=([.[^; ]]*)" );
596 
597     // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
598     private static String getContentTypeEncoding( String httpContentType )
599     {
600         String encoding = null;
601         if ( httpContentType != null )
602         {
603             int i = httpContentType.indexOf( ";" );
604             if ( i > -1 )
605             {
606                 String postMime = httpContentType.substring( i + 1 );
607                 Matcher m = CHARSET_PATTERN.matcher( postMime );
608                 encoding = ( m.find() ) ? m.group( 1 ) : null;
609                 encoding = ( encoding != null ) ? encoding.toUpperCase( Locale.ENGLISH ) : null;
610             }
611         }
612         return encoding;
613     }
614 
615     // returns the BOM in the stream, NULL if not present,
616     // if there was BOM the in the stream it is consumed
617     private static String getBOMEncoding( BufferedInputStream is )
618         throws IOException
619     {
620         String encoding = null;
621         int[] bytes = new int[3];
622         is.mark( 3 );
623         bytes[0] = is.read();
624         bytes[1] = is.read();
625         bytes[2] = is.read();
626 
627         if ( bytes[0] == 0xFE && bytes[1] == 0xFF )
628         {
629             encoding = UTF_16BE;
630             is.reset();
631             is.read();
632             is.read();
633         }
634         else if ( bytes[0] == 0xFF && bytes[1] == 0xFE )
635         {
636             encoding = UTF_16LE;
637             is.reset();
638             is.read();
639             is.read();
640         }
641         else if ( bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF )
642         {
643             encoding = UTF_8;
644         }
645         else
646         {
647             is.reset();
648         }
649         return encoding;
650     }
651 
652     // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
653     private static String getXMLGuessEncoding( BufferedInputStream is )
654         throws IOException
655     {
656         String encoding = null;
657         int[] bytes = new int[4];
658         is.mark( 4 );
659         bytes[0] = is.read();
660         bytes[1] = is.read();
661         bytes[2] = is.read();
662         bytes[3] = is.read();
663         is.reset();
664 
665         if ( bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F )
666         {
667             encoding = UTF_16BE;
668         }
669         else if ( bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00 )
670         {
671             encoding = UTF_16LE;
672         }
673         else if ( bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D )
674         {
675             encoding = UTF_8;
676         }
677         else if ( bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 && bytes[3] == 0x94 )
678         {
679             encoding = EBCDIC;
680         }
681         return encoding;
682     }
683 
684     static final Pattern ENCODING_PATTERN =
685         Pattern.compile( "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE );
686 
687     // returns the encoding declared in the <?xml encoding=...?>, NULL if none
688     private static String getXmlProlog( BufferedInputStream is, String guessedEnc )
689         throws IOException
690     {
691         String encoding = null;
692         if ( guessedEnc != null )
693         {
694             byte[] bytes = new byte[BUFFER_SIZE];
695             is.mark( BUFFER_SIZE );
696             int offset = 0;
697             int max = BUFFER_SIZE;
698             int c = is.read( bytes, offset, max );
699             int firstGT = -1;
700             String xmlProlog = null;
701             while ( c != -1 && firstGT == -1 && offset < BUFFER_SIZE )
702             {
703                 offset += c;
704                 max -= c;
705                 c = is.read( bytes, offset, max );
706                 xmlProlog = new String( bytes, 0, offset, guessedEnc );
707                 firstGT = xmlProlog.indexOf( '>' );
708             }
709             if ( firstGT == -1 )
710             {
711                 if ( c == -1 )
712                 {
713                     throw new IOException( "Unexpected end of XML stream" );
714                 }
715                 else
716                 {
717                     throw new IOException( "XML prolog or ROOT element not found on first " + offset + " bytes" );
718                 }
719             }
720             int bytesRead = offset;
721             if ( bytesRead > 0 )
722             {
723                 is.reset();
724                 BufferedReader bReader =
725                     new BufferedReader( new StringReader( xmlProlog.substring( 0, firstGT + 1 ) ) );
726                 StringBuilder prolog = new StringBuilder();
727                 String line = bReader.readLine();
728                 while ( line != null )
729                 {
730                     prolog.append( line );
731                     line = bReader.readLine();
732                 }
733                 Matcher m = ENCODING_PATTERN.matcher( prolog );
734                 if ( m.find() )
735                 {
736                     encoding = m.group( 1 ).toUpperCase( Locale.ENGLISH );
737                     encoding = encoding.substring( 1, encoding.length() - 1 );
738                 }
739             }
740         }
741         return encoding;
742     }
743 
744     // indicates if the MIME type belongs to the APPLICATION XML family
745     private static boolean isAppXml( String mime )
746     {
747         return mime != null && ( mime.equals( "application/xml" ) || mime.equals( "application/xml-dtd" )
748             || mime.equals( "application/xml-external-parsed-entity" )
749             || ( mime.startsWith( "application/" ) && mime.endsWith( "+xml" ) ) );
750     }
751 
752     // indicates if the MIME type belongs to the TEXT XML family
753     private static boolean isTextXml( String mime )
754     {
755         return mime != null && ( mime.equals( "text/xml" ) || mime.equals( "text/xml-external-parsed-entity" )
756             || ( mime.startsWith( "text/" ) && mime.endsWith( "+xml" ) ) );
757     }
758 
759     private static final MessageFormat RAW_EX_1 =
760         new MessageFormat( "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch" );
761 
762     private static final MessageFormat RAW_EX_2 =
763         new MessageFormat( "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM" );
764 
765     private static final MessageFormat HTTP_EX_1 =
766         new MessageFormat( "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL" );
767 
768     private static final MessageFormat HTTP_EX_2 =
769         new MessageFormat( "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch" );
770 
771     private static final MessageFormat HTTP_EX_3 =
772         new MessageFormat( "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME" );
773 
774 }