View Javadoc
1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package org.codehaus.plexus.util.xml;
18  
19  import java.io.File;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.net.URL;
23  import java.net.URLConnection;
24  import java.nio.file.Path;
25  
26  /**
27   * Character stream that handles (or at least attempts to) all the necessary Voodo to figure out the charset encoding of
28   * the XML document within the stream.
29   * <p>
30   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
31   * <p>
32   * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the
33   * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right
34   * now, XmlReader handles it and things work in all parsers).
35   * <p>
36   * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering
37   * a wide set of constructors.
38   * <P>
39   * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script
40   * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog,
41   * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a
42   * feed</a>.
43   * <p>
44   *
45   * @author Alejandro Abdelnur
46   * @version revision 1.17 taken on 26/06/2007 from Rome (see
47   *          https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java)
48   * @since 1.4.4
49   */
50  public class XmlStreamReader extends XmlReader {
51      /**
52       * Creates a Reader for a Path.
53       * <p>
54       * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
55       * UTF-8.
56       * <p>
57       * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
58       * <p>
59       *
60       * @param path Path to create a Reader from.
61       * @throws IOException thrown if there is a problem reading the file.
62       */
63      public XmlStreamReader(Path path) throws IOException {
64          super(path);
65      }
66  
67      /**
68       * Creates a Reader for a File.
69       * <p>
70       * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
71       * UTF-8.
72       * <p>
73       * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
74       * <p>
75       *
76       * @param file File to create a Reader from.
77       * @throws IOException thrown if there is a problem reading the file.
78       */
79      public XmlStreamReader(File file) throws IOException {
80          this(file.toPath());
81      }
82  
83      /**
84       * Creates a Reader for a raw InputStream.
85       * <p>
86       * It follows the same logic used for files.
87       * <p>
88       * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
89       * <p>
90       *
91       * @param is InputStream to create a Reader from.
92       * @throws IOException thrown if there is a problem reading the stream.
93       */
94      public XmlStreamReader(InputStream is) throws IOException {
95          super(is);
96      }
97  
98      /**
99       * Creates a Reader for a raw InputStream.
100      * <p>
101      * It follows the same logic used for files.
102      * <p>
103      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
104      * following:
105      * <p>
106      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
107      * <p>
108      * Else if the XML prolog had a charset encoding that encoding is used.
109      * <p>
110      * Else if the content type had a charset encoding that encoding is used.
111      * <p>
112      * Else 'UTF-8' is used.
113      * <p>
114      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
115      * <p>
116      *
117      * @param is InputStream to create a Reader from.
118      * @param lenient indicates if the charset encoding detection should be relaxed.
119      * @throws IOException thrown if there is a problem reading the stream.
120      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
121      */
122     public XmlStreamReader(InputStream is, boolean lenient) throws IOException, XmlStreamReaderException {
123         super(is, lenient);
124     }
125 
126     /**
127      * Creates a Reader using the InputStream of a URL.
128      * <p>
129      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic
130      * used for Files.
131      * <p>
132      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for
133      * an InputStream with content-type.
134      * <p>
135      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
136      * <p>
137      *
138      * @param url URL to create a Reader from.
139      * @throws IOException thrown if there is a problem reading the stream of the URL.
140      */
141     public XmlStreamReader(URL url) throws IOException {
142         super(url);
143     }
144 
145     /**
146      * Creates a Reader using the InputStream of a URLConnection.
147      * <p>
148      * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data
149      * it uses the same logic used for files.
150      * <p>
151      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic
152      * used for an InputStream with content-type.
153      * <p>
154      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
155      * <p>
156      *
157      * @param conn URLConnection to create a Reader from.
158      * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
159      */
160     public XmlStreamReader(URLConnection conn) throws IOException {
161         super(conn);
162     }
163 
164     /**
165      * Creates a Reader using an InputStream an the associated content-type header.
166      * <p>
167      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
168      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
169      * encoding mandated by the content-type MIME type.
170      * <p>
171      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
172      * <p>
173      *
174      * @param is InputStream to create the reader from.
175      * @param httpContentType content-type header to use for the resolution of the charset encoding.
176      * @throws IOException thrown if there is a problem reading the file.
177      */
178     public XmlStreamReader(InputStream is, String httpContentType) throws IOException {
179         super(is, httpContentType);
180     }
181 
182     /**
183      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
184      * regarding the encoding detection.
185      * <p>
186      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
187      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
188      * encoding mandated by the content-type MIME type.
189      * <p>
190      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
191      * following:
192      * <p>
193      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
194      * <p>
195      * Else if the XML prolog had a charset encoding that encoding is used.
196      * <p>
197      * Else if the content type had a charset encoding that encoding is used.
198      * <p>
199      * Else 'UTF-8' is used.
200      * <p>
201      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
202      * <p>
203      *
204      * @param is InputStream to create the reader from.
205      * @param httpContentType content-type header to use for the resolution of the charset encoding.
206      * @param lenient indicates if the charset encoding detection should be relaxed.
207      * @param defaultEncoding encoding to use
208      * @throws IOException thrown if there is a problem reading the file.
209      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
210      */
211     public XmlStreamReader(InputStream is, String httpContentType, boolean lenient, String defaultEncoding)
212             throws IOException, XmlStreamReaderException {
213         super(is, httpContentType, lenient, defaultEncoding);
214     }
215 
216     /**
217      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
218      * regarding the encoding detection.
219      * <p>
220      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
221      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
222      * encoding mandated by the content-type MIME type.
223      * <p>
224      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
225      * following:
226      * <p>
227      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
228      * <p>
229      * Else if the XML prolog had a charset encoding that encoding is used.
230      * <p>
231      * Else if the content type had a charset encoding that encoding is used.
232      * <p>
233      * Else 'UTF-8' is used.
234      * <p>
235      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
236      * <p>
237      *
238      * @param is InputStream to create the reader from.
239      * @param httpContentType content-type header to use for the resolution of the charset encoding.
240      * @param lenient indicates if the charset encoding detection should be relaxed.
241      * @throws IOException thrown if there is a problem reading the file.
242      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
243      */
244     public XmlStreamReader(InputStream is, String httpContentType, boolean lenient)
245             throws IOException, XmlStreamReaderException {
246         super(is, httpContentType, lenient);
247     }
248 }