1 /*
2 * Copyright 2004 Sun Microsystems, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17 package org.codehaus.plexus.util.xml;
18
19 import java.io.File;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.net.URL;
23 import java.net.URLConnection;
24 import java.nio.file.Path;
25
26 /**
27 * Character stream that handles (or at least attempts to) all the necessary Voodo to figure out the charset encoding of
28 * the XML document within the stream.
29 * <p>
30 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
31 * <p>
32 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the
33 * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right
34 * now, XmlReader handles it and things work in all parsers).
35 * <p>
36 * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering
37 * a wide set of constructors.
38 * <P>
39 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script
40 * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog,
41 * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a
42 * feed</a>.
43 * <p>
44 *
45 * @author Alejandro Abdelnur
46 * @version revision 1.17 taken on 26/06/2007 from Rome (see
47 * https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java)
48 * @since 1.4.4
49 */
50 public class XmlStreamReader extends XmlReader {
51 /**
52 * Creates a Reader for a Path.
53 * <p>
54 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
55 * UTF-8.
56 * <p>
57 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
58 * <p>
59 *
60 * @param path Path to create a Reader from.
61 * @throws IOException thrown if there is a problem reading the file.
62 */
63 public XmlStreamReader(Path path) throws IOException {
64 super(path);
65 }
66
67 /**
68 * Creates a Reader for a File.
69 * <p>
70 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
71 * UTF-8.
72 * <p>
73 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
74 * <p>
75 *
76 * @param file File to create a Reader from.
77 * @throws IOException thrown if there is a problem reading the file.
78 */
79 public XmlStreamReader(File file) throws IOException {
80 this(file.toPath());
81 }
82
83 /**
84 * Creates a Reader for a raw InputStream.
85 * <p>
86 * It follows the same logic used for files.
87 * <p>
88 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
89 * <p>
90 *
91 * @param is InputStream to create a Reader from.
92 * @throws IOException thrown if there is a problem reading the stream.
93 */
94 public XmlStreamReader(InputStream is) throws IOException {
95 super(is);
96 }
97
98 /**
99 * Creates a Reader for a raw InputStream.
100 * <p>
101 * It follows the same logic used for files.
102 * <p>
103 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
104 * following:
105 * <p>
106 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
107 * <p>
108 * Else if the XML prolog had a charset encoding that encoding is used.
109 * <p>
110 * Else if the content type had a charset encoding that encoding is used.
111 * <p>
112 * Else 'UTF-8' is used.
113 * <p>
114 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
115 * <p>
116 *
117 * @param is InputStream to create a Reader from.
118 * @param lenient indicates if the charset encoding detection should be relaxed.
119 * @throws IOException thrown if there is a problem reading the stream.
120 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
121 */
122 public XmlStreamReader(InputStream is, boolean lenient) throws IOException, XmlStreamReaderException {
123 super(is, lenient);
124 }
125
126 /**
127 * Creates a Reader using the InputStream of a URL.
128 * <p>
129 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic
130 * used for Files.
131 * <p>
132 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for
133 * an InputStream with content-type.
134 * <p>
135 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
136 * <p>
137 *
138 * @param url URL to create a Reader from.
139 * @throws IOException thrown if there is a problem reading the stream of the URL.
140 */
141 public XmlStreamReader(URL url) throws IOException {
142 super(url);
143 }
144
145 /**
146 * Creates a Reader using the InputStream of a URLConnection.
147 * <p>
148 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data
149 * it uses the same logic used for files.
150 * <p>
151 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic
152 * used for an InputStream with content-type.
153 * <p>
154 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
155 * <p>
156 *
157 * @param conn URLConnection to create a Reader from.
158 * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
159 */
160 public XmlStreamReader(URLConnection conn) throws IOException {
161 super(conn);
162 }
163
164 /**
165 * Creates a Reader using an InputStream an the associated content-type header.
166 * <p>
167 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
168 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
169 * encoding mandated by the content-type MIME type.
170 * <p>
171 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
172 * <p>
173 *
174 * @param is InputStream to create the reader from.
175 * @param httpContentType content-type header to use for the resolution of the charset encoding.
176 * @throws IOException thrown if there is a problem reading the file.
177 */
178 public XmlStreamReader(InputStream is, String httpContentType) throws IOException {
179 super(is, httpContentType);
180 }
181
182 /**
183 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
184 * regarding the encoding detection.
185 * <p>
186 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
187 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
188 * encoding mandated by the content-type MIME type.
189 * <p>
190 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
191 * following:
192 * <p>
193 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
194 * <p>
195 * Else if the XML prolog had a charset encoding that encoding is used.
196 * <p>
197 * Else if the content type had a charset encoding that encoding is used.
198 * <p>
199 * Else 'UTF-8' is used.
200 * <p>
201 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
202 * <p>
203 *
204 * @param is InputStream to create the reader from.
205 * @param httpContentType content-type header to use for the resolution of the charset encoding.
206 * @param lenient indicates if the charset encoding detection should be relaxed.
207 * @param defaultEncoding encoding to use
208 * @throws IOException thrown if there is a problem reading the file.
209 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
210 */
211 public XmlStreamReader(InputStream is, String httpContentType, boolean lenient, String defaultEncoding)
212 throws IOException, XmlStreamReaderException {
213 super(is, httpContentType, lenient, defaultEncoding);
214 }
215
216 /**
217 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
218 * regarding the encoding detection.
219 * <p>
220 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
221 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
222 * encoding mandated by the content-type MIME type.
223 * <p>
224 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
225 * following:
226 * <p>
227 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
228 * <p>
229 * Else if the XML prolog had a charset encoding that encoding is used.
230 * <p>
231 * Else if the content type had a charset encoding that encoding is used.
232 * <p>
233 * Else 'UTF-8' is used.
234 * <p>
235 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
236 * <p>
237 *
238 * @param is InputStream to create the reader from.
239 * @param httpContentType content-type header to use for the resolution of the charset encoding.
240 * @param lenient indicates if the charset encoding detection should be relaxed.
241 * @throws IOException thrown if there is a problem reading the file.
242 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
243 */
244 public XmlStreamReader(InputStream is, String httpContentType, boolean lenient)
245 throws IOException, XmlStreamReaderException {
246 super(is, httpContentType, lenient);
247 }
248 }