1 /* 2 * Copyright 2004 Sun Microsystems, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 * 16 */ 17 package org.codehaus.plexus.util.xml; 18 19 import java.io.File; 20 import java.io.IOException; 21 import java.io.InputStream; 22 import java.net.URL; 23 import java.net.URLConnection; 24 import java.nio.file.Path; 25 26 /** 27 * Character stream that handles (or at least attempts to) all the necessary Voodo to figure out the charset encoding of 28 * the XML document within the stream. 29 * <p> 30 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream. 31 * <p> 32 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the 33 * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right 34 * now, XmlReader handles it and things work in all parsers). 35 * <p> 36 * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering 37 * a wide set of constructors. 38 * <P> 39 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script 40 * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog, 41 * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a 42 * feed</a>. 43 * <p> 44 * 45 * @author Alejandro Abdelnur 46 * @version revision 1.17 taken on 26/06/2007 from Rome (see 47 * https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java) 48 * @since 1.4.4 49 */ 50 public class XmlStreamReader extends XmlReader { 51 /** 52 * Creates a Reader for a Path. 53 * <p> 54 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to 55 * UTF-8. 56 * <p> 57 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 58 * <p> 59 * 60 * @param path Path to create a Reader from. 61 * @throws IOException thrown if there is a problem reading the file. 62 */ 63 public XmlStreamReader(Path path) throws IOException { 64 super(path); 65 } 66 67 /** 68 * Creates a Reader for a File. 69 * <p> 70 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to 71 * UTF-8. 72 * <p> 73 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 74 * <p> 75 * 76 * @param file File to create a Reader from. 77 * @throws IOException thrown if there is a problem reading the file. 78 */ 79 public XmlStreamReader(File file) throws IOException { 80 this(file.toPath()); 81 } 82 83 /** 84 * Creates a Reader for a raw InputStream. 85 * <p> 86 * It follows the same logic used for files. 87 * <p> 88 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 89 * <p> 90 * 91 * @param is InputStream to create a Reader from. 92 * @throws IOException thrown if there is a problem reading the stream. 93 */ 94 public XmlStreamReader(InputStream is) throws IOException { 95 super(is); 96 } 97 98 /** 99 * Creates a Reader for a raw InputStream. 100 * <p> 101 * It follows the same logic used for files. 102 * <p> 103 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the 104 * following: 105 * <p> 106 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 107 * <p> 108 * Else if the XML prolog had a charset encoding that encoding is used. 109 * <p> 110 * Else if the content type had a charset encoding that encoding is used. 111 * <p> 112 * Else 'UTF-8' is used. 113 * <p> 114 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 115 * <p> 116 * 117 * @param is InputStream to create a Reader from. 118 * @param lenient indicates if the charset encoding detection should be relaxed. 119 * @throws IOException thrown if there is a problem reading the stream. 120 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs. 121 */ 122 public XmlStreamReader(InputStream is, boolean lenient) throws IOException, XmlStreamReaderException { 123 super(is, lenient); 124 } 125 126 /** 127 * Creates a Reader using the InputStream of a URL. 128 * <p> 129 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic 130 * used for Files. 131 * <p> 132 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for 133 * an InputStream with content-type. 134 * <p> 135 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 136 * <p> 137 * 138 * @param url URL to create a Reader from. 139 * @throws IOException thrown if there is a problem reading the stream of the URL. 140 */ 141 public XmlStreamReader(URL url) throws IOException { 142 super(url); 143 } 144 145 /** 146 * Creates a Reader using the InputStream of a URLConnection. 147 * <p> 148 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data 149 * it uses the same logic used for files. 150 * <p> 151 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic 152 * used for an InputStream with content-type. 153 * <p> 154 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 155 * <p> 156 * 157 * @param conn URLConnection to create a Reader from. 158 * @throws IOException thrown if there is a problem reading the stream of the URLConnection. 159 */ 160 public XmlStreamReader(URLConnection conn) throws IOException { 161 super(conn); 162 } 163 164 /** 165 * Creates a Reader using an InputStream an the associated content-type header. 166 * <p> 167 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not 168 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default 169 * encoding mandated by the content-type MIME type. 170 * <p> 171 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 172 * <p> 173 * 174 * @param is InputStream to create the reader from. 175 * @param httpContentType content-type header to use for the resolution of the charset encoding. 176 * @throws IOException thrown if there is a problem reading the file. 177 */ 178 public XmlStreamReader(InputStream is, String httpContentType) throws IOException { 179 super(is, httpContentType); 180 } 181 182 /** 183 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient 184 * regarding the encoding detection. 185 * <p> 186 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not 187 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default 188 * encoding mandated by the content-type MIME type. 189 * <p> 190 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the 191 * following: 192 * <p> 193 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 194 * <p> 195 * Else if the XML prolog had a charset encoding that encoding is used. 196 * <p> 197 * Else if the content type had a charset encoding that encoding is used. 198 * <p> 199 * Else 'UTF-8' is used. 200 * <p> 201 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 202 * <p> 203 * 204 * @param is InputStream to create the reader from. 205 * @param httpContentType content-type header to use for the resolution of the charset encoding. 206 * @param lenient indicates if the charset encoding detection should be relaxed. 207 * @param defaultEncoding encoding to use 208 * @throws IOException thrown if there is a problem reading the file. 209 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs. 210 */ 211 public XmlStreamReader(InputStream is, String httpContentType, boolean lenient, String defaultEncoding) 212 throws IOException, XmlStreamReaderException { 213 super(is, httpContentType, lenient, defaultEncoding); 214 } 215 216 /** 217 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient 218 * regarding the encoding detection. 219 * <p> 220 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not 221 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default 222 * encoding mandated by the content-type MIME type. 223 * <p> 224 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the 225 * following: 226 * <p> 227 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 228 * <p> 229 * Else if the XML prolog had a charset encoding that encoding is used. 230 * <p> 231 * Else if the content type had a charset encoding that encoding is used. 232 * <p> 233 * Else 'UTF-8' is used. 234 * <p> 235 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 236 * <p> 237 * 238 * @param is InputStream to create the reader from. 239 * @param httpContentType content-type header to use for the resolution of the charset encoding. 240 * @param lenient indicates if the charset encoding detection should be relaxed. 241 * @throws IOException thrown if there is a problem reading the file. 242 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs. 243 */ 244 public XmlStreamReader(InputStream is, String httpContentType, boolean lenient) 245 throws IOException, XmlStreamReaderException { 246 super(is, httpContentType, lenient); 247 } 248 }