1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.codehaus.plexus.util.xml;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.io.StringReader;
27 import java.net.HttpURLConnection;
28 import java.net.URL;
29 import java.net.URLConnection;
30 import java.nio.file.Files;
31 import java.nio.file.Path;
32 import java.text.MessageFormat;
33 import java.util.Locale;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61 @Deprecated
62 public class XmlReader extends Reader {
63 private static final int BUFFER_SIZE = 4096;
64
65 private static final String UTF_8 = "UTF-8";
66
67 private static final String US_ASCII = "US-ASCII";
68
69 private static final String UTF_16BE = "UTF-16BE";
70
71 private static final String UTF_16LE = "UTF-16LE";
72
73 private static final String UTF_16 = "UTF-16";
74
75 private static final String EBCDIC = "CP1047";
76
77 private static String _staticDefaultEncoding = null;
78
79 private Reader _reader;
80
81 private String _encoding;
82
83 private String _defaultEncoding;
84
85
86
87
88
89
90
91
92
93
94
95 public static void setDefaultEncoding(String encoding) {
96 _staticDefaultEncoding = encoding;
97 }
98
99
100
101
102
103
104
105
106
107 public static String getDefaultEncoding() {
108 return _staticDefaultEncoding;
109 }
110
111
112
113
114
115
116
117
118
119
120
121
122
123 public XmlReader(Path path) throws IOException {
124 this(Files.newInputStream(path));
125 }
126
127
128
129
130
131
132
133
134
135
136
137
138
139 public XmlReader(File file) throws IOException {
140 this(file.toPath());
141 }
142
143
144
145
146
147
148
149
150
151
152
153
154 public XmlReader(InputStream is) throws IOException {
155 this(is, true);
156 }
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182 public XmlReader(InputStream is, boolean lenient) throws IOException, XmlStreamReaderException {
183 _defaultEncoding = _staticDefaultEncoding;
184 try {
185 doRawStream(is, lenient);
186 } catch (XmlStreamReaderException ex) {
187 if (!lenient) {
188 throw ex;
189 } else {
190 doLenientDetection(null, ex);
191 }
192 }
193 }
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210 public XmlReader(URL url) throws IOException {
211 this(url.openConnection());
212 }
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229 public XmlReader(URLConnection conn) throws IOException {
230 _defaultEncoding = _staticDefaultEncoding;
231 boolean lenient = true;
232 if (conn instanceof HttpURLConnection) {
233 try {
234 doHttpStream(conn.getInputStream(), conn.getContentType(), lenient);
235 } catch (XmlStreamReaderException ex) {
236 doLenientDetection(conn.getContentType(), ex);
237 }
238 } else if (conn.getContentType() != null) {
239 try {
240 doHttpStream(conn.getInputStream(), conn.getContentType(), lenient);
241 } catch (XmlStreamReaderException ex) {
242 doLenientDetection(conn.getContentType(), ex);
243 }
244 } else {
245 try {
246 doRawStream(conn.getInputStream(), lenient);
247 } catch (XmlStreamReaderException ex) {
248 doLenientDetection(null, ex);
249 }
250 }
251 }
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267 public XmlReader(InputStream is, String httpContentType) throws IOException {
268 this(is, httpContentType, true);
269 }
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300 public XmlReader(InputStream is, String httpContentType, boolean lenient, String defaultEncoding)
301 throws IOException, XmlStreamReaderException {
302 _defaultEncoding = (defaultEncoding == null) ? _staticDefaultEncoding : defaultEncoding;
303 try {
304 doHttpStream(is, httpContentType, lenient);
305 } catch (XmlStreamReaderException ex) {
306 if (!lenient) {
307 throw ex;
308 } else {
309 doLenientDetection(httpContentType, ex);
310 }
311 }
312 }
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342 public XmlReader(InputStream is, String httpContentType, boolean lenient)
343 throws IOException, XmlStreamReaderException {
344 this(is, httpContentType, lenient, null);
345 }
346
347 private void doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
348 if (httpContentType != null) {
349 if (httpContentType.startsWith("text/html")) {
350 httpContentType = httpContentType.substring("text/html".length());
351 httpContentType = "text/xml" + httpContentType;
352 try {
353 doHttpStream(ex.getInputStream(), httpContentType, true);
354 ex = null;
355 } catch (XmlStreamReaderException ex2) {
356 ex = ex2;
357 }
358 }
359 }
360 if (ex != null) {
361 String encoding = ex.getXmlEncoding();
362 if (encoding == null) {
363 encoding = ex.getContentTypeEncoding();
364 }
365 if (encoding == null) {
366 encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
367 }
368 prepareReader(ex.getInputStream(), encoding);
369 }
370 }
371
372
373
374
375
376
377
378 public String getEncoding() {
379 return _encoding;
380 }
381
382 @Override
383 public int read(char[] buf, int offset, int len) throws IOException {
384 return _reader.read(buf, offset, len);
385 }
386
387
388
389
390
391
392
393 @Override
394 public void close() throws IOException {
395 _reader.close();
396 }
397
398 private void doRawStream(InputStream is, boolean lenient) throws IOException {
399 BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
400 String bomEnc = getBOMEncoding(pis);
401 String xmlGuessEnc = getXMLGuessEncoding(pis);
402 String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
403 String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
404 prepareReader(pis, encoding);
405 }
406
407 private void doHttpStream(InputStream is, String httpContentType, boolean lenient) throws IOException {
408 BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
409 String cTMime = getContentTypeMime(httpContentType);
410 String cTEnc = getContentTypeEncoding(httpContentType);
411 String bomEnc = getBOMEncoding(pis);
412 String xmlGuessEnc = getXMLGuessEncoding(pis);
413 String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
414 String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis, lenient);
415 prepareReader(pis, encoding);
416 }
417
418 private void prepareReader(InputStream is, String encoding) throws IOException {
419 _reader = new InputStreamReader(is, encoding);
420 _encoding = encoding;
421 }
422
423
424 private String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is)
425 throws IOException {
426 String encoding;
427 if (bomEnc == null) {
428 if (xmlGuessEnc == null || xmlEnc == null) {
429 encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
430 } else if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
431 encoding = xmlGuessEnc;
432 } else {
433 encoding = xmlEnc;
434 }
435 } else if (bomEnc.equals(UTF_8)) {
436 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
437 throw new XmlStreamReaderException(
438 RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
439 }
440 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
441 throw new XmlStreamReaderException(
442 RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
443 }
444 encoding = UTF_8;
445 } else if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
446 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)
447 || xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
448 throw new XmlStreamReaderException(
449 RAW_EX_1.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
450 }
451 encoding = bomEnc;
452 } else {
453 throw new XmlStreamReaderException(
454 RAW_EX_2.format(new Object[] {bomEnc, xmlGuessEnc, xmlEnc}), bomEnc, xmlGuessEnc, xmlEnc, is);
455 }
456 return encoding;
457 }
458
459
460 private String calculateHttpEncoding(
461 String cTMime,
462 String cTEnc,
463 String bomEnc,
464 String xmlGuessEnc,
465 String xmlEnc,
466 InputStream is,
467 boolean lenient)
468 throws IOException {
469 String encoding;
470 if (lenient & xmlEnc != null) {
471 encoding = xmlEnc;
472 } else {
473 boolean appXml = isAppXml(cTMime);
474 boolean textXml = isTextXml(cTMime);
475 if (appXml || textXml) {
476 if (cTEnc == null) {
477 if (appXml) {
478 encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
479 } else {
480 encoding = (_defaultEncoding == null) ? US_ASCII : _defaultEncoding;
481 }
482 } else if (bomEnc != null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
483 throw new XmlStreamReaderException(
484 HTTP_EX_1.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
485 cTMime,
486 cTEnc,
487 bomEnc,
488 xmlGuessEnc,
489 xmlEnc,
490 is);
491 } else if (cTEnc.equals(UTF_16)) {
492 if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
493 encoding = bomEnc;
494 } else {
495 throw new XmlStreamReaderException(
496 HTTP_EX_2.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
497 cTMime,
498 cTEnc,
499 bomEnc,
500 xmlGuessEnc,
501 xmlEnc,
502 is);
503 }
504 } else {
505 encoding = cTEnc;
506 }
507 } else {
508 throw new XmlStreamReaderException(
509 HTTP_EX_3.format(new Object[] {cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc}),
510 cTMime,
511 cTEnc,
512 bomEnc,
513 xmlGuessEnc,
514 xmlEnc,
515 is);
516 }
517 }
518 return encoding;
519 }
520
521
522 private static String getContentTypeMime(String httpContentType) {
523 String mime = null;
524 if (httpContentType != null) {
525 int i = httpContentType.indexOf(";");
526 mime = ((i == -1) ? httpContentType : httpContentType.substring(0, i)).trim();
527 }
528 return mime;
529 }
530
531 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
532
533
534 private static String getContentTypeEncoding(String httpContentType) {
535 String encoding = null;
536 if (httpContentType != null) {
537 int i = httpContentType.indexOf(";");
538 if (i > -1) {
539 String postMime = httpContentType.substring(i + 1);
540 Matcher m = CHARSET_PATTERN.matcher(postMime);
541 encoding = (m.find()) ? m.group(1) : null;
542 encoding = (encoding != null) ? encoding.toUpperCase(Locale.ENGLISH) : null;
543 }
544 }
545 return encoding;
546 }
547
548
549
550 private static String getBOMEncoding(BufferedInputStream is) throws IOException {
551 String encoding = null;
552 int[] bytes = new int[3];
553 is.mark(3);
554 bytes[0] = is.read();
555 bytes[1] = is.read();
556 bytes[2] = is.read();
557
558 if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
559 encoding = UTF_16BE;
560 is.reset();
561 is.read();
562 is.read();
563 } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
564 encoding = UTF_16LE;
565 is.reset();
566 is.read();
567 is.read();
568 } else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
569 encoding = UTF_8;
570 } else {
571 is.reset();
572 }
573 return encoding;
574 }
575
576
577 private static String getXMLGuessEncoding(BufferedInputStream is) throws IOException {
578 String encoding = null;
579 int[] bytes = new int[4];
580 is.mark(4);
581 bytes[0] = is.read();
582 bytes[1] = is.read();
583 bytes[2] = is.read();
584 bytes[3] = is.read();
585 is.reset();
586
587 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
588 encoding = UTF_16BE;
589 } else if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
590 encoding = UTF_16LE;
591 } else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
592 encoding = UTF_8;
593 } else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 && bytes[3] == 0x94) {
594 encoding = EBCDIC;
595 }
596 return encoding;
597 }
598
599 static final Pattern ENCODING_PATTERN =
600 Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE);
601
602
603 private static String getXmlProlog(BufferedInputStream is, String guessedEnc) throws IOException {
604 String encoding = null;
605 if (guessedEnc != null) {
606 byte[] bytes = new byte[BUFFER_SIZE];
607 is.mark(BUFFER_SIZE);
608 int offset = 0;
609 int max = BUFFER_SIZE;
610 int c = is.read(bytes, offset, max);
611 int firstGT = -1;
612 String xmlProlog = null;
613 while (c != -1 && firstGT == -1 && offset < BUFFER_SIZE) {
614 offset += c;
615 max -= c;
616 c = is.read(bytes, offset, max);
617 xmlProlog = new String(bytes, 0, offset, guessedEnc);
618 firstGT = xmlProlog.indexOf('>');
619 }
620 if (firstGT == -1) {
621 if (c == -1) {
622 throw new IOException("Unexpected end of XML stream");
623 } else {
624 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
625 }
626 }
627 int bytesRead = offset;
628 if (bytesRead > 0) {
629 is.reset();
630 BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
631 StringBuilder prolog = new StringBuilder();
632 String line = bReader.readLine();
633 while (line != null) {
634 prolog.append(line);
635 line = bReader.readLine();
636 }
637 Matcher m = ENCODING_PATTERN.matcher(prolog);
638 if (m.find()) {
639 encoding = m.group(1).toUpperCase(Locale.ENGLISH);
640 encoding = encoding.substring(1, encoding.length() - 1);
641 }
642 }
643 }
644 return encoding;
645 }
646
647
648 private static boolean isAppXml(String mime) {
649 return mime != null
650 && (mime.equals("application/xml")
651 || mime.equals("application/xml-dtd")
652 || mime.equals("application/xml-external-parsed-entity")
653 || (mime.startsWith("application/") && mime.endsWith("+xml")));
654 }
655
656
657 private static boolean isTextXml(String mime) {
658 return mime != null
659 && (mime.equals("text/xml")
660 || mime.equals("text/xml-external-parsed-entity")
661 || (mime.startsWith("text/") && mime.endsWith("+xml")));
662 }
663
664 private static final MessageFormat RAW_EX_1 =
665 new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
666
667 private static final MessageFormat RAW_EX_2 =
668 new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
669
670 private static final MessageFormat HTTP_EX_1 = new MessageFormat(
671 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
672
673 private static final MessageFormat HTTP_EX_2 = new MessageFormat(
674 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
675
676 private static final MessageFormat HTTP_EX_3 = new MessageFormat(
677 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
678 }