View Javadoc
1   package org.codehaus.plexus.util.xml.pull;
2   
3   /*
4    * Copyright The Codehaus Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  public class EntityReplacementMap {
19      final String entityName[];
20  
21      final char[] entityNameBuf[];
22  
23      final String entityReplacement[];
24  
25      final char[] entityReplacementBuf[];
26  
27      int entityEnd;
28  
29      final int entityNameHash[];
30  
31      public EntityReplacementMap(String[][] replacements) {
32          int length = replacements.length;
33          entityName = new String[length];
34          entityNameBuf = new char[length][];
35          entityReplacement = new String[length];
36          entityReplacementBuf = new char[length][];
37          entityNameHash = new int[length];
38  
39          for (String[] replacement : replacements) {
40              defineEntityReplacementText(replacement[0], replacement[1]);
41          }
42      }
43  
44      private void defineEntityReplacementText(String entityName, String replacementText) {
45          if (!replacementText.startsWith("&#") && this.entityName != null && replacementText.length() > 1) {
46              String tmp = replacementText.substring(1, replacementText.length() - 1);
47              for (int i = 0; i < this.entityName.length; i++) {
48                  if (this.entityName[i] != null && this.entityName[i].equals(tmp)) {
49                      replacementText = this.entityReplacement[i];
50                  }
51              }
52          }
53  
54          // this is to make sure that if interning works we will take advantage of it ...
55          char[] entityNameCharData = entityName.toCharArray();
56          // noinspection ConstantConditions
57          this.entityName[entityEnd] = newString(entityNameCharData, 0, entityName.length());
58          entityNameBuf[entityEnd] = entityNameCharData;
59  
60          entityReplacement[entityEnd] = replacementText;
61          entityReplacementBuf[entityEnd] = replacementText.toCharArray();
62          entityNameHash[entityEnd] = fastHash(entityNameBuf[entityEnd], 0, entityNameBuf[entityEnd].length);
63          ++entityEnd;
64          // TODO disallow < or & in entity replacement text (or ]]>???)
65          // TODO keepEntityNormalizedForAttributeValue cached as well ...
66      }
67  
68      private String newString(char[] cbuf, int off, int len) {
69          return new String(cbuf, off, len);
70      }
71  
72      /**
73       * simplistic implementation of hash function that has <b>constant</b> time to compute - so it also means
74       * diminishing hash quality for long strings but for XML parsing it should be good enough ...
75       */
76      private static int fastHash(char ch[], int off, int len) {
77          if (len == 0) return 0;
78          // assert len >0
79          int hash = ch[off]; // hash at beginning
80          // try {
81          hash = (hash << 7) + ch[off + len - 1]; // hash at the end
82          // } catch(ArrayIndexOutOfBoundsException aie) {
83          // aie.printStackTrace(); //should never happen ...
84          // throw new RuntimeException("this is violation of pre-condition");
85          // }
86          if (len > 16) hash = (hash << 7) + ch[off + (len / 4)]; // 1/4 from beginning
87          if (len > 8) hash = (hash << 7) + ch[off + (len / 2)]; // 1/2 of string size ...
88          // notice that hash is at most done 3 times <<7 so shifted by 21 bits 8 bit value
89          // so max result == 29 bits so it is quite just below 31 bits for long (2^32) ...
90          // assert hash >= 0;
91          return hash;
92      }
93  
94      public static final EntityReplacementMap defaultEntityReplacementMap = new EntityReplacementMap(new String[][] {
95          {"nbsp", "\u00a0"},
96          {"iexcl", "\u00a1"},
97          {"cent", "\u00a2"},
98          {"pound", "\u00a3"},
99          {"curren", "\u00a4"},
100         {"yen", "\u00a5"},
101         {"brvbar", "\u00a6"},
102         {"sect", "\u00a7"},
103         {"uml", "\u00a8"},
104         {"copy", "\u00a9"},
105         {"ordf", "\u00aa"},
106         {"laquo", "\u00ab"},
107         {"not", "\u00ac"},
108         {"shy", "\u00ad"},
109         {"reg", "\u00ae"},
110         {"macr", "\u00af"},
111         {"deg", "\u00b0"},
112         {"plusmn", "\u00b1"},
113         {"sup2", "\u00b2"},
114         {"sup3", "\u00b3"},
115         {"acute", "\u00b4"},
116         {"micro", "\u00b5"},
117         {"para", "\u00b6"},
118         {"middot", "\u00b7"},
119         {"cedil", "\u00b8"},
120         {"sup1", "\u00b9"},
121         {"ordm", "\u00ba"},
122         {"raquo", "\u00bb"},
123         {"frac14", "\u00bc"},
124         {"frac12", "\u00bd"},
125         {"frac34", "\u00be"},
126         {"iquest", "\u00bf"},
127         {"Agrave", "\u00c0"},
128         {"Aacute", "\u00c1"},
129         {"Acirc", "\u00c2"},
130         {"Atilde", "\u00c3"},
131         {"Auml", "\u00c4"},
132         {"Aring", "\u00c5"},
133         {"AElig", "\u00c6"},
134         {"Ccedil", "\u00c7"},
135         {"Egrave", "\u00c8"},
136         {"Eacute", "\u00c9"},
137         {"Ecirc", "\u00ca"},
138         {"Euml", "\u00cb"},
139         {"Igrave", "\u00cc"},
140         {"Iacute", "\u00cd"},
141         {"Icirc", "\u00ce"},
142         {"Iuml", "\u00cf"},
143         {"ETH", "\u00d0"},
144         {"Ntilde", "\u00d1"},
145         {"Ograve", "\u00d2"},
146         {"Oacute", "\u00d3"},
147         {"Ocirc", "\u00d4"},
148         {"Otilde", "\u00d5"},
149         {"Ouml", "\u00d6"},
150         {"times", "\u00d7"},
151         {"Oslash", "\u00d8"},
152         {"Ugrave", "\u00d9"},
153         {"Uacute", "\u00da"},
154         {"Ucirc", "\u00db"},
155         {"Uuml", "\u00dc"},
156         {"Yacute", "\u00dd"},
157         {"THORN", "\u00de"},
158         {"szlig", "\u00df"},
159         {"agrave", "\u00e0"},
160         {"aacute", "\u00e1"},
161         {"acirc", "\u00e2"},
162         {"atilde", "\u00e3"},
163         {"auml", "\u00e4"},
164         {"aring", "\u00e5"},
165         {"aelig", "\u00e6"},
166         {"ccedil", "\u00e7"},
167         {"egrave", "\u00e8"},
168         {"eacute", "\u00e9"},
169         {"ecirc", "\u00ea"},
170         {"euml", "\u00eb"},
171         {"igrave", "\u00ec"},
172         {"iacute", "\u00ed"},
173         {"icirc", "\u00ee"},
174         {"iuml", "\u00ef"},
175         {"eth", "\u00f0"},
176         {"ntilde", "\u00f1"},
177         {"ograve", "\u00f2"},
178         {"oacute", "\u00f3"},
179         {"ocirc", "\u00f4"},
180         {"otilde", "\u00f5"},
181         {"ouml", "\u00f6"},
182         {"divide", "\u00f7"},
183         {"oslash", "\u00f8"},
184         {"ugrave", "\u00f9"},
185         {"uacute", "\u00fa"},
186         {"ucirc", "\u00fb"},
187         {"uuml", "\u00fc"},
188         {"yacute", "\u00fd"},
189         {"thorn", "\u00fe"},
190         {"yuml", "\u00ff"},
191 
192         // ----------------------------------------------------------------------
193         // Special entities
194         // ----------------------------------------------------------------------
195 
196         {"OElig", "\u0152"},
197         {"oelig", "\u0153"},
198         {"Scaron", "\u0160"},
199         {"scaron", "\u0161"},
200         {"Yuml", "\u0178"},
201         {"circ", "\u02c6"},
202         {"tilde", "\u02dc"},
203         {"ensp", "\u2002"},
204         {"emsp", "\u2003"},
205         {"thinsp", "\u2009"},
206         {"zwnj", "\u200c"},
207         {"zwj", "\u200d"},
208         {"lrm", "\u200e"},
209         {"rlm", "\u200f"},
210         {"ndash", "\u2013"},
211         {"mdash", "\u2014"},
212         {"lsquo", "\u2018"},
213         {"rsquo", "\u2019"},
214         {"sbquo", "\u201a"},
215         {"ldquo", "\u201c"},
216         {"rdquo", "\u201d"},
217         {"bdquo", "\u201e"},
218         {"dagger", "\u2020"},
219         {"Dagger", "\u2021"},
220         {"permil", "\u2030"},
221         {"lsaquo", "\u2039"},
222         {"rsaquo", "\u203a"},
223         {"euro", "\u20ac"},
224 
225         // ----------------------------------------------------------------------
226         // Symbol entities
227         // ----------------------------------------------------------------------
228 
229         {"fnof", "\u0192"},
230         {"Alpha", "\u0391"},
231         {"Beta", "\u0392"},
232         {"Gamma", "\u0393"},
233         {"Delta", "\u0394"},
234         {"Epsilon", "\u0395"},
235         {"Zeta", "\u0396"},
236         {"Eta", "\u0397"},
237         {"Theta", "\u0398"},
238         {"Iota", "\u0399"},
239         {"Kappa", "\u039a"},
240         {"Lambda", "\u039b"},
241         {"Mu", "\u039c"},
242         {"Nu", "\u039d"},
243         {"Xi", "\u039e"},
244         {"Omicron", "\u039f"},
245         {"Pi", "\u03a0"},
246         {"Rho", "\u03a1"},
247         {"Sigma", "\u03a3"},
248         {"Tau", "\u03a4"},
249         {"Upsilon", "\u03a5"},
250         {"Phi", "\u03a6"},
251         {"Chi", "\u03a7"},
252         {"Psi", "\u03a8"},
253         {"Omega", "\u03a9"},
254         {"alpha", "\u03b1"},
255         {"beta", "\u03b2"},
256         {"gamma", "\u03b3"},
257         {"delta", "\u03b4"},
258         {"epsilon", "\u03b5"},
259         {"zeta", "\u03b6"},
260         {"eta", "\u03b7"},
261         {"theta", "\u03b8"},
262         {"iota", "\u03b9"},
263         {"kappa", "\u03ba"},
264         {"lambda", "\u03bb"},
265         {"mu", "\u03bc"},
266         {"nu", "\u03bd"},
267         {"xi", "\u03be"},
268         {"omicron", "\u03bf"},
269         {"pi", "\u03c0"},
270         {"rho", "\u03c1"},
271         {"sigmaf", "\u03c2"},
272         {"sigma", "\u03c3"},
273         {"tau", "\u03c4"},
274         {"upsilon", "\u03c5"},
275         {"phi", "\u03c6"},
276         {"chi", "\u03c7"},
277         {"psi", "\u03c8"},
278         {"omega", "\u03c9"},
279         {"thetasym", "\u03d1"},
280         {"upsih", "\u03d2"},
281         {"piv", "\u03d6"},
282         {"bull", "\u2022"},
283         {"hellip", "\u2026"},
284         {"prime", "\u2032"},
285         {"Prime", "\u2033"},
286         {"oline", "\u203e"},
287         {"frasl", "\u2044"},
288         {"weierp", "\u2118"},
289         {"image", "\u2111"},
290         {"real", "\u211c"},
291         {"trade", "\u2122"},
292         {"alefsym", "\u2135"},
293         {"larr", "\u2190"},
294         {"uarr", "\u2191"},
295         {"rarr", "\u2192"},
296         {"darr", "\u2193"},
297         {"harr", "\u2194"},
298         {"crarr", "\u21b5"},
299         {"lArr", "\u21d0"},
300         {"uArr", "\u21d1"},
301         {"rArr", "\u21d2"},
302         {"dArr", "\u21d3"},
303         {"hArr", "\u21d4"},
304         {"forall", "\u2200"},
305         {"part", "\u2202"},
306         {"exist", "\u2203"},
307         {"empty", "\u2205"},
308         {"nabla", "\u2207"},
309         {"isin", "\u2208"},
310         {"notin", "\u2209"},
311         {"ni", "\u220b"},
312         {"prod", "\u220f"},
313         {"sum", "\u2211"},
314         {"minus", "\u2212"},
315         {"lowast", "\u2217"},
316         {"radic", "\u221a"},
317         {"prop", "\u221d"},
318         {"infin", "\u221e"},
319         {"ang", "\u2220"},
320         {"and", "\u2227"},
321         {"or", "\u2228"},
322         {"cap", "\u2229"},
323         {"cup", "\u222a"},
324         {"int", "\u222b"},
325         {"there4", "\u2234"},
326         {"sim", "\u223c"},
327         {"cong", "\u2245"},
328         {"asymp", "\u2248"},
329         {"ne", "\u2260"},
330         {"equiv", "\u2261"},
331         {"le", "\u2264"},
332         {"ge", "\u2265"},
333         {"sub", "\u2282"},
334         {"sup", "\u2283"},
335         {"nsub", "\u2284"},
336         {"sube", "\u2286"},
337         {"supe", "\u2287"},
338         {"oplus", "\u2295"},
339         {"otimes", "\u2297"},
340         {"perp", "\u22a5"},
341         {"sdot", "\u22c5"},
342         {"lceil", "\u2308"},
343         {"rceil", "\u2309"},
344         {"lfloor", "\u230a"},
345         {"rfloor", "\u230b"},
346         {"lang", "\u2329"},
347         {"rang", "\u232a"},
348         {"loz", "\u25ca"},
349         {"spades", "\u2660"},
350         {"clubs", "\u2663"},
351         {"hearts", "\u2665"},
352         {"diams", "\u2666"}
353     });
354 }