2d9a832917ac8d252a9292fd4a90615267e08c3b
[debian/jabref.git] / src / java / net / sf / jabref / imports / HTMLConverter.java
1 /*  Copyright (C) 2003-2012 JabRef contributors.
2     This program is free software; you can redistribute it and/or modify
3     it under the terms of the GNU General Public License as published by
4     the Free Software Foundation; either version 2 of the License, or
5     (at your option) any later version.
6
7     This program is distributed in the hope that it will be useful,
8     but WITHOUT ANY WARRANTY; without even the implied warranty of
9     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10     GNU General Public License for more details.
11
12     You should have received a copy of the GNU General Public License along
13     with this program; if not, write to the Free Software Foundation, Inc.,
14     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
15 */
16 package net.sf.jabref.imports;
17
18 import java.util.HashMap;
19 import java.util.Set;
20 import java.util.regex.Matcher;
21 import java.util.regex.Pattern;
22
23 import net.sf.jabref.export.layout.LayoutFormatter;
24 import net.sf.jabref.Globals;
25
26 public class HTMLConverter implements LayoutFormatter {
27
28     /*   Portions © International Organization for Standardization 1986:
29      Permission to copy in any form is granted for use with
30      conforming SGML systems and applications as defined in
31      ISO 8879, provided this notice is included in all copies.
32     */
33
34
35         // most of the LaTeX commands can be read at http://en.wikibooks.org/wiki/LaTeX/Accents
36         // The symbols can be looked at http://www.fileformat.info/info/unicode/char/a4/index.htm. Replace "a4" with the U+ number
37         // http://detexify.kirelabs.org/classify.html and http://www.ctan.org/tex-archive/info/symbols/comprehensive/ might help to find the right LaTeX command
38         // http://llg.cubic.org/docs/ent2latex.html and http://www.w3.org/TR/xml-entity-names/byalpha.html are also useful
39         // as well as http://www.w3.org/Math/characters/unicode.xml
40     
41     
42     // An array of arrays of strings in the format:
43     // {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"}
44     // Leaving a field empty is OK as it then will not be included
45     private String[][] conversionList = new String[][]{
46         {"160", "nbsp", "\\{~\\}"}, // no-break space = non-breaking space, 
47         //                                 U+00A0 ISOnum 
48         {"161", "iexcl", "\\{\\\\textexclamdown\\}"}, // inverted exclamation mark, U+00A1 ISOnum
49         {"162", "cent", "\\{\\\\textcent\\}"}, // cent sign, U+00A2 ISOnum  
50         {"163", "pound", "\\{\\\\pounds\\}"}, // pound sign, U+00A3 ISOnum
51         {"164", "curren", "\\{\\\\textcurrency\\}"}, // currency sign, U+00A4 ISOnum  
52         {"165", "yen", "\\{\\\\textyen\\}"}, // yen sign = yuan sign, U+00A5 ISOnum  
53         {"166", "brvbar", "\\{\\\\textbrokenbar\\}"}, // broken bar = broken vertical bar, 
54         //                                 U+00A6 ISOnum 
55         {"167", "sect", "\\{\\\\S\\}"}, // section sign, U+00A7 ISOnum  
56         {"168", "uml", "\\{\\\\\"\\{\\}\\}"}, // diaeresis = spacing diaeresis, 
57         //                                 U+00A8 ISOdia 
58         {"169", "copy", "\\{\\\\copyright\\}"}, // copyright sign, U+00A9 ISOnum
59         {"170", "ordf", "\\{\\\\textordfeminine\\}"}, // feminine ordinal indicator, U+00AA ISOnum
60         {"171", "laquo", "\\{\\\\guillemotleft\\}"}, // left-pointing double angle quotation mark
61         //                                 = left pointing guillemet, U+00AB ISOnum 
62         {"172", "not", "\\$\\\\neg\\$"}, // not sign, U+00AC ISOnum  
63         {"173", "shy", "\\\\-"}, // soft hyphen = discretionary hyphen, 
64         //                                 U+00AD ISOnum 
65         {"174", "reg", "\\{\\\\textregistered\\}"}, // registered sign = registered trade mark sign,
66         //                                 U+00AE ISOnum 
67         {"175", "macr", "\\{\\\\=\\{\\}\\}"}, // macron = spacing macron = overline 
68         //                                 = APL overbar, U+00AF ISOdia 
69         {"176", "deg", "\\$\\\\deg\\$"}, // degree sign, U+00B0 ISOnum  
70         {"177", "plusmn", "\\$\\\\pm\\$"}, // plus-minus sign = plus-or-minus sign, 
71         //                                 U+00B1 ISOnum 
72         {"178", "sup2", "\\\\textsuperscript\\{2\\}"}, // superscript two = superscript digit two 
73         //                                 = squared, U+00B2 ISOnum 
74         {"179", "sup3", "\\\\textsuperscript\\{3\\}"}, // superscript three = superscript digit three 
75         //                                 = cubed, U+00B3 ISOnum 
76         {"180", "acute", "\\{\\\\'\\{\\}\\}"}, // acute accent = spacing acute, 
77         //                                 U+00B4 ISOdia 
78         {"181", "micro", "\\$\\\\mu\\$"}, // micro sign, U+00B5 ISOnum  
79         {"182", "para", "\\{\\\\P\\}"}, // pilcrow sign = paragraph sign, 
80         //                                 U+00B6 ISOnum 
81         {"183", "middot", "\\$\\\\cdot\\$"}, // middle dot = Georgian comma 
82         //                                 = Greek middle dot, U+00B7 ISOnum 
83         {"184", "cedil", "\\{\\\\c\\{\\}\\}"}, // cedilla = spacing cedilla, U+00B8 ISOdia  
84         {"185", "sup1", "\\\\textsuperscript\\{1\\}"}, // superscript one = superscript digit one,
85         //                                 U+00B9 ISOnum 
86         {"186", "ordm", "\\{\\\\textordmasculine\\}"}, // masculine ordinal indicator,
87         //                                 U+00BA ISOnum 
88         {"187", "raquo", "\\{\\\\guillemotright\\}"}, // right-pointing double angle quotation mark
89         //                                 = right pointing guillemet, U+00BB ISOnum 
90         {"188", "frac14", "\\$\\\\sfrac\\{1\\}\\{4\\}\\$"}, // vulgar fraction one quarter 
91         //                                 = fraction one quarter, U+00BC ISOnum 
92         {"189", "frac12", "\\$\\\\sfrac\\{1\\}\\{2\\}\\$"}, // vulgar fraction one half 
93         //                                 = fraction one half, U+00BD ISOnum 
94         {"190", "frac34", "\\$\\\\sfrac\\{3\\}\\{4\\}\\$"}, // vulgar fraction three quarters 
95         //                                 = fraction three quarters, U+00BE ISOnum 
96         {"191", "iquest", "\\{\\\\textquestiondown\\}"}, // inverted question mark 
97         //                                 = turned question mark, U+00BF ISOnum 
98         {"192", "Agrave", "\\{\\\\`\\{A\\}\\}"}, // latin capital letter A with grave
99         //                                 = latin capital letter A grave,
100         //                                 U+00C0 ISOlat1 
101         {"193", "Aacute", "\\{\\\\'\\{A\\}\\}"}, // latin capital letter A with acute, 
102         //                                 U+00C1 ISOlat1 
103         {"194", "Acirc", "\\{\\\\\\^\\{A\\}\\}"}, // latin capital letter A with circumflex, 
104         //                                 U+00C2 ISOlat1 
105         {"195", "Atilde", "\\{\\\\~\\{A\\}\\}"}, // latin capital letter A with tilde, 
106         //                                 U+00C3 ISOlat1 
107         {"196", "Auml", "\\{\\\\\"\\{A\\}\\}"}, // latin capital letter A with diaeresis, 
108         //                                 U+00C4 ISOlat1 
109         {"197", "Aring", "\\{\\\\AA\\}"}, // latin capital letter A with ring above 
110         //                                 = latin capital letter A ring,
111         //                                 U+00C5 ISOlat1 
112         {"198", "AElig", "\\{\\\\AE\\}"}, // latin capital letter AE 
113         //                                 = latin capital ligature AE,
114         //                                 U+00C6 ISOlat1 
115         {"199", "Ccedil", "\\{\\\\c\\{C\\}\\}"}, // latin capital letter C with cedilla,
116         //                                 U+00C7 ISOlat1 
117         {"200", "Egrave", "\\{\\\\`\\{E\\}\\}"}, // latin capital letter E with grave,
118         //                                 U+00C8 ISOlat1 
119         {"201", "Eacute", "\\{\\\\'\\{E\\}\\}"}, // latin capital letter E with acute, 
120         //                                 U+00C9 ISOlat1 
121         {"202", "Ecirc", "\\{\\\\\\^\\{E\\}\\}"}, // latin capital letter E with circumflex, 
122         //                                 U+00CA ISOlat1 
123         {"203", "Euml", "\\{\\\\\"\\{E\\}\\}"}, // latin capital letter E with diaeresis, 
124         //                                 U+00CB ISOlat1 
125         {"204", "Igrave", "\\{\\\\`\\{I\\}\\}"}, // latin capital letter I with grave,
126         //                                 U+00CC ISOlat1 
127         {"205", "Iacute", "\\{\\\\'\\{I\\}\\}"}, // latin capital letter I with acute, 
128         //                                 U+00CD ISOlat1 
129         {"206", "Icirc", "\\{\\\\\\^\\{I\\}\\}"}, // latin capital letter I with circumflex, 
130         //                                 U+00CE ISOlat1 
131         {"207", "Iuml", "\\{\\\\\"\\{I\\}\\}"}, // latin capital letter I with diaeresis, 
132         //                                 U+00CF ISOlat1 
133         {"208", "ETH", "\\{\\\\DH\\}"}, // latin capital letter ETH, U+00D0 ISOlat1  
134         {"209", "Ntilde", "\\{\\\\~\\{N\\}\\}"}, // latin capital letter N with tilde, 
135         //                                 U+00D1 ISOlat1 
136         {"210", "Ograve", "\\{\\\\`\\{O\\}\\}"}, // latin capital letter O with grave,
137         //                                 U+00D2 ISOlat1 
138         {"211", "Oacute", "\\{\\\\'\\{O\\}\\}"}, // latin capital letter O with acute, 
139         //                                 U+00D3 ISOlat1 
140         {"212", "Ocirc", "\\{\\\\\\^\\{O\\}\\}"}, // latin capital letter O with circumflex, 
141         //                                 U+00D4 ISOlat1 
142         {"213", "Otilde", "\\{\\\\~\\{O\\}\\}"}, // latin capital letter O with tilde, 
143         //                                 U+00D5 ISOlat1 
144         {"214", "Ouml", "\\{\\\\\"\\{O\\}\\}"}, // latin capital letter O with diaeresis, 
145         //                                 U+00D6 ISOlat1 
146         {"215", "times", "\\$\\\\times\\$"}, // multiplication sign, U+00D7 ISOnum  
147         {"216", "Oslash", "\\{\\\\O\\{\\}\\}"}, // latin capital letter O with stroke 
148         //                                 = latin capital letter O slash,
149         //                                 U+00D8 ISOlat1 
150         {"217", "Ugrave", "\\{\\\\`\\{U\\}\\}"}, // latin capital letter U with grave,
151         //                                 U+00D9 ISOlat1 
152         {"218", "Uacute", "\\{\\\\'\\{U\\}\\}"}, // latin capital letter U with acute, 
153         //                                 U+00DA ISOlat1 
154         {"219", "Ucirc", "\\{\\\\\\^\\{U\\}\\}"}, // latin capital letter U with circumflex, 
155         //                                 U+00DB ISOlat1 
156         {"220", "Uuml", "\\{\\\\\"\\{U\\}\\}"}, // latin capital letter U with diaeresis, 
157         //                                 U+00DC ISOlat1 
158         {"221", "Yacute", "\\{\\\\'\\{Y\\}\\}"}, // latin capital letter Y with acute, 
159         //                                 U+00DD ISOlat1 
160         {"222", "THORN", "\\{\\\\TH\\}"}, // latin capital letter THORN, 
161         //                                 U+00DE ISOlat1 
162         {"223", "szlig", "\\{\\\\ss\\}"}, // latin small letter sharp s = ess-zed,
163         //                                 U+00DF ISOlat1 
164         {"224", "agrave", "\\{\\\\`\\{a\\}\\}"}, // latin small letter a with grave
165         //                                 = latin small letter a grave,
166         //                                 U+00E0 ISOlat1 
167         {"225", "aacute", "\\{\\\\'\\{a\\}\\}"}, // latin small letter a with acute, 
168         //                                 U+00E1 ISOlat1 
169         {"226", "acirc", "\\{\\\\\\^\\{a\\}\\}"}, // latin small letter a with circumflex, 
170         //                                 U+00E2 ISOlat1 
171         {"227", "atilde", "\\{\\\\~\\{a\\}\\}"}, // latin small letter a with tilde, 
172         //                                 U+00E3 ISOlat1 
173         {"228", "auml", "\\{\\\\\"\\{a\\}\\}"}, // latin small letter a with diaeresis, 
174         //                                 U+00E4 ISOlat1 
175         {"229", "aring", "\\{\\\\aa\\}"}, // latin small letter a with ring above 
176         //                                 = latin small letter a ring,
177         //                                 U+00E5 ISOlat1 
178         {"230", "aelig", "\\{\\\\ae\\}"}, // latin small letter ae 
179         //                                 = latin small ligature ae, U+00E6 ISOlat1 
180         {"231", "ccedil", "\\{\\\\c\\{c\\}\\}"}, // latin small letter c with cedilla,
181         //                                 U+00E7 ISOlat1 
182         {"232", "egrave", "\\{\\\\`\\{e\\}\\}"}, // latin small letter e with grave,
183         //                                 U+00E8 ISOlat1 
184         {"233", "eacute", "\\{\\\\'\\{e\\}\\}"}, // latin small letter e with acute, 
185         //                                 U+00E9 ISOlat1 
186         {"234", "ecirc", "\\{\\\\\\^\\{e\\}\\}"}, // latin small letter e with circumflex, 
187         //                                 U+00EA ISOlat1 
188         {"235", "euml", "\\{\\\\\"\\{e\\}\\}"}, // latin small letter e with diaeresis, 
189         //                                 U+00EB ISOlat1 
190         {"236", "igrave", "\\{\\\\`\\{\\\\i\\}\\}"}, // latin small letter i with grave,
191         //                                 U+00EC ISOlat1 
192         {"237", "iacute", "\\{\\\\'\\{\\\\i\\}\\}"}, // latin small letter i with acute, 
193         //                                 U+00ED ISOlat1 
194         {"238", "icirc", "\\{\\\\\\^\\{\\\\i\\}\\}"}, // latin small letter i with circumflex, 
195         //                                 U+00EE ISOlat1 
196         {"239", "iuml", "\\{\\\\\"\\{\\\\i\\}\\}"}, // latin small letter i with diaeresis, 
197         //                                 U+00EF ISOlat1 
198         {"240", "eth", "\\{\\\\dh\\}"}, // latin small letter eth, U+00F0 ISOlat1  
199         {"241", "ntilde", "\\{\\\\~\\{n\\}\\}"}, // latin small letter n with tilde, 
200         //                                 U+00F1 ISOlat1 
201         {"242", "ograve", "\\{\\\\`\\{o\\}\\}"}, // latin small letter o with grave,
202         //                                 U+00F2 ISOlat1 
203         {"243", "oacute", "\\{\\\\'\\{o\\}\\}"}, // latin small letter o with acute, 
204         //                                 U+00F3 ISOlat1 
205         {"244", "ocirc", "\\{\\\\\\^\\{o\\}\\}"}, // latin small letter o with circumflex, 
206         //                                 U+00F4 ISOlat1 
207         {"245", "otilde", "\\{\\\\~\\{o\\}\\}"}, // latin small letter o with tilde, 
208         //                                 U+00F5 ISOlat1 
209         {"246", "ouml", "\\{\\\\\"\\{o\\}\\}"}, // latin small letter o with diaeresis, 
210         //                                 U+00F6 ISOlat1 
211         {"247", "divide", "\\$\\\\div\\$"}, // division sign, U+00F7 ISOnum  
212         {"248", "oslash", "\\{\\\\o\\{\\}\\}"}, // latin small letter o with stroke, 
213         //                                 = latin small letter o slash,
214         //                                 U+00F8 ISOlat1 
215         {"249", "ugrave", "\\{\\\\`\\{u\\}\\}"}, // latin small letter u with grave,
216         //                                 U+00F9 ISOlat1 
217         {"250", "uacute", "\\{\\\\'\\{u\\}\\}"}, // latin small letter u with acute, 
218         //                                 U+00FA ISOlat1 
219         {"251", "ucirc", "\\{\\\\\\^\\{u\\}\\}"}, // latin small letter u with circumflex, 
220         //                                 U+00FB ISOlat1 
221         {"252", "uuml", "\\{\\\\\"\\{u\\}\\}"}, // latin small letter u with diaeresis, 
222         //                                 U+00FC ISOlat1 
223         {"253", "yacute", "\\{\\\\'\\{y\\}\\}"}, // latin small letter y with acute, 
224         //                                 U+00FD ISOlat1 
225         {"254", "thorn", "\\{\\\\th\\}"}, // latin small letter thorn, 
226         //                                 U+00FE ISOlat1 
227         {"255", "yuml", "\\{\\\\\"\\{y\\}\\}"}, // latin small letter y with diaeresis, 
228         //                                 U+00FF ISOlat1 
229         {"402", "fnof", "\\$f\\$"}, // latin small f with hook = function 
230         //                                   = florin, U+0192 ISOtech 
231
232         /* Greek */
233         {"913", "Alpha", "\\{\\$\\\\Alpha\\$\\}"}, // greek capital letter alpha, U+0391  
234         {"914", "Beta", "\\{\\$\\\\Beta\\$\\}"}, // greek capital letter beta, U+0392  
235         {"915", "Gamma", "\\{\\$\\\\Gamma\\$\\}"}, // greek capital letter gamma, 
236         //                                   U+0393 ISOgrk3 
237         {"916", "Delta", "\\{\\$\\\\Delta\\$\\}"}, // greek capital letter delta, 
238         //                                   U+0394 ISOgrk3 
239         {"917", "Epsilon", "\\{\\$\\\\Epsilon\\$\\}"}, // greek capital letter epsilon, U+0395  
240         {"918", "Zeta", "\\{\\$\\\\Zeta\\$\\}"}, // greek capital letter zeta, U+0396  
241         {"919", "Eta", "\\{\\$\\\\Eta\\$\\}"}, // greek capital letter eta, U+0397  
242         {"920", "Theta", "\\{\\$\\\\Theta\\$\\}"}, // greek capital letter theta, 
243         //                                   U+0398 ISOgrk3 
244         {"921", "Iota", "\\{\\$\\\\Iota\\$\\}"}, // greek capital letter iota, U+0399  
245         {"922", "Kappa", "\\{\\$\\\\Kappa\\$\\}"}, // greek capital letter kappa, U+039A  
246         {"923", "Lambda", "\\{\\$\\\\Lambda\\$\\}"}, // greek capital letter lambda, 
247         //                                   U+039B ISOgrk3 
248         {"924", "Mu", "\\{\\$\\\\Mu\\$\\}"}, // greek capital letter mu, U+039C  
249         {"925", "Nu", "\\{\\$\\\\Nu\\$\\}"}, // greek capital letter nu, U+039D  
250         {"926", "Xi", "\\{\\$\\\\Xi\\$\\}"}, // greek capital letter xi, U+039E ISOgrk3  
251         {"927", "Omicron", "\\{\\$\\\\Omicron\\$\\}"}, // greek capital letter omicron, U+039F  
252         {"928", "Pi", "\\{\\$\\\\Pi\\$\\}"}, // greek capital letter pi, U+03A0 ISOgrk3  
253         {"929", "Rho", "\\{\\$\\\\Rho\\$\\}"}, // greek capital letter rho, U+03A1  
254         /* there is no Sigmaf, and no U+03A2 character either */
255         {"931", "Sigma", "\\{\\$\\\\Sigma\\$\\}"}, // greek capital letter sigma, 
256         //                                   U+03A3 ISOgrk3 
257         {"932", "Tau", "\\{\\$\\\\Tau\\$\\}"}, // greek capital letter tau, U+03A4  
258         {"933", "Upsilon", "\\{\\$\\\\Upsilon\\$\\}"}, // greek capital letter upsilon, 
259         //                                   U+03A5 ISOgrk3 
260         {"934", "Phi", "\\{\\$\\\\Phi\\$\\}"}, // greek capital letter phi, 
261         //                                   U+03A6 ISOgrk3 
262         {"935", "Chi", "\\{\\$\\\\Chi\\$\\}"}, // greek capital letter chi, U+03A7  
263         {"936", "Psi", "\\{\\$\\\\Psi\\$\\}"}, // greek capital letter psi, 
264         //                                   U+03A8 ISOgrk3 
265         {"937", "Omega", "\\{\\$\\\\Omega\\$\\}"}, // greek capital letter omega, 
266         //                                   U+03A9 ISOgrk3 
267
268         {"945", "alpha", "\\$\\\\alpha\\$"}, // greek small letter alpha, 
269         //                                   U+03B1 ISOgrk3 
270         {"946", "beta", "\\$\\\\beta\\$"}, // greek small letter beta, U+03B2 ISOgrk3  
271         {"947", "gamma", "\\$\\\\gamma\\$"}, // greek small letter gamma, 
272         //                                   U+03B3 ISOgrk3 
273         {"948", "delta", "\\$\\\\delta\\$"}, // greek small letter delta, 
274         //                                   U+03B4 ISOgrk3 
275         {"949", "epsilon", "\\$\\\\epsilon\\$"}, // greek small letter epsilon, 
276         //                                   U+03B5 ISOgrk3 
277         {"950", "zeta", "\\$\\\\zeta\\$"}, // greek small letter zeta, U+03B6 ISOgrk3  
278         {"951", "eta", "\\$\\\\eta\\$"}, // greek small letter eta, U+03B7 ISOgrk3  
279         {"952", "theta", "\\$\\\\theta\\$"}, // greek small letter theta, 
280         //                                   U+03B8 ISOgrk3 
281         {"953", "iota", "\\$\\\\iota\\$"}, // greek small letter iota, U+03B9 ISOgrk3  
282         {"954", "kappa", "\\$\\\\kappa\\$"}, // greek small letter kappa, 
283         //                                   U+03BA ISOgrk3 
284         {"955", "lambda", "\\$\\\\lambda\\$"}, // greek small letter lambda, 
285         //                                   U+03BB ISOgrk3 
286         {"956", "mu", "\\$\\\\mu\\$"}, // greek small letter mu, U+03BC ISOgrk3  
287         {"957", "nu", "\\$\\\\nu\\$"}, // greek small letter nu, U+03BD ISOgrk3  
288         {"958", "xi", "\\$\\\\xi\\$"}, // greek small letter xi, U+03BE ISOgrk3  
289         {"959", "omicron", "\\$\\\\omicron\\$"}, // greek small letter omicron, U+03BF NEW  
290         {"960", "pi", "\\$\\\\phi\\$"}, // greek small letter pi, U+03C0 ISOgrk3  
291         {"961", "rho", "\\$\\\\rho\\$"}, // greek small letter rho, U+03C1 ISOgrk3  
292         {"962", "sigmaf", "\\$\\\\varsigma\\$"}, // greek small letter final sigma, 
293         //                                   U+03C2 ISOgrk3 
294         {"963", "sigma", "\\$\\\\sigma\\$"}, // greek small letter sigma, 
295         //                                   U+03C3 ISOgrk3 
296         {"964", "tau", "\\$\\\\tau\\$"}, // greek small letter tau, U+03C4 ISOgrk3  
297         {"965", "upsilon", "\\$\\\\upsilon\\$"}, // greek small letter upsilon, 
298         {"", "upsi", "\\$\\\\upsilon\\$"}, // alias 
299         //                                   U+03C5 ISOgrk3 
300         {"966", "phi", "\\$\\\\phi\\$"}, // greek small letter phi, U+03C6 ISOgrk3  
301         {"967", "chi", "\\$\\\\chi\\$"}, // greek small letter chi, U+03C7 ISOgrk3  
302         {"968", "psi", "\\$\\\\psi\\$"}, // greek small letter psi, U+03C8 ISOgrk3  
303         {"969", "omega", "\\$\\\\omega\\$"}, // greek small letter omega, 
304         //                                   U+03C9 ISOgrk3 
305         {"977", "thetasym", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol, 
306         {"", "thetav", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol, 
307         {"", "vartheta", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol, 
308         //                                   U+03D1 NEW 
309         {"978", "upsih", "\\{\\$\\\\Upsilon\\$\\}"}, // greek upsilon with hook symbol, 
310         //                                   U+03D2 NEW 
311         {"982", "piv", "\\$\\\\varphi\\$"}, // greek pi symbol, U+03D6 ISOgrk3  
312
313         /* General Punctuation */
314         {"8226", "bull", "\\$\\\\bullet\\$"}, // bullet = black small circle, 
315         //                                    U+2022 ISOpub  
316         /* bullet is NOT the same as bullet operator, U+2219 */
317         {"8230", "hellip", "\\{\\\\ldots\\}"}, // horizontal ellipsis = three dot leader, 
318         //                                    U+2026 ISOpub  
319         {"8242", "prime", "\\$\\\\prime\\$"}, // prime = minutes = feet, U+2032 ISOtech  
320         {"8243", "Prime", "\\$\\{''\\}\\$"}, // double prime = seconds = inches, 
321         //                                    U+2033 ISOtech 
322         {"8254", "oline", "\\{\\\\=\\{\\}\\}"}, // overline = spacing overscore, 
323         //                                    U+203E NEW 
324         {"8260", "frasl", "/"}, // fraction slash, U+2044 NEW  
325
326         /* Letterlike Symbols */
327         {"8472", "weierp", "\\$\\\\wp\\$"}, // script capital P = power set 
328         //                                    = Weierstrass p, U+2118 ISOamso 
329         {"8465", "image", "\\{\\$\\\\Im\\$\\}"}, // blackletter capital I = imaginary part, 
330         //                                    U+2111 ISOamso 
331         {"8476", "real", "\\{\\$\\\\Re\\$\\}"}, // blackletter capital R = real part symbol, 
332         //                                    U+211C ISOamso 
333         {"8482", "trade", "\\{\\\\texttrademark\\}"}, // trade mark sign, U+2122 ISOnum
334         {"8501", "alefsym", "\\$\\\\aleph\\$"}, // alef symbol = first transfinite cardinal, 
335         //                                    U+2135 NEW 
336         /*    alef symbol is NOT the same as hebrew letter alef,
337          U+05D0 although the same glyph could be used to depict both characters */
338         /* Arrows */
339         {"8592", "larr", "\\$\\\\leftarrow\\$"}, // leftwards arrow, U+2190 ISOnum
340         {"8593", "uarr", "\\$\\\\uparrow\\$"}, // upwards arrow, U+2191 ISOnum
341         {"8594", "rarr", "\\$\\\\rightarrow\\$"}, // rightwards arrow, U+2192 ISOnum
342         {"8595", "darr", "\\$\\\\downarrow\\$"}, // downwards arrow, U+2193 ISOnum
343         {"8596", "harr", "\\$\\\\leftrightarrow\\$"}, // left right arrow, U+2194 ISOamsa  
344         {"8629", "crarr", ""}, // downwards arrow with corner leftwards 
345         //                                    = carriage return, U+21B5 NEW 
346         {"8656", "lArr", "\\{\\$\\\\Leftarrow\\$\\}"}, // leftwards double arrow, U+21D0 ISOtech
347         /*  ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
348          but also does not have any other character for that function. So ? lArr can
349          be used for 'is implied by' as ISOtech suggests */
350         {"8657", "uArr", "\\{\\$\\\\Uparrow\\$\\}"}, // upwards double arrow, U+21D1 ISOamsa
351         {"8658", "rArr", "\\{\\$\\\\Rightarrow\\$\\}"}, // rightwards double arrow,
352         //                                     U+21D2 ISOtech 
353         /*   ISO 10646 does not say this is the 'implies' character but does not have 
354          another character with this function so ?
355          rArr can be used for 'implies' as ISOtech suggests */
356         {"8659", "dArr", "\\{\\$\\\\Downarrow\\$\\}"}, // downwards double arrow, U+21D3 ISOamsa  
357         {"8660", "hArr", "\\{\\$\\\\Leftrightarrow\\$\\}"}, // left right double arrow, 
358         //                                     U+21D4 ISOamsa 
359
360         /* Mathematical Operators */
361         {"8704", "forall", "\\$\\\\forall\\$"}, // for all, U+2200 ISOtech  
362         {"8706", "part", "\\$\\\\partial\\$"}, // partial differential, U+2202 ISOtech
363         {"8707", "exist", "\\$\\\\exists\\$"}, // there exists, U+2203 ISOtech
364         {"8709", "empty", "\\$\\\\emptyset\\$"}, // empty set = null set = diameter,
365         //                                    U+2205 ISOamso 
366         {"8711", "nabla", "\\$\\\\nabla\\$"}, // nabla = backward difference, 
367         //                                    U+2207 ISOtech 
368         {"8712", "isin", "\\$\\\\in\\$"}, // element of, U+2208 ISOtech
369         {"8713", "notin", "\\$\\\\notin\\$"}, // not an element of, U+2209 ISOtech
370         {"8715", "ni", "\\$\\\\ni\\$"}, // contains as member, U+220B ISOtech
371         /* should there be a more memorable name than 'ni'? */
372         {"8719", "prod", "\\$\\\\prod\\$"}, // n-ary product = product sign,
373         //                                    U+220F ISOamsb 
374         /*    prod is NOT the same character as U+03A0 'greek capital letter pi' though
375          the same glyph might be used for both  */
376         {"8721", "sum", "\\$\\\\sum\\$"}, // n-ary sumation, U+2211 ISOamsb  
377         /*    sum is NOT the same character as U+03A3 'greek capital letter sigma'
378          though the same glyph might be used for both */
379         {"8722", "minus", "\\$-\\$"}, // minus sign, U+2212 ISOtech  
380         {"8727", "lowast", "\\$\\\\ast\\$"}, // asterisk operator, U+2217 ISOtech  
381         {"8730", "radic", "\\$\\\\sqrt{}\\$"}, // square root = radical sign, 
382         //                                    U+221A ISOtech 
383         {"8733", "prop", "\\$\\\\propto\\$"}, // proportional to, U+221D ISOtech  
384         {"8734", "infin", "\\$\\\\infty\\$"}, // infinity, U+221E ISOtech  
385         {"8736", "ang", "\\$\\\\angle\\$"}, // angle, U+2220 ISOamso
386         {"8743", "and", "\\$\\\\land\\$"}, // logical and = wedge, U+2227 ISOtech
387         {"8744", "or", "\\$\\\\lor\\$"}, // logical or = vee, U+2228 ISOtech
388         {"8745", "cap", "\\$\\\\cap\\$"}, // intersection = cap, U+2229 ISOtech
389         {"8746", "cup", "\\$\\\\cup\\$"}, // union = cup, U+222A ISOtech
390         {"8747", "int", "\\$\\\\int\\$"}, // integral, U+222B ISOtech
391         {"8756", "there4", "\\$\\\\uptherefore\\$"}, // therefore, U+2234 ISOtech; only in LaTeX package MnSymbol
392         {"8764", "sim", "\\$\\\\sim\\$"}, // tilde operator = varies with = similar to,
393         //                                    U+223C ISOtech 
394         /*  tilde operator is NOT the same character as the tilde, U+007E,
395          although the same glyph might be used to represent both   */
396         {"8773", "cong", "\\$\\\\cong\\$"}, // approximately equal to, U+2245 ISOtech  
397         {"8776", "asymp", "\\$\\\\approx\\$"}, // almost equal to = asymptotic to, 
398         //                                    U+2248 ISOamsr 
399         {"8800", "ne", "\\$\\\\neq\\$"}, // not equal to, U+2260 ISOtech  
400         {"8801", "equiv", "\\$\\\\equiv\\$"}, // identical to, U+2261 ISOtech  
401         {"8804", "le", "\\$\\\\leq\\$"}, // less-than or equal to, U+2264 ISOtech  
402         {"8805", "ge", "\\$\\\\geq\\$"}, // greater-than or equal to, 
403         //                                    U+2265 ISOtech 
404         {"8834", "sub", "\\$\\\\subset\\$"}, // subset of, U+2282 ISOtech  
405         {"8835", "sup", "\\$\\\\supset\\$"}, // superset of, U+2283 ISOtech  
406         /*    note that nsup, 'not a superset of, U+2283' is not covered by the Symbol 
407          font encoding and is not included. Should it be, for symmetry?
408          It is in ISOamsn   */
409         {"8836", "nsub", "\\$\\\\nsubset\\$"}, // not a subset of, U+2284 ISOamsn  
410         {"8838", "sube", "\\$\\\\subseteq\\$"}, // subset of or equal to, U+2286 ISOtech  
411         {"8839", "supe", "\\$\\\\supseteq\\$"}, // superset of or equal to, 
412         //                                    U+2287 ISOtech 
413         {"8853", "oplus", "\\$\\\\oplus\\$"}, // circled plus = direct sum, 
414         //                                    U+2295 ISOamsb 
415         {"8855", "otimes", "\\$\\\\otimes\\$"}, // circled times = vector product,
416         //                                    U+2297 ISOamsb 
417         {"8869", "perp", "\\$\\\\perp\\$"}, // up tack = orthogonal to = perpendicular, 
418         //                                    U+22A5 ISOtech 
419         {"8901", "sdot", "\\$\\\\cdot\\$"}, // dot operator, U+22C5 ISOamsb  
420         /* dot operator is NOT the same character as U+00B7 middle dot */
421         /* Miscellaneous Technical */
422         {"8968", "lceil", "\\$\\\\lceil\\$"}, // left ceiling = apl upstile, 
423         //                                    U+2308 ISOamsc  
424         {"8969", "rceil", "\\$\\\\rceil\\$"}, // right ceiling, U+2309 ISOamsc   
425         {"8970", "lfloor", "\\$\\\\lfloor\\$"}, // left floor = apl downstile, 
426         //                                    U+230A ISOamsc  
427         {"8971", "rfloor", "\\$\\\\rfloor\\$"}, // right floor, U+230B ISOamsc   
428         {"9001", "lang", "\\$\\\\langle\\$"}, // left-pointing angle bracket = bra, 
429         //                                    U+2329 ISOtech 
430         /*    lang is NOT the same character as U+003C 'less than' 
431          or U+2039 'single left-pointing angle quotation mark' */
432         {"9002", "rang", "\\$\\\\rangle\\$"}, // right-pointing angle bracket = ket, 
433         //                                    U+232A ISOtech 
434         /*    rang is NOT the same character as U+003E 'greater than' 
435          or U+203A 'single right-pointing angle quotation mark' */
436         /* Geometric Shapes */
437         {"9674", "loz", "\\$\\\\lozenge\\$"}, // lozenge, U+25CA ISOpub  
438
439         /* Miscellaneous Symbols */
440         {"9824", "spades", "\\$\\\\spadesuit\\$"}, // black spade suit, U+2660 ISOpub  
441         /* black here seems to mean filled as opposed to hollow */
442         {"9827", "clubs", "\\$\\\\clubsuit\\$"}, // black club suit = shamrock, 
443         //                                    U+2663 ISOpub 
444         {"9829", "hearts", "\\$\\\\heartsuit\\$"}, // black heart suit = valentine, 
445         //                                    U+2665 ISOpub 
446         {"9830", "diams", "\\$\\\\diamondsuit\\$"}, // black diamond suit, U+2666 ISOpub  
447         {"34", "quot", "\""}, // quotation mark = APL quote,
448         //                                   U+0022 ISOnum 
449         {"38", "amp", "\\\\&"}, // ampersand, U+0026 ISOnum 
450         {"60", "lt", "\\$<\\$"}, // less-than sign, U+003C ISOnum 
451         {"62", "gt", "\\$>\\$"}, // greater-than sign, U+003E ISOnum 
452
453         /* Latin Extended-A */
454         {"338", "OElig", "\\{\\\\OE\\}"}, // latin capital ligature OE,
455         //                                   U+0152 ISOlat2 
456         {"339", "oelig", "\\{\\\\oe\\}"}, // latin small ligature oe, U+0153 ISOlat2 
457         /* ligature is a misnomer, this is a separate character in some languages */
458         {"352", "Scaron", "\\{\\\\v\\{S\\}\\}"}, // latin capital letter S with caron,
459         //                                   U+0160 ISOlat2 
460         {"353", "scaron", "\\{\\\\v\\{s\\}\\}"}, // latin small letter s with caron,
461         //                                   U+0161 ISOlat2 
462         {"376", "Yuml", "\\{\\\\\"\\{Y\\}\\}"}, // latin capital letter Y with diaeresis,
463         //                                   U+0178 ISOlat2 
464
465         /* Spacing Modifier Letters */
466         {"710", "circ", "\\{\\\\textasciicircum\\}"}, // modifier letter circumflex accent,
467         //                                   U+02C6 ISOpub 
468         {"732", "tilde", "\\{\\\\textasciitilde\\}"}, // small tilde, U+02DC ISOdia 
469
470         /* General Punctuation */
471         {"8194", "ensp", "\\\\hspace\\{0.5em\\}"}, // en space, U+2002 ISOpub  
472         {"8195", "emsp", "\\\\hspace\\{1em\\}"}, // em space, U+2003 ISOpub  
473         {"8201", "thinsp", "\\\\hspace\\{0.167em\\}"}, // thin space, U+2009 ISOpub  
474         {"8204", "zwnj", ""}, // zero width non-joiner, 
475         //                                   U+200C NEW RFC 2070 
476         {"8205", "zwj", ""}, // zero width joiner, U+200D NEW RFC 2070  
477         {"8206", "lrm", ""}, // left-to-right mark, U+200E NEW RFC 2070  
478         {"8207", "rlm", ""}, // right-to-left mark, U+200F NEW RFC 2070  
479         {"8211", "ndash", "--"}, // en dash, U+2013 ISOpub  
480         {"8212", "mdash", "---"}, // em dash, U+2014 ISOpub  
481         {"8216", "lsquo", "\\{\\\\textquoteleft\\}"}, // left single quotation mark, 
482         //                                   U+2018 ISOnum 
483         {"8217", "rsquo", "\\{\\\\textquoteright\\}"}, // right single quotation mark, 
484         //                                   U+2019 ISOnum 
485         {"8218", "sbquo", "\\{\\\\quotesinglbase\\}"}, // single low-9 quotation mark, U+201A NEW  
486         {"8220", "ldquo", "\\{\\\\textquotedblleft\\}"}, // left double quotation mark, 
487         //                                   U+201C ISOnum 
488         {"8221", "rdquo", "\\{\\\\textquotedblright\\}"}, // right double quotation mark, 
489         //                                   U+201D ISOnum 
490         {"8222", "bdquo", "\\{\\\\quotedblbase\\}"}, // double low-9 quotation mark, U+201E NEW  
491         {"8224", "dagger", "\\{\\\\dag\\}"}, // dagger, U+2020 ISOpub  
492         {"8225", "Dagger", "\\{\\\\ddag\\}"}, // double dagger, U+2021 ISOpub  
493         {"8240", "permil", "\\{\\\\textperthousand\\}"}, // per mille sign, U+2030 ISOtech  
494         {"8249", "lsaquo", "\\{\\\\guilsinglleft\\}"}, // single left-pointing angle quotation mark, 
495         //                                   U+2039 ISO proposed 
496         /* lsaquo is proposed but not yet ISO standardized */
497         {"8250", "rsaquo", "\\{\\\\guilsinglright\\}"}, // single right-pointing angle quotation mark, 
498         //                                   U+203A ISO proposed 
499         /* rsaquo is proposed but not yet ISO standardized */
500         {"8364", "euro", "\\{\\\\texteuro\\}"}, // euro sign, U+20AC NEW 
501             
502         /* Manually added */
503         {"37", "percnt", "\\\\%"}, // Percent
504         {"39", "", "'"}, // Apostrophe
505         {"40", "", "("}, // Left bracket
506         {"41", "", ")"}, // Right bracket
507         {"43", "plus", "\\+"}, // Plus
508         {"44", "comma", ","}, // Comma
509         {"45", "hyphen", "-"}, // Hyphen
510         {"46", "period", "\\."}, // Period
511         {"47", "slash", "/"}, // Slash (solidus)
512         {"58", "colon", ":"}, // Colon
513         {"59", "semi", ";"}, // Semi colon
514         {"91", "lsqb", "\\["}, // Left square bracket
515         {"92", "bsol", "\\{\\\\textbackslash\\}"}, // Backslash
516         {"93", "rsqb", "\\]"}, // Right square bracket
517         {"94", "Hat", "\\{\\\\\\^\\{\\}\\}"}, // Circumflex
518         {"95", "lowbar", "\\\\_"}, // Underscore
519         {"96", "grave", "\\{\\\\`\\{\\}\\}"}, // Grave
520         {"123", "lbrace", "\\\\\\{"}, // Left curly bracket
521         {"", "lcub", "\\\\\\{"}, // Left curly bracket
522         {"124", "vert", "\\|"}, // Vertical bar
523         {"", "verbar", "\\|"}, // Vertical bar
524         {"", "VerticalLine", "\\|"}, // Vertical bar
525         {"125", "rbrace", "\\\\\\}"}, // Right curly bracket
526         {"", "rcub", "\\\\\\}"}, // Right curly bracket
527         {"138", "", "\\{\\\\v\\{S\\}\\}"}, // Line tabulation set   
528      // {"141", "", ""}, // Reverse line feed
529         {"145", "", "`"}, // Apostrophe
530         {"146", "", "'"}, // Apostrophe
531         {"147", "", "``"}, // Quotation mark
532         {"148", "", "''"}, // Quotation mark
533         {"150", "", "--"}, // En dash
534         {"154", "", "\\{\\\\v\\{s\\}\\}"}, // Single character introducer
535         {"262", "Cacute", "\\{\\\\'\\{C\\}\\}"}, // capital C with acute
536         {"263", "cacute", "\\{\\\\'\\{c\\}\\}"}, // small C with acute
537         {"264", "Ccirc", "\\{\\\\\\^\\{C\\}\\}"}, // capital C with circumflex
538         {"265", "ccirc", "\\{\\\\\\^\\{c\\}\\}"}, // small C with circumflex
539         {"266", "Cdot", "\\{\\\\\\.\\{C\\}\\}"}, // capital C with dot above
540         {"267", "cdot", "\\{\\\\\\.\\{c\\}\\}"}, // small C with dot above
541         {"268", "Ccaron", "\\{\\\\v\\{C\\}\\}"}, // capital C with caron
542         {"269", "ccaron", "\\{\\\\v\\{c\\}\\}"}, // small C with caron
543         {"298", "Imacr", "\\{\\\\=\\{I\\}\\}"}, // capital I with macron
544         {"299", "imacr", "\\{\\\\=\\{\\\\i\\}\\}"}, // small i with macron
545         {"305", "inodot", "\\{\\\\i\\}"},    // Small i without the dot
546         {"", "imath", "\\{\\\\i\\}"},    // Small i without the dot
547         {"321", "Lstrok", "\\{\\\\L\\}"},    // upper case l with stroke
548         {"322", "lstrok", "\\{\\\\l\\}"},    // lower case l with stroke
549         {"536", "", "\\{\\\\cb\\{S\\}\\}"},    // capital letter S with comma below, require combelow
550         {"537", "", "\\{\\\\cb\\{s\\}\\}"},    // small letter S with comma below, require combelow
551         {"727", "caron", "\\{\\\\v\\{\\}\\}"}, // Caron
552         {"", "Hacek", "\\{\\\\v\\{\\}\\}"}, // Caron
553         {"728", "breve", "\\{\\\\u\\{\\}\\}"}, // Breve
554         {"", "Breve", "\\{\\\\u\\{\\}\\}"}, // Breve
555         {"729", "dot", "\\{\\\\\\.\\{\\}\\}"}, // Dot above
556         {"730", "ring", "\\{\\\\r\\{\\}\\}"}, // Ring above
557         {"949", "epsi", "\\$\\\\epsilon\\$"},    // Epsilon - double check
558         {"1013", "epsiv", "\\$\\\\varepsilonup\\$"},    // lunate epsilon, requires txfonts
559         {"1055", "", "\\{\\\\cyrchar\\\\CYRP\\}"},    // Cyrillic capital Pe
560         {"1082", "", "\\{\\\\cyrchar\\\\cyrk\\}"},    // Cyrillic small Ka
561      // {"2013", "", ""},    // NKO letter FA -- Maybe en dash = 0x2013?
562      // {"2014", "", ""},    // NKO letter FA -- Maybe em dash = 0x2014?
563         {"8208", "hyphen", "-"},    // Hyphen
564         {"8229", "nldr", "\\.\\."},    // Double dots - en leader
565         {"8451", "", "\\$\\\\deg\\$\\{C\\}"}, // Degree Celsius
566         {"8459", "Hscr", "\\$\\\\mathcal\\{H\\}\\$"}, // script capital H -- possibly use \mathscr
567         {"8460", "Hfr", "\\$\\\\mathbb\\{H\\}\\$"}, // black letter capital H -- requires e.g. amsfonts
568         {"8466", "Lscr", "\\$\\\\mathcal\\{L\\}\\$"}, // script capital L -- possibly use \mathscr
569         {"8467", "ell", "\\{\\\\ell\\}"}, // script small l 
570         {"8469", "naturals", "\\$\\\\mathbb\\{N\\}\\$"}, // double struck capital N -- requires e.g. amsfonts
571         {"8486", "", "\\$\\{\\\\Omega\\}\\$"}, // Omega
572         {"8491", "angst", "\\{\\\\AA\\}"}, // Angstrom 
573         {"8496", "Escr", "\\$\\\\mathcal\\{E\\}\\$"}, // script capital E 
574         {"8714", "", "\\$\\\\in\\$"},    // Small element in
575         {"8729", "bullet", "\\$\\\\bullet\\$"},    // Bullet operator
576         {"8758", "ratio", ":"},    // Colon/ratio
577         {"8771", "sime", "\\$\\\\simeq\\$"}, // almost equal to = asymptotic to, 
578         {"8776", "ap", "\\$\\\\approx\\$"}, // almost equal to = asymptotic to, 
579         {"8810", "ll", "\\$\\\\ll\\$"}, // Much less than 
580         {"", "Lt", "\\$\\\\ll\\$"}, // Much less than 
581         {"8811", "gg", "\\$\\\\gg\\$"}, // Much greater than 
582         {"", "Gt", "\\$\\\\gg\\$"}, // Much greater than 
583         {"8819", "gsim", "\\$\\\\gtrsim\\$"}, // Greater than or equivalent to
584         {"8882", "vltri", "\\$\\\\triangleleft\\$"}, // Left triangle
585         {"8883", "vrtri", "\\$\\\\triangleright\\$"}, // Right triangle
586         {"8896", "xwedge", "\\$\\\\bigwedge\\$"}, // Big wedge
587         {"8897", "xvee", "\\$\\\\bigvee\\$"}, // Big vee
588         {"9426", "", "\\{\\\\copyright\\}"}, // circled small letter C
589         {"9633", "square", "\\$\\\\square\\$"}, // White square
590         {"9653", "utri", "\\$\\\\triangle\\$"}, // White up-pointing small triangle -- \vartriangle probably
591                                                 // better but requires amssymb
592         {"10877", "les", "\\$\\\\leqslant\\$"},    // Less than slanted equal -- requires amssymb 
593         {"10878", "ges", "\\$\\\\geqslant\\$"},    // Less than slanted equal -- requires amssymb 
594         {"119978", "Oscr", "\\$\\\\mathcal\\{O\\}\\$"} // script capital O -- possibly use \mathscr
595         
596     };
597     
598         // List of combining accents
599         private String[][] accentList = new String[][] {
600         {"768", "`"},    // Grave 
601         {"769", "'"},    // Acute
602         {"770", "\\^"},  // Circumflex
603         {"771", "~"},    // Tilde
604         {"772", "="},    // Macron
605         {"773", "="},     // Overline - not completely correct
606         {"774", "u"},    // Breve
607         {"775", "\\."},  // Dot above
608         {"776", "\""},   // Diaeresis
609         {"777", "h"},    // Hook above
610         {"778", "r"},    // Ring 
611         {"779", "H"},    // Double acute
612         {"780", "v"},    // Caron
613         {"781", "\\|"},  // Vertical line above
614         // {"782", ""},     // Double vertical line above
615         {"783", "G"},    // Double grave
616         {"803", "d"},    // Dot below
617         {"807", "c"},    // Cedilla
618           
619         };
620
621         private HashMap<String, String> escapedSymbols = new HashMap<String, String>();
622         private HashMap<Integer, String> escapedAccents = new HashMap<Integer, String>();
623         private HashMap<Integer, String> numSymbols = new HashMap<Integer, String>();
624         
625         
626         
627         public HTMLConverter() {
628                 super();
629                 for (int i=0;i<conversionList.length;i++) {
630                     if (conversionList[i][2].length() >= 1) {
631                         if (conversionList[i][1].length() >= 1) {
632                             escapedSymbols.put("&" + conversionList[i][1] + ";" , conversionList[i][2]);
633                         }
634                         if (conversionList[i][0].length() >= 1) {
635                             numSymbols.put(Integer.decode(conversionList[i][0]) , conversionList[i][2]);
636                         }
637                     }
638                 }
639                 for (int i=0;i<accentList.length;i++) {
640                     escapedAccents.put(Integer.decode(accentList[i][0]), accentList[i][1]);
641                 }
642         }
643         
644     public String format(String text) {
645         if (text == null)
646             return null;
647         StringBuffer sb = new StringBuffer();
648         // Deal with the form <sup>k</sup>and <sub>k</sub>
649         // If the result is in text or equation form can be controlled
650         // From the "Advanced settings" tab
651         if(Globals.prefs.getBoolean("useConvertToEquation")) {
652             text = text.replaceAll("<sup>([^<]+)</sup>", "\\$\\^\\{$1\\}\\$");
653             text = text.replaceAll("<sub>([^<]+)</sub>", "\\$_\\{$1\\}\\$");
654         } else {
655             text = text.replaceAll("<sup>([^<]+)</sup>", "\\\\textsuperscript\\{$1\\}");
656             text = text.replaceAll("<sub>([^<]+)</sub>", "\\\\textsubscript\\{$1\\}");
657         }
658         
659         // TODO: maybe rewrite this based on regular expressions instead
660         // Note that (at least) the IEEE Xplore fetcher must be fixed as it relies on the current way to 
661         // remove tags for its image alt-tag to equation converter
662         for (int i=0; i<text.length(); i++) {
663
664             int c = text.charAt(i);
665
666             if (c == '<') {
667                 i = readTag(text, sb, i);
668             } else
669                 sb.append((char)c);
670
671         }
672         text = sb.toString();
673         
674         // Handle text based HTML entities
675         Set<String> patterns = escapedSymbols.keySet();
676         for (String pattern: patterns) {
677                 text = text.replaceAll(pattern, escapedSymbols.get(pattern));
678         }
679         
680         // Handle numerical HTML entities
681         Pattern escapedPattern = Pattern.compile("&#([x]*)([0]*)(\\p{XDigit}+);");
682         Matcher m = escapedPattern.matcher(text);
683         while (m.find()) {
684             //      System.err.println("Found pattern: " + m.group(1));
685             //      System.err.println("Found pattern: " + m.group(2));
686             int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3));
687             if(numSymbols.containsKey(num)) {
688                 text = text.replaceAll("&#" + m.group(1) + m.group(2) + m.group(3) + ";", numSymbols.get(num));
689             } 
690         }
691
692         escapedPattern = Pattern.compile("(.)&#([x]*)([0]*)(\\p{XDigit}+);");
693         m = escapedPattern.matcher(text);
694         while (m.find()) {
695             //      System.err.println("Found pattern: " + m.group(1));
696             //      System.err.println("Found pattern: " + m.group(2));
697             int num = Integer.decode(m.group(2).replace("x", "#") + m.group(4));
698             if(escapedAccents.containsKey(num)) {
699                 text = text.replaceAll(m.group(1) + "&#" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{" + m.group(1) + "\\}\\}");
700             } 
701         }
702
703         escapedPattern = Pattern.compile("&#([x]*)([0]*)(\\p{XDigit}+);");
704         m = escapedPattern.matcher(text);
705         while (m.find()) {
706             //      System.err.println("Found pattern: " + m.group(1));
707             //      System.err.println("Found pattern: " + m.group(2));
708             int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3));
709             System.err.println("HTML escaped char not converted: " + m.group(1) + m.group(2) + m.group(3) + " = " + Integer.toString(num));
710         }
711         
712         // Remove $$ in case of two adjacent conversions
713         text = text.replace("$$","");
714         
715        // Find non-covered special characters with alphabetic codes
716         escapedPattern = Pattern.compile("&(\\w+);");
717         m = escapedPattern.matcher(text);
718         while (m.find()) {
719             System.err.println("HTML escaped char not converted: " + m.group(1));
720         }
721
722         return text.trim();
723     }
724
725     private final int MAX_TAG_LENGTH = 30;
726     /*private final int MAX_CHAR_LENGTH = 10;
727
728     private int readHtmlChar(String text, StringBuffer sb, int position) {
729         // Have just read the < character that starts the tag.
730         int index = text.indexOf(';', position);
731         if ((index > position) && (index-position < MAX_CHAR_LENGTH)) {
732                 //String code = text.substring(position, index);
733             //System.out.println("Removed code: "+text.substring(position, index));
734             return index; // Just skip the tag.
735         } else return position; // Don't do anything.
736     }*/
737
738     private int readTag(String text, StringBuffer sb, int position) {
739         // Have just read the < character that starts the tag.
740         int index = text.indexOf('>', position);
741         if ((index > position) && (index-position < MAX_TAG_LENGTH)) {
742             //System.out.println("Removed tag: "+text.substring(position, index));
743             return index; // Just skip the tag.
744         } else return position; // Don't do anything.
745     }
746 }