56d8b4367c92da12f134acff90cb950009d9243a
[debian/jabref.git] / src / java / net / sf / jabref / imports / HTMLConverter.java
1 /*  Copyright (C) 2003-2012 JabRef contributors.
2     This program is free software; you can redistribute it and/or modify
3     it under the terms of the GNU General Public License as published by
4     the Free Software Foundation; either version 2 of the License, or
5     (at your option) any later version.
6
7     This program is distributed in the hope that it will be useful,
8     but WITHOUT ANY WARRANTY; without even the implied warranty of
9     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10     GNU General Public License for more details.
11
12     You should have received a copy of the GNU General Public License along
13     with this program; if not, write to the Free Software Foundation, Inc.,
14     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
15 */
16 package net.sf.jabref.imports;
17
18 import java.util.HashMap;
19 import java.util.Set;
20 import java.util.regex.Matcher;
21 import java.util.regex.Pattern;
22
23 import net.sf.jabref.export.layout.LayoutFormatter;
24 import net.sf.jabref.Globals;
25
26 public class HTMLConverter implements LayoutFormatter {
27
28     /*   Portions © International Organization for Standardization 1986:
29      Permission to copy in any form is granted for use with
30      conforming SGML systems and applications as defined in
31      ISO 8879, provided this notice is included in all copies.
32     */
33
34
35         // most of the LaTeX commands can be read at http://en.wikibooks.org/wiki/LaTeX/Accents
36         // The symbols can be looked at http://www.fileformat.info/info/unicode/char/a4/index.htm. Replace "a4" with the U+ number
37         // http://detexify.kirelabs.org/classify.html and http://www.ctan.org/tex-archive/info/symbols/comprehensive/ might help to find the right LaTeX command
38         // http://llg.cubic.org/docs/ent2latex.html and http://www.w3.org/TR/xml-entity-names/byalpha.html are also useful
39         // as well as http://www.w3.org/Math/characters/unicode.xml
40     
41     
42     // An array of arrays of strings in the format:
43     // {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"}
44     // Leaving a field empty is OK as it then will not be included
45     private String[][] conversionList = new String[][]{
46         {"160", "nbsp", "\\{~\\}"}, // no-break space = non-breaking space, 
47         //                                 U+00A0 ISOnum 
48         {"161", "iexcl", "\\{\\\\textexclamdown\\}"}, // inverted exclamation mark, U+00A1 ISOnum
49         {"162", "cent", "\\{\\\\textcent\\}"}, // cent sign, U+00A2 ISOnum  
50         {"163", "pound", "\\{\\\\pounds\\}"}, // pound sign, U+00A3 ISOnum
51         {"164", "curren", "\\{\\\\textcurrency\\}"}, // currency sign, U+00A4 ISOnum  
52         {"165", "yen", "\\{\\\\textyen\\}"}, // yen sign = yuan sign, U+00A5 ISOnum  
53         {"166", "brvbar", "\\{\\\\textbrokenbar\\}"}, // broken bar = broken vertical bar, 
54         //                                 U+00A6 ISOnum 
55         {"167", "sect", "\\{\\\\S\\}"}, // section sign, U+00A7 ISOnum  
56         {"168", "uml", "\\{\\\\\"\\{\\}\\}"}, // diaeresis = spacing diaeresis, 
57         //                                 U+00A8 ISOdia 
58         {"169", "copy", "\\{\\\\copyright\\}"}, // copyright sign, U+00A9 ISOnum
59         {"170", "ordf", "\\{\\\\textordfeminine\\}"}, // feminine ordinal indicator, U+00AA ISOnum
60         {"171", "laquo", "\\{\\\\guillemotleft\\}"}, // left-pointing double angle quotation mark
61         //                                 = left pointing guillemet, U+00AB ISOnum 
62         {"172", "not", "\\$\\\\neg\\$"}, // not sign, U+00AC ISOnum  
63         {"173", "shy", "\\\\-"}, // soft hyphen = discretionary hyphen, 
64         //                                 U+00AD ISOnum 
65         {"174", "reg", "\\{\\\\textregistered\\}"}, // registered sign = registered trade mark sign,
66         //                                 U+00AE ISOnum 
67         {"175", "macr", "\\{\\\\=\\{\\}\\}"}, // macron = spacing macron = overline 
68         //                                 = APL overbar, U+00AF ISOdia 
69         {"176", "deg", "\\$\\\\deg\\$"}, // degree sign, U+00B0 ISOnum  
70         {"177", "plusmn", "\\$\\\\pm\\$"}, // plus-minus sign = plus-or-minus sign, 
71         //                                 U+00B1 ISOnum 
72         {"178", "sup2", "\\\\textsuperscript\\{2\\}"}, // superscript two = superscript digit two 
73         //                                 = squared, U+00B2 ISOnum 
74         {"179", "sup3", "\\\\textsuperscript\\{3\\}"}, // superscript three = superscript digit three 
75         //                                 = cubed, U+00B3 ISOnum 
76         {"180", "acute", "\\{\\\\'\\{\\}\\}"}, // acute accent = spacing acute, 
77         //                                 U+00B4 ISOdia 
78         {"181", "micro", "\\$\\\\mu\\$"}, // micro sign, U+00B5 ISOnum  
79         {"182", "para", "\\{\\\\P\\}"}, // pilcrow sign = paragraph sign, 
80         //                                 U+00B6 ISOnum 
81         {"183", "middot", "\\$\\\\cdot\\$"}, // middle dot = Georgian comma 
82         //                                 = Greek middle dot, U+00B7 ISOnum 
83         {"184", "cedil", "\\{\\\\c\\{\\}\\}"}, // cedilla = spacing cedilla, U+00B8 ISOdia  
84         {"185", "sup1", "\\\\textsuperscript\\{1\\}"}, // superscript one = superscript digit one,
85         //                                 U+00B9 ISOnum 
86         {"186", "ordm", "\\{\\\\textordmasculine\\}"}, // masculine ordinal indicator,
87         //                                 U+00BA ISOnum 
88         {"187", "raquo", "\\{\\\\guillemotright\\}"}, // right-pointing double angle quotation mark
89         //                                 = right pointing guillemet, U+00BB ISOnum 
90         {"188", "frac14", "\\$\\\\sfrac\\{1\\}\\{4\\}\\$"}, // vulgar fraction one quarter 
91         //                                 = fraction one quarter, U+00BC ISOnum 
92         {"189", "frac12", "\\$\\\\sfrac\\{1\\}\\{2\\}\\$"}, // vulgar fraction one half 
93         //                                 = fraction one half, U+00BD ISOnum 
94         {"190", "frac34", "\\$\\\\sfrac\\{3\\}\\{4\\}\\$"}, // vulgar fraction three quarters 
95         //                                 = fraction three quarters, U+00BE ISOnum 
96         {"191", "iquest", "\\{\\\\textquestiondown\\}"}, // inverted question mark 
97         //                                 = turned question mark, U+00BF ISOnum 
98         {"192", "Agrave", "\\{\\\\`\\{A\\}\\}"}, // latin capital letter A with grave
99         //                                 = latin capital letter A grave,
100         //                                 U+00C0 ISOlat1 
101         {"193", "Aacute", "\\{\\\\'\\{A\\}\\}"}, // latin capital letter A with acute, 
102         //                                 U+00C1 ISOlat1 
103         {"194", "Acirc", "\\{\\\\\\^\\{A\\}\\}"}, // latin capital letter A with circumflex, 
104         //                                 U+00C2 ISOlat1 
105         {"195", "Atilde", "\\{\\\\~\\{A\\}\\}"}, // latin capital letter A with tilde, 
106         //                                 U+00C3 ISOlat1 
107         {"196", "Auml", "\\{\\\\\"\\{A\\}\\}"}, // latin capital letter A with diaeresis, 
108         //                                 U+00C4 ISOlat1 
109         {"197", "Aring", "\\{\\\\AA\\}"}, // latin capital letter A with ring above 
110         //                                 = latin capital letter A ring,
111         //                                 U+00C5 ISOlat1 
112         {"198", "AElig", "\\{\\\\AE\\}"}, // latin capital letter AE 
113         //                                 = latin capital ligature AE,
114         //                                 U+00C6 ISOlat1 
115         {"199", "Ccedil", "\\{\\\\c\\{C\\}\\}"}, // latin capital letter C with cedilla,
116         //                                 U+00C7 ISOlat1 
117         {"200", "Egrave", "\\{\\\\`\\{E\\}\\}"}, // latin capital letter E with grave,
118         //                                 U+00C8 ISOlat1 
119         {"201", "Eacute", "\\{\\\\'\\{E\\}\\}"}, // latin capital letter E with acute, 
120         //                                 U+00C9 ISOlat1 
121         {"202", "Ecirc", "\\{\\\\\\^\\{E\\}\\}"}, // latin capital letter E with circumflex, 
122         //                                 U+00CA ISOlat1 
123         {"203", "Euml", "\\{\\\\\"\\{E\\}\\}"}, // latin capital letter E with diaeresis, 
124         //                                 U+00CB ISOlat1 
125         {"204", "Igrave", "\\{\\\\`\\{I\\}\\}"}, // latin capital letter I with grave,
126         //                                 U+00CC ISOlat1 
127         {"205", "Iacute", "\\{\\\\'\\{I\\}\\}"}, // latin capital letter I with acute, 
128         //                                 U+00CD ISOlat1 
129         {"206", "Icirc", "\\{\\\\\\^\\{I\\}\\}"}, // latin capital letter I with circumflex, 
130         //                                 U+00CE ISOlat1 
131         {"207", "Iuml", "\\{\\\\\"\\{I\\}\\}"}, // latin capital letter I with diaeresis, 
132         //                                 U+00CF ISOlat1 
133         {"208", "ETH", "\\{\\\\DH\\}"}, // latin capital letter ETH, U+00D0 ISOlat1  
134         {"209", "Ntilde", "\\{\\\\~\\{N\\}\\}"}, // latin capital letter N with tilde, 
135         //                                 U+00D1 ISOlat1 
136         {"210", "Ograve", "\\{\\\\`\\{O\\}\\}"}, // latin capital letter O with grave,
137         //                                 U+00D2 ISOlat1 
138         {"211", "Oacute", "\\{\\\\'\\{O\\}\\}"}, // latin capital letter O with acute, 
139         //                                 U+00D3 ISOlat1 
140         {"212", "Ocirc", "\\{\\\\\\^\\{O\\}\\}"}, // latin capital letter O with circumflex, 
141         //                                 U+00D4 ISOlat1 
142         {"213", "Otilde", "\\{\\\\~\\{O\\}\\}"}, // latin capital letter O with tilde, 
143         //                                 U+00D5 ISOlat1 
144         {"214", "Ouml", "\\{\\\\\"\\{O\\}\\}"}, // latin capital letter O with diaeresis, 
145         //                                 U+00D6 ISOlat1 
146         {"215", "times", "\\$\\\\times\\$"}, // multiplication sign, U+00D7 ISOnum  
147         {"216", "Oslash", "\\{\\\\O\\}"}, // latin capital letter O with stroke 
148         //                                 = latin capital letter O slash,
149         //                                 U+00D8 ISOlat1 
150         {"217", "Ugrave", "\\{\\\\`\\{U\\}\\}"}, // latin capital letter U with grave,
151         //                                 U+00D9 ISOlat1 
152         {"218", "Uacute", "\\{\\\\'\\{U\\}\\}"}, // latin capital letter U with acute, 
153         //                                 U+00DA ISOlat1 
154         {"219", "Ucirc", "\\{\\\\\\^\\{U\\}\\}"}, // latin capital letter U with circumflex, 
155         //                                 U+00DB ISOlat1 
156         {"220", "Uuml", "\\{\\\\\"\\{U\\}\\}"}, // latin capital letter U with diaeresis, 
157         //                                 U+00DC ISOlat1 
158         {"221", "Yacute", "\\{\\\\'\\{Y\\}\\}"}, // latin capital letter Y with acute, 
159         //                                 U+00DD ISOlat1 
160         {"222", "THORN", "\\{\\\\TH\\}"}, // latin capital letter THORN, 
161         //                                 U+00DE ISOlat1 
162         {"223", "szlig", "\\{\\\\ss\\}"}, // latin small letter sharp s = ess-zed,
163         //                                 U+00DF ISOlat1 
164         {"224", "agrave", "\\{\\\\`\\{a\\}\\}"}, // latin small letter a with grave
165         //                                 = latin small letter a grave,
166         //                                 U+00E0 ISOlat1 
167         {"225", "aacute", "\\{\\\\'\\{a\\}\\}"}, // latin small letter a with acute, 
168         //                                 U+00E1 ISOlat1 
169         {"226", "acirc", "\\{\\\\\\^\\{a\\}\\}"}, // latin small letter a with circumflex, 
170         //                                 U+00E2 ISOlat1 
171         {"227", "atilde", "\\{\\\\~\\{a\\}\\}"}, // latin small letter a with tilde, 
172         //                                 U+00E3 ISOlat1 
173         {"228", "auml", "\\{\\\\\"\\{a\\}\\}"}, // latin small letter a with diaeresis, 
174         //                                 U+00E4 ISOlat1 
175         {"229", "aring", "\\{\\\\aa\\}"}, // latin small letter a with ring above 
176         //                                 = latin small letter a ring,
177         //                                 U+00E5 ISOlat1 
178         {"230", "aelig", "\\{\\\\ae\\}"}, // latin small letter ae 
179         //                                 = latin small ligature ae, U+00E6 ISOlat1 
180         {"231", "ccedil", "\\{\\\\c\\{c\\}\\}"}, // latin small letter c with cedilla,
181         //                                 U+00E7 ISOlat1 
182         {"232", "egrave", "\\{\\\\`\\{e\\}\\}"}, // latin small letter e with grave,
183         //                                 U+00E8 ISOlat1 
184         {"233", "eacute", "\\{\\\\'\\{e\\}\\}"}, // latin small letter e with acute, 
185         //                                 U+00E9 ISOlat1 
186         {"234", "ecirc", "\\{\\\\\\^\\{e\\}\\}"}, // latin small letter e with circumflex, 
187         //                                 U+00EA ISOlat1 
188         {"235", "euml", "\\{\\\\\"\\{e\\}\\}"}, // latin small letter e with diaeresis, 
189         //                                 U+00EB ISOlat1 
190         {"236", "igrave", "\\{\\\\`\\{\\\\i\\}\\}"}, // latin small letter i with grave,
191         //                                 U+00EC ISOlat1 
192         {"237", "iacute", "\\{\\\\'\\{\\\\i\\}\\}"}, // latin small letter i with acute, 
193         //                                 U+00ED ISOlat1 
194         {"238", "icirc", "\\{\\\\\\^\\{\\\\i\\}\\}"}, // latin small letter i with circumflex, 
195         //                                 U+00EE ISOlat1 
196         {"239", "iuml", "\\{\\\\\"\\{\\\\i\\}\\}"}, // latin small letter i with diaeresis, 
197         //                                 U+00EF ISOlat1 
198         {"240", "eth", "\\{\\\\dh\\}"}, // latin small letter eth, U+00F0 ISOlat1  
199         {"241", "ntilde", "\\{\\\\~\\{n\\}\\}"}, // latin small letter n with tilde, 
200         //                                 U+00F1 ISOlat1 
201         {"242", "ograve", "\\{\\\\`\\{o\\}\\}"}, // latin small letter o with grave,
202         //                                 U+00F2 ISOlat1 
203         {"243", "oacute", "\\{\\\\'\\{o\\}\\}"}, // latin small letter o with acute, 
204         //                                 U+00F3 ISOlat1 
205         {"244", "ocirc", "\\{\\\\\\^\\{o\\}\\}"}, // latin small letter o with circumflex, 
206         //                                 U+00F4 ISOlat1 
207         {"245", "otilde", "\\{\\\\~\\{o\\}\\}"}, // latin small letter o with tilde, 
208         //                                 U+00F5 ISOlat1 
209         {"246", "ouml", "\\{\\\\\"\\{o\\}\\}"}, // latin small letter o with diaeresis, 
210         //                                 U+00F6 ISOlat1 
211         {"247", "divide", "\\$\\\\div\\$"}, // division sign, U+00F7 ISOnum  
212         {"248", "oslash", "\\{\\\\o\\}"}, // latin small letter o with stroke, 
213         //                                 = latin small letter o slash,
214         //                                 U+00F8 ISOlat1 
215         {"249", "ugrave", "\\{\\\\`\\{u\\}\\}"}, // latin small letter u with grave,
216         //                                 U+00F9 ISOlat1 
217         {"250", "uacute", "\\{\\\\'\\{u\\}\\}"}, // latin small letter u with acute, 
218         //                                 U+00FA ISOlat1 
219         {"251", "ucirc", "\\{\\\\\\^\\{u\\}\\}"}, // latin small letter u with circumflex, 
220         //                                 U+00FB ISOlat1 
221         {"252", "uuml", "\\{\\\\\"\\{u\\}\\}"}, // latin small letter u with diaeresis, 
222         //                                 U+00FC ISOlat1 
223         {"253", "yacute", "\\{\\\\'\\{y\\}\\}"}, // latin small letter y with acute, 
224         //                                 U+00FD ISOlat1 
225         {"254", "thorn", "\\{\\\\th\\}"}, // latin small letter thorn, 
226         //                                 U+00FE ISOlat1 
227         {"255", "yuml", "\\{\\\\\"\\{y\\}\\}"}, // latin small letter y with diaeresis, 
228         //                                 U+00FF ISOlat1 
229         {"402", "fnof", "\\$f\\$"}, // latin small f with hook = function 
230         //                                   = florin, U+0192 ISOtech 
231
232         /* Greek */
233         {"913", "Alpha", "\\{\\$\\\\Alpha\\$\\}"}, // greek capital letter alpha, U+0391  
234         {"914", "Beta", "\\{\\$\\\\Beta\\$\\}"}, // greek capital letter beta, U+0392  
235         {"915", "Gamma", "\\{\\$\\\\Gamma\\$\\}"}, // greek capital letter gamma, 
236         //                                   U+0393 ISOgrk3 
237         {"916", "Delta", "\\{\\$\\\\Delta\\$\\}"}, // greek capital letter delta, 
238         //                                   U+0394 ISOgrk3 
239         {"917", "Epsilon", "\\{\\$\\\\Epsilon\\$\\}"}, // greek capital letter epsilon, U+0395  
240         {"918", "Zeta", "\\{\\$\\\\Zeta\\$\\}"}, // greek capital letter zeta, U+0396  
241         {"919", "Eta", "\\{\\$\\\\Eta\\$\\}"}, // greek capital letter eta, U+0397  
242         {"920", "Theta", "\\{\\$\\\\Theta\\$\\}"}, // greek capital letter theta, 
243         //                                   U+0398 ISOgrk3 
244         {"921", "Iota", "\\{\\$\\\\Iota\\$\\}"}, // greek capital letter iota, U+0399  
245         {"922", "Kappa", "\\{\\$\\\\Kappa\\$\\}"}, // greek capital letter kappa, U+039A  
246         {"923", "Lambda", "\\{\\$\\\\Lambda\\$\\}"}, // greek capital letter lambda, 
247         //                                   U+039B ISOgrk3 
248         {"924", "Mu", "\\{\\$\\\\Mu\\$\\}"}, // greek capital letter mu, U+039C  
249         {"925", "Nu", "\\{\\$\\\\Nu\\$\\}"}, // greek capital letter nu, U+039D  
250         {"926", "Xi", "\\{\\$\\\\Xi\\$\\}"}, // greek capital letter xi, U+039E ISOgrk3  
251         {"927", "Omicron", "\\{\\$\\\\Omicron\\$\\}"}, // greek capital letter omicron, U+039F  
252         {"928", "Pi", "\\{\\$\\\\Pi\\$\\}"}, // greek capital letter pi, U+03A0 ISOgrk3  
253         {"929", "Rho", "\\{\\$\\\\Rho\\$\\}"}, // greek capital letter rho, U+03A1  
254         /* there is no Sigmaf, and no U+03A2 character either */
255         {"931", "Sigma", "\\{\\$\\\\Sigma\\$\\}"}, // greek capital letter sigma, 
256         //                                   U+03A3 ISOgrk3 
257         {"932", "Tau", "\\{\\$\\\\Tau\\$\\}"}, // greek capital letter tau, U+03A4  
258         {"933", "Upsilon", "\\{\\$\\\\Upsilon\\$\\}"}, // greek capital letter upsilon, 
259         //                                   U+03A5 ISOgrk3 
260         {"934", "Phi", "\\{\\$\\\\Phi\\$\\}"}, // greek capital letter phi, 
261         //                                   U+03A6 ISOgrk3 
262         {"935", "Chi", "\\{\\$\\\\Chi\\$\\}"}, // greek capital letter chi, U+03A7  
263         {"936", "Psi", "\\{\\$\\\\Psi\\$\\}"}, // greek capital letter psi, 
264         //                                   U+03A8 ISOgrk3 
265         {"937", "Omega", "\\{\\$\\\\Omega\\$\\}"}, // greek capital letter omega, 
266         //                                   U+03A9 ISOgrk3 
267
268         {"945", "alpha", "\\$\\\\alpha\\$"}, // greek small letter alpha, 
269         //                                   U+03B1 ISOgrk3 
270         {"946", "beta", "\\$\\\\beta\\$"}, // greek small letter beta, U+03B2 ISOgrk3  
271         {"947", "gamma", "\\$\\\\gamma\\$"}, // greek small letter gamma, 
272         //                                   U+03B3 ISOgrk3 
273         {"948", "delta", "\\$\\\\delta\\$"}, // greek small letter delta, 
274         //                                   U+03B4 ISOgrk3 
275         {"949", "epsilon", "\\$\\\\epsilon\\$"}, // greek small letter epsilon, 
276         //                                   U+03B5 ISOgrk3 
277         {"950", "zeta", "\\$\\\\zeta\\$"}, // greek small letter zeta, U+03B6 ISOgrk3  
278         {"951", "eta", "\\$\\\\eta\\$"}, // greek small letter eta, U+03B7 ISOgrk3  
279         {"952", "theta", "\\$\\\\theta\\$"}, // greek small letter theta, 
280         //                                   U+03B8 ISOgrk3 
281         {"953", "iota", "\\$\\\\iota\\$"}, // greek small letter iota, U+03B9 ISOgrk3  
282         {"954", "kappa", "\\$\\\\kappa\\$"}, // greek small letter kappa, 
283         //                                   U+03BA ISOgrk3 
284         {"955", "lambda", "\\$\\\\lambda\\$"}, // greek small letter lambda, 
285         //                                   U+03BB ISOgrk3 
286         {"956", "mu", "\\$\\\\mu\\$"}, // greek small letter mu, U+03BC ISOgrk3  
287         {"957", "nu", "\\$\\\\nu\\$"}, // greek small letter nu, U+03BD ISOgrk3  
288         {"958", "xi", "\\$\\\\xi\\$"}, // greek small letter xi, U+03BE ISOgrk3  
289         {"959", "omicron", "\\$\\\\omicron\\$"}, // greek small letter omicron, U+03BF NEW  
290         {"960", "pi", "\\$\\\\phi\\$"}, // greek small letter pi, U+03C0 ISOgrk3  
291         {"961", "rho", "\\$\\\\rho\\$"}, // greek small letter rho, U+03C1 ISOgrk3  
292         {"962", "sigmaf", "\\$\\\\varsigma\\$"}, // greek small letter final sigma, 
293         //                                   U+03C2 ISOgrk3 
294         {"963", "sigma", "\\$\\\\sigma\\$"}, // greek small letter sigma, 
295         //                                   U+03C3 ISOgrk3 
296         {"964", "tau", "\\$\\\\tau\\$"}, // greek small letter tau, U+03C4 ISOgrk3  
297         {"965", "upsilon", "\\$\\\\upsilon\\$"}, // greek small letter upsilon, 
298         {"", "upsi", "\\$\\\\upsilon\\$"}, // alias 
299         //                                   U+03C5 ISOgrk3 
300         {"966", "phi", "\\$\\\\phi\\$"}, // greek small letter phi, U+03C6 ISOgrk3  
301         {"967", "chi", "\\$\\\\chi\\$"}, // greek small letter chi, U+03C7 ISOgrk3  
302         {"968", "psi", "\\$\\\\psi\\$"}, // greek small letter psi, U+03C8 ISOgrk3  
303         {"969", "omega", "\\$\\\\omega\\$"}, // greek small letter omega, 
304         //                                   U+03C9 ISOgrk3 
305         {"977", "thetasym", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol, 
306         {"", "thetav", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol, 
307         {"", "vartheta", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol, 
308         //                                   U+03D1 NEW 
309         {"978", "upsih", "\\{\\$\\\\Upsilon\\$\\}"}, // greek upsilon with hook symbol, 
310         //                                   U+03D2 NEW 
311         {"982", "piv", "\\$\\\\varphi\\$"}, // greek pi symbol, U+03D6 ISOgrk3  
312
313         /* General Punctuation */
314         {"8226", "bull", "\\$\\\\bullet\\$"}, // bullet = black small circle, 
315         //                                    U+2022 ISOpub  
316         /* bullet is NOT the same as bullet operator, U+2219 */
317         {"8230", "hellip", "\\{\\\\ldots\\}"}, // horizontal ellipsis = three dot leader, 
318         //                                    U+2026 ISOpub  
319         {"8242", "prime", "\\$\\\\prime\\$"}, // prime = minutes = feet, U+2032 ISOtech  
320         {"8243", "Prime", "\\$\\{''\\}\\$"}, // double prime = seconds = inches, 
321         //                                    U+2033 ISOtech 
322         {"8254", "oline", "\\{\\\\=\\{\\}\\}"}, // overline = spacing overscore, 
323         //                                    U+203E NEW 
324         {"8260", "frasl", "/"}, // fraction slash, U+2044 NEW  
325
326         /* Letterlike Symbols */
327         {"8472", "weierp", "\\$\\\\wp\\$"}, // script capital P = power set 
328         //                                    = Weierstrass p, U+2118 ISOamso 
329         {"8465", "image", "\\{\\$\\\\Im\\$\\}"}, // blackletter capital I = imaginary part, 
330         //                                    U+2111 ISOamso 
331         {"8476", "real", "\\{\\$\\\\Re\\$\\}"}, // blackletter capital R = real part symbol, 
332         //                                    U+211C ISOamso 
333         {"8482", "trade", "\\{\\\\texttrademark\\}"}, // trade mark sign, U+2122 ISOnum
334         {"8501", "alefsym", "\\$\\\\aleph\\$"}, // alef symbol = first transfinite cardinal, 
335         //                                    U+2135 NEW 
336         /*    alef symbol is NOT the same as hebrew letter alef,
337          U+05D0 although the same glyph could be used to depict both characters */
338         /* Arrows */
339         {"8592", "larr", "\\$\\\\leftarrow\\$"}, // leftwards arrow, U+2190 ISOnum
340         {"8593", "uarr", "\\$\\\\uparrow\\$"}, // upwards arrow, U+2191 ISOnum
341         {"8594", "rarr", "\\$\\\\rightarrow\\$"}, // rightwards arrow, U+2192 ISOnum
342         {"8595", "darr", "\\$\\\\downarrow\\$"}, // downwards arrow, U+2193 ISOnum
343         {"8596", "harr", "\\$\\\\leftrightarrow\\$"}, // left right arrow, U+2194 ISOamsa  
344         {"8629", "crarr", "\\$\\\\dlsh\\$"}, // downwards arrow with corner leftwards 
345         //                                    = carriage return, U+21B5 NEW - require mathabx
346         {"8656", "lArr", "\\{\\$\\\\Leftarrow\\$\\}"}, // leftwards double arrow, U+21D0 ISOtech
347         /*  ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
348          but also does not have any other character for that function. So ? lArr can
349          be used for 'is implied by' as ISOtech suggests */
350         {"8657", "uArr", "\\{\\$\\\\Uparrow\\$\\}"}, // upwards double arrow, U+21D1 ISOamsa
351         {"8658", "rArr", "\\{\\$\\\\Rightarrow\\$\\}"}, // rightwards double arrow,
352         //                                     U+21D2 ISOtech 
353         /*   ISO 10646 does not say this is the 'implies' character but does not have 
354          another character with this function so ?
355          rArr can be used for 'implies' as ISOtech suggests */
356         {"8659", "dArr", "\\{\\$\\\\Downarrow\\$\\}"}, // downwards double arrow, U+21D3 ISOamsa  
357         {"8660", "hArr", "\\{\\$\\\\Leftrightarrow\\$\\}"}, // left right double arrow, 
358         //                                     U+21D4 ISOamsa 
359
360         /* Mathematical Operators */
361         {"8704", "forall", "\\$\\\\forall\\$"}, // for all, U+2200 ISOtech  
362         {"8706", "part", "\\$\\\\partial\\$"}, // partial differential, U+2202 ISOtech
363         {"8707", "exist", "\\$\\\\exists\\$"}, // there exists, U+2203 ISOtech
364         {"8709", "empty", "\\$\\\\emptyset\\$"}, // empty set = null set = diameter,
365         //                                    U+2205 ISOamso 
366         {"8711", "nabla", "\\$\\\\nabla\\$"}, // nabla = backward difference, 
367         //                                    U+2207 ISOtech 
368         {"8712", "isin", "\\$\\\\in\\$"}, // element of, U+2208 ISOtech
369         {"8713", "notin", "\\$\\\\notin\\$"}, // not an element of, U+2209 ISOtech
370         {"8715", "ni", "\\$\\\\ni\\$"}, // contains as member, U+220B ISOtech
371         /* should there be a more memorable name than 'ni'? */
372         {"8719", "prod", "\\$\\\\prod\\$"}, // n-ary product = product sign,
373         //                                    U+220F ISOamsb 
374         /*    prod is NOT the same character as U+03A0 'greek capital letter pi' though
375          the same glyph might be used for both  */
376         {"8721", "sum", "\\$\\\\sum\\$"}, // n-ary sumation, U+2211 ISOamsb  
377         /*    sum is NOT the same character as U+03A3 'greek capital letter sigma'
378          though the same glyph might be used for both */
379         {"8722", "minus", "\\$-\\$"}, // minus sign, U+2212 ISOtech  
380         {"8727", "lowast", "\\$\\\\ast\\$"}, // asterisk operator, U+2217 ISOtech  
381         {"8730", "radic", "\\$\\\\sqrt{}\\$"}, // square root = radical sign, 
382         //                                    U+221A ISOtech 
383         {"8733", "prop", "\\$\\\\propto\\$"}, // proportional to, U+221D ISOtech  
384         {"8734", "infin", "\\$\\\\infty\\$"}, // infinity, U+221E ISOtech  
385         {"8736", "ang", "\\$\\\\angle\\$"}, // angle, U+2220 ISOamso
386         {"8743", "and", "\\$\\\\land\\$"}, // logical and = wedge, U+2227 ISOtech
387         {"8744", "or", "\\$\\\\lor\\$"}, // logical or = vee, U+2228 ISOtech
388         {"8745", "cap", "\\$\\\\cap\\$"}, // intersection = cap, U+2229 ISOtech
389         {"8746", "cup", "\\$\\\\cup\\$"}, // union = cup, U+222A ISOtech
390         {"8747", "int", "\\$\\\\int\\$"}, // integral, U+222B ISOtech
391         {"8756", "there4", "\\$\\\\uptherefore\\$"}, // therefore, U+2234 ISOtech; only in LaTeX package MnSymbol
392         {"8764", "sim", "\\$\\\\sim\\$"}, // tilde operator = varies with = similar to,
393         //                                    U+223C ISOtech 
394         /*  tilde operator is NOT the same character as the tilde, U+007E,
395          although the same glyph might be used to represent both   */
396         {"8773", "cong", "\\$\\\\cong\\$"}, // approximately equal to, U+2245 ISOtech  
397         {"8776", "asymp", "\\$\\\\approx\\$"}, // almost equal to = asymptotic to, 
398         //                                    U+2248 ISOamsr 
399         {"8800", "ne", "\\$\\\\neq\\$"}, // not equal to, U+2260 ISOtech  
400         {"8801", "equiv", "\\$\\\\equiv\\$"}, // identical to, U+2261 ISOtech  
401         {"8804", "le", "\\$\\\\leq\\$"}, // less-than or equal to, U+2264 ISOtech  
402         {"8805", "ge", "\\$\\\\geq\\$"}, // greater-than or equal to, 
403         //                                    U+2265 ISOtech 
404         {"8834", "sub", "\\$\\\\subset\\$"}, // subset of, U+2282 ISOtech  
405         {"8835", "sup", "\\$\\\\supset\\$"}, // superset of, U+2283 ISOtech  
406         /*    note that nsup, 'not a superset of, U+2283' is not covered by the Symbol 
407          font encoding and is not included. Should it be, for symmetry?
408          It is in ISOamsn   */
409         {"8836", "nsub", "\\$\\\\nsubset\\$"}, // not a subset of, U+2284 ISOamsn  
410         {"8838", "sube", "\\$\\\\subseteq\\$"}, // subset of or equal to, U+2286 ISOtech  
411         {"8839", "supe", "\\$\\\\supseteq\\$"}, // superset of or equal to, 
412         //                                    U+2287 ISOtech 
413         {"8853", "oplus", "\\$\\\\oplus\\$"}, // circled plus = direct sum, 
414         //                                    U+2295 ISOamsb 
415         {"8855", "otimes", "\\$\\\\otimes\\$"}, // circled times = vector product,
416         //                                    U+2297 ISOamsb 
417         {"8869", "perp", "\\$\\\\perp\\$"}, // up tack = orthogonal to = perpendicular, 
418         //                                    U+22A5 ISOtech 
419         {"8901", "sdot", "\\$\\\\cdot\\$"}, // dot operator, U+22C5 ISOamsb  
420         /* dot operator is NOT the same character as U+00B7 middle dot */
421         /* Miscellaneous Technical */
422         {"8968", "lceil", "\\$\\\\lceil\\$"}, // left ceiling = apl upstile, 
423         //                                    U+2308 ISOamsc  
424         {"8969", "rceil", "\\$\\\\rceil\\$"}, // right ceiling, U+2309 ISOamsc   
425         {"8970", "lfloor", "\\$\\\\lfloor\\$"}, // left floor = apl downstile, 
426         //                                    U+230A ISOamsc  
427         {"8971", "rfloor", "\\$\\\\rfloor\\$"}, // right floor, U+230B ISOamsc   
428         {"9001", "lang", "\\$\\\\langle\\$"}, // left-pointing angle bracket = bra, 
429         //                                    U+2329 ISOtech 
430         /*    lang is NOT the same character as U+003C 'less than' 
431          or U+2039 'single left-pointing angle quotation mark' */
432         {"9002", "rang", "\\$\\\\rangle\\$"}, // right-pointing angle bracket = ket, 
433         //                                    U+232A ISOtech 
434         /*    rang is NOT the same character as U+003E 'greater than' 
435          or U+203A 'single right-pointing angle quotation mark' */
436         /* Geometric Shapes */
437         {"9674", "loz", "\\$\\\\lozenge\\$"}, // lozenge, U+25CA ISOpub  
438
439         /* Miscellaneous Symbols */
440         {"9824", "spades", "\\$\\\\spadesuit\\$"}, // black spade suit, U+2660 ISOpub  
441         /* black here seems to mean filled as opposed to hollow */
442         {"9827", "clubs", "\\$\\\\clubsuit\\$"}, // black club suit = shamrock, 
443         //                                    U+2663 ISOpub 
444         {"9829", "hearts", "\\$\\\\heartsuit\\$"}, // black heart suit = valentine, 
445         //                                    U+2665 ISOpub 
446         {"9830", "diams", "\\$\\\\diamondsuit\\$"}, // black diamond suit, U+2666 ISOpub  
447         {"34", "quot", "\""}, // quotation mark = APL quote,
448         //                                   U+0022 ISOnum 
449         {"38", "amp", "\\\\&"}, // ampersand, U+0026 ISOnum 
450         {"60", "lt", "\\$<\\$"}, // less-than sign, U+003C ISOnum 
451         {"62", "gt", "\\$>\\$"}, // greater-than sign, U+003E ISOnum 
452
453         /* Latin Extended-A */
454         {"338", "OElig", "\\{\\\\OE\\}"}, // latin capital ligature OE,
455         //                                   U+0152 ISOlat2 
456         {"339", "oelig", "\\{\\\\oe\\}"}, // latin small ligature oe, U+0153 ISOlat2 
457         /* ligature is a misnomer, this is a separate character in some languages */
458         {"352", "Scaron", "\\{\\\\v\\{S\\}\\}"}, // latin capital letter S with caron,
459         //                                   U+0160 ISOlat2 
460         {"353", "scaron", "\\{\\\\v\\{s\\}\\}"}, // latin small letter s with caron,
461         //                                   U+0161 ISOlat2 
462         {"376", "Yuml", "\\{\\\\\"\\{Y\\}\\}"}, // latin capital letter Y with diaeresis,
463         //                                   U+0178 ISOlat2 
464
465         /* Spacing Modifier Letters */
466         {"710", "circ", "\\{\\\\textasciicircum\\}"}, // modifier letter circumflex accent,
467         //                                   U+02C6 ISOpub 
468         {"732", "tilde", "\\{\\\\textasciitilde\\}"}, // small tilde, U+02DC ISOdia 
469
470         /* General Punctuation */
471         {"8194", "ensp", "\\\\hspace\\{0.5em\\}"}, // en space, U+2002 ISOpub  
472         {"8195", "emsp", "\\\\hspace\\{1em\\}"}, // em space, U+2003 ISOpub  
473         {"8201", "thinsp", "\\\\hspace\\{0.167em\\}"}, // thin space, U+2009 ISOpub  
474         {"8204", "zwnj", ""}, // zero width non-joiner, 
475         //                                   U+200C NEW RFC 2070 
476         {"8205", "zwj", ""}, // zero width joiner, U+200D NEW RFC 2070  
477         {"8206", "lrm", ""}, // left-to-right mark, U+200E NEW RFC 2070  
478         {"8207", "rlm", ""}, // right-to-left mark, U+200F NEW RFC 2070  
479         {"8211", "ndash", "--"}, // en dash, U+2013 ISOpub  
480         {"8212", "mdash", "---"}, // em dash, U+2014 ISOpub  
481         {"8216", "lsquo", "\\{\\\\textquoteleft\\}"}, // left single quotation mark, 
482         //                                   U+2018 ISOnum 
483         {"8217", "rsquo", "\\{\\\\textquoteright\\}"}, // right single quotation mark, 
484         //                                   U+2019 ISOnum 
485         {"8218", "sbquo", "\\{\\\\quotesinglbase\\}"}, // single low-9 quotation mark, U+201A NEW  
486         {"8220", "ldquo", "\\{\\\\textquotedblleft\\}"}, // left double quotation mark, 
487         //                                   U+201C ISOnum 
488         {"8221", "rdquo", "\\{\\\\textquotedblright\\}"}, // right double quotation mark, 
489         //                                   U+201D ISOnum 
490         {"8222", "bdquo", "\\{\\\\quotedblbase\\}"}, // double low-9 quotation mark, U+201E NEW  
491         {"8224", "dagger", "\\{\\\\dag\\}"}, // dagger, U+2020 ISOpub  
492         {"8225", "Dagger", "\\{\\\\ddag\\}"}, // double dagger, U+2021 ISOpub  
493         {"8240", "permil", "\\{\\\\textperthousand\\}"}, // per mille sign, U+2030 ISOtech  
494         {"8249", "lsaquo", "\\{\\\\guilsinglleft\\}"}, // single left-pointing angle quotation mark, 
495         //                                   U+2039 ISO proposed 
496         /* lsaquo is proposed but not yet ISO standardized */
497         {"8250", "rsaquo", "\\{\\\\guilsinglright\\}"}, // single right-pointing angle quotation mark, 
498         //                                   U+203A ISO proposed 
499         /* rsaquo is proposed but not yet ISO standardized */
500         {"8364", "euro", "\\{\\\\texteuro\\}"}, // euro sign, U+20AC NEW 
501             
502         /* Manually added */
503         {"24", "dollar", "\\\\$"}, // Percent
504         {"37", "percnt", "\\\\%"}, // Percent
505         {"39", "apos", "'"}, // Apostrophe
506         {"40", "lpar", "("}, // Left bracket
507         {"41", "rpar", ")"}, // Right bracket
508         {"43", "plus", "\\+"}, // Plus
509         {"44", "comma", ","}, // Comma
510         {"45", "hyphen", "-"}, // Hyphen
511         {"46", "period", "\\."}, // Period
512         {"47", "slash", "/"}, // Slash (solidus)
513         {"58", "colon", ":"}, // Colon
514         {"59", "semi", ";"}, // Semi colon
515         {"61", "equals", "="}, // Equals to
516         {"91", "lsqb", "\\["}, // Left square bracket
517         {"92", "bsol", "\\{\\\\textbackslash\\}"}, // Backslash
518         {"93", "rsqb", "\\]"}, // Right square bracket
519         {"94", "Hat", "\\{\\\\\\^\\{\\}\\}"}, // Circumflex
520         {"95", "lowbar", "\\\\_"}, // Underscore
521         {"96", "grave", "\\{\\\\`\\{\\}\\}"}, // Grave
522         {"123", "lbrace", "\\\\\\{"}, // Left curly bracket
523         {"", "lcub", "\\\\\\{"}, // Left curly bracket
524         {"124", "vert", "\\|"}, // Vertical bar
525         {"", "verbar", "\\|"}, // Vertical bar
526         {"", "VerticalLine", "\\|"}, // Vertical bar
527         {"125", "rbrace", "\\\\\\}"}, // Right curly bracket
528         {"", "rcub", "\\\\\\}"}, // Right curly bracket
529         {"138", "", "\\{\\\\v\\{S\\}\\}"}, // Line tabulation set   
530      // {"141", "", ""}, // Reverse line feed
531         {"145", "", "`"}, // Apostrophe
532         {"146", "", "'"}, // Apostrophe
533         {"147", "", "``"}, // Quotation mark
534         {"148", "", "''"}, // Quotation mark
535         {"150", "", "--"}, // En dash
536         {"154", "", "\\{\\\\v\\{s\\}\\}"}, // Single character introducer
537         {"260", "Aogon", "\\{\\\\k\\{A\\}\\}"}, // capital A with ogonek
538         {"261", "aogon", "\\{\\\\k\\{a\\}\\}"}, // small a with ogonek
539         {"262", "Cacute", "\\{\\\\'\\{C\\}\\}"}, // capital C with acute
540         {"263", "cacute", "\\{\\\\'\\{c\\}\\}"}, // small C with acute
541         {"264", "Ccirc", "\\{\\\\\\^\\{C\\}\\}"}, // capital C with circumflex
542         {"265", "ccirc", "\\{\\\\\\^\\{c\\}\\}"}, // small C with circumflex
543         {"266", "Cdot", "\\{\\\\\\.\\{C\\}\\}"}, // capital C with dot above
544         {"267", "cdot", "\\{\\\\\\.\\{c\\}\\}"}, // small C with dot above
545         {"268", "Ccaron", "\\{\\\\v\\{C\\}\\}"}, // capital C with caron
546         {"269", "ccaron", "\\{\\\\v\\{c\\}\\}"}, // small C with caron
547         {"272", "Dstrok", "\\{\\\\DJ\\}"}, // capital D with stroke
548         {"273", "dstrok", "\\{\\\\dj\\}"}, // small d with stroke
549         {"280", "Eogon", "\\{\\\\k\\{E\\}\\}"}, // capital E with ogonek
550         {"281", "eogon", "\\{\\\\k\\{e\\}\\}"}, // small e with ogonek
551         {"298", "Imacr", "\\{\\\\=\\{I\\}\\}"}, // capital I with macron
552         {"299", "imacr", "\\{\\\\=\\{\\\\i\\}\\}"}, // small i with macron
553         {"302", "Iogon", "\\{\\\\k\\{I\\}\\}"}, // capital I with ogonek
554         {"303", "iogon", "\\{\\\\k\\{i\\}\\}"}, // small i with ogonek
555         {"304", "Idot", "\\{\\\\.\\{I\\}\\}"},    // capital I with dot above
556         {"305", "inodot", "\\{\\\\i\\}"},    // Small i without the dot
557         {"", "imath", "\\{\\\\i\\}"},    // Small i without the dot
558         {"321", "Lstrok", "\\{\\\\L\\}"},    // upper case l with stroke
559         {"322", "lstrok", "\\{\\\\l\\}"},    // lower case l with stroke
560         {"370", "Uogon", "\\{\\\\k\\{U\\}\\}"}, // capital U with ogonek
561         {"371", "uogon", "\\{\\\\k\\{u\\}\\}"}, // small u with ogonek
562         {"381", "Zcaron", "\\{\\\\v\\{Z\\}\\}"}, // capital Z with caron
563         {"382", "zcaron", "\\{\\\\v\\{z\\}\\}"}, // small z with caron
564         {"490", "Oogon", "\\{\\\\k\\{O\\}\\}"},    // capital letter O with ogonek
565         {"491", "oogon", "\\{\\\\k\\{o\\}\\}"},    // small letter o with ogonek
566         {"492", "", "\\{\\\\k\\{\\\\=\\{O\\}\\}\\}"},    // capital letter O with ogonek and macron
567         {"493", "", "\\{\\\\k\\{\\\\=\\{o\\}\\}\\}"},    // small letter o with ogonek and macron
568         {"536", "", "\\{\\\\cb\\{S\\}\\}"},    // capital letter S with comma below, require combelow
569         {"537", "", "\\{\\\\cb\\{s\\}\\}"},    // small letter S with comma below, require combelow
570         {"538", "", "\\{\\\\cb\\{T\\}\\}"},    // capital letter T with comma below, require combelow
571         {"539", "", "\\{\\\\cb\\{t\\}\\}"},    // small letter T with comma below, require combelow
572         {"727", "caron", "\\{\\\\v\\{\\}\\}"}, // Caron
573         {"", "Hacek", "\\{\\\\v\\{\\}\\}"}, // Caron
574         {"728", "breve", "\\{\\\\u\\{\\}\\}"}, // Breve
575         {"", "Breve", "\\{\\\\u\\{\\}\\}"}, // Breve
576         {"729", "dot", "\\{\\\\\\.\\{\\}\\}"}, // Dot above
577         {"730", "ring", "\\{\\\\r\\{\\}\\}"}, // Ring above
578         {"731", "ogon", "\\{\\\\k\\{\\}\\}"}, // Ogonek
579         {"733", "dblac", "\\{\\\\H\\{\\}\\}"}, // Double acute
580         {"949", "epsi", "\\$\\\\epsilon\\$"},    // Epsilon - double check
581         {"1013", "epsiv", "\\$\\\\varepsilonup\\$"},    // lunate epsilon, requires txfonts
582         {"1055", "", "\\{\\\\cyrchar\\\\CYRP\\}"},    // Cyrillic capital Pe
583         {"1082", "", "\\{\\\\cyrchar\\\\cyrk\\}"},    // Cyrillic small Ka
584      // {"2013", "", ""},    // NKO letter FA -- Maybe en dash = 0x2013?
585      // {"2014", "", ""},    // NKO letter FA -- Maybe em dash = 0x2014?
586         {"8192", "", "\\\\hspace\\{0.5em\\}"}, // en quad
587         {"8193", "", "\\\\hspace\\{1em\\}"}, // em quad
588         {"8196", "", "\\\\hspace\\{0.333em\\}"}, // Three-Per-Em Space 
589         {"8197", "", "\\\\hspace\\{0.25em\\}"}, // Four-Per-Em Space 
590         {"8198", "", "\\\\hspace\\{0.167em\\}"}, // Six-Per-Em Space
591         {"8208", "hyphen", "-"},    // Hyphen
592         {"8229", "nldr", "\\.\\."},    // Double dots - en leader
593         {"8451", "", "\\$\\\\deg\\$\\{C\\}"}, // Degree Celsius
594         {"8459", "Hscr", "\\$\\\\mathcal\\{H\\}\\$"}, // script capital H -- possibly use \mathscr
595         {"8460", "Hfr", "\\$\\\\mathbb\\{H\\}\\$"}, // black letter capital H -- requires e.g. amsfonts
596         {"8466", "Lscr", "\\$\\\\mathcal\\{L\\}\\$"}, // script capital L -- possibly use \mathscr
597         {"8467", "ell", "\\{\\\\ell\\}"}, // script small l 
598         {"8469", "naturals", "\\$\\\\mathbb\\{N\\}\\$"}, // double struck capital N -- requires e.g. amsfonts
599         {"8486", "", "\\$\\{\\\\Omega\\}\\$"}, // Omega
600         {"8491", "angst", "\\{\\\\AA\\}"}, // Angstrom 
601         {"8496", "Escr", "\\$\\\\mathcal\\{E\\}\\$"}, // script capital E 
602         {"8531", "frac13", "\\$\\\\sfrac\\{1\\}\\{3\\}\\$"},    // Vulgar fraction one third
603         {"8532", "frac23", "\\$\\\\sfrac\\{2\\}\\{3\\}\\$"},    // Vulgar fraction two thirds
604         {"8533", "frac15", "\\$\\\\sfrac\\{1\\}\\{5\\}\\$"},    // Vulgar fraction one fifth
605         {"8534", "frac25", "\\$\\\\sfrac\\{2\\}\\{5\\}\\$"},    // Vulgar fraction two fifths
606         {"8535", "frac35", "\\$\\\\sfrac\\{3\\}\\{5\\}\\$"},    // Vulgar fraction three fifths
607         {"8536", "frac45", "\\$\\\\sfrac\\{4\\}\\{5\\}\\$"},    // Vulgar fraction four fifths
608         {"8537", "frac16", "\\$\\\\sfrac\\{1\\}\\{6\\}\\$"},    // Vulgar fraction one sixth
609         {"8538", "frac56", "\\$\\\\sfrac\\{5\\}\\{6\\}\\$"},    // Vulgar fraction five sixths
610         {"8539", "frac18", "\\$\\\\sfrac\\{1\\}\\{8\\}\\$"},    // Vulgar fraction one eighth
611         {"8540", "frac38", "\\$\\\\sfrac\\{3\\}\\{8\\}\\$"},    // Vulgar fraction three eighths
612         {"8541", "frac58", "\\$\\\\sfrac\\{5\\}\\{8\\}\\$"},    // Vulgar fraction five eighths
613         {"8542", "frac78", "\\$\\\\sfrac\\{7\\}\\{8\\}\\$"},    // Vulgar fraction seven eighths
614         {"8710", "", "\\$\\\\triangle\\$"},    // Increment - could use a more appropriate symbol
615         {"8714", "", "\\$\\\\in\\$"},    // Small element in
616         {"8723", "mp", "\\$\\\\mp\\$"},    // Minus-plus
617         {"8729", "bullet", "\\$\\\\bullet\\$"},    // Bullet operator
618         {"8758", "ratio", ":"},    // Colon/ratio
619         {"8771", "sime", "\\$\\\\simeq\\$"}, // almost equal to = asymptotic to, 
620         {"8776", "ap", "\\$\\\\approx\\$"}, // almost equal to = asymptotic to, 
621         {"8810", "ll", "\\$\\\\ll\\$"}, // Much less than 
622         {"", "Lt", "\\$\\\\ll\\$"}, // Much less than 
623         {"8811", "gg", "\\$\\\\gg\\$"}, // Much greater than 
624         {"", "Gt", "\\$\\\\gg\\$"}, // Much greater than 
625         {"8818", "lsim", "\\$\\\\lesssim\\$"}, // Less than or equivalent to
626         {"8819", "gsim", "\\$\\\\gtrsim\\$"}, // Greater than or equivalent to
627         {"8862", "boxplus", "\\$\\\\boxplus\\$"}, // Boxed plus -- requires amssymb 
628         {"8863", "boxminus", "\\$\\\\boxminus\\$"}, // Boxed minus -- requires amssymb 
629         {"8864", "boxtimes", "\\$\\\\boxtimes\\$"}, // Boxed times -- requires amssymb 
630         {"8882", "vltri", "\\$\\\\triangleleft\\$"}, // Left triangle
631         {"8883", "vrtri", "\\$\\\\triangleright\\$"}, // Right triangle
632         {"8896", "xwedge", "\\$\\\\bigwedge\\$"}, // Big wedge
633         {"8897", "xvee", "\\$\\\\bigvee\\$"}, // Big vee
634         {"9426", "circledc", "\\{\\\\copyright\\}"}, // circled small letter C
635         {"9633", "square", "\\$\\\\square\\$"}, // White square
636         {"9651", "xutri", "\\$\\\\bigtriangleup\\$"}, // White up-pointing big triangle 
637         {"9653", "utri", "\\$\\\\triangle\\$"}, // White up-pointing small triangle -- \vartriangle probably
638                                                 // better but requires amssymb
639         {"10877", "les", "\\$\\\\leqslant\\$"},    // Less than slanted equal -- requires amssymb 
640         {"10878", "ges", "\\$\\\\geqslant\\$"},    // Less than slanted equal -- requires amssymb 
641         {"119978", "Oscr", "\\$\\\\mathcal\\{O\\}\\$"}, // script capital O -- possibly use \mathscr
642         {"119984", "Uscr", "\\$\\\\mathcal\\{U\\}\\$"} // script capital U -- possibly use \mathscr
643         
644     };
645     
646         // List of combining accents
647         private String[][] accentList = new String[][] {
648         {"768", "`"},    // Grave 
649         {"769", "'"},    // Acute
650         {"770", "\\^"},  // Circumflex
651         {"771", "~"},    // Tilde
652         {"772", "="},    // Macron
653         {"773", "="},     // Overline - not completely correct
654         {"774", "u"},    // Breve
655         {"775", "\\."},  // Dot above
656         {"776", "\""},   // Diaeresis
657         {"777", "h"},    // Hook above
658         {"778", "r"},    // Ring 
659         {"779", "H"},    // Double acute
660         {"780", "v"},    // Caron
661         {"781", "\\|"},  // Vertical line above
662         {"782", "U"},     // Double vertical line above
663         {"783", "G"},    // Double grave
664         {"784", "textdotbreve"},    // Candrabindu
665         {"785", "t"},    // Inverted breve
666 //        {"786", ""},    // Turned comma above
667 //        {"787", ""},    // Comma above
668 //        {"788", ""},    // Reversed comma above
669 //        {"789", ""},    // Comma above right
670         {"790", "textsubgrave"},    // Grave accent below -requires tipa
671         {"791", "textsubacute"},    // Acute accent below - requires tipa
672         {"792", "textadvancing"},    // Left tack below - requires tipa
673         {"793", "textretracting"},    // Right tack below - requires tipa
674 //        {"794", ""},    // Left angle above
675 //        {"795", ""},    // Horn
676         {"796", "textsublhalfring"},    // Left half ring below - requires tipa
677         {"797", "textraising"},    // Up tack below - requires tipa
678         {"798", "textlowering"},    // Down tack below - requires tipa
679         {"799", "textsubplus"},    // Plus sign below - requires tipa
680 //        {"800", ""},    // Minus sign below
681 //        {"801", ""},    // Palatalized hook below
682 //        {"802", ""},    // Retroflex hook below
683         {"803", "d"},    // Dot below
684         {"804", "textsubumlaut"},    // Diaeresis below - requires tipa
685         {"805", "textsubring"},    // Ring below - requires tipa
686         {"806", "cb"},    // Comma below - requires combelow
687         {"807", "c"},    // Cedilla
688         {"808", "k"},    // Ogonek
689         {"809", "textsyllabic"},    // Vertical line below - requires tipa
690         {"810", "textsubbridge"},    // Bridge below - requires tipa
691         {"811", "textsubw"},    // Inverted double arch below - requires tipa
692         {"812", "textsubwedge"},    // Caron below
693         {"813", "textsubcircum"},    // Circumflex accent below - requires tipa
694 //        {"814", ""},    // Breve below
695         {"815", "textsubarch"},    // Inverted breve below - requires tipa
696         {"816", "textsubtilde"},    // Tilde below - requires tipa
697         {"817", "b"},    // Macron below - not completely correct
698         {"818", "b"},    // Underline
699         {"819", "subdoublebar"},    // Double low line -- requires extraipa
700         {"820", "textsuperimposetilde"},    // Tilde overlay - requires tipa
701 //        {"821", ""},    // Short stroke overlay
702 //        {"822", ""},    // Long stroke overlay
703 //        {"823", ""},    // Short solidus overlay
704 //        {"824", ""},    // Long solidus overlay
705         {"825", "textsubrhalfring"},    // Right half ring below - requires tipa
706         {"826", "textinvsubbridge"},    // inverted bridge below - requires tipa
707         {"827", "textsubsquare"},    // Square below - requires tipa
708         {"828", "textseagull"},    // Seagull below - requires tipa
709         {"829", "textovercross"},    // X above - requires tipa
710 //        {"830", ""},    // Vertical tilde
711 //        {"831", ""},    // Double overline
712 //        {"832", ""},    // Grave tone mark
713 //        {"833", ""},    // Acute tone mark
714 //        {"834", ""},    // Greek perispomeni
715 //        {"835", ""},    // Greek koronis
716 //        {"836", ""},    // Greek dialytika tonos
717 //        {"837", ""},    // Greek ypogegrammeni
718         {"838", "overbridge"},    // Bridge above - requires extraipa
719         {"839", "subdoublebar"},    // Equals sign below - requires extraipa
720         {"840", "subdoublevert"},    // Double vertical line below - requires extraipa
721         {"841", "subcorner"},    // Left angle below - requires extraipa
722         {"842", "crtilde"},    // Not tilde above - requires extraipa
723         {"843", "dottedtilde"},    // Homothetic above - requires extraipa
724         {"844", "doubletilde"},    // Almost equal to above - requires extraipa
725         {"845", "spreadlips"},    // Left right arrow below - requires extraipa
726         {"846", "whistle"},    // Upwards arrow below - requires extraipa
727 //        {"864", ""},    // Double tilde
728 //        {"865", ""},    // Double inverted breve
729         {"866", "sliding"},    // Double rightwards arrow below - requires extraipa
730         };
731
732         private HashMap<String, String> escapedSymbols = new HashMap<String, String>();
733         private HashMap<Integer, String> escapedAccents = new HashMap<Integer, String>();
734         private HashMap<Integer, String> numSymbols = new HashMap<Integer, String>();
735         private HashMap<Character, String> unicodeSymbols = new HashMap<Character, String>();
736         
737         
738         
739         public HTMLConverter() {
740                 super();
741                 for (int i=0;i<conversionList.length;i++) {
742                     if (conversionList[i][2].length() >= 1) {
743                         if (conversionList[i][1].length() >= 1) {
744                             escapedSymbols.put("&" + conversionList[i][1] + ";" , conversionList[i][2]);
745                         }
746                         if (conversionList[i][0].length() >= 1) {
747                             numSymbols.put(Integer.decode(conversionList[i][0]) , conversionList[i][2]);
748                             if(Integer.decode(conversionList[i][0]).intValue()>128) {
749                                 Character c = new Character((char) Integer.decode(conversionList[i][0]).intValue());
750                                 unicodeSymbols.put(c, conversionList[i][2]);
751                                 // System.err.println(Integer.decode(conversionList[i][0]).toString() + ": " + c.toString() + ": "+ conversionList[i][2]);
752                             }
753                         }
754                     }
755                 }
756                 for (int i=0;i<accentList.length;i++) {
757                     escapedAccents.put(Integer.decode(accentList[i][0]), accentList[i][1]);
758                 }
759         }
760         
761     public String formatUnicode(String text) {
762         if (text == null)
763             return null;    
764         Set<Character> chars = unicodeSymbols.keySet();
765         for (Character character: chars) {
766                 // System.err.println(new Integer((int) character).toString() + ": " + character.toString() + ": " + unicodeSymbols.get(character));
767                 text = text.replaceAll(character.toString(), unicodeSymbols.get(character));
768         }
769         return text;
770     };
771         
772     public String format(String text) {
773         if (text == null)
774             return null;
775         StringBuffer sb = new StringBuffer();
776         // Deal with the form <sup>k</sup>and <sub>k</sub>
777         // If the result is in text or equation form can be controlled
778         // From the "Advanced settings" tab
779         if(Globals.prefs.getBoolean("useConvertToEquation")) {
780             text = text.replaceAll("<[ ]?sup>([^<]+)</sup>", "\\$\\^\\{$1\\}\\$");
781             text = text.replaceAll("<[ ]?sub>([^<]+)</sub>", "\\$_\\{$1\\}\\$");
782         } else {
783             text = text.replaceAll("<[ ]?sup>([^<]+)</sup>", "\\\\textsuperscript\\{$1\\}");
784             text = text.replaceAll("<[ ]?sub>([^<]+)</sub>", "\\\\textsubscript\\{$1\\}");
785         }
786         
787         // TODO: maybe rewrite this based on regular expressions instead
788         // Note that (at least) the IEEE Xplore fetcher must be fixed as it relies on the current way to 
789         // remove tags for its image alt-tag to equation converter
790         for (int i=0; i<text.length(); i++) {
791
792             int c = text.charAt(i);
793
794             if (c == '<') {
795                 i = readTag(text, sb, i);
796             } else
797                 sb.append((char)c);
798
799         }
800         text = sb.toString();
801         
802         // Handle text based HTML entities
803         Set<String> patterns = escapedSymbols.keySet();
804         for (String pattern: patterns) {
805                 text = text.replaceAll(pattern, escapedSymbols.get(pattern));
806         }
807         
808         // Handle numerical HTML entities
809         Pattern escapedPattern = Pattern.compile("&#([x]*)([0]*)(\\p{XDigit}+);");
810         Matcher m = escapedPattern.matcher(text);
811         while (m.find()) {
812             //      System.err.println("Found pattern: " + m.group(1));
813             //      System.err.println("Found pattern: " + m.group(2));
814             int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3));
815             if(numSymbols.containsKey(num)) {
816                 text = text.replaceAll("&#" + m.group(1) + m.group(2) + m.group(3) + ";", numSymbols.get(num));
817             } 
818         }
819
820         escapedPattern = Pattern.compile("(.)&#([x]*)([0]*)(\\p{XDigit}+);");
821         m = escapedPattern.matcher(text);
822         while (m.find()) {
823             //      System.err.println("Found pattern: " + m.group(1));
824             //      System.err.println("Found pattern: " + m.group(2));
825             int num = Integer.decode(m.group(2).replace("x", "#") + m.group(4));
826             if(escapedAccents.containsKey(num)) {
827                 if(m.group(1).equals("i")) {
828                     text = text.replaceAll(m.group(1) + "&#" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{\\\\i\\}\\}");
829                 } else if(m.group(1).equals("j")){
830                     text = text.replaceAll(m.group(1) + "&#" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{\\\\j\\}\\}");
831                 } else {
832                     text = text.replaceAll(m.group(1) + "&#" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{" + m.group(1) + "\\}\\}");
833                 }
834             } 
835         }
836
837         escapedPattern = Pattern.compile("&#([x]*)([0]*)(\\p{XDigit}+);");
838         m = escapedPattern.matcher(text);
839         while (m.find()) {
840             //      System.err.println("Found pattern: " + m.group(1));
841             //      System.err.println("Found pattern: " + m.group(2));
842             int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3));
843             System.err.println("HTML escaped char not converted: " + m.group(1) + m.group(2) + m.group(3) + " = " + Integer.toString(num));
844         }
845         
846         // Remove $$ in case of two adjacent conversions
847         text = text.replace("$$","");
848         
849        // Find non-covered special characters with alphabetic codes
850         escapedPattern = Pattern.compile("&(\\w+);");
851         m = escapedPattern.matcher(text);
852         while (m.find()) {
853             System.err.println("HTML escaped char not converted: " + m.group(1));
854         }
855
856         return text.trim();
857     }
858
859     private final int MAX_TAG_LENGTH = 100;
860     /*private final int MAX_TAG_LENGTH = 30;*/
861     /*private final int MAX_CHAR_LENGTH = 10;
862
863     private int readHtmlChar(String text, StringBuffer sb, int position) {
864         // Have just read the < character that starts the tag.
865         int index = text.indexOf(';', position);
866         if ((index > position) && (index-position < MAX_CHAR_LENGTH)) {
867                 //String code = text.substring(position, index);
868             //System.out.println("Removed code: "+text.substring(position, index));
869             return index; // Just skip the tag.
870         } else return position; // Don't do anything.
871     }*/
872
873     private int readTag(String text, StringBuffer sb, int position) {
874         // Have just read the < character that starts the tag.
875         int index = text.indexOf('>', position);
876         if ((index > position) && (index-position < MAX_TAG_LENGTH)) {
877             //System.out.println("Removed tag: "+text.substring(position, index));
878             return index; // Just skip the tag.
879         } else return position; // Don't do anything.
880     }
881 }