e11decfb1e8bd0b2e5f88e340fc17bca4e49ee52
[debian/jabref.git] / src / java / net / sf / jabref / imports / HTMLConverter.java
1 /*  Copyright (C) 2003-2012 JabRef contributors.
2     This program is free software; you can redistribute it and/or modify
3     it under the terms of the GNU General Public License as published by
4     the Free Software Foundation; either version 2 of the License, or
5     (at your option) any later version.
6
7     This program is distributed in the hope that it will be useful,
8     but WITHOUT ANY WARRANTY; without even the implied warranty of
9     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10     GNU General Public License for more details.
11
12     You should have received a copy of the GNU General Public License along
13     with this program; if not, write to the Free Software Foundation, Inc.,
14     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
15 */
16 package net.sf.jabref.imports;
17
18 import java.util.HashMap;
19 import java.util.Set;
20 import java.util.regex.Matcher;
21 import java.util.regex.Pattern;
22
23 import net.sf.jabref.export.layout.LayoutFormatter;
24 import net.sf.jabref.Globals;
25
26 public class HTMLConverter implements LayoutFormatter {
27
28     /*   Portions © International Organization for Standardization 1986:
29      Permission to copy in any form is granted for use with
30      conforming SGML systems and applications as defined in
31      ISO 8879, provided this notice is included in all copies.
32     */
33
34
35         // most of the LaTeX commands can be read at http://en.wikibooks.org/wiki/LaTeX/Accents
36         // The symbols can be looked at http://www.fileformat.info/info/unicode/char/a4/index.htm. Replace "a4" with the U+ number
37         // http://detexify.kirelabs.org/classify.html and http://www.ctan.org/tex-archive/info/symbols/comprehensive/ might help to find the right LaTeX command
38         // http://llg.cubic.org/docs/ent2latex.html and http://www.w3.org/TR/xml-entity-names/byalpha.html are also useful
39     
40     
41     // An array of arrays of strings in the format:
42     // {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"}
43     // Leaving a field empty is OK as it then will not be included
44     private String[][] conversionList = new String[][]{
45         {"160", "nbsp", "\\{~\\}"}, // no-break space = non-breaking space, 
46         //                                 U+00A0 ISOnum 
47         {"161", "iexcl", "\\\\textexclamdown"}, // inverted exclamation mark, U+00A1 ISOnum
48         {"162", "cent", "\\\\textcent"}, // cent sign, U+00A2 ISOnum  
49         {"163", "pound", "\\\\pounds"}, // pound sign, U+00A3 ISOnum
50         {"164", "curren", "\\\\textcurrency"}, // currency sign, U+00A4 ISOnum  
51         {"165", "yen", "\\\\textyen"}, // yen sign = yuan sign, U+00A5 ISOnum  
52         {"166", "brvbar", "\\\\textbrokenbar"}, // broken bar = broken vertical bar, 
53         //                                 U+00A6 ISOnum 
54         {"167", "sect", "\\{\\\\S\\}"}, // section sign, U+00A7 ISOnum  
55         {"168", "uml", "\\\\\"\\{\\}"}, // diaeresis = spacing diaeresis, 
56         //                                 U+00A8 ISOdia 
57         {"169", "copy", "\\\\copyright"}, // copyright sign, U+00A9 ISOnum
58         {"170", "ordf", "\\\\textordfeminine"}, // feminine ordinal indicator, U+00AA ISOnum
59         {"171", "laquo", "\\\\guillemotleft"}, // left-pointing double angle quotation mark
60         //                                 = left pointing guillemet, U+00AB ISOnum 
61         {"172", "not", "\\$\\\\neg\\$"}, // not sign, U+00AC ISOnum  
62         {"173", "shy", "\\\\-"}, // soft hyphen = discretionary hyphen, 
63         //                                 U+00AD ISOnum 
64         {"174", "reg", "\\\\textregistered"}, // registered sign = registered trade mark sign,
65         //                                 U+00AE ISOnum 
66         {"175", "macr", "\\\\=\\{\\}"}, // macron = spacing macron = overline 
67         //                                 = APL overbar, U+00AF ISOdia 
68         {"176", "deg", "\\$\\\\deg\\$"}, // degree sign, U+00B0 ISOnum  
69         {"177", "plusmn", "\\$\\\\pm\\$"}, // plus-minus sign = plus-or-minus sign, 
70         //                                 U+00B1 ISOnum 
71         {"178", "sup2", "\\$\\^2\\$"}, // superscript two = superscript digit two 
72         //                                 = squared, U+00B2 ISOnum 
73         {"179", "sup3", "\\$\\^3\\$"}, // superscript three = superscript digit three 
74         //                                 = cubed, U+00B3 ISOnum 
75         {"180", "acute", "\\\\'\\{\\}"}, // acute accent = spacing acute, 
76         //                                 U+00B4 ISOdia 
77         {"181", "micro", "\\$\\\\mu\\$"}, // micro sign, U+00B5 ISOnum  
78         {"182", "para", "\\{\\\\P\\}"}, // pilcrow sign = paragraph sign, 
79         //                                 U+00B6 ISOnum 
80         {"183", "middot", "\\$\\\\cdot\\$"}, // middle dot = Georgian comma 
81         //                                 = Greek middle dot, U+00B7 ISOnum 
82         {"184", "cedil", "\\\\c\\{\\}"}, // cedilla = spacing cedilla, U+00B8 ISOdia  
83         {"185", "sup1", "\\\\textsuperscript\\{1\\}"}, // superscript one = superscript digit one,
84         //                                 U+00B9 ISOnum 
85         {"186", "ordm", "\\\\textordmasculine"}, // masculine ordinal indicator,
86         //                                 U+00BA ISOnum 
87         {"187", "raquo", "\\\\guillemotright"}, // right-pointing double angle quotation mark
88         //                                 = right pointing guillemet, U+00BB ISOnum 
89         {"188", "frac14", "\\$\\sfrac\\{1\\}\\{4\\}\\$"}, // vulgar fraction one quarter 
90         //                                 = fraction one quarter, U+00BC ISOnum 
91         {"189", "frac12", "\\$\\sfrac\\{1\\}\\{2\\}\\$"}, // vulgar fraction one half 
92         //                                 = fraction one half, U+00BD ISOnum 
93         {"190", "frac34", "\\$\\sfrac\\{3\\}\\{4\\}\\$"}, // vulgar fraction three quarters 
94         //                                 = fraction three quarters, U+00BE ISOnum 
95         {"191", "iquest", "\\\\textquestiondown"}, // inverted question mark 
96         //                                 = turned question mark, U+00BF ISOnum 
97         {"192", "Agrave", "\\\\`\\{A\\}"}, // latin capital letter A with grave
98         //                                 = latin capital letter A grave,
99         //                                 U+00C0 ISOlat1 
100         {"193", "Aacute", "\\\\'\\{A\\}"}, // latin capital letter A with acute, 
101         //                                 U+00C1 ISOlat1 
102         {"194", "Acirc", "\\\\\\^\\{A\\}"}, // latin capital letter A with circumflex, 
103         //                                 U+00C2 ISOlat1 
104         {"195", "Atilde", "\\\\~\\{A\\}"}, // latin capital letter A with tilde, 
105         //                                 U+00C3 ISOlat1 
106         {"196", "Auml", "\\\"\\{A\\}"}, // latin capital letter A with diaeresis, 
107         //                                 U+00C4 ISOlat1 
108         {"197", "Aring", "\\{\\\\AA\\}"}, // latin capital letter A with ring above 
109         //                                 = latin capital letter A ring,
110         //                                 U+00C5 ISOlat1 
111         {"198", "AElig", "\\{\\\\AE\\}"}, // latin capital letter AE 
112         //                                 = latin capital ligature AE,
113         //                                 U+00C6 ISOlat1 
114         {"199", "Ccedil", "\\\\c\\{C\\}"}, // latin capital letter C with cedilla,
115         //                                 U+00C7 ISOlat1 
116         {"200", "Egrave", "\\\\`\\{E\\}"}, // latin capital letter E with grave,
117         //                                 U+00C8 ISOlat1 
118         {"201", "Eacute", "\\\\'\\{E\\}"}, // latin capital letter E with acute, 
119         //                                 U+00C9 ISOlat1 
120         {"202", "Ecirc", "\\\\\\^\\{E\\}"}, // latin capital letter E with circumflex, 
121         //                                 U+00CA ISOlat1 
122         {"203", "Euml", "\\\\\"\\{E\\}"}, // latin capital letter E with diaeresis, 
123         //                                 U+00CB ISOlat1 
124         {"204", "Igrave", "\\\\`\\{I\\}"}, // latin capital letter I with grave,
125         //                                 U+00CC ISOlat1 
126         {"205", "Iacute", "\\\\'\\{I\\}"}, // latin capital letter I with acute, 
127         //                                 U+00CD ISOlat1 
128         {"206", "Icirc", "\\\\\\^\\{I\\}"}, // latin capital letter I with circumflex, 
129         //                                 U+00CE ISOlat1 
130         {"207", "Iuml", "\\\\\"\\{I\\}"}, // latin capital letter I with diaeresis, 
131         //                                 U+00CF ISOlat1 
132         {"208", "ETH", "\\{\\\\DH\\}"}, // latin capital letter ETH, U+00D0 ISOlat1  
133         {"209", "Ntilde", "\\\\~\\{N\\}"}, // latin capital letter N with tilde, 
134         //                                 U+00D1 ISOlat1 
135         {"210", "Ograve", "\\\\`\\{O\\}"}, // latin capital letter O with grave,
136         //                                 U+00D2 ISOlat1 
137         {"211", "Oacute", "\\\\'\\{O\\}"}, // latin capital letter O with acute, 
138         //                                 U+00D3 ISOlat1 
139         {"212", "Ocirc", "\\\\\\^\\{O\\}"}, // latin capital letter O with circumflex, 
140         //                                 U+00D4 ISOlat1 
141         {"213", "Otilde", "\\\\~\\{O\\}"}, // latin capital letter O with tilde, 
142         //                                 U+00D5 ISOlat1 
143         {"214", "Ouml", "\\\\\"\\{O\\}"}, // latin capital letter O with diaeresis, 
144         //                                 U+00D6 ISOlat1 
145         {"215", "times", "\\$\\\\times\\$"}, // multiplication sign, U+00D7 ISOnum  
146         {"216", "Oslash", "\\{\\\\O\\{\\}\\}"}, // latin capital letter O with stroke 
147         //                                 = latin capital letter O slash,
148         //                                 U+00D8 ISOlat1 
149         {"217", "Ugrave", "\\\\`\\{U\\}"}, // latin capital letter U with grave,
150         //                                 U+00D9 ISOlat1 
151         {"218", "Uacute", "\\\\'\\{U\\}"}, // latin capital letter U with acute, 
152         //                                 U+00DA ISOlat1 
153         {"219", "Ucirc", "\\\\\\^\\{U\\}"}, // latin capital letter U with circumflex, 
154         //                                 U+00DB ISOlat1 
155         {"220", "Uuml", "\\\\\"\\{U\\}"}, // latin capital letter U with diaeresis, 
156         //                                 U+00DC ISOlat1 
157         {"221", "Yacute", "\\\\'\\{Y\\}"}, // latin capital letter Y with acute, 
158         //                                 U+00DD ISOlat1 
159         {"222", "THORN", "\\{\\\\TH\\}"}, // latin capital letter THORN, 
160         //                                 U+00DE ISOlat1 
161         {"223", "szlig", "\\\\ss\\{\\}"}, // latin small letter sharp s = ess-zed,
162         //                                 U+00DF ISOlat1 
163         {"224", "agrave", "\\\\`\\{a\\}"}, // latin small letter a with grave
164         //                                 = latin small letter a grave,
165         //                                 U+00E0 ISOlat1 
166         {"225", "aacute", "\\\\'\\{a\\}"}, // latin small letter a with acute, 
167         //                                 U+00E1 ISOlat1 
168         {"226", "acirc", "\\\\\\^\\{a\\}"}, // latin small letter a with circumflex, 
169         //                                 U+00E2 ISOlat1 
170         {"227", "atilde", "\\\\~\\{a\\}"}, // latin small letter a with tilde, 
171         //                                 U+00E3 ISOlat1 
172         {"228", "auml", "\\\\\"\\{a\\}"}, // latin small letter a with diaeresis, 
173         //                                 U+00E4 ISOlat1 
174         {"229", "aring", "\\{\\\\aa\\}"}, // latin small letter a with ring above 
175         //                                 = latin small letter a ring,
176         //                                 U+00E5 ISOlat1 
177         {"230", "aelig", "\\{\\\\ae\\}"}, // latin small letter ae 
178         //                                 = latin small ligature ae, U+00E6 ISOlat1 
179         {"231", "ccedil", "\\\\c\\{c\\}"}, // latin small letter c with cedilla,
180         //                                 U+00E7 ISOlat1 
181         {"232", "egrave", "\\\\`\\{e\\}"}, // latin small letter e with grave,
182         //                                 U+00E8 ISOlat1 
183         {"233", "eacute", "\\\\'\\{e\\}"}, // latin small letter e with acute, 
184         //                                 U+00E9 ISOlat1 
185         {"234", "ecirc", "\\\\\\^\\{e\\}"}, // latin small letter e with circumflex, 
186         //                                 U+00EA ISOlat1 
187         {"235", "euml", "\\\\\"\\{e\\}"}, // latin small letter e with diaeresis, 
188         //                                 U+00EB ISOlat1 
189         {"236", "igrave", "\\\\`\\{i\\}"}, // latin small letter i with grave,
190         //                                 U+00EC ISOlat1 
191         {"237", "iacute", "\\\\'\\{i\\}"}, // latin small letter i with acute, 
192         //                                 U+00ED ISOlat1 
193         {"238", "icirc", "\\\\\\^\\{i\\}"}, // latin small letter i with circumflex, 
194         //                                 U+00EE ISOlat1 
195         {"239", "iuml", "\\\\\"\\{\\\\i\\}"}, // latin small letter i with diaeresis, 
196         //                                 U+00EF ISOlat1 
197         {"240", "eth", "\\\\dh"}, // latin small letter eth, U+00F0 ISOlat1  
198         {"241", "ntilde", "\\\\~\\{n\\}"}, // latin small letter n with tilde, 
199         //                                 U+00F1 ISOlat1 
200         {"242", "ograve", "\\\\`\\{o\\}"}, // latin small letter o with grave,
201         //                                 U+00F2 ISOlat1 
202         {"243", "oacute", "\\\\'\\{o\\}"}, // latin small letter o with acute, 
203         //                                 U+00F3 ISOlat1 
204         {"244", "ocirc", "\\\\\\^\\{o\\}"}, // latin small letter o with circumflex, 
205         //                                 U+00F4 ISOlat1 
206         {"245", "otilde", "\\\\~\\{o\\}"}, // latin small letter o with tilde, 
207         //                                 U+00F5 ISOlat1 
208         {"246", "ouml", "\\\\\"\\{o\\}"}, // latin small letter o with diaeresis, 
209         //                                 U+00F6 ISOlat1 
210         {"247", "divide", "\\$\\\\div\\$"}, // division sign, U+00F7 ISOnum  
211         {"248", "oslash", "\\\\o\\{\\}"}, // latin small letter o with stroke, 
212         //                                 = latin small letter o slash,
213         //                                 U+00F8 ISOlat1 
214         {"249", "ugrave", "\\\\`\\{u\\}"}, // latin small letter u with grave,
215         //                                 U+00F9 ISOlat1 
216         {"250", "uacute", "\\\\'\\{u\\}"}, // latin small letter u with acute, 
217         //                                 U+00FA ISOlat1 
218         {"251", "ucirc", "\\\\\\^\\{u\\}"}, // latin small letter u with circumflex, 
219         //                                 U+00FB ISOlat1 
220         {"252", "uuml", "\\\\\"\\{u\\}"}, // latin small letter u with diaeresis, 
221         //                                 U+00FC ISOlat1 
222         {"253", "yacute", "\\\\'\\{y\\}"}, // latin small letter y with acute, 
223         //                                 U+00FD ISOlat1 
224         {"254", "thorn", "\\{\\\\th\\}"}, // latin small letter thorn, 
225         //                                 U+00FE ISOlat1 
226         {"255", "yuml", "\\\\\"\\{y\\}"}, // latin small letter y with diaeresis, 
227         //                                 U+00FF ISOlat1 
228         {"402", "fnof", "\\$f\\$"}, // latin small f with hook = function 
229         //                                   = florin, U+0192 ISOtech 
230
231         /* Greek */
232         {"913", "Alpha", "\\{\\$\\\\Alpha\\$\\}"}, // greek capital letter alpha, U+0391  
233         {"914", "Beta", "\\{\\$\\\\Beta\\$\\}"}, // greek capital letter beta, U+0392  
234         {"915", "Gamma", "\\{\\$\\\\Gamma\\$\\}"}, // greek capital letter gamma, 
235         //                                   U+0393 ISOgrk3 
236         {"916", "Delta", "\\{\\$\\\\Delta\\$\\}"}, // greek capital letter delta, 
237         //                                   U+0394 ISOgrk3 
238         {"917", "Epsilon", "\\{\\$\\\\Epsilon\\$\\}"}, // greek capital letter epsilon, U+0395  
239         {"918", "Zeta", "\\{\\$\\\\Zeta\\$\\}"}, // greek capital letter zeta, U+0396  
240         {"919", "Eta", "\\{\\$\\\\Eta\\$\\}"}, // greek capital letter eta, U+0397  
241         {"920", "Theta", "\\{\\$\\\\Theta\\$\\}"}, // greek capital letter theta, 
242         //                                   U+0398 ISOgrk3 
243         {"921", "Iota", "\\{\\$\\\\Iota\\$\\}"}, // greek capital letter iota, U+0399  
244         {"922", "Kappa", "\\{\\$\\\\Kappa\\$\\}"}, // greek capital letter kappa, U+039A  
245         {"923", "Lambda", "\\{\\$\\\\Lambda\\$\\}"}, // greek capital letter lambda, 
246         //                                   U+039B ISOgrk3 
247         {"924", "Mu", "\\{\\$\\\\Mu\\$\\}"}, // greek capital letter mu, U+039C  
248         {"925", "Nu", "\\{\\$\\\\Nu\\$\\}"}, // greek capital letter nu, U+039D  
249         {"926", "Xi", "\\{\\$\\\\Xi\\$\\}"}, // greek capital letter xi, U+039E ISOgrk3  
250         {"927", "Omicron", "\\{\\$\\\\Omicron\\$\\}"}, // greek capital letter omicron, U+039F  
251         {"928", "Pi", "\\{\\$\\\\Pi\\$\\}"}, // greek capital letter pi, U+03A0 ISOgrk3  
252         {"929", "Rho", "\\{\\$\\\\Rho\\$\\}"}, // greek capital letter rho, U+03A1  
253         /* there is no Sigmaf, and no U+03A2 character either */
254         {"931", "Sigma", "\\{\\$\\\\Sigma\\$\\}"}, // greek capital letter sigma, 
255         //                                   U+03A3 ISOgrk3 
256         {"932", "Tau", "\\{\\$\\\\Tau\\$\\}"}, // greek capital letter tau, U+03A4  
257         {"933", "Upsilon", "\\{\\$\\\\Upsilon\\$\\}"}, // greek capital letter upsilon, 
258         //                                   U+03A5 ISOgrk3 
259         {"934", "Phi", "\\{\\$\\\\Phi\\$\\}"}, // greek capital letter phi, 
260         //                                   U+03A6 ISOgrk3 
261         {"935", "Chi", "\\{\\$\\\\Chi\\$\\}"}, // greek capital letter chi, U+03A7  
262         {"936", "Psi", "\\{\\$\\\\Psi\\$\\}"}, // greek capital letter psi, 
263         //                                   U+03A8 ISOgrk3 
264         {"937", "Omega", "\\{\\$\\\\Omega\\$\\}"}, // greek capital letter omega, 
265         //                                   U+03A9 ISOgrk3 
266
267         {"945", "alpha", "\\$\\\\alpha\\$"}, // greek small letter alpha, 
268         //                                   U+03B1 ISOgrk3 
269         {"946", "beta", "\\$\\\\beta\\$"}, // greek small letter beta, U+03B2 ISOgrk3  
270         {"947", "gamma", "\\$\\\\gamma\\$"}, // greek small letter gamma, 
271         //                                   U+03B3 ISOgrk3 
272         {"948", "delta", "\\$\\\\delta\\$"}, // greek small letter delta, 
273         //                                   U+03B4 ISOgrk3 
274         {"949", "epsilon", "\\$\\\\epsilon\\$"}, // greek small letter epsilon, 
275         //                                   U+03B5 ISOgrk3 
276         {"950", "zeta", "\\$\\\\zeta\\$"}, // greek small letter zeta, U+03B6 ISOgrk3  
277         {"951", "eta", "\\$\\\\eta\\$"}, // greek small letter eta, U+03B7 ISOgrk3  
278         {"952", "theta", "\\$\\\\theta\\$"}, // greek small letter theta, 
279         //                                   U+03B8 ISOgrk3 
280         {"953", "iota", "\\$\\\\iota\\$"}, // greek small letter iota, U+03B9 ISOgrk3  
281         {"954", "kappa", "\\$\\\\kappa\\$"}, // greek small letter kappa, 
282         //                                   U+03BA ISOgrk3 
283         {"955", "lambda", "\\$\\\\lambda\\$"}, // greek small letter lambda, 
284         //                                   U+03BB ISOgrk3 
285         {"956", "mu", "\\$\\\\mu\\$"}, // greek small letter mu, U+03BC ISOgrk3  
286         {"957", "nu", "\\$\\\\nu\\$"}, // greek small letter nu, U+03BD ISOgrk3  
287         {"958", "xi", "\\$\\\\xi\\$"}, // greek small letter xi, U+03BE ISOgrk3  
288         {"959", "omicron", "\\$\\\\omicron\\$"}, // greek small letter omicron, U+03BF NEW  
289         {"960", "pi", "\\$\\\\phi\\$"}, // greek small letter pi, U+03C0 ISOgrk3  
290         {"961", "rho", "\\$\\\\rho\\$"}, // greek small letter rho, U+03C1 ISOgrk3  
291         {"962", "sigmaf", "\\$\\\\varsigma\\$"}, // greek small letter final sigma, 
292         //                                   U+03C2 ISOgrk3 
293         {"963", "sigma", "\\$\\\\sigma\\$"}, // greek small letter sigma, 
294         //                                   U+03C3 ISOgrk3 
295         {"964", "tau", "\\$\\\\tau\\$"}, // greek small letter tau, U+03C4 ISOgrk3  
296         {"965", "upsilon", "\\$\\\\upsilon\\$"}, // greek small letter upsilon, 
297         //                                   U+03C5 ISOgrk3 
298         {"966", "phi", "\\$\\\\phi\\$"}, // greek small letter phi, U+03C6 ISOgrk3  
299         {"967", "chi", "\\$\\\\chi\\$"}, // greek small letter chi, U+03C7 ISOgrk3  
300         {"968", "psi", "\\$\\\\psi\\$"}, // greek small letter psi, U+03C8 ISOgrk3  
301         {"969", "omega", "\\$\\\\omega\\$"}, // greek small letter omega, 
302         //                                   U+03C9 ISOgrk3 
303         {"977", "thetasym", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol, 
304         //                                   U+03D1 NEW 
305         {"978", "upsih", "\\{\\$\\\\Upsilon\\$\\}"}, // greek upsilon with hook symbol, 
306         //                                   U+03D2 NEW 
307         {"982", "piv", "\\$\\\\varphi\\$"}, // greek pi symbol, U+03D6 ISOgrk3  
308
309         /* General Punctuation */
310         {"8226", "bull", "\\$\\\\bullet\\$"}, // bullet = black small circle, 
311         //                                    U+2022 ISOpub  
312         /* bullet is NOT the same as bullet operator, U+2219 */
313         {"8230", "hellip", "\\{\\\\ldots\\}"}, // horizontal ellipsis = three dot leader, 
314         //                                    U+2026 ISOpub  
315         {"8242", "prime", "\\$\\\\prime\\$"}, // prime = minutes = feet, U+2032 ISOtech  
316         {"8243", "Prime", "\\$\\{''\\}\\$"}, // double prime = seconds = inches, 
317         //                                    U+2033 ISOtech 
318         {"8254", "oline", "\\\\=\\{\\}"}, // overline = spacing overscore, 
319         //                                    U+203E NEW 
320         {"8260", "frasl", "/"}, // fraction slash, U+2044 NEW  
321
322         /* Letterlike Symbols */
323         {"8472", "weierp", "\\$\\\\wp\\$"}, // script capital P = power set 
324         //                                    = Weierstrass p, U+2118 ISOamso 
325         {"8465", "image", "\\{\\$\\\\Im\\$\\}"}, // blackletter capital I = imaginary part, 
326         //                                    U+2111 ISOamso 
327         {"8476", "real", "\\{\\$\\\\Re\\$\\}"}, // blackletter capital R = real part symbol, 
328         //                                    U+211C ISOamso 
329         {"8482", "trade", "\\\\texttrademark"}, // trade mark sign, U+2122 ISOnum
330         {"8501", "alefsym", "\\$\\\\aleph\\$"}, // alef symbol = first transfinite cardinal, 
331         //                                    U+2135 NEW 
332         /*    alef symbol is NOT the same as hebrew letter alef,
333          U+05D0 although the same glyph could be used to depict both characters */
334         /* Arrows */
335         {"8592", "larr", "\\$\\\\leftarrow\\$"}, // leftwards arrow, U+2190 ISOnum
336         {"8593", "uarr", "\\$\\\\uparrow\\$"}, // upwards arrow, U+2191 ISOnum
337         {"8594", "rarr", "\\$\\\\rightarrow\\$"}, // rightwards arrow, U+2192 ISOnum
338         {"8595", "darr", "\\$\\\\downarrow\\$"}, // downwards arrow, U+2193 ISOnum
339         {"8596", "harr", "\\$\\\\leftrightarrow\\$"}, // left right arrow, U+2194 ISOamsa  
340         {"8629", "crarr", ""}, // downwards arrow with corner leftwards 
341         //                                    = carriage return, U+21B5 NEW 
342         {"8656", "lArr", "\\{\\$\\\\Leftarrow\\$\\}"}, // leftwards double arrow, U+21D0 ISOtech
343         /*  ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
344          but also does not have any other character for that function. So ? lArr can
345          be used for 'is implied by' as ISOtech suggests */
346         {"8657", "uArr", "\\{\\$\\\\Uparrow\\$\\}"}, // upwards double arrow, U+21D1 ISOamsa
347         {"8658", "rArr", "\\{\\$\\\\Rightarrow\\$\\}"}, // rightwards double arrow,
348         //                                     U+21D2 ISOtech 
349         /*   ISO 10646 does not say this is the 'implies' character but does not have 
350          another character with this function so ?
351          rArr can be used for 'implies' as ISOtech suggests */
352         {"8659", "dArr", "\\{\\$\\\\Downarrow\\$\\}"}, // downwards double arrow, U+21D3 ISOamsa  
353         {"8660", "hArr", "\\{\\$\\\\Leftrightarrow\\$\\}"}, // left right double arrow, 
354         //                                     U+21D4 ISOamsa 
355
356         /* Mathematical Operators */
357         {"8704", "forall", "\\$\\\\forall\\$"}, // for all, U+2200 ISOtech  
358         {"8706", "part", "\\$\\\\partial\\$"}, // partial differential, U+2202 ISOtech
359         {"8707", "exist", "\\$\\\\exists\\$"}, // there exists, U+2203 ISOtech
360         {"8709", "empty", "\\$\\\\emptyset\\$"}, // empty set = null set = diameter,
361         //                                    U+2205 ISOamso 
362         {"8711", "nabla", "\\$\\\\nabla\\$"}, // nabla = backward difference, 
363         //                                    U+2207 ISOtech 
364         {"8712", "isin", "\\$\\\\in\\$"}, // element of, U+2208 ISOtech
365         {"8713", "notin", "\\$\\\\notin\\$"}, // not an element of, U+2209 ISOtech
366         {"8715", "ni", "\\$\\\\ni\\$"}, // contains as member, U+220B ISOtech
367         /* should there be a more memorable name than 'ni'? */
368         {"8719", "prod", "\\$\\\\prod\\$"}, // n-ary product = product sign,
369         //                                    U+220F ISOamsb 
370         /*    prod is NOT the same character as U+03A0 'greek capital letter pi' though
371          the same glyph might be used for both  */
372         {"8721", "sum", "\\$\\\\sum\\$"}, // n-ary sumation, U+2211 ISOamsb  
373         /*    sum is NOT the same character as U+03A3 'greek capital letter sigma'
374          though the same glyph might be used for both */
375         {"8722", "minus", "\\$-\\$"}, // minus sign, U+2212 ISOtech  
376         {"8727", "lowast", "\\$\\\\ast\\$"}, // asterisk operator, U+2217 ISOtech  
377         {"8730", "radic", "\\$\\\\sqrt{}\\$"}, // square root = radical sign, 
378         //                                    U+221A ISOtech 
379         {"8733", "prop", "\\$\\\\propto\\$"}, // proportional to, U+221D ISOtech  
380         {"8734", "infin", "\\$\\\\infty\\$"}, // infinity, U+221E ISOtech  
381         {"8736", "ang", "\\$\\\\angle\\$"}, // angle, U+2220 ISOamso
382         {"8743", "and", "\\$\\\\land\\$"}, // logical and = wedge, U+2227 ISOtech
383         {"8744", "or", "\\$\\\\lor\\$"}, // logical or = vee, U+2228 ISOtech
384         {"8745", "cap", "\\$\\\\cap\\$"}, // intersection = cap, U+2229 ISOtech
385         {"8746", "cup", "\\$\\\\cup\\$"}, // union = cup, U+222A ISOtech
386         {"8747", "int", "\\$\\\\int\\$"}, // integral, U+222B ISOtech
387         {"8756", "there4", "\\$\\\\uptherefore\\$"}, // therefore, U+2234 ISOtech; only in LaTeX package MnSymbol
388         {"8764", "sim", "\\$\\\\sim\\$"}, // tilde operator = varies with = similar to,
389         //                                    U+223C ISOtech 
390         /*  tilde operator is NOT the same character as the tilde, U+007E,
391          although the same glyph might be used to represent both   */
392         {"8773", "cong", "\\$\\\\cong\\$"}, // approximately equal to, U+2245 ISOtech  
393         {"8776", "asymp", "\\$\\\\approx\\$"}, // almost equal to = asymptotic to, 
394         //                                    U+2248 ISOamsr 
395         {"8800", "ne", "\\$\\\\neq\\$"}, // not equal to, U+2260 ISOtech  
396         {"8801", "equiv", "\\$\\\\equiv\\$"}, // identical to, U+2261 ISOtech  
397         {"8804", "le", "\\$\\\\leq\\$"}, // less-than or equal to, U+2264 ISOtech  
398         {"8805", "ge", "\\$\\\\geq\\$"}, // greater-than or equal to, 
399         //                                    U+2265 ISOtech 
400         {"8834", "sub", "\\$\\\\subset\\$"}, // subset of, U+2282 ISOtech  
401         {"8835", "sup", "\\$\\\\supset\\$"}, // superset of, U+2283 ISOtech  
402         /*    note that nsup, 'not a superset of, U+2283' is not covered by the Symbol 
403          font encoding and is not included. Should it be, for symmetry?
404          It is in ISOamsn   */
405         {"8836", "nsub", "\\$\\\\nsubset\\$"}, // not a subset of, U+2284 ISOamsn  
406         {"8838", "sube", "\\$\\\\subseteq\\$"}, // subset of or equal to, U+2286 ISOtech  
407         {"8839", "supe", "\\$\\\\supseteq\\$"}, // superset of or equal to, 
408         //                                    U+2287 ISOtech 
409         {"8853", "oplus", "\\$\\\\oplus\\$"}, // circled plus = direct sum, 
410         //                                    U+2295 ISOamsb 
411         {"8855", "otimes", "\\$\\\\otimes\\$"}, // circled times = vector product,
412         //                                    U+2297 ISOamsb 
413         {"8869", "perp", "\\$\\\\perp\\$"}, // up tack = orthogonal to = perpendicular, 
414         //                                    U+22A5 ISOtech 
415         {"8901", "sdot", "\\$\\\\cdot\\$"}, // dot operator, U+22C5 ISOamsb  
416         /* dot operator is NOT the same character as U+00B7 middle dot */
417         /* Miscellaneous Technical */
418         {"8968", "lceil", "\\$\\\\lceil\\$"}, // left ceiling = apl upstile, 
419         //                                    U+2308 ISOamsc  
420         {"8969", "rceil", "\\$\\\\rceil\\$"}, // right ceiling, U+2309 ISOamsc   
421         {"8970", "lfloor", "\\$\\\\lfloor\\$"}, // left floor = apl downstile, 
422         //                                    U+230A ISOamsc  
423         {"8971", "rfloor", "\\$\\\\rfloor\\$"}, // right floor, U+230B ISOamsc   
424         {"9001", "lang", "\\$\\\\langle\\$"}, // left-pointing angle bracket = bra, 
425         //                                    U+2329 ISOtech 
426         /*    lang is NOT the same character as U+003C 'less than' 
427          or U+2039 'single left-pointing angle quotation mark' */
428         {"9002", "rang", "\\$\\\\rangle\\$"}, // right-pointing angle bracket = ket, 
429         //                                    U+232A ISOtech 
430         /*    rang is NOT the same character as U+003E 'greater than' 
431          or U+203A 'single right-pointing angle quotation mark' */
432         /* Geometric Shapes */
433         {"9674", "loz", "\\$\\\\lozenge\\$"}, // lozenge, U+25CA ISOpub  
434
435         /* Miscellaneous Symbols */
436         {"9824", "spades", "\\$\\\\spadesuit\\$"}, // black spade suit, U+2660 ISOpub  
437         /* black here seems to mean filled as opposed to hollow */
438         {"9827", "clubs", "\\$\\\\clubsuit\\$"}, // black club suit = shamrock, 
439         //                                    U+2663 ISOpub 
440         {"9829", "hearts", "\\$\\\\heartsuit\\$"}, // black heart suit = valentine, 
441         //                                    U+2665 ISOpub 
442         {"9830", "diams", "\\$\\\\diamondsuit\\$"}, // black diamond suit, U+2666 ISOpub  
443         {"34", "quot", "\""}, // quotation mark = APL quote,
444         //                                   U+0022 ISOnum 
445         {"38", "amp", "\\\\&"}, // ampersand, U+0026 ISOnum 
446         {"60", "lt", "\\$<\\$"}, // less-than sign, U+003C ISOnum 
447         {"62", "gt", "\\$>\\$"}, // greater-than sign, U+003E ISOnum 
448
449         /* Latin Extended-A */
450         {"338", "OElig", "\\{\\\\OE\\}"}, // latin capital ligature OE,
451         //                                   U+0152 ISOlat2 
452         {"339", "oelig", "\\{\\\\oe\\}"}, // latin small ligature oe, U+0153 ISOlat2 
453         /* ligature is a misnomer, this is a separate character in some languages */
454         {"352", "Scaron", "\\\\v\\{S\\}"}, // latin capital letter S with caron,
455         //                                   U+0160 ISOlat2 
456         {"353", "scaron", "\\\\v\\{s\\}"}, // latin small letter s with caron,
457         //                                   U+0161 ISOlat2 
458         {"376", "Yuml", "\\\\\"\\{Y\\}"}, // latin capital letter Y with diaeresis,
459         //                                   U+0178 ISOlat2 
460
461         /* Spacing Modifier Letters */
462         {"710", "circ", "\\\\textasciicircum"}, // modifier letter circumflex accent,
463         //                                   U+02C6 ISOpub 
464         {"732", "tilde", "\\\\textasciitilde"}, // small tilde, U+02DC ISOdia 
465
466         /* General Punctuation */
467         {"8194", "ensp", "\\\\hspace\\{0.5em\\}"}, // en space, U+2002 ISOpub  
468         {"8195", "emsp", "\\\\hspace\\{1em\\}"}, // em space, U+2003 ISOpub  
469         {"8201", "thinsp", "\\\\hspace\\{0.167em\\}"}, // thin space, U+2009 ISOpub  
470         {"8204", "zwnj", ""}, // zero width non-joiner, 
471         //                                   U+200C NEW RFC 2070 
472         {"8205", "zwj", ""}, // zero width joiner, U+200D NEW RFC 2070  
473         {"8206", "lrm", ""}, // left-to-right mark, U+200E NEW RFC 2070  
474         {"8207", "rlm", ""}, // right-to-left mark, U+200F NEW RFC 2070  
475         {"8211", "ndash", "--"}, // en dash, U+2013 ISOpub  
476         {"8212", "mdash", "---"}, // em dash, U+2014 ISOpub  
477         {"8216", "lsquo", "\\\\textquoteleft"}, // left single quotation mark, 
478         //                                   U+2018 ISOnum 
479         {"8217", "rsquo", "\\\\textquoteright"}, // right single quotation mark, 
480         //                                   U+2019 ISOnum 
481         {"8218", "sbquo", "\\\\quotesinglbase"}, // single low-9 quotation mark, U+201A NEW  
482         {"8220", "ldquo", "\\\\textquotedblleft"}, // left double quotation mark, 
483         //                                   U+201C ISOnum 
484         {"8221", "rdquo", "\\\\textquotedblright"}, // right double quotation mark, 
485         //                                   U+201D ISOnum 
486         {"8222", "bdquo", "\\\\quotedblbase"}, // double low-9 quotation mark, U+201E NEW  
487         {"8224", "dagger", "\\\\dag"}, // dagger, U+2020 ISOpub  
488         {"8225", "Dagger", "\\\\ddag"}, // double dagger, U+2021 ISOpub  
489         {"8240", "permil", "\\\\textperthousand"}, // per mille sign, U+2030 ISOtech  
490         {"8249", "lsaquo", "\\\\guilsinglleft"}, // single left-pointing angle quotation mark, 
491         //                                   U+2039 ISO proposed 
492         /* lsaquo is proposed but not yet ISO standardized */
493         {"8250", "rsaquo", "\\\\guilsinglright"}, // single right-pointing angle quotation mark, 
494         //                                   U+203A ISO proposed 
495         /* rsaquo is proposed but not yet ISO standardized */
496         {"8364", "euro", "\\\\texteuro"}, // euro sign, U+20AC NEW 
497             
498         /* Manually added */
499         {"37", "percnt", "\\\\%"}, // Percent
500         {"39", "", "'"}, // Apostrophe
501         {"40", "", "("}, // Left bracket
502         {"41", "", ")"}, // Right bracket
503         {"43", "plus", "\\+"}, // Plus
504         {"95", "lowbar", "\\\\_"}, // Underscore
505         {"123", "lbrace", "\\\\\\{"}, // Left curly bracket
506         {"125", "rbrace", "\\\\\\}"}, // Right curly bracket
507      // {"141", "", ""}, // Reverse line feed
508         {"146", "", "'"}, // Private use two ???
509         {"264", "Ccirc", "\\\\\\^\\{C\\}"}, // capital C with circumflex
510         {"305", "inodot", "\\{\\\\i\\}"},    // Small i without the dot
511         {"321", "Lstrok", "\\{\\\\L\\}"},    // upper case l with stroke
512         {"322", "lstrok", "\\{\\\\l\\}"},    // lower case l with stroke
513         {"536", "", "\\\\cb\\{S\\}"},    // capital letter S with comma below, require combelow
514         {"537", "", "\\\\cb\\{s\\}"},    // small letter S with comma below, require combelow
515         {"769", "", "'"},    // Can be solved better as it is a combining accent
516         {"774", "", "\\\\u\\{\\}"},    // FIX: Breve - Can be solved better as it is a combining accent
517         {"775", "", "\\\\\\.\\{\\}"},    // FIX: Dot above - Can be solved better as it is a combining accent
518         {"776", "", "\\\\\"\\{\\}"},    // FIX: Diaeresis - Can be solved better as it is a combining accent
519         {"780", "", "\\\\v\\{\\}"},    // FIX: Caron - Can be solved better as it is a combining accent
520         {"807", "", "\\\\c\\{\\}"},    // FIX: Cedilla - Can be solved better as it is a combining accent
521         {"949", "epsi", "\\$\\\\epsilon\\$"},    // Epsilon - double check
522         {"1013", "epsiv", "\\$\\\\varepsilonup\\$"},    // lunate epsilon, requires txfonts
523      // {"2013", "", ""},    // NKO letter FA
524         {"8208", "hyphen", "-"},    // Hyphen
525         {"8459", "Hscr", "\\$\\\\mathcal\\{H\\}\\$"}, // script capital H -- possibly use \mathscr
526         {"8460", "", "\\$\\\\mathbb\\{H\\}\\$"}, // black letter capital H -- requires e.g. amsfonts
527         {"8466", "Lscr", "\\$\\\\mathcal\\{L\\}\\$"}, // script capital L -- possibly use \mathscr
528         {"8467", "lscr", "\\{\\\\ell\\}"}, // script small l 
529         {"8491", "angst", "\\{\\\\AA\\}"}, // Angstrom 
530         {"8729", "bullet", "\\$\\\\bullet\\$"},    // Bullet operator
531         {"8776", "ap", "\\$\\\\approx\\$"}, // almost equal to = asymptotic to, 
532         {"8810", "ll", "\\$\\\\ll\\$"}, // Much less than 
533         {"8811", "gg", "\\$\\\\gg\\$"}, // Much greater than 
534         {"9426", "", "\\\\copyright"}, // circled small letter C
535         {"9653", "utri", "\\$\\\\triangle\\$"}, // White up-pointing small triangle -- \vartriangle probably
536                                                 // better but requires amssymb
537         {"10877", "les", "\\$\\\\leqslant\\$"},    // Less than slanted equal -- requires amssymb 
538         {"10878", "ges", "\\$\\\\geqslant\\$"},    // Less than slanted equal -- requires amssymb 
539         {"119978", "Oscr", "\\$\\\\mathcal\\{O\\}\\$"} // script capital O -- possibly use \mathscr
540         
541     };
542
543         private HashMap<String, String> escapedSymbols = new HashMap<String, String>();
544         private HashMap<Integer, String> numSymbols = new HashMap<Integer, String>();
545         
546         
547         
548         public HTMLConverter() {
549                 super();
550                 for (int i=0;i<conversionList.length;i++) {
551                     if (conversionList[i][2].length() >= 1) {
552                         if (conversionList[i][1].length() >= 1) {
553                             escapedSymbols.put("&" + conversionList[i][1] + ";" , conversionList[i][2]);
554                         }
555                         if (conversionList[i][0].length() >= 1) {
556                             numSymbols.put(Integer.decode(conversionList[i][0]) , conversionList[i][2]);
557                         }
558                     }
559                 }
560         }
561         
562     public String format(String text) {
563         if (text == null)
564             return null;
565         StringBuffer sb = new StringBuffer();
566         // Deal with the form <sup>k</sup>and <sub>k</sub>
567         // If the result is in text or equation form can be controlled
568         // From the "Advanced settings" tab
569         if(Globals.prefs.getBoolean("useConvertToEquation")) {
570             text = text.replaceAll("<sup>([^<]+)</sup>", "\\$\\^\\{$1\\}\\$");
571             text = text.replaceAll("<sub>([^<]+)</sub>", "\\$_\\{$1\\}\\$");
572         } else {
573             text = text.replaceAll("<sup>([^<]+)</sup>", "\\\\textsuperscript\\{$1\\}");
574             text = text.replaceAll("<sub>([^<]+)</sub>", "\\\\textsubscript\\{$1\\}");
575         }
576         
577         // TODO: maybe rewrite this based on regular expressions instead
578         // Note that (at least) the IEEE Xplore fetcher must be fixed as it relies on the current way to 
579         // remove tags for its image alt-tag to equation converter
580         for (int i=0; i<text.length(); i++) {
581
582             int c = text.charAt(i);
583
584             if (c == '<') {
585                 i = readTag(text, sb, i);
586             } else
587                 sb.append((char)c);
588
589         }
590         text = sb.toString();
591         
592         // Handle text based HTML entities
593         Set<String> patterns = escapedSymbols.keySet();
594         for (String pattern: patterns) {
595                 text = text.replaceAll(pattern, escapedSymbols.get(pattern));
596         }
597         
598         // Handle numerical HTML entities
599         Pattern escapedPattern = Pattern.compile("&#([x]*)([0]*)(\\p{XDigit}+);");
600         Matcher m = escapedPattern.matcher(text);
601         while (m.find()) {
602             //      System.err.println("Found pattern: " + m.group(1));
603             //      System.err.println("Found pattern: " + m.group(2));
604             int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3));
605             if(numSymbols.containsKey(num)) {
606                 text = text.replaceAll("&#" + m.group(1) + m.group(2) + m.group(3) + ";", numSymbols.get(num));
607             } else {
608                 System.err.println("HTML escaped char not converted: " + m.group(1) + m.group(2) + m.group(3) + " = " + Integer.toString(num));
609             }
610         }
611         
612         // Find non-covered special characters with alphabetic codes
613         escapedPattern = Pattern.compile("&(\\w+);");
614         m = escapedPattern.matcher(text);
615         while (m.find()) {
616             System.err.println("HTML escaped char not converted: " + m.group(1));
617         }
618
619         return text.trim();
620     }
621
622     private final int MAX_TAG_LENGTH = 30;
623     /*private final int MAX_CHAR_LENGTH = 10;
624
625     private int readHtmlChar(String text, StringBuffer sb, int position) {
626         // Have just read the < character that starts the tag.
627         int index = text.indexOf(';', position);
628         if ((index > position) && (index-position < MAX_CHAR_LENGTH)) {
629                 //String code = text.substring(position, index);
630             //System.out.println("Removed code: "+text.substring(position, index));
631             return index; // Just skip the tag.
632         } else return position; // Don't do anything.
633     }*/
634
635     private int readTag(String text, StringBuffer sb, int position) {
636         // Have just read the < character that starts the tag.
637         int index = text.indexOf('>', position);
638         if ((index > position) && (index-position < MAX_TAG_LENGTH)) {
639             //System.out.println("Removed tag: "+text.substring(position, index));
640             return index; // Just skip the tag.
641         } else return position; // Don't do anything.
642     }
643 }