cd5328d6fa205cb48c0bf1f12a0f8fed75444827
[debian/jabref.git] / src / java / net / sf / jabref / imports / IEEEXploreFetcher.java
1 /*  Copyright (C) 2003-2011 JabRef contributors.
2     This program is free software; you can redistribute it and/or modify
3     it under the terms of the GNU General Public License as published by
4     the Free Software Foundation; either version 2 of the License, or
5     (at your option) any later version.
6
7     This program is distributed in the hope that it will be useful,
8     but WITHOUT ANY WARRANTY; without even the implied warranty of
9     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10     GNU General Public License for more details.
11
12     You should have received a copy of the GNU General Public License along
13     with this program; if not, write to the Free Software Foundation, Inc.,
14     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
15 */
16 package net.sf.jabref.imports;
17
18 import java.awt.BorderLayout;
19
20 import java.io.BufferedReader;
21 import java.io.BufferedInputStream;
22 import java.io.File;
23 import java.io.FileInputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.InputStreamReader;
27 import java.io.PrintWriter;
28
29 import java.net.ConnectException;
30 import java.net.CookieHandler;
31 import java.net.CookieManager;
32 import java.net.MalformedURLException;
33 import java.net.URL;
34 import java.net.URLConnection;
35
36 import java.util.ArrayList;
37 import java.util.Collection;
38 import java.util.HashMap;
39 import java.util.Iterator;
40 import java.util.List;
41 import java.util.Set;
42 import java.util.regex.Matcher;
43 import java.util.regex.Pattern;
44
45 import javax.swing.ButtonGroup;
46 import javax.swing.JCheckBox;
47 import javax.swing.JOptionPane;
48 import javax.swing.JPanel;
49 import javax.swing.JRadioButton;
50
51 import net.sf.jabref.BibtexDatabase;
52 import net.sf.jabref.BibtexEntry;
53 import net.sf.jabref.BibtexEntryType;
54 import net.sf.jabref.GUIGlobals;
55 import net.sf.jabref.Globals;
56 import net.sf.jabref.OutputPrinter;
57 import net.sf.jabref.Util;
58
59 public class IEEEXploreFetcher implements EntryFetcher {
60
61     final CaseKeeperList caseKeeperList = new CaseKeeperList();
62     final CaseKeeper caseKeeper = new CaseKeeper();
63     final UnitFormatter unitFormatter = new UnitFormatter();
64     
65     ImportInspector dialog = null;
66         OutputPrinter status;
67     final HTMLConverter htmlConverter = new HTMLConverter();
68     
69     private JCheckBox absCheckBox = new JCheckBox(Globals.lang("Include abstracts"), false);
70     private JRadioButton htmlButton = new JRadioButton(Globals.lang("HTML parser"));
71     private JRadioButton bibButton = new JRadioButton(Globals.lang("BibTeX importer"));
72     
73     private CookieManager cm = new CookieManager();
74     
75     private static final int MAX_FETCH = 100;
76     private int perPage = MAX_FETCH, hits = 0, unparseable = 0, parsed = 0;
77     private int piv = 0;
78     private boolean shouldContinue = false;
79     private boolean includeAbstract = false;
80     private boolean importBibtex = false;
81     
82     private String terms;
83     private final String startUrl = "http://ieeexplore.ieee.org/search/freesearchresult.jsp?queryText=";
84     private final String endUrl = "&rowsPerPage=" + Integer.toString(perPage) + "&pageNumber=";
85     private String searchUrl;
86     private final String importUrl = "http://ieeexplore.ieee.org/xpls/downloadCitations";
87     
88     private final Pattern hitsPattern = Pattern.compile("([0-9,]+) Results");
89     private final Pattern idPattern = Pattern.compile("<input name=\'\' title=\'.*\' type=\'checkbox\'" + 
90                                                       "value=\'\'\\s*id=\'([0-9]+)\'/>");
91     private final Pattern typePattern = Pattern.compile("<span class=\"type\">\\s*(.+)");
92     private HashMap<String, String> fieldPatterns = new HashMap<String, String>();
93     private final Pattern absPattern = Pattern.compile("<p>\\s*(.+)");
94     
95     Pattern stdEntryPattern = Pattern.compile(".*<strong>(.+)</strong><br>"
96                         + "\\s+(.+)");
97     
98     Pattern publicationPattern = Pattern.compile("(.*), \\d*\\.*\\s?(.*)");
99     Pattern proceedingPattern = Pattern.compile("(.*?)\\.?\\s?Proceedings\\s?(.*)");
100     Pattern abstractLinkPattern = Pattern.compile(
101            "<a href=\'(.+)\'>\\s*<span class=\"more\">View full.*</span> </a>");
102     String abrvPattern = ".*[^,] '?\\d+\\)?";
103
104     Pattern ieeeArticleNumberPattern = Pattern.compile("<a href=\".*arnumber=(\\d+).*\">");
105     
106     // Common words in IEEE Xplore that should always be 
107     
108     public IEEEXploreFetcher() {
109         super();
110         CookieHandler.setDefault(cm);
111         
112         fieldPatterns.put("title", "<a\\s*href=[^<]+>\\s*(.+)\\s*</a>");
113         //fieldPatterns.put("author", "</h3>\\s*(.+)");
114         fieldPatterns.put("author", "(?s)</h3>\\s*(.+)</br>");
115         fieldPatterns.put("volume", "Volume:\\s*([A-Za-z-]*\\d+)");
116         fieldPatterns.put("number", "Issue:\\s*(\\d+)");
117         //fieldPatterns.put("part", "Part (\\d+),&nbsp;(.+)");
118         fieldPatterns.put("year", "(?:Copyright|Publication) Year:\\s*(\\d{4})");
119         fieldPatterns.put("pages", "Page\\(s\\):\\s*(\\d+)\\s*-\\s*(\\d*)");
120         //fieldPatterns.put("doi", "Digital Object Identifier:\\s*<a href=.*>(.+)</a>");
121         fieldPatterns.put("doi", "<a href=\"http://dx.doi.org/(.+)\" target");
122         fieldPatterns.put("url", "<a href=\"(/stamp/stamp[^\"]+)");       
123     }
124     public JPanel getOptionsPanel() {
125         JPanel pan = new JPanel();
126         pan.setLayout(new BorderLayout());
127         htmlButton.setSelected(true);
128         htmlButton.setEnabled(false);
129         bibButton.setEnabled(false);
130         
131         ButtonGroup group = new ButtonGroup();
132         group.add(htmlButton);
133         group.add(bibButton);
134         pan.add(absCheckBox, BorderLayout.NORTH);
135         pan.add(htmlButton, BorderLayout.CENTER);
136         pan.add(bibButton, BorderLayout.EAST);
137                 
138         return pan;
139     }
140
141     public boolean processQuery(String query, ImportInspector dialog, OutputPrinter status) {
142         this.dialog = dialog;
143         this.status = status;
144         terms = query;
145         piv = 0;
146         shouldContinue = true;
147         parsed = 0;
148         unparseable = 0;
149         int pageNumber = 1;
150         
151         searchUrl = makeUrl(pageNumber);//start at page 1
152         
153         try {
154                 URL url = new URL(searchUrl);
155                 String page = getResults(url);
156             
157             if (page.indexOf("You have entered an invalid search") >= 0) {
158                 status.showMessage(Globals.lang("You have entered an invalid search '%0'.",
159                         terms),
160                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
161                 return false;
162             }
163             
164             if (page.indexOf("Bad request") >= 0) {
165                 status.showMessage(Globals.lang("Bad Request '%0'.",
166                         terms),
167                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
168                 return false;
169             }
170             
171             if (page.indexOf("No results were found.") >= 0) {
172                 status.showMessage(Globals.lang("No entries found for the search string '%0'",
173                         terms),
174                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
175                 return false;
176             }
177                         
178             if (page.indexOf("Error Page") >= 0) {
179                 status.showMessage(Globals.lang("Intermittent errors on the IEEE Xplore server. Please try again in a while."),
180                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
181                 return false;
182             }
183             
184             hits = getNumberOfHits(page, "display-status", hitsPattern);
185
186
187             includeAbstract = absCheckBox.isSelected();
188             importBibtex = bibButton.isSelected();
189             
190             if (hits > MAX_FETCH) {
191                 status.showMessage(Globals.lang("%0 entries found. To reduce server load, "
192                        +"only %1 will be downloaded.",
193                                 new String[] {String.valueOf(hits), String.valueOf(MAX_FETCH)}),
194                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
195                         hits = MAX_FETCH;
196             }
197
198             parse(dialog, page, 0, 1);
199             int firstEntry = perPage;
200             while (shouldContinue && firstEntry < hits) {
201                 pageNumber++;
202                 searchUrl = makeUrl(pageNumber);
203                 page = getResults(new URL(searchUrl));
204
205                 if (!shouldContinue)
206                     break;
207
208                 parse(dialog, page, 0, firstEntry + 1);
209                 firstEntry += perPage;
210
211             }
212             return true;
213         } catch (MalformedURLException e) {
214             e.printStackTrace();
215         } catch (ConnectException e) {
216             status.showMessage(Globals.lang("Connection to IEEEXplore failed"),
217                     Globals.lang("Search IEEEXplore"), JOptionPane.ERROR_MESSAGE);
218         } catch (IOException e) {
219                 status.showMessage(Globals.lang(e.getMessage()),
220                     Globals.lang("Search IEEEXplore"), JOptionPane.ERROR_MESSAGE);
221             e.printStackTrace();
222         }
223         return false;
224     }
225
226     public String getTitle() {
227         return "IEEEXplore";
228     }
229
230     public URL getIcon() {
231         return GUIGlobals.getIconUrl("www");
232     }
233
234     public String getHelpPage() {
235         return "IEEEXploreHelp.html";
236     }
237
238     public String getKeyName() {
239         return "IEEEXplore";
240     }
241
242     /**
243      * This method is called by the dialog when the user has cancelled the import.
244      */
245     public void stopFetching() {
246         shouldContinue = false;
247     }
248
249     private String makeUrl(int startIndex) {
250         StringBuffer sb = new StringBuffer(startUrl);
251         sb.append(terms.replaceAll(" ", "+"));
252         sb.append(endUrl);
253         sb.append(String.valueOf(startIndex));
254         return sb.toString();
255     }
256
257     
258
259     private void parse(ImportInspector dialog, String text, int startIndex, int firstEntryNumber) {
260         piv = startIndex;
261         int entryNumber = firstEntryNumber;
262         
263         if (importBibtex) {
264                         //TODO: Login
265                 ArrayList<String> idSelected = new ArrayList<String>();
266                 String id;
267                         while ((id = parseNextEntryId(text, piv)) != null && shouldContinue) {
268                         idSelected.add(id);
269                         entryNumber++;
270                 }
271                         try {
272                                 BibtexDatabase dbase = parseBibtexDatabase(idSelected, includeAbstract);
273                                 Collection<BibtexEntry> items = dbase.getEntries();
274                                 Iterator<BibtexEntry> iter = items.iterator();
275                                 while (iter.hasNext()) {
276                                         BibtexEntry entry = iter.next();
277                                         dialog.addEntry(cleanup(entry));
278                         dialog.setProgress(parsed + unparseable, hits);
279                         parsed++;
280                                 }
281                         } catch (IOException e) {
282                                 e.printStackTrace();
283                         }
284                         //for
285         } else {
286                 BibtexEntry entry;
287                 while (((entry = parseNextEntry(text, piv)) != null) && shouldContinue) {
288                     if (entry.getField("title") != null) {
289                         dialog.addEntry(entry);
290                         dialog.setProgress(parsed + unparseable, hits);
291                         parsed++;
292                     }
293                     entryNumber++;
294                 }
295         }
296     }
297
298     private BibtexDatabase parseBibtexDatabase(List<String> id, boolean abs) throws IOException {
299         if (id.isEmpty())
300                 return null;
301         URL url;
302         URLConnection conn;
303         try {
304             url = new URL(importUrl);
305             conn = url.openConnection();
306         } catch (MalformedURLException e) {
307             e.printStackTrace();
308             return null;
309         }
310         conn.setDoInput(true);
311         conn.setDoOutput(true);
312         conn.setRequestProperty("Content-Type",
313                 "application/x-www-form-urlencoded");
314         conn.setRequestProperty("Referer", searchUrl);
315         PrintWriter out = new PrintWriter(
316                 conn.getOutputStream());
317
318                 String recordIds = "";
319                 Iterator<String> iter = id.iterator();
320                 while (iter.hasNext()) { 
321                 recordIds += iter.next() + " ";
322                 }
323                 recordIds = recordIds.trim();
324                 String citation = abs ? "citation-abstract" : "citation-only";
325                 
326                 String content = "recordIds=" + recordIds.replaceAll(" ", "%20") + "&fromPageName=&citations-format=" + citation + "&download-format=download-bibtex";
327                 System.out.println(content);
328         out.write(content);
329         out.flush();
330         out.close();
331
332         BufferedReader bufr = new BufferedReader(new InputStreamReader(conn.getInputStream()));
333         StringBuffer sb = new StringBuffer();
334         char[] buffer = new char[256];
335         while(true) {
336             int bytesRead = bufr.read(buffer);
337             if(bytesRead == -1) break;
338             for (int i=0; i<bytesRead; i++)
339                 sb.append((char)buffer[i]);
340         }
341         System.out.println(sb.toString());
342         
343         ParserResult results = new BibtexParser(bufr).parse();
344         bufr.close();
345         return results.getDatabase();
346     }
347
348     private BibtexEntry cleanup(BibtexEntry entry) {
349         if (entry == null)
350                 return null;
351         
352         // clean up title
353         String title = (String)entry.getField("title");
354         if (title != null) {
355             // USe the alt-text and replace image links
356             title = title.replaceAll("[ ]?img src=[^ ]+ alt=\"([^\"]+)\">[ ]?", "\\$$1\\$");
357             // Try to sort out most of the /spl / conversions
358             // Deal with this specific nested type first
359             title = title.replaceAll("/sub /spl infin//", "\\$_\\\\infty\\$");
360             title = title.replaceAll("/sup /spl infin//", "\\$\\^\\\\infty\\$");
361             // Replace general expressions
362             title = title.replaceAll("/[sS]pl ([^/]+)/", "\\$\\\\$1\\$");
363             // Deal with subscripts and superscripts       
364             if (Globals.prefs.getBoolean("useConvertToEquation")) {
365                 title = title.replaceAll("/sup ([^/]+)/", "\\$\\^\\{$1\\}\\$");
366                 title = title.replaceAll("/sub ([^/]+)/", "\\$_\\{$1\\}\\$");
367                 title = title.replaceAll("\\(sup\\)([^(]+)\\(/sup\\)", "\\$\\^\\{$1\\}\\$");
368                 title = title.replaceAll("\\(sub\\)([^(]+)\\(/sub\\)", "\\_\\{$1\\}\\$");
369             } else {
370                 title = title.replaceAll("/sup ([^/]+)/", "\\\\textsuperscript\\{$1\\}");
371                 title = title.replaceAll("/sub ([^/]+)/", "\\\\textsubscript\\{$1\\}");
372                 title = title.replaceAll("\\(sup\\)([^(]+)\\(/sup\\)", "\\\\textsuperscript\\{$1\\}");
373                 title = title.replaceAll("\\(sub\\)([^(]+)\\(/sub\\)", "\\\\textsubscript\\{$1\\}");
374             }
375
376             // Replace \infin with \infty
377             title = title.replaceAll("\\\\infin", "\\\\infty");
378             
379             // Unit formatting
380             if (Globals.prefs.getBoolean("useUnitFormatterOnSearch")) {
381                 title = unitFormatter.format(title);
382             }
383             
384             // Automatic case keeping
385             if (Globals.prefs.getBoolean("useCaseKeeperOnSearch")) {
386                 title = caseKeeper.format(title);
387             }
388             // Write back
389             entry.setField("title", title);
390         }
391         
392         // clean up author
393         String author = (String)entry.getField("author");
394         if (author != null) {
395             if (author.indexOf("a href=") >= 0) {  // Author parsing failed because it was empty
396                 entry.setField("author","");  // Maybe not needed anymore due to another change
397             } else {
398                 author = author.replaceAll("\\s+", " ");
399                 author = author.replaceAll("\\.", ". ");
400                 author = author.replaceAll("([^;]+),([^;]+),([^;]+)","$1,$3,$2"); // Change order in case of Jr. etc
401                 author = author.replaceAll("  ", " ");
402                 author = author.replaceAll("\\. -", ".-");
403                 author = author.replaceAll("; ", " and ");
404                 author = author.replaceAll(" ,", ",");
405                 author = author.replaceAll("  ", " ");
406                 author = author.replaceAll("[ ,;]+$", "");
407                 entry.setField("author", author);
408             }
409         }
410         // clean up month
411         String month = (String)entry.getField("month");
412         if ((month != null) && (month.length() > 0)) {
413                 month = month.replaceAll("\\.", "");
414                 month = month.toLowerCase();
415
416                 Pattern monthPattern = Pattern.compile("(\\d*+)\\s*([a-z]*+)-*(\\d*+)\\s*([a-z]*+)");
417                 Matcher mm = monthPattern.matcher(month);
418                 String date = month;
419                 if (mm.find()) {
420                         if (mm.group(3).length() == 0) {
421                                 if (mm.group(2).length() > 0) {
422                                         date = "#" + mm.group(2).substring(0, 3) + "#";
423                                         if (mm.group(1).length() > 0) {
424                                                 date += " " + mm.group(1) + ",";
425                                         }
426                                 } else {
427                                         date = mm.group(1) + ",";
428                                 }
429                         } else if (mm.group(2).length() == 0) {
430                                 if (mm.group(4).length() > 0) {
431                                         date = "#" + mm.group(4).substring(0, 3) + "# " + mm.group(1) + "--" + mm.group(3) + ",";
432                                 } else
433                                         date += ",";
434                         } else {
435                                 date = "#" + mm.group(2).substring(0, 3) + "# " + mm.group(1) + "--#" + mm.group(4).substring(0, 3) + "# " + mm.group(3) + ",";
436                         }
437                 }
438                 //date = date.trim();
439                 //if (!date.isEmpty()) {
440                 entry.setField("month", date);
441                 //}
442         }
443         
444         // clean up pages
445         String field = "pages";
446         String pages = entry.getField(field);
447         if (pages != null) {
448                 String [] pageNumbers = pages.split("-");
449                 if (pageNumbers.length == 2) {
450                         if (pageNumbers[0].equals(pageNumbers[1])) {// single page
451                                 entry.setField(field, pageNumbers[0]);
452                         } else {
453                                 entry.setField(field, pages.replaceAll("-", "--"));
454                         }
455                 }
456         }
457         
458         // clean up publication field
459         BibtexEntryType type = entry.getType();
460         String sourceField = "";
461                 if (type.getName() == "Article") {
462                 sourceField = "journal";
463                         entry.clearField("booktitle");
464                 } else if (type.getName() == "Inproceedings"){
465             sourceField = "booktitle";
466                 }
467         String fullName = entry.getField(sourceField);
468         if (fullName != null) {
469                 if (type.getName() == "Article") {
470                         int ind = fullName.indexOf(": Accepted for future publication");
471                                 if (ind > 0) {
472                                         fullName = fullName.substring(0, ind);
473                                         entry.setField("year", "to be published");
474                                         entry.clearField("month");
475                                         entry.clearField("pages");
476                                         entry.clearField("number");
477                                 }
478                         String[] parts = fullName.split("[\\[\\]]"); //[see also...], [legacy...]
479                         fullName = parts[0];
480                         if (parts.length == 3) {
481                                         fullName += parts[2];
482                                 }
483                         if(entry.getField("note") ==  "Early Access") {
484                                         entry.setField("year", "to be published");
485                                         entry.clearField("month");
486                                         entry.clearField("pages");
487                                         entry.clearField("number");
488                         }
489                 } else {
490                         fullName = fullName.replace("Conference Proceedings", "Proceedings").
491                                         replace("Proceedings of", "Proceedings").replace("Proceedings.", "Proceedings");
492                         fullName = fullName.replaceAll("International", "Int.");
493                         fullName = fullName.replaceAll("Symposium", "Symp.");
494                         fullName = fullName.replaceAll("Conference", "Conf.");
495                         fullName = fullName.replaceAll(" on", " ").replace("  ", " ");
496                 }
497                 
498                 Matcher m1 = publicationPattern.matcher(fullName);
499                         if (m1.find()) {
500                                 String prefix = m1.group(2).trim();
501                                 String postfix = m1.group(1).trim();
502                                 String abrv = "";
503                                 String[] parts = prefix.split("\\. ", 2);
504                                 if (parts.length == 2) {
505                                         if (parts[0].matches(abrvPattern)) {
506                                                 prefix = parts[1];
507                                                 abrv = parts[0];
508                                         } else {
509                                                 prefix = parts[0];
510                                                 abrv = parts[1];
511                                         }
512                                 }
513                                 if (prefix.matches(abrvPattern) == false) {
514                                         fullName = prefix + " " + postfix + " " + abrv;
515                                         fullName = fullName.trim();
516                                 } else {
517                                         fullName = postfix + " " + prefix;
518                                 }
519                         }
520                         if (type.getName() == "Article") {
521                                 fullName = fullName.replace(" - ", "-"); //IEE Proceedings-
522                                 
523                                 fullName = fullName.trim();
524                                 if (Globals.prefs.getBoolean("useIEEEAbrv")) {
525                                         String id = Globals.journalAbbrev.getAbbreviatedName(fullName, false);
526                                         if (id != null)
527                                                 fullName = id;
528                                 }
529                 }
530                         if (type.getName() == "Inproceedings") {
531                     Matcher m2 = proceedingPattern.matcher(fullName);
532                                 if (m2.find()) {
533                                         String prefix = m2.group(2); 
534                                         String postfix = m2.group(1).replaceAll("\\.$", "");
535                                         if (prefix.matches(abrvPattern) == false) {
536                                                 String abrv = "";
537                                         
538                                                 String[] parts = postfix.split("\\. ", 2);
539                                                 if (parts.length == 2) {
540                                                         if (parts[0].matches(abrvPattern)) {
541                                                                 postfix = parts[1];
542                                                                 abrv = parts[0];
543                                                         } else {
544                                                                 postfix = parts[0];
545                                                                 abrv = parts[1];
546                                                         }
547                                                 }
548                                                 fullName = prefix.trim() + " " + postfix.trim() + " " + abrv;
549                                                 
550                                         } else {
551                                                 fullName = postfix.trim() + " " + prefix.trim();
552                                         }
553                                         
554                                 }
555                                 
556                                 fullName = fullName.trim();
557                                 
558                                 fullName = fullName.replaceAll("^[tT]he ", "").replaceAll("^\\d{4} ", "").replaceAll("[,.]$", "");
559                                 String year = entry.getField("year");
560                                 fullName = fullName.replaceAll(", " + year + "\\.?", "");
561                                 
562                         if (fullName.contains("Abstract") == false && fullName.contains("Summaries") == false && fullName.contains("Conference Record") == false)
563                                 fullName = "Proc. " + fullName;
564                 }
565                         entry.setField(sourceField, fullName);
566         }
567         
568         // clean up abstract
569         String abstr = (String) entry.getField("abstract");
570         if (abstr != null) {
571             // Try to sort out most of the /spl / conversions
572             // Deal with this specific nested type first
573             abstr = abstr.replaceAll("/sub /spl infin//", "\\$_\\\\infty\\$");
574             abstr = abstr.replaceAll("/sup /spl infin//", "\\$\\^\\\\infty\\$");
575             // Replace general expressions
576             abstr = abstr.replaceAll("/[sS]pl ([^/]+)/", "\\$\\\\$1\\$");
577             // Deal with subscripts and superscripts       
578             if (Globals.prefs.getBoolean("useConvertToEquation")) {
579                 abstr = abstr.replaceAll("/sup ([^/]+)/", "\\$\\^\\{$1\\}\\$");
580                 abstr = abstr.replaceAll("/sub ([^/]+)/", "\\$_\\{$1\\}\\$");
581                 abstr = abstr.replaceAll("\\(sup\\)([^(]+)\\(/sup\\)", "\\$\\^\\{$1\\}\\$");
582                 abstr = abstr.replaceAll("\\(sub\\)([^(]+)\\(/sub\\)", "\\_\\{$1\\}\\$");
583             } else {
584                 abstr = abstr.replaceAll("/sup ([^/]+)/", "\\\\textsuperscript\\{$1\\}");
585                 abstr = abstr.replaceAll("/sub ([^/]+)/", "\\\\textsubscript\\{$1\\}");
586                 abstr = abstr.replaceAll("\\(sup\\)([^(]+)\\(/sup\\)", "\\\\textsuperscript\\{$1\\}");
587                 abstr = abstr.replaceAll("\\(sub\\)([^(]+)\\(/sub\\)", "\\\\textsubscript\\{$1\\}");
588             }
589             // Replace \infin with \infty
590             abstr = abstr.replaceAll("\\\\infin", "\\\\infty");
591             // Write back
592             entry.setField("abstract", abstr);
593         }
594         
595         // Clean up url
596         String url = (String) entry.getField("url");
597         if (url != null) {
598             entry.setField("url","http://ieeexplore.ieee.org"+url.replace("tp=&",""));
599         }
600         return entry;
601     }
602
603     private String parseNextEntryId(String allText, int startIndex) {
604             int index = allText.indexOf("<div class=\"select", startIndex);
605             int endIndex = allText.indexOf("</div>", index);
606             
607             if (index >= 0 && endIndex > 0) {
608                 String text = allText.substring(index, endIndex);
609                 endIndex += 6;
610                 piv = endIndex;
611                 //parse id
612                 Matcher idMatcher = idPattern.matcher(text);
613                 //add id into a vector
614                 if (idMatcher.find()) {
615                         return idMatcher.group(1);
616                 }
617             }
618             return null;
619     }
620     
621     private BibtexEntry parseNextEntry(String allText, int startIndex) {
622         BibtexEntry entry = null;
623         
624         int index = allText.indexOf("<div class=\"detail", piv);
625         int endIndex = allText.indexOf("</div>", index);
626
627         if (index >= 0 && endIndex > 0) {
628                 endIndex += 6;
629                 piv = endIndex;
630                 String text = allText.substring(index, endIndex);
631             
632             BibtexEntryType type = null;
633             String sourceField = null;
634             
635             String typeName = "";
636             Matcher typeMatcher = typePattern.matcher(text);
637             if (typeMatcher.find()) {
638                     typeName = typeMatcher.group(1);
639                     if (typeName.equalsIgnoreCase("IEEE Journals &amp; Magazines") || typeName.equalsIgnoreCase("IEEE Early Access Articles") ||
640                                 typeName.equalsIgnoreCase("IET Journals &amp; Magazines") || typeName.equalsIgnoreCase("AIP Journals &amp; Magazines") ||
641                                 typeName.equalsIgnoreCase("AVS Journals &amp; Magazines") || typeName.equalsIgnoreCase("IBM Journals &amp; Magazines") || 
642                                 typeName.equalsIgnoreCase("TUP Journals &amp; Magazines") || typeName.equalsIgnoreCase("BIAI Journals &amp; Magazines")) {
643                         type = BibtexEntryType.getType("article");
644                         sourceField = "journal";
645                     } else if (typeName.equalsIgnoreCase("IEEE Conference Publications") || typeName.equalsIgnoreCase("IET Conference Publications") || typeName.equalsIgnoreCase("VDE Conference Publications")) {
646                         type = BibtexEntryType.getType("inproceedings");
647                         sourceField = "booktitle";
648                         } else if (typeName.equalsIgnoreCase("IEEE Standards") || typeName.equalsIgnoreCase("Standards")) {
649                         type = BibtexEntryType.getType("standard");
650                         sourceField = "number";
651                         } else if (typeName.equalsIgnoreCase("IEEE eLearning Library Courses")) {
652                                 type = BibtexEntryType.getType("Electronic");
653                                 sourceField = "note";
654                         } else if (typeName.equalsIgnoreCase("Wiley-IEEE Press eBook Chapters") || typeName.equalsIgnoreCase("MIT Press eBook Chapters") ||
655                                 typeName.equalsIgnoreCase("IEEE USA Books &amp; eBooks")) {
656                                 type = BibtexEntryType.getType("inCollection");
657                                 sourceField = "booktitle";
658                         }
659             } 
660             
661             if (type == null) {
662                 type = BibtexEntryType.getType("misc");
663                 sourceField = "note";
664                 System.err.println("Type detection failed. Use MISC instead.");
665                 unparseable++;
666                 System.err.println(text);
667             }
668             
669             entry = new BibtexEntry(Util.createNeutralId(), type);
670             
671             if (typeName.equalsIgnoreCase("IEEE Standards")) {
672                 entry.setField("organization", "IEEE");
673             }
674             
675             if (typeName.equalsIgnoreCase("Wiley-IEEE Press eBook Chapters")) {
676                 entry.setField("publisher", "Wiley-IEEE Press");
677             } else if(typeName.equalsIgnoreCase("MIT Press eBook Chapters")) {
678                 entry.setField("publisher", "MIT Press");
679             } else if(typeName.equalsIgnoreCase("IEEE USA Books &amp; eBooks")) {
680                 entry.setField("publisher", "IEEE USA");
681             }
682             
683             if (typeName.equalsIgnoreCase("IEEE Early Access Articles")) {
684                 entry.setField("note", "Early Access");
685             }
686             
687             Set<String> fields = fieldPatterns.keySet();
688             for (String field: fields) {
689                 Matcher fieldMatcher = Pattern.compile(fieldPatterns.get(field)).matcher(text);
690                 if (fieldMatcher.find()) {
691                         entry.setField(field, htmlConverter.format(fieldMatcher.group(1)));
692                         if (field.equals("title") && fieldMatcher.find()) {
693                                 String sec_title = htmlConverter.format(fieldMatcher.group(1));
694                                 if (entry.getType() == BibtexEntryType.getStandardType("standard")) {
695                                         sec_title = sec_title.replaceAll("IEEE Std ", "");
696                                 }
697                                 entry.setField(sourceField, sec_title);
698                                 
699                         }
700                         if (field.equals("pages") && fieldMatcher.groupCount() == 2) {
701                                 entry.setField(field, fieldMatcher.group(1) + "-" + fieldMatcher.group(2));
702                         }
703                 } 
704             }
705             if (entry.getField("author") == null || entry.getField("author").startsWith("a href") ||
706                     entry.getField("author").startsWith("Topic(s)")) {  // Fix for some documents without authors
707                 entry.setField("author","");
708             }
709             if (entry.getType() == BibtexEntryType.getStandardType("inproceedings") && entry.getField("author").equals("")) {
710                 entry.setType(BibtexEntryType.getStandardType("proceedings"));
711             }
712         
713             if (includeAbstract) {
714                     index = text.indexOf("id=\"abstract");
715                     if (index >= 0) {
716                         endIndex = text.indexOf("</div>", index) + 6;
717                             
718                         text = text.substring(index, endIndex);
719                         Matcher absMatcher = absPattern.matcher(text);
720                         if (absMatcher.find()) {
721                                 // Clean-up abstract
722                             String abstr=absMatcher.group(1);
723                             abstr = abstr.replaceAll("<span class='snippet'>([\\w]+)</span>","$1");
724                                 
725                             entry.setField("abstract", htmlConverter.format(abstr));
726                         }
727                     }
728             }
729         }
730         
731         if (entry == null) {
732                 return null;
733         } else {
734             return cleanup(entry);
735         }
736     }
737
738     /**
739      * Find out how many hits were found.
740      * @param page
741      */
742     private int getNumberOfHits(String page, String marker, Pattern pattern) throws IOException {
743         int ind = page.indexOf(marker);
744         if (ind < 0) {
745                 System.out.println(page);
746             throw new IOException(Globals.lang("Could not parse number of hits"));
747         }
748         String substring = page.substring(ind, page.length());
749         Matcher m = pattern.matcher(substring);
750         if (m.find())
751             return Integer.parseInt(m.group(1));
752         else
753                 throw new IOException(Globals.lang("Could not parse number of hits"));
754     }
755
756     /**
757      * Download the URL and return contents as a String.
758      * @param source
759      * @return
760      * @throws IOException
761      */
762     public String getResults(URL source) throws IOException {
763         
764         InputStream in = source.openStream();
765         StringBuffer sb = new StringBuffer();
766         byte[] buffer = new byte[256];
767         while(true) {
768             int bytesRead = in.read(buffer);
769             if(bytesRead == -1) break;
770             for (int i=0; i<bytesRead; i++)
771                 sb.append((char)buffer[i]);
772         }
773         return sb.toString();
774     }
775
776     /**
777      * Read results from a file instead of an URL. Just for faster debugging.
778      * @param f
779      * @return
780      * @throws IOException
781      */
782     public String getResultsFromFile(File f) throws IOException {
783         InputStream in = new BufferedInputStream(new FileInputStream(f));
784         StringBuffer sb = new StringBuffer();
785         byte[] buffer = new byte[256];
786         while(true) {
787             int bytesRead = in.read(buffer);
788             if(bytesRead == -1) break;
789             for (int i=0; i<bytesRead; i++)
790                 sb.append((char)buffer[i]);
791         }
792         return sb.toString();
793     }
794 }