d132579b928c07960870a09247f5643d1fd8abf2
[debian/jabref.git] / src / java / net / sf / jabref / imports / IEEEXploreFetcher.java
1 /*  Copyright (C) 2003-2011 JabRef contributors.
2     This program is free software; you can redistribute it and/or modify
3     it under the terms of the GNU General Public License as published by
4     the Free Software Foundation; either version 2 of the License, or
5     (at your option) any later version.
6
7     This program is distributed in the hope that it will be useful,
8     but WITHOUT ANY WARRANTY; without even the implied warranty of
9     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10     GNU General Public License for more details.
11
12     You should have received a copy of the GNU General Public License along
13     with this program; if not, write to the Free Software Foundation, Inc.,
14     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
15 */
16 package net.sf.jabref.imports;
17
18 import java.awt.BorderLayout;
19
20 import java.io.BufferedReader;
21 import java.io.BufferedInputStream;
22 import java.io.File;
23 import java.io.FileInputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.InputStreamReader;
27 import java.io.PrintWriter;
28
29 import java.net.ConnectException;
30 import java.net.MalformedURLException;
31 import java.net.URL;
32 import java.net.URLConnection;
33 import java.net.URLEncoder;
34
35 import java.util.ArrayList;
36 import java.util.Collection;
37 import java.util.HashMap;
38 import java.util.Iterator;
39 import java.util.List;
40 import java.util.Set;
41 import java.util.regex.Matcher;
42 import java.util.regex.Pattern;
43
44 import javax.swing.ButtonGroup;
45 import javax.swing.JCheckBox;
46 import javax.swing.JOptionPane;
47 import javax.swing.JPanel;
48 import javax.swing.JRadioButton;
49
50 import net.sf.jabref.BibtexDatabase;
51 import net.sf.jabref.BibtexEntry;
52 import net.sf.jabref.BibtexEntryType;
53 import net.sf.jabref.GUIGlobals;
54 import net.sf.jabref.Globals;
55 import net.sf.jabref.OutputPrinter;
56 import net.sf.jabref.Util;
57
58 public class IEEEXploreFetcher implements EntryFetcher {
59
60     final CaseKeeperList caseKeeperList = new CaseKeeperList();
61     final CaseKeeper caseKeeper = new CaseKeeper();
62     
63     ImportInspector dialog = null;
64         OutputPrinter status;
65     final HTMLConverter htmlConverter = new HTMLConverter();
66     
67     private JCheckBox absCheckBox = new JCheckBox(Globals.lang("Include abstracts"), false);
68     private JRadioButton htmlButton = new JRadioButton(Globals.lang("HTML parser"));
69     private JRadioButton bibButton = new JRadioButton(Globals.lang("BibTeX importer"));
70     
71     private static final int MAX_FETCH = 100;
72     private int perPage = MAX_FETCH, hits = 0, unparseable = 0, parsed = 0;
73     private int piv = 0;
74     private boolean shouldContinue = false;
75     private boolean includeAbstract = false;
76     private boolean importBibtex = false;
77     
78     private String terms;
79     private final String startUrl = "http://ieeexplore.ieee.org/search/freesearchresult.jsp?queryText=";
80     private final String endUrl = "&rowsPerPage=" + Integer.toString(perPage) + "&pageNumber=";
81     private String searchUrl;
82     private final String importUrl = "http://ieeexplore.ieee.org/xpls/downloadCitations";
83     
84     private final Pattern hitsPattern = Pattern.compile("([0-9,]+) Results");
85     private final Pattern idPattern = Pattern.compile("<input name=\'\' title=\'.*\' type=\'checkbox\'" + 
86                                                       "value=\'\'\\s*id=\'([0-9]+)\'/>");
87     private final Pattern typePattern = Pattern.compile("<span class=\"type\">\\s*(.+)");
88     private HashMap<String, String> fieldPatterns = new HashMap<String, String>();
89     private final Pattern absPattern = Pattern.compile("<p>\\s*(.+)");
90     
91     Pattern stdEntryPattern = Pattern.compile(".*<strong>(.+)</strong><br>"
92                         + "\\s+(.+)");
93     
94     Pattern publicationPattern = Pattern.compile("(.*), \\d*\\.*\\s?(.*)");
95     Pattern proceedingPattern = Pattern.compile("(.*?)\\.?\\s?Proceedings\\s?(.*)");
96     Pattern abstractLinkPattern = Pattern.compile(
97            "<a href=\'(.+)\'>\\s*<span class=\"more\">View full.*</span> </a>");
98     String abrvPattern = ".*[^,] '?\\d+\\)?";
99
100     Pattern ieeeArticleNumberPattern = Pattern.compile("<a href=\".*arnumber=(\\d+).*\">");
101     
102     // Common words in IEEE Xplore that should always be 
103     
104     public IEEEXploreFetcher() {
105         super();
106         
107         fieldPatterns.put("title", "<a\\s*href=[^<]+>\\s*(.+)\\s*</a>");
108         fieldPatterns.put("author", "</h3>\\s*(.+)");
109         fieldPatterns.put("volume", "Volume:\\s*([A-Za-z-]*\\d+)");
110         fieldPatterns.put("number", "Issue:\\s*(\\d+)");
111         //fieldPatterns.put("part", "Part (\\d+),&nbsp;(.+)");
112         fieldPatterns.put("year", "(?:Copyright|Publication) Year:\\s*(\\d{4})");
113         fieldPatterns.put("pages", "Page\\(s\\):\\s*(\\d+)\\s*-\\s*(\\d*)");
114         //fieldPatterns.put("doi", "Digital Object Identifier:\\s*<a href=.*>(.+)</a>");
115         fieldPatterns.put("doi", "<a href=\"http://dx.doi.org/(.+)\" target");
116         fieldPatterns.put("url", "<a href=\"(/stamp/stamp[^\"]+)");       
117     }
118     public JPanel getOptionsPanel() {
119         JPanel pan = new JPanel();
120         pan.setLayout(new BorderLayout());
121         htmlButton.setSelected(true);
122         htmlButton.setEnabled(false);
123         bibButton.setEnabled(false);
124         
125         ButtonGroup group = new ButtonGroup();
126         group.add(htmlButton);
127         group.add(bibButton);
128         pan.add(absCheckBox, BorderLayout.NORTH);
129         pan.add(htmlButton, BorderLayout.CENTER);
130         pan.add(bibButton, BorderLayout.EAST);
131                 
132         return pan;
133     }
134
135     public boolean processQuery(String query, ImportInspector dialog, OutputPrinter status) {
136         this.dialog = dialog;
137         this.status = status;
138         terms = query;
139         piv = 0;
140         shouldContinue = true;
141         parsed = 0;
142         unparseable = 0;
143         int pageNumber = 1;
144         
145         searchUrl = makeUrl(pageNumber);//start at page 1
146         
147         try {
148                 URL url = new URL(searchUrl);
149                 String page = getResults(url);
150             
151             if (page.indexOf("You have entered an invalid search") >= 0) {
152                 status.showMessage(Globals.lang("You have entered an invalid search '%0'.",
153                         terms),
154                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
155                 return false;
156             }
157             
158             if (page.indexOf("Bad request") >= 0) {
159                 status.showMessage(Globals.lang("Bad Request '%0'.",
160                         terms),
161                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
162                 return false;
163             }
164             
165             if (page.indexOf("No results were found.") >= 0) {
166                 status.showMessage(Globals.lang("No entries found for the search string '%0'",
167                         terms),
168                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
169                 return false;
170             }
171                         
172             if (page.indexOf("Error Page") >= 0) {
173                 status.showMessage(Globals.lang("Intermittent errors on the IEEE Xplore server. Please try again in a while."),
174                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
175                 return false;
176             }
177             
178             hits = getNumberOfHits(page, "display-status", hitsPattern);
179
180
181             includeAbstract = absCheckBox.isSelected();
182             importBibtex = bibButton.isSelected();
183             
184             if (hits > MAX_FETCH) {
185                 status.showMessage(Globals.lang("%0 entries found. To reduce server load, "
186                        +"only %1 will be downloaded.",
187                                 new String[] {String.valueOf(hits), String.valueOf(MAX_FETCH)}),
188                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
189                         hits = MAX_FETCH;
190             }
191
192             parse(dialog, page, 0, 1);
193             int firstEntry = perPage;
194             while (shouldContinue && firstEntry < hits) {
195                 pageNumber++;
196                 searchUrl = makeUrl(pageNumber);
197                 page = getResults(new URL(searchUrl));
198
199                 if (!shouldContinue)
200                     break;
201
202                 parse(dialog, page, 0, firstEntry + 1);
203                 firstEntry += perPage;
204
205             }
206             return true;
207         } catch (MalformedURLException e) {
208             e.printStackTrace();
209         } catch (ConnectException e) {
210             status.showMessage(Globals.lang("Connection to IEEEXplore failed"),
211                     Globals.lang("Search IEEEXplore"), JOptionPane.ERROR_MESSAGE);
212         } catch (IOException e) {
213                 status.showMessage(Globals.lang(e.getMessage()),
214                     Globals.lang("Search IEEEXplore"), JOptionPane.ERROR_MESSAGE);
215             e.printStackTrace();
216         }
217         return false;
218     }
219
220     public String getTitle() {
221         return "IEEEXplore";
222     }
223
224     public URL getIcon() {
225         return GUIGlobals.getIconUrl("www");
226     }
227
228     public String getHelpPage() {
229         return "IEEEXploreHelp.html";
230     }
231
232     public String getKeyName() {
233         return "IEEEXplore";
234     }
235
236     /**
237      * This method is called by the dialog when the user has cancelled the import.
238      */
239     public void stopFetching() {
240         shouldContinue = false;
241     }
242
243     private String makeUrl(int startIndex) {
244         StringBuffer sb = new StringBuffer(startUrl);
245         sb.append(terms.replaceAll(" ", "+"));
246         sb.append(endUrl);
247         sb.append(String.valueOf(startIndex));
248         return sb.toString();
249     }
250
251     
252
253     private void parse(ImportInspector dialog, String text, int startIndex, int firstEntryNumber) {
254         piv = startIndex;
255         int entryNumber = firstEntryNumber;
256         
257         if (importBibtex) {
258                         //TODO: Login
259                 ArrayList<String> idSelected = new ArrayList<String>();
260                 String id;
261                         while ((id = parseNextEntryId(text, piv)) != null && shouldContinue) {
262                         idSelected.add(id);
263                         entryNumber++;
264                 }
265                         try {
266                                 BibtexDatabase dbase = parseBibtexDatabase(idSelected, includeAbstract);
267                                 Collection<BibtexEntry> items = dbase.getEntries();
268                                 Iterator<BibtexEntry> iter = items.iterator();
269                                 while (iter.hasNext()) {
270                                         BibtexEntry entry = iter.next();
271                                         dialog.addEntry(cleanup(entry));
272                         dialog.setProgress(parsed + unparseable, hits);
273                         parsed++;
274                                 }
275                         } catch (IOException e) {
276                                 e.printStackTrace();
277                         }
278                         //for
279         } else {
280                 BibtexEntry entry;
281                 while (((entry = parseNextEntry(text, piv)) != null) && shouldContinue) {
282                     if (entry.getField("title") != null) {
283                         dialog.addEntry(entry);
284                         dialog.setProgress(parsed + unparseable, hits);
285                         parsed++;
286                     }
287                     entryNumber++;
288                 }
289         }
290     }
291
292     private BibtexDatabase parseBibtexDatabase(List<String> id, boolean abs) throws IOException {
293         if (id.isEmpty())
294                 return null;
295         URL url;
296         URLConnection conn;
297         try {
298             url = new URL(importUrl);
299             conn = url.openConnection();
300         } catch (MalformedURLException e) {
301             e.printStackTrace();
302             return null;
303         }
304         conn.setDoInput(true);
305         conn.setDoOutput(true);
306         conn.setRequestProperty("Content-Type",
307                 "application/x-www-form-urlencoded");
308         conn.setRequestProperty("Referer", searchUrl);
309         PrintWriter out = new PrintWriter(
310                 conn.getOutputStream());
311
312                 String recordIds = "";
313                 Iterator<String> iter = id.iterator();
314                 while (iter.hasNext()) { 
315                 recordIds += iter.next() + " ";
316                 }
317                 recordIds = recordIds.trim();
318                 String citation = abs ? "citation-abstract" : "citation-only";
319                 
320                 String content = "recordIds=" + recordIds.replaceAll(" ", "%20") + "&fromPageName=&citations-format=" + citation + "&download-format=download-bibtex";
321                 System.out.println(content);
322         out.write(content);
323         out.flush();
324         out.close();
325
326         BufferedReader bufr = new BufferedReader(new InputStreamReader(conn.getInputStream()));
327         StringBuffer sb = new StringBuffer();
328         char[] buffer = new char[256];
329         while(true) {
330             int bytesRead = bufr.read(buffer);
331             if(bytesRead == -1) break;
332             for (int i=0; i<bytesRead; i++)
333                 sb.append((char)buffer[i]);
334         }
335         System.out.println(sb.toString());
336         
337         ParserResult results = new BibtexParser(bufr).parse();
338         bufr.close();
339         return results.getDatabase();
340     }
341
342     private BibtexEntry cleanup(BibtexEntry entry) {
343         if (entry == null)
344                 return null;
345         
346         // clean up title
347         String title = (String)entry.getField("title");
348         if (title != null) {
349             // USe the alt-text and replace image links
350             title = title.replaceAll("[ ]?img src=[^ ]+ alt=\"([^\"]+)\">[ ]?", "\\$$1\\$");
351             // Try to sort out most of the /spl / conversions
352             // Deal with this specific nested type first
353             title = title.replaceAll("/sub /spl infin//", "\\$_\\\\infty\\$");
354             title = title.replaceAll("/sup /spl infin//", "\\$\\^\\\\infty\\$");
355             // Replace general expressions
356             title = title.replaceAll("/[sS]pl ([^/]+)/", "\\$\\\\$1\\$");
357             // Deal with subscripts and superscripts       
358             if (Globals.prefs.getBoolean("useConvertToEquation")) {
359                 title = title.replaceAll("/sup ([^/]+)/", "\\$\\^\\{$1\\}\\$");
360                 title = title.replaceAll("/sub ([^/]+)/", "\\$_\\{$1\\}\\$");
361                 title = title.replaceAll("\\(sup\\)([^(]+)\\(/sup\\)", "\\$\\^\\{$1\\}\\$");
362                 title = title.replaceAll("\\(sub\\)([^(]+)\\(/sub\\)", "\\_\\{$1\\}\\$");
363             } else {
364                 title = title.replaceAll("/sup ([^/]+)/", "\\\\textsuperscript\\{$1\\}");
365                 title = title.replaceAll("/sub ([^/]+)/", "\\\\textsubscript\\{$1\\}");
366                 title = title.replaceAll("\\(sup\\)([^(]+)\\(/sup\\)", "\\\\textsuperscript\\{$1\\}");
367                 title = title.replaceAll("\\(sub\\)([^(]+)\\(/sub\\)", "\\\\textsubscript\\{$1\\}");
368             }
369
370             // Replace \infin with \infty
371             title = title.replaceAll("\\\\infin", "\\\\infty");
372             // Automatic case keeping
373             if (Globals.prefs.getBoolean("useCaseKeeperOnSearch")) {
374                 title = caseKeeper.format(title, caseKeeperList.wordListIEEEXplore);
375             }
376             // Write back
377             entry.setField("title", title);
378         }
379         
380         // clean up author
381         String author = (String)entry.getField("author");
382         if (author != null) {
383             if (author.indexOf("a href=") >= 0) {  // Author parsing failed because it was empty
384                 entry.setField("author","");  // Maybe not needed anymore due to another change
385             } else {
386                 author = author.replaceAll("\\.", ". ");
387                 author = author.replaceAll("  ", " ");
388                 author = author.replaceAll("\\. -", ".-");
389                 author = author.replaceAll("; ", " and ");
390                 author = author.replaceAll("[,;]$", "");
391                 entry.setField("author", author);
392             }
393         }
394         // clean up month
395         String month = (String)entry.getField("month");
396         if ((month != null) && (month.length() > 0)) {
397                 month = month.replaceAll("\\.", "");
398                 month = month.toLowerCase();
399
400                 Pattern monthPattern = Pattern.compile("(\\d*+)\\s*([a-z]*+)-*(\\d*+)\\s*([a-z]*+)");
401                 Matcher mm = monthPattern.matcher(month);
402                 String date = month;
403                 if (mm.find()) {
404                         if (mm.group(3).length() == 0) {
405                                 if (mm.group(2).length() > 0) {
406                                         date = "#" + mm.group(2).substring(0, 3) + "#";
407                                         if (mm.group(1).length() > 0) {
408                                                 date += " " + mm.group(1) + ",";
409                                         }
410                                 } else {
411                                         date = mm.group(1) + ",";
412                                 }
413                         } else if (mm.group(2).length() == 0) {
414                                 if (mm.group(4).length() > 0) {
415                                         date = "#" + mm.group(4).substring(0, 3) + "# " + mm.group(1) + "--" + mm.group(3) + ",";
416                                 } else
417                                         date += ",";
418                         } else {
419                                 date = "#" + mm.group(2).substring(0, 3) + "# " + mm.group(1) + "--#" + mm.group(4).substring(0, 3) + "# " + mm.group(3) + ",";
420                         }
421                 }
422                 //date = date.trim();
423                 //if (!date.isEmpty()) {
424                 entry.setField("month", date);
425                 //}
426         }
427         
428         // clean up pages
429         String field = "pages";
430         String pages = entry.getField(field);
431         if (pages != null) {
432                 String [] pageNumbers = pages.split("-");
433                 if (pageNumbers.length == 2) {
434                         if (pageNumbers[0].equals(pageNumbers[1])) {// single page
435                                 entry.setField(field, pageNumbers[0]);
436                         } else {
437                                 entry.setField(field, pages.replaceAll("-", "--"));
438                         }
439                 }
440         }
441         
442         // clean up publication field
443         BibtexEntryType type = entry.getType();
444         String sourceField = "";
445                 if (type.getName() == "Article") {
446                 sourceField = "journal";
447                         entry.clearField("booktitle");
448                 } else if (type.getName() == "Inproceedings"){
449             sourceField = "booktitle";
450                 }
451         String fullName = entry.getField(sourceField);
452         if (fullName != null) {
453                 if (type.getName() == "Article") {
454                         int ind = fullName.indexOf(": Accepted for future publication");
455                                 if (ind > 0) {
456                                         fullName = fullName.substring(0, ind);
457                                         entry.setField("year", "to be published");
458                                         entry.clearField("month");
459                                         entry.clearField("pages");
460                                         entry.clearField("number");
461                                 }
462                         String[] parts = fullName.split("[\\[\\]]"); //[see also...], [legacy...]
463                         fullName = parts[0];
464                         if (parts.length == 3) {
465                                         fullName += parts[2];
466                                 }
467                         if(entry.getField("note") ==  "Early Access") {
468                                         entry.setField("year", "to be published");
469                                         entry.clearField("month");
470                                         entry.clearField("pages");
471                                         entry.clearField("number");
472                         }
473                 } else {
474                         fullName = fullName.replace("Conference Proceedings", "Proceedings").
475                                         replace("Proceedings of", "Proceedings").replace("Proceedings.", "Proceedings");
476                         fullName = fullName.replaceAll("International", "Int.");
477                         fullName = fullName.replaceAll("Symposium", "Symp.");
478                         fullName = fullName.replaceAll("Conference", "Conf.");
479                         fullName = fullName.replaceAll(" on", " ").replace("  ", " ");
480                 }
481                 
482                 Matcher m1 = publicationPattern.matcher(fullName);
483                         if (m1.find()) {
484                                 String prefix = m1.group(2).trim();
485                                 String postfix = m1.group(1).trim();
486                                 String abrv = "";
487                                 String[] parts = prefix.split("\\. ", 2);
488                                 if (parts.length == 2) {
489                                         if (parts[0].matches(abrvPattern)) {
490                                                 prefix = parts[1];
491                                                 abrv = parts[0];
492                                         } else {
493                                                 prefix = parts[0];
494                                                 abrv = parts[1];
495                                         }
496                                 }
497                                 if (prefix.matches(abrvPattern) == false) {
498                                         fullName = prefix + " " + postfix + " " + abrv;
499                                         fullName = fullName.trim();
500                                 } else {
501                                         fullName = postfix + " " + prefix;
502                                 }
503                         }
504                         if (type.getName() == "Article") {
505                                 fullName = fullName.replace(" - ", "-"); //IEE Proceedings-
506                                 
507                                 fullName = fullName.trim();
508                                 if (Globals.prefs.getBoolean("useIEEEAbrv")) {
509                                         String id = Globals.journalAbbrev.getAbbreviatedName(fullName, false);
510                                         if (id != null)
511                                                 fullName = id;
512                                 }
513                 }
514                         if (type.getName() == "Inproceedings") {
515                     Matcher m2 = proceedingPattern.matcher(fullName);
516                                 if (m2.find()) {
517                                         String prefix = m2.group(2); 
518                                         String postfix = m2.group(1).replaceAll("\\.$", "");
519                                         if (prefix.matches(abrvPattern) == false) {
520                                                 String abrv = "";
521                                         
522                                                 String[] parts = postfix.split("\\. ", 2);
523                                                 if (parts.length == 2) {
524                                                         if (parts[0].matches(abrvPattern)) {
525                                                                 postfix = parts[1];
526                                                                 abrv = parts[0];
527                                                         } else {
528                                                                 postfix = parts[0];
529                                                                 abrv = parts[1];
530                                                         }
531                                                 }
532                                                 fullName = prefix.trim() + " " + postfix.trim() + " " + abrv;
533                                                 
534                                         } else {
535                                                 fullName = postfix.trim() + " " + prefix.trim();
536                                         }
537                                         
538                                 }
539                                 
540                                 fullName = fullName.trim();
541                                 
542                                 fullName = fullName.replaceAll("^[tT]he ", "").replaceAll("^\\d{4} ", "").replaceAll("[,.]$", "");
543                                 String year = entry.getField("year");
544                                 fullName = fullName.replaceAll(", " + year + "\\.?", "");
545                                 
546                         if (fullName.contains("Abstract") == false && fullName.contains("Summaries") == false && fullName.contains("Conference Record") == false)
547                                 fullName = "Proc. " + fullName;
548                 }
549                         entry.setField(sourceField, fullName);
550         }
551         
552         // clean up abstract
553         String abstr = (String) entry.getField("abstract");
554         if (abstr != null) {
555             // Try to sort out most of the /spl / conversions
556             // Deal with this specific nested type first
557             abstr = abstr.replaceAll("/sub /spl infin//", "\\$_\\\\infty\\$");
558             abstr = abstr.replaceAll("/sup /spl infin//", "\\$\\^\\\\infty\\$");
559             // Replace general expressions
560             abstr = abstr.replaceAll("/[sS]pl ([^/]+)/", "\\$\\\\$1\\$");
561             // Deal with subscripts and superscripts       
562             if (Globals.prefs.getBoolean("useConvertToEquation")) {
563                 abstr = abstr.replaceAll("/sup ([^/]+)/", "\\$\\^\\{$1\\}\\$");
564                 abstr = abstr.replaceAll("/sub ([^/]+)/", "\\$_\\{$1\\}\\$");
565                 abstr = abstr.replaceAll("\\(sup\\)([^(]+)\\(/sup\\)", "\\$\\^\\{$1\\}\\$");
566                 abstr = abstr.replaceAll("\\(sub\\)([^(]+)\\(/sub\\)", "\\_\\{$1\\}\\$");
567             } else {
568                 abstr = abstr.replaceAll("/sup ([^/]+)/", "\\\\textsuperscript\\{$1\\}");
569                 abstr = abstr.replaceAll("/sub ([^/]+)/", "\\\\textsubscript\\{$1\\}");
570                 abstr = abstr.replaceAll("\\(sup\\)([^(]+)\\(/sup\\)", "\\\\textsuperscript\\{$1\\}");
571                 abstr = abstr.replaceAll("\\(sub\\)([^(]+)\\(/sub\\)", "\\\\textsubscript\\{$1\\}");
572             }
573             // Replace \infin with \infty
574             abstr = abstr.replaceAll("\\\\infin", "\\\\infty");
575             // Write back
576             entry.setField("abstract", abstr);
577         }
578         
579         // Clean up url
580         String url = (String) entry.getField("url");
581         if (url != null) {
582             entry.setField("url","http://ieeexplore.ieee.org"+url);
583         }
584         return entry;
585     }
586
587     private String parseNextEntryId(String allText, int startIndex) {
588             int index = allText.indexOf("<div class=\"select", startIndex);
589             int endIndex = allText.indexOf("</div>", index);
590             
591             if (index >= 0 && endIndex > 0) {
592                 String text = allText.substring(index, endIndex);
593                 endIndex += 6;
594                 piv = endIndex;
595                 //parse id
596                 Matcher idMatcher = idPattern.matcher(text);
597                 //add id into a vector
598                 if (idMatcher.find()) {
599                         return idMatcher.group(1);
600                 }
601             }
602             return null;
603     }
604     
605     private BibtexEntry parseNextEntry(String allText, int startIndex) {
606         BibtexEntry entry = null;
607         
608         int index = allText.indexOf("<div class=\"detail", piv);
609         int endIndex = allText.indexOf("</div>", index);
610
611         if (index >= 0 && endIndex > 0) {
612                 endIndex += 6;
613                 piv = endIndex;
614                 String text = allText.substring(index, endIndex);
615             
616             BibtexEntryType type = null;
617             String sourceField = null;
618             
619             String typeName = "";
620             Matcher typeMatcher = typePattern.matcher(text);
621             if (typeMatcher.find()) {
622                     typeName = typeMatcher.group(1);
623                     if (typeName.equalsIgnoreCase("IEEE Journals &amp; Magazines") || typeName.equalsIgnoreCase("IEEE Early Access Articles") ||
624                                 typeName.equalsIgnoreCase("IET Journals &amp; Magazines") || typeName.equalsIgnoreCase("AIP Journals &amp; Magazines") ||
625                                 typeName.equalsIgnoreCase("AVS Journals &amp; Magazines") || typeName.equalsIgnoreCase("IBM Journals &amp; Magazines") || 
626                                 typeName.equalsIgnoreCase("TUP Journals &amp; Magazines") || typeName.equalsIgnoreCase("BIAI Journals &amp; Magazines")) {
627                         type = BibtexEntryType.getType("article");
628                         sourceField = "journal";
629                     } else if (typeName.equalsIgnoreCase("IEEE Conference Publications") || typeName.equalsIgnoreCase("IET Conference Publications") || typeName.equalsIgnoreCase("VDE Conference Publications")) {
630                         type = BibtexEntryType.getType("inproceedings");
631                         sourceField = "booktitle";
632                         } else if (typeName.equalsIgnoreCase("IEEE Standards") || typeName.equalsIgnoreCase("Standards")) {
633                         type = BibtexEntryType.getType("standard");
634                         sourceField = "number";
635                         } else if (typeName.equalsIgnoreCase("IEEE eLearning Library Courses")) {
636                                 type = BibtexEntryType.getType("Electronic");
637                                 sourceField = "note";
638                         } else if (typeName.equalsIgnoreCase("Wiley-IEEE Press eBook Chapters") || typeName.equalsIgnoreCase("MIT Press eBook Chapters")) {
639                                 type = BibtexEntryType.getType("inCollection");
640                                 sourceField = "booktitle";
641                         }
642             } 
643             
644             if (type == null) {
645                 type = BibtexEntryType.getType("misc");
646                 sourceField = "note";
647                 System.err.println("Type detection failed. Use MISC instead.");
648                 unparseable++;
649                 System.err.println(text);
650             }
651             
652             entry = new BibtexEntry(Util.createNeutralId(), type);
653             
654             if (typeName.equalsIgnoreCase("IEEE Standards")) {
655                 entry.setField("organization", "IEEE");
656             }
657             
658             if (typeName.equalsIgnoreCase("Wiley-IEEE Press eBook Chapters")) {
659                 entry.setField("publisher", "Wiley-IEEE Press");
660             } else if(typeName.equalsIgnoreCase("MIT Press eBook Chapters")) {
661                 entry.setField("publisher", "MIT Press");
662             }
663             
664             if (typeName.equalsIgnoreCase("IEEE Early Access Articles")) {
665                 entry.setField("note", "Early Access");
666             }
667             
668             Set<String> fields = fieldPatterns.keySet();
669             for (String field: fields) {
670                 Matcher fieldMatcher = Pattern.compile(fieldPatterns.get(field)).matcher(text);
671                 if (fieldMatcher.find()) {
672                         entry.setField(field, htmlConverter.format(fieldMatcher.group(1)));
673                         if (field.equals("title") && fieldMatcher.find()) {
674                                 String sec_title = htmlConverter.format(fieldMatcher.group(1));
675                                 if (entry.getType() == BibtexEntryType.getStandardType("standard")) {
676                                         sec_title = sec_title.replaceAll("IEEE Std ", "");
677                                 }
678                                 entry.setField(sourceField, sec_title);
679                                 
680                         }
681                         if (field.equals("pages") && fieldMatcher.groupCount() == 2) {
682                                 entry.setField(field, fieldMatcher.group(1) + "-" + fieldMatcher.group(2));
683                         }
684                 } 
685             }
686             if (entry.getField("author") == null || entry.getField("author").startsWith("a href")) {  // Fix for some documents without authors
687                 entry.setField("author","");
688             }
689             if (entry.getType() == BibtexEntryType.getStandardType("inproceedings") && entry.getField("author").equals("")) {
690                 entry.setType(BibtexEntryType.getStandardType("proceedings"));
691             }
692         
693             if (includeAbstract) {
694                     index = text.indexOf("id=\"abstract");
695                     if (index >= 0) {
696                         endIndex = text.indexOf("</div>", index) + 6;
697                             
698                         text = text.substring(index, endIndex);
699                         Matcher absMatcher = absPattern.matcher(text);
700                         if (absMatcher.find()) {
701                                 // Clean-up abstract
702                             String abstr=absMatcher.group(1);
703                             abstr = abstr.replaceAll("<span class='snippet'>([\\w]+)</span>","$1");
704                                 
705                             entry.setField("abstract", htmlConverter.format(abstr));
706                         }
707                     }
708             }
709         }
710         
711         if (entry == null) {
712                 return null;
713         } else {
714             return cleanup(entry);
715         }
716     }
717
718     /**
719      * Find out how many hits were found.
720      * @param page
721      */
722     private int getNumberOfHits(String page, String marker, Pattern pattern) throws IOException {
723         int ind = page.indexOf(marker);
724         if (ind < 0) {
725                 System.out.println(page);
726             throw new IOException(Globals.lang("Could not parse number of hits"));
727         }
728         String substring = page.substring(ind, page.length());
729         Matcher m = pattern.matcher(substring);
730         if (m.find())
731             return Integer.parseInt(m.group(1));
732         else
733                 throw new IOException(Globals.lang("Could not parse number of hits"));
734     }
735
736     /**
737      * Download the URL and return contents as a String.
738      * @param source
739      * @return
740      * @throws IOException
741      */
742     public String getResults(URL source) throws IOException {
743         
744         InputStream in = source.openStream();
745         StringBuffer sb = new StringBuffer();
746         byte[] buffer = new byte[256];
747         while(true) {
748             int bytesRead = in.read(buffer);
749             if(bytesRead == -1) break;
750             for (int i=0; i<bytesRead; i++)
751                 sb.append((char)buffer[i]);
752         }
753         return sb.toString();
754     }
755
756     /**
757      * Read results from a file instead of an URL. Just for faster debugging.
758      * @param f
759      * @return
760      * @throws IOException
761      */
762     public String getResultsFromFile(File f) throws IOException {
763         InputStream in = new BufferedInputStream(new FileInputStream(f));
764         StringBuffer sb = new StringBuffer();
765         byte[] buffer = new byte[256];
766         while(true) {
767             int bytesRead = in.read(buffer);
768             if(bytesRead == -1) break;
769             for (int i=0; i<bytesRead; i++)
770                 sb.append((char)buffer[i]);
771         }
772         return sb.toString();
773     }
774 }