54c1f97b78c71a8de0e032844432b300d8743704
[debian/jabref.git] / src / java / net / sf / jabref / imports / IEEEXploreFetcher.java
1 package net.sf.jabref.imports;
2
3 import net.sf.jabref.net.URLDownload;
4 import net.sf.jabref.*;
5 import net.sf.jabref.gui.ImportInspectionDialog;
6
7 import javax.swing.*;
8 import java.net.URL;
9 import java.net.MalformedURLException;
10 import java.io.*;
11 import java.util.regex.Pattern;
12 import java.util.regex.Matcher;
13 import java.util.List;
14 import java.util.ArrayList;
15 import java.awt.*;
16
17 /**
18  * Created by IntelliJ IDEA.
19  * User: alver
20  * Date: Mar 25, 2006
21  * Time: 1:09:32 PM
22  * To change this template use File | Settings | File Templates.
23  */
24 public class IEEEXploreFetcher implements Runnable, EntryFetcher {
25
26     ImportInspectionDialog dialog = null;
27     JabRefFrame frame = null;
28     HTMLConverter htmlConverter = new HTMLConverter();
29     private String terms;
30     String startUrl = "http://ieeexplore.ieee.org";
31     String searchUrlPart = "/search/freesearchresult.jsp?queryText=";
32     String endUrl = "+%3Cin%3E+metadata&ResultCount=25&ResultStart=";
33     private int perPage = 25, hits = 0, unparseable = 0, parsed = 0;
34     private boolean shouldContinue = false;
35     private JCheckBox fetchAstracts = new JCheckBox(Globals.lang("Include abstracts"), false);
36     private boolean fetchingAbstracts = false;
37     private static final int MAX_ABSTRACT_FETCH = 5;
38
39     public IEEEXploreFetcher() {
40     }
41
42
43     //Pattern hitsPattern = Pattern.compile("Your search matched <strong>(\\d+)</strong>");
44     Pattern hitsPattern = Pattern.compile(".*Your search matched <strong>(\\d+)</strong>.*");
45     Pattern maxHitsPattern = Pattern.compile(".*A maximum of <strong>(\\d+)</strong>.*");
46     Pattern entryPattern1 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
47                 +"\\s+<A href='(.+)'>(.+)</A><br>\\s+Volume (.+),&nbsp;\\s*"
48                 +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
49
50     Pattern entryPattern2 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
51                     +"\\s+<A href='(.+)'>(.+)</A><br>\\s+Volume (.+),&nbsp;\\s+Issue (\\d+),&nbsp;\\s*"
52                     +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
53
54
55     Pattern entryPattern3 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
56                     +"\\s+<A href='(.+)'>(.+)</A><br>\\s+Volume (.+),&nbsp;\\s+Issue (\\d+),&nbsp;" +
57                     "\\s+Part (\\d+),&nbsp;\\s*" //"[\\s-\\d]+"
58                     +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
59
60     Pattern entryPattern4 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
61                     +"\\s+<A href='(.+)'>(.+)</A><br>\\s*" //[\\s-\\da-z]+"
62                     +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
63
64     Pattern abstractLinkPattern = Pattern.compile(
65             "<a href=\"(.+)\" class=\"bodyCopySpaced\">Abstract</a>");
66
67     public JPanel getOptionsPanel() {
68         JPanel pan = new JPanel();
69         pan.setLayout(new BorderLayout());
70         pan.add(fetchAstracts, BorderLayout.CENTER);
71         return pan;
72     }
73
74     public void processQuery(String query, ImportInspectionDialog dialog, JabRefFrame frame) {
75         this.dialog = dialog;
76         this.frame =frame;
77         this.terms = query;
78         piv = 0;
79         (new Thread(this)).start();
80     }
81
82
83
84     public String getTitle() {
85         return Globals.menuTitle("Search IEEEXplore");
86     }
87
88
89     public URL getIcon() {
90         return GUIGlobals.wwwIcon;
91     }
92
93     public String getHelpPage() {
94         return "IEEEXplorerHelp.html";
95     }
96
97     public String getKeyName() {
98         return "Search IEEXplore";
99     }
100
101     // This method is called by the dialog when the user has cancelled the import.
102     public void cancelled() {
103         shouldContinue = false;
104     }
105
106     // This method is called by the dialog when the user has selected the
107 // wanted entries, and clicked Ok. The callback object can update status
108 // line etc.
109     public void done(int entriesImported) {
110         //System.out.println("Number of entries parsed: "+parsed);
111         //System.out.println("Parsing failed for "+unparseable+" entries");
112     }
113
114     // This method is called by the dialog when the user has cancelled or
115 // signalled a stop. It is expected that any long-running fetch operations
116 // will stop after this method is called.
117     public void stopFetching() {
118         shouldContinue = false;
119     }
120
121     /**
122      * The code that runs the actual search and fetch operation.
123      */
124     public void run() {
125         frame.block();
126         shouldContinue = true;
127         parsed = 0;
128         unparseable = 0;
129         String address = makeUrl(0);
130         try {
131             URL url = new URL(address);
132             // Fetch the search page and put the contents in a String:
133             //String page = getResultsFromFile(new File("/home/alver/div/temp.txt"));
134             //URLDownload ud = new URLDownload(new JPanel(), url, new File("/home/alver/div/temp.txt"));
135             //ud.download();
136
137             //dialog.setVisible(true);
138             String page = getResults(url);
139             hits = getNumberOfHits(page, "Your search matched", hitsPattern);
140
141             frame.unblock();
142             if (hits == 0) {
143                 dialog.dispose();
144                 JOptionPane.showMessageDialog(frame, Globals.lang("No entries found for the search string '%0'",
145                         terms),
146                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
147                 return;
148             } else {
149                 fetchingAbstracts = fetchAstracts.isSelected();
150                 if (fetchingAbstracts && (hits > MAX_ABSTRACT_FETCH)) {
151                     fetchingAbstracts = false;
152                     JOptionPane.showMessageDialog(frame,
153                             Globals.lang("%0 entries found. To reduce server load, abstracts "
154                             +"will only be downloaded for searches returning %1 hits or less.",
155                                     new String[] {String.valueOf(hits), String.valueOf(MAX_ABSTRACT_FETCH)}),
156                             Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
157                 }
158                 dialog.setVisible(true);
159             }
160
161             int maxHits = getNumberOfHits(page, "A maximum of", maxHitsPattern);
162             //String page = getResultsFromFile(new File("/home/alver/div/temp50.txt"));
163
164             //List entries = new ArrayList();
165             //System.out.println("Number of hits: "+hits);
166             //System.out.println("Maximum returned: "+maxHits);
167             if (hits > maxHits)
168                 hits = maxHits;
169             //parse(dialog, page, 0, 51);
170             //dialog.setProgress(perPage/2, hits);
171             parse(dialog, page, 0, 1);
172             int firstEntry = perPage;
173             while (shouldContinue && (firstEntry < hits)) {
174                 //System.out.println("Fetching from: "+firstEntry);
175                 address = makeUrl(firstEntry);
176                 //System.out.println(address);
177                 page = getResults(new URL(address));
178                 //dialog.setProgress(firstEntry+perPage/2, hits);
179                 if (!shouldContinue)
180                     break;
181
182                 parse(dialog, page, 0, 1+firstEntry);
183                 firstEntry += perPage;
184
185             }
186             dialog.entryListComplete();
187         } catch (MalformedURLException e) {
188             e.printStackTrace();
189         } catch (IOException e) {
190             e.printStackTrace();
191         }
192
193
194     }
195
196     private String makeUrl(int startIndex) {
197         StringBuffer sb = new StringBuffer(startUrl).append(searchUrlPart);
198         sb.append(terms.replaceAll(" ", "+"));
199         sb.append(endUrl);
200         sb.append(String.valueOf(startIndex));
201         return sb.toString();
202     }
203
204     int piv = 0;
205
206     private void parse(ImportInspectionDialog dialog, String text, int startIndex, int firstEntryNumber) {
207         piv = startIndex;
208         int entryNumber = firstEntryNumber;
209         List entries = new ArrayList();
210         BibtexEntry entry;
211         while (((entry = parseNextEntry(text, piv, entryNumber)) != null)
212             && (shouldContinue)) {
213             if (entry.getField("title") != null) {
214                 entries.add(entry);
215                 dialog.addEntries(entries);
216                 dialog.setProgress(parsed+unparseable, hits);
217                 entries.clear();
218                 parsed++;
219             }
220             entryNumber++;
221             //break;
222         }
223
224
225     }
226
227     private BibtexEntry parseNextEntry(String allText, int startIndex, int entryNumber) {
228         String toFind = new StringBuffer().append("<div align=\"left\"><strong>")
229                 .append(entryNumber).append(".</strong></div>").toString();
230         int index = allText.indexOf(toFind, startIndex);
231         int endIndex = allText.indexOf("</table>", index+1);
232         if (endIndex < 0)
233             endIndex = allText.length();
234
235         if (index >= 0) {
236             piv = index+1;
237             String text = allText.substring(index, endIndex);
238             BibtexEntryType type;
239             String sourceField;
240             if (text.indexOf("IEEE JNL") >= 0) {
241                 type = BibtexEntryType.getType("article");
242                 sourceField = "journal";
243             } else {
244                 type = BibtexEntryType.getType("inproceedings");
245                 sourceField = "booktitle";
246             }
247
248             index = 0;
249             BibtexEntry entry = new BibtexEntry(Util.createNeutralId(), type);
250             //System.out.println(text);
251             Matcher m1 = entryPattern1.matcher(text);
252             Matcher m2 = entryPattern2.matcher(text);
253             Matcher m3 = entryPattern3.matcher(text);
254             Matcher m4 = entryPattern4.matcher(text);
255             Matcher m;
256             String tmp;
257             String rest = "";
258             if (m1.find()) {
259                 m = m1;
260                 // Title:
261                 entry.setField("title", convertHTMLChars(m.group(1)));
262                 // Author:
263                 tmp = convertHTMLChars(m.group(2));
264                 if (tmp.charAt(tmp.length()-1) == ';')
265                     tmp= tmp.substring(0, tmp.length()-1);
266                 entry.setField("author", tmp.replaceAll("; ", " and "));
267                 // Publication:
268                 tmp = m.group(4);
269                 entry.setField(sourceField, convertHTMLChars(tmp));
270                 // Volume:
271                 entry.setField("volume", convertHTMLChars(m.group(5)));
272                 // Month:
273                 entry.setField("month", convertHTMLChars(m.group(6)));
274                 // Year
275                 entry.setField("year", m.group(7));
276
277             }
278             else if (m2.find()) {
279                 m = m2;
280                 // Title:
281                 entry.setField("title", convertHTMLChars(m.group(1)));
282                 // Author:
283                 tmp = convertHTMLChars(m.group(2));
284                 if (tmp.charAt(tmp.length()-1) == ';')
285                     tmp= tmp.substring(0, tmp.length()-1);
286                 entry.setField("author", tmp.replaceAll("; ", " and "));
287                 // Publication:
288                 tmp = m.group(4);
289                 entry.setField(sourceField, convertHTMLChars(tmp));
290                 // Volume:
291                 entry.setField("volume", convertHTMLChars(m.group(5)));
292                 // Number:
293                 entry.setField("number", convertHTMLChars(m.group(6)));
294                 // Month:
295                 entry.setField("month", convertHTMLChars(m.group(7)));
296                 // Year:
297                 entry.setField("year", m.group(8));
298
299             }
300             else if (m3.find()) {
301                 m = m3;
302                 // Title:
303                 entry.setField("title", convertHTMLChars(m.group(1)));
304                 // Author:
305                 tmp = convertHTMLChars(m.group(2));
306                 if (tmp.charAt(tmp.length()-1) == ';')
307                     tmp= tmp.substring(0, tmp.length()-1);
308                 entry.setField("author", tmp.replaceAll("; ", " and "));
309                 // Publication:
310                 tmp = m.group(4);
311                 entry.setField(sourceField, convertHTMLChars(tmp));
312                 // Volume:
313                 entry.setField("volume", convertHTMLChars(m.group(5)));
314                 // Number:
315                 entry.setField("number", convertHTMLChars(m.group(6)));
316                 // Month:
317                 entry.setField("month", convertHTMLChars(m.group(8)));
318                 // Year
319                 entry.setField("year", m.group(9));
320
321             }
322             else if (m4.find()) {
323                 m = m4;
324                 // Title:
325                 entry.setField("title", convertHTMLChars(m.group(1)));
326                 // Author:
327                 tmp = convertHTMLChars(m.group(2));
328                 if (tmp.charAt(tmp.length()-1) == ';')
329                     tmp= tmp.substring(0, tmp.length()-1);
330                 entry.setField("author", tmp.replaceAll("; ", " and "));
331                 // Publication:
332                 tmp = m.group(4);
333                 entry.setField(sourceField, convertHTMLChars(tmp));
334                 // Month:
335                 entry.setField("month", convertHTMLChars(m.group(5)));
336                 // Year
337                 entry.setField("year", m.group(6));
338
339             } else {
340                 System.err.println("---no structure match---");
341                 System.err.println(text);
342                 unparseable++;
343             }
344             int pgInd = text.indexOf("Page(s):");
345             if (pgInd >= 0) {
346                 // Try to set pages:
347                 rest = text.substring(pgInd+8);
348                 pgInd = rest.indexOf("<br>");
349                 if (pgInd >= 0) {
350                     tmp = rest.substring(0, pgInd);
351                     entry.setField("pages", tmp.replaceAll("\\s+", "").replaceAll("-","--"));
352                 }
353                 // Try to set doi:
354                 pgInd = rest.indexOf("Digital Object Identifier ", pgInd);
355                 if (pgInd >= 0) {
356                     int fieldEnd = rest.indexOf("<br>", pgInd);
357                     if (fieldEnd >= 0) {
358                         entry.setField("doi", rest.substring(pgInd+26, fieldEnd).trim());
359                     }
360                 }
361             }
362
363             // Fetching abstracts takes time, and causes a lot of requests, so this should
364             // be optional or disabled:
365             if (fetchingAbstracts) {
366                 Matcher abstractLink = abstractLinkPattern.matcher(text);
367                 if (abstractLink.find()) {
368                     StringBuffer sb = new StringBuffer(startUrl).append(abstractLink.group(1));
369                     //System.out.println(sb.toString());
370                     try {
371                         String abstractText = fetchAbstract(sb.toString());
372                         if ((abstractText != null) && (abstractText.length() > 0) &&
373                                 !abstractText.equalsIgnoreCase("not available")) {
374                             entry.setField("abstract", convertHTMLChars(abstractText));
375                         }
376                     } catch (IOException e) {
377                         e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
378                     }
379                 }
380             }
381
382             return entry;
383         }
384         return null;
385     }
386
387     /**
388      * This method must convert HTML style char sequences to normal characters.
389      * @param text The text to handle.
390      * @return The converted text.
391      */
392     private String convertHTMLChars(String text) {
393
394         return htmlConverter.format(text);
395     }
396
397
398     /**
399      * Find out how many hits were found.
400      * @param page
401      */
402     private int getNumberOfHits(String page, String marker, Pattern pattern) throws IOException {
403         int ind = page.indexOf(marker);
404         if (ind < 0)
405             throw new IOException(Globals.lang("Could not parse number of hits"));
406         String substring = page.substring(ind, Math.min(ind+42, page.length()));
407         Matcher m = pattern.matcher(substring);
408         m.find();
409         if (m.groupCount() >= 1) {
410             try {
411                 return Integer.parseInt(m.group(1));
412             } catch (NumberFormatException ex) {
413                 throw new IOException(Globals.lang("Could not parse number of hits"));
414             }
415         }
416         throw new IOException(Globals.lang("Could not parse number of hits"));
417     }
418
419     /**
420      * Download the URL and return contents as a String.
421      * @param source
422      * @return
423      * @throws IOException
424      */
425     public String getResults(URL source) throws IOException {
426         InputStream in = source.openStream();
427         StringBuffer sb = new StringBuffer();
428         byte[] buffer = new byte[256];
429         while(true) {
430             int bytesRead = in.read(buffer);
431             if(bytesRead == -1) break;
432             for (int i=0; i<bytesRead; i++)
433                 sb.append((char)buffer[i]);
434         }
435         return sb.toString();
436     }
437
438     /**
439      * Read results from a file instead of an URL. Just for faster debugging.
440      * @param f
441      * @return
442      * @throws IOException
443      */
444     public String getResultsFromFile(File f) throws IOException {
445         InputStream in = new BufferedInputStream(new FileInputStream(f));
446         StringBuffer sb = new StringBuffer();
447         byte[] buffer = new byte[256];
448         while(true) {
449             int bytesRead = in.read(buffer);
450             if(bytesRead == -1) break;
451             for (int i=0; i<bytesRead; i++)
452                 sb.append((char)buffer[i]);
453         }
454         return sb.toString();
455     }
456
457
458     /**
459      * Download and parse the web page containing an entry's Abstract:
460      * @param link
461      * @return
462      * @throws IOException
463      */
464     public String fetchAbstract(String link) throws IOException {
465         URL url = new URL(link);
466         String page = getResults(url);
467         //System.out.println(link);
468
469         //System.out.println("Fetched abstract page.");
470
471         String marker = "Abstract</span><br>";
472         int index = page.indexOf(marker);
473         int endIndex = page.indexOf("</td>", index + 1);
474         if ((index >= 0) && (endIndex > index)) {
475             return new String(page.substring(index + marker.length(), endIndex).trim());
476         }
477
478         return null;
479     }
480
481 }