a3b89b940b913e3d71c13418a7f0cbfa6b6401bf
[debian/jabref.git] / src / java / net / sf / jabref / imports / IEEEXploreFetcher.java
1 package net.sf.jabref.imports;
2
3 import net.sf.jabref.net.URLDownload;
4 import net.sf.jabref.*;
5 import net.sf.jabref.gui.ImportInspectionDialog;
6
7 import javax.swing.*;
8 import java.net.URL;
9 import java.net.URLConnection;
10 import java.net.URLEncoder;
11 import java.net.MalformedURLException;
12 import java.net.ConnectException;
13 import java.io.*;
14 import java.util.regex.Pattern;
15 import java.util.regex.Matcher;
16 import java.util.List;
17 import java.util.ArrayList;
18 import java.awt.*;
19
20 /**
21  * Created by IntelliJ IDEA.
22  * User: alver
23  * Date: Mar 25, 2006
24  * Time: 1:09:32 PM
25  * To change this template use File | Settings | File Templates.
26  */
27 public class IEEEXploreFetcher implements Runnable, EntryFetcher {
28
29     ImportInspectionDialog dialog = null;
30     JabRefFrame frame = null;
31     HTMLConverter htmlConverter = new HTMLConverter();
32     private String terms;
33     String startUrl = "http://ieeexplore.ieee.org";
34     String searchUrlPart = "/search/freesearchresult.jsp?queryText=";
35     String endUrl = "+%3Cin%3E+metadata&ResultCount=25&ResultStart=";
36     String risUrl = "http://ieeexplore.ieee.org/xpls/citationAct";
37     private int perPage = 25, hits = 0, unparseable = 0, parsed = 0;
38     private boolean shouldContinue = false;
39     private JCheckBox fetchAstracts = new JCheckBox(Globals.lang("Include abstracts"), false);
40     private boolean fetchingAbstracts = false;
41     private static final int MAX_ABSTRACT_FETCH = 5;
42
43     public IEEEXploreFetcher() {
44     }
45
46
47     //Pattern hitsPattern = Pattern.compile("Your search matched <strong>(\\d+)</strong>");
48     Pattern hitsPattern = Pattern.compile(".*Your search matched <strong>(\\d+)</strong>.*");
49     Pattern maxHitsPattern = Pattern.compile(".*A maximum of <strong>(\\d+)</strong>.*");
50     Pattern entryPattern1 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
51                 +"\\s+<A href='(.+)'>(.+)</A><br>\\s+Volume (.+),&nbsp;\\s*"
52                 +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
53
54     Pattern entryPattern2 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
55                     +"\\s+<A href='(.+)'>(.+)</A><br>\\s+Volume (.+),&nbsp;\\s+Issue (\\d+),&nbsp;\\s*"
56                     +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
57
58
59     Pattern entryPattern3 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
60                     +"\\s+<A href='(.+)'>(.+)</A><br>\\s+Volume (.+),&nbsp;\\s+Issue (\\d+),&nbsp;" +
61                     "\\s+Part (\\d+),&nbsp;\\s*" //"[\\s-\\d]+"
62                     +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
63
64     Pattern entryPattern4 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
65                     +"\\s+<A href='(.+)'>(.+)</A><br>\\s*" //[\\s-\\da-z]+"
66                     +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
67
68     Pattern abstractLinkPattern = Pattern.compile(
69             "<a href=\"(.+)\" class=\"bodyCopySpaced\">Abstract</a>");
70
71     Pattern ieeeArticleNumberPattern =
72         Pattern.compile("<a href=\".*arnumber=(\\d+).*\">");
73
74     public JPanel getOptionsPanel() {
75         JPanel pan = new JPanel();
76         pan.setLayout(new BorderLayout());
77         pan.add(fetchAstracts, BorderLayout.CENTER);
78         return pan;
79     }
80
81     public void processQuery(String query, ImportInspectionDialog dialog, JabRefFrame frame) {
82         this.dialog = dialog;
83         this.frame =frame;
84         this.terms = query;
85         piv = 0;
86         (new Thread(this)).start();
87     }
88
89
90
91     public String getTitle() {
92         return Globals.menuTitle("Search IEEEXplore");
93     }
94
95
96     public URL getIcon() {
97         return GUIGlobals.getIconUrl("www");
98     }
99
100     public String getHelpPage() {
101         return "IEEEXploreHelp.html";
102     }
103
104     public String getKeyName() {
105         return "Search IEEXplore";
106     }
107
108     // This method is called by the dialog when the user has cancelled the import.
109     public void cancelled() {
110         shouldContinue = false;
111     }
112
113     // This method is called by the dialog when the user has selected the
114 // wanted entries, and clicked Ok. The callback object can update status
115 // line etc.
116     public void done(int entriesImported) {
117         //System.out.println("Number of entries parsed: "+parsed);
118         //System.out.println("Parsing failed for "+unparseable+" entries");
119     }
120
121     // This method is called by the dialog when the user has cancelled or
122 // signalled a stop. It is expected that any long-running fetch operations
123 // will stop after this method is called.
124     public void stopFetching() {
125         shouldContinue = false;
126     }
127
128     /**
129      * The code that runs the actual search and fetch operation.
130      */
131     public void run() {
132         frame.block();
133         shouldContinue = true;
134         parsed = 0;
135         unparseable = 0;
136         String address = makeUrl(0);
137         try {
138             URL url = new URL(address);
139             // Fetch the search page and put the contents in a String:
140             //String page = getResultsFromFile(new File("/home/alver/div/temp.txt"));
141             //URLDownload ud = new URLDownload(new JPanel(), url, new File("/home/alver/div/temp.txt"));
142             //ud.download();
143
144             //dialog.setVisible(true);
145             String page = getResults(url);
146             hits = getNumberOfHits(page, "Your search matched", hitsPattern);
147
148             frame.unblock();
149             if (hits == 0) {
150                 dialog.dispose();
151                 JOptionPane.showMessageDialog(frame, Globals.lang("No entries found for the search string '%0'",
152                         terms),
153                         Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
154                 return;
155             } else {
156                 fetchingAbstracts = fetchAstracts.isSelected();
157                 if (fetchingAbstracts && (hits > MAX_ABSTRACT_FETCH)) {
158                     fetchingAbstracts = false;
159                     JOptionPane.showMessageDialog(frame,
160                             Globals.lang("%0 entries found. To reduce server load, abstracts "
161                             +"will only be downloaded for searches returning %1 hits or less.",
162                                     new String[] {String.valueOf(hits), String.valueOf(MAX_ABSTRACT_FETCH)}),
163                             Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
164                 }
165                 dialog.setVisible(true);
166             }
167
168             int maxHits = getNumberOfHits(page, "A maximum of", maxHitsPattern);
169             //String page = getResultsFromFile(new File("/home/alver/div/temp50.txt"));
170
171             //List entries = new ArrayList();
172             //System.out.println("Number of hits: "+hits);
173             //System.out.println("Maximum returned: "+maxHits);
174             if (hits > maxHits)
175                 hits = maxHits;
176             //parse(dialog, page, 0, 51);
177             //dialog.setProgress(perPage/2, hits);
178             parse(dialog, page, 0, 1);
179             int firstEntry = perPage;
180             while (shouldContinue && (firstEntry < hits)) {
181                 //System.out.println("Fetching from: "+firstEntry);
182                 address = makeUrl(firstEntry);
183                 //System.out.println(address);
184                 page = getResults(new URL(address));
185                 //dialog.setProgress(firstEntry+perPage/2, hits);
186                 if (!shouldContinue)
187                     break;
188
189                 parse(dialog, page, 0, 1+firstEntry);
190                 firstEntry += perPage;
191
192             }
193             dialog.entryListComplete();
194         } catch (MalformedURLException e) {
195             e.printStackTrace();
196         } catch (ConnectException e) {
197             JOptionPane.showMessageDialog(frame, Globals.lang("Connection to IEEEXplore failed"),
198                     Globals.lang("Search IEEExplore"), JOptionPane.ERROR_MESSAGE);
199         } catch (IOException e) {
200             e.printStackTrace();
201         } finally {
202             frame.unblock(); // We call this to ensure no lockup.
203         }
204
205
206     }
207
208     private String makeUrl(int startIndex) {
209         StringBuffer sb = new StringBuffer(startUrl).append(searchUrlPart);
210         sb.append(terms.replaceAll(" ", "+"));
211         sb.append(endUrl);
212         sb.append(String.valueOf(startIndex));
213         return sb.toString();
214     }
215
216     int piv = 0;
217
218     private void parse(ImportInspectionDialog dialog, String text, int startIndex, int firstEntryNumber) {
219         piv = startIndex;
220         int entryNumber = firstEntryNumber;
221         List entries = new ArrayList();
222         BibtexEntry entry;
223         while (((entry = parseNextEntry(text, piv, entryNumber)) != null)
224             && (shouldContinue)) {
225             if (entry.getField("title") != null) {
226                 entries.add(entry);
227                 dialog.addEntries(entries);
228                 dialog.setProgress(parsed+unparseable, hits);
229                 entries.clear();
230                 parsed++;
231             }
232             entryNumber++;
233             //break;
234         }
235
236
237     }
238
239     private BibtexEntry parseEntryRis(String number)
240         throws IOException
241     {
242         URL url;
243         URLConnection conn;
244         try {
245             url = new URL(risUrl);
246             conn = url.openConnection();
247         } catch (MalformedURLException e) {
248             e.printStackTrace();
249             return null;
250         }
251         conn.setDoInput(true);
252         conn.setDoOutput(true);
253         conn.setRequestProperty("Content-Type",
254                 "application/x-www-form-urlencoded");
255         PrintWriter out = new PrintWriter(
256                 conn.getOutputStream());
257         out.write(
258                 "fileFormate=ris&arnumber="+
259                 URLEncoder.encode(
260                     "<arnumber>"+number+"</arnumber>",
261                     "UTF-8"));
262         out.flush();
263         out.close();
264         InputStream inp = conn.getInputStream();
265         List items = new RisImporter().importEntries(inp);
266         inp.close();
267         if (items.size() > 0)
268             return (BibtexEntry)items.get(0);
269         else
270             return null;
271     }
272
273     private BibtexEntry parseNextEntry(String allText, int startIndex, int entryNumber)
274     {
275         BibtexEntry entry = null;
276         String toFind = new StringBuffer().append("<div align=\"left\"><strong>")
277                 .append(entryNumber).append(".</strong></div>").toString();
278         int index = allText.indexOf(toFind, startIndex);
279         int endIndex = allText.indexOf("</table>", index+1);
280         if (endIndex < 0)
281             endIndex = allText.length();
282
283         if (index >= 0) {
284             piv = index+1;
285             String text = allText.substring(index, endIndex);
286             // Fetching abstracts takes time, and causes a lot
287             // of requests, so this should be optional or disabled:
288             if (fetchingAbstracts) {
289                 Matcher number =
290                     ieeeArticleNumberPattern.matcher(text);
291                 if (number.find()) {
292                     try {
293                         entry = parseEntryRis(number.group(1));
294                     } catch (IOException e) {
295                         e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
296                     }
297                 }
298             }
299             if (entry != null) { // fetch successful
300                 // we just need to add DOI, it is not included in RIS.
301                 int pgInd = text.indexOf("Digital Object Identifier ");
302                 if (pgInd >= 0) {
303                     int fieldEnd = text.indexOf("<br>", pgInd);
304                     if (fieldEnd >= 0) {
305                         entry.setField("doi",
306                             text.substring(pgInd+26, fieldEnd).trim());
307                     }
308                 }
309                 return entry;
310             }
311             BibtexEntryType type;
312             String sourceField;
313             if (text.indexOf("IEEE JNL") >= 0) {
314                 type = BibtexEntryType.getType("article");
315                 sourceField = "journal";
316             } else {
317                 type = BibtexEntryType.getType("inproceedings");
318                 sourceField = "booktitle";
319             }
320
321             index = 0;
322             entry = new BibtexEntry(Util.createNeutralId(), type);
323             //System.out.println(text);
324             Matcher m1 = entryPattern1.matcher(text);
325             Matcher m2 = entryPattern2.matcher(text);
326             Matcher m3 = entryPattern3.matcher(text);
327             Matcher m4 = entryPattern4.matcher(text);
328             Matcher m;
329             String tmp;
330             String rest = "";
331             if (m1.find()) {
332                 m = m1;
333                 // Title:
334                 entry.setField("title", convertHTMLChars(m.group(1)));
335                 // Author:
336                 tmp = convertHTMLChars(m.group(2));
337                 if (tmp.charAt(tmp.length()-1) == ';')
338                     tmp= tmp.substring(0, tmp.length()-1);
339                 entry.setField("author", tmp.replaceAll("; ", " and "));
340                 // Publication:
341                 tmp = m.group(4);
342                 entry.setField(sourceField, convertHTMLChars(tmp));
343                 // Volume:
344                 entry.setField("volume", convertHTMLChars(m.group(5)));
345                 // Month:
346                 entry.setField("month", convertHTMLChars(m.group(6)));
347                 // Year
348                 entry.setField("year", m.group(7));
349
350             }
351             else if (m2.find()) {
352                 m = m2;
353                 // Title:
354                 entry.setField("title", convertHTMLChars(m.group(1)));
355                 // Author:
356                 tmp = convertHTMLChars(m.group(2));
357                 if (tmp.charAt(tmp.length()-1) == ';')
358                     tmp= tmp.substring(0, tmp.length()-1);
359                 entry.setField("author", tmp.replaceAll("; ", " and "));
360                 // Publication:
361                 tmp = m.group(4);
362                 entry.setField(sourceField, convertHTMLChars(tmp));
363                 // Volume:
364                 entry.setField("volume", convertHTMLChars(m.group(5)));
365                 // Number:
366                 entry.setField("number", convertHTMLChars(m.group(6)));
367                 // Month:
368                 entry.setField("month", convertHTMLChars(m.group(7)));
369                 // Year:
370                 entry.setField("year", m.group(8));
371
372             }
373             else if (m3.find()) {
374                 m = m3;
375                 // Title:
376                 entry.setField("title", convertHTMLChars(m.group(1)));
377                 // Author:
378                 tmp = convertHTMLChars(m.group(2));
379                 if (tmp.charAt(tmp.length()-1) == ';')
380                     tmp= tmp.substring(0, tmp.length()-1);
381                 entry.setField("author", tmp.replaceAll("; ", " and "));
382                 // Publication:
383                 tmp = m.group(4);
384                 entry.setField(sourceField, convertHTMLChars(tmp));
385                 // Volume:
386                 entry.setField("volume", convertHTMLChars(m.group(5)));
387                 // Number:
388                 entry.setField("number", convertHTMLChars(m.group(6)));
389                 // Month:
390                 entry.setField("month", convertHTMLChars(m.group(8)));
391                 // Year
392                 entry.setField("year", m.group(9));
393
394             }
395             else if (m4.find()) {
396                 m = m4;
397                 // Title:
398                 entry.setField("title", convertHTMLChars(m.group(1)));
399                 // Author:
400                 tmp = convertHTMLChars(m.group(2));
401                 if (tmp.charAt(tmp.length()-1) == ';')
402                     tmp= tmp.substring(0, tmp.length()-1);
403                 entry.setField("author", tmp.replaceAll("; ", " and "));
404                 // Publication:
405                 tmp = m.group(4);
406                 entry.setField(sourceField, convertHTMLChars(tmp));
407                 // Month:
408                 entry.setField("month", convertHTMLChars(m.group(5)));
409                 // Year
410                 entry.setField("year", m.group(6));
411
412             } else {
413                 System.err.println("---no structure match---");
414                 System.err.println(text);
415                 unparseable++;
416             }
417             int pgInd = text.indexOf("Page(s):");
418             if (pgInd >= 0) {
419                 // Try to set pages:
420                 rest = text.substring(pgInd+8);
421                 pgInd = rest.indexOf("<br>");
422                 if (pgInd >= 0) {
423                     tmp = rest.substring(0, pgInd);
424                     entry.setField("pages", tmp.replaceAll("\\s+", "").replaceAll("-","--"));
425                 }
426                 // Try to set doi:
427                 pgInd = rest.indexOf("Digital Object Identifier ", pgInd);
428                 if (pgInd >= 0) {
429                     int fieldEnd = rest.indexOf("<br>", pgInd);
430                     if (fieldEnd >= 0) {
431                         entry.setField("doi", rest.substring(pgInd+26, fieldEnd).trim());
432                     }
433                 }
434             }
435
436
437             return entry;
438         }
439         return null;
440     }
441
442     /**
443      * This method must convert HTML style char sequences to normal characters.
444      * @param text The text to handle.
445      * @return The converted text.
446      */
447     private String convertHTMLChars(String text) {
448
449         return htmlConverter.format(text);
450     }
451
452
453     /**
454      * Find out how many hits were found.
455      * @param page
456      */
457     private int getNumberOfHits(String page, String marker, Pattern pattern) throws IOException {
458         int ind = page.indexOf(marker);
459         if (ind < 0)
460             throw new IOException(Globals.lang("Could not parse number of hits"));
461         String substring = page.substring(ind, Math.min(ind+42, page.length()));
462         Matcher m = pattern.matcher(substring);
463         if (!m.find())
464             return 0;
465         if (m.groupCount() >= 1) {
466             try {
467                 return Integer.parseInt(m.group(1));
468             } catch (NumberFormatException ex) {
469                 throw new IOException(Globals.lang("Could not parse number of hits"));
470             }
471         }
472         throw new IOException(Globals.lang("Could not parse number of hits"));
473     }
474
475     /**
476      * Download the URL and return contents as a String.
477      * @param source
478      * @return
479      * @throws IOException
480      */
481     public String getResults(URL source) throws IOException {
482         
483         InputStream in = source.openStream();
484         StringBuffer sb = new StringBuffer();
485         byte[] buffer = new byte[256];
486         while(true) {
487             int bytesRead = in.read(buffer);
488             if(bytesRead == -1) break;
489             for (int i=0; i<bytesRead; i++)
490                 sb.append((char)buffer[i]);
491         }
492         return sb.toString();
493     }
494
495     /**
496      * Read results from a file instead of an URL. Just for faster debugging.
497      * @param f
498      * @return
499      * @throws IOException
500      */
501     public String getResultsFromFile(File f) throws IOException {
502         InputStream in = new BufferedInputStream(new FileInputStream(f));
503         StringBuffer sb = new StringBuffer();
504         byte[] buffer = new byte[256];
505         while(true) {
506             int bytesRead = in.read(buffer);
507             if(bytesRead == -1) break;
508             for (int i=0; i<bytesRead; i++)
509                 sb.append((char)buffer[i]);
510         }
511         return sb.toString();
512     }
513
514
515     /**
516      * Download and parse the web page containing an entry's Abstract:
517      * @param link
518      * @return
519      * @throws IOException
520      */
521     public String fetchAbstract(String link) throws IOException {
522         URL url = new URL(link);
523         String page = getResults(url);
524         //System.out.println(link);
525
526         //System.out.println("Fetched abstract page.");
527
528         String marker = "Abstract</span><br>";
529         int index = page.indexOf(marker);
530         int endIndex = page.indexOf("</td>", index + 1);
531         if ((index >= 0) && (endIndex > index)) {
532             return new String(page.substring(index + marker.length(), endIndex).trim());
533         }
534
535         return null;
536     }
537
538 }