1 package net.sf.jabref.imports;
3 import net.sf.jabref.net.URLDownload;
4 import net.sf.jabref.*;
5 import net.sf.jabref.gui.ImportInspectionDialog;
9 import java.net.URLConnection;
10 import java.net.URLEncoder;
11 import java.net.MalformedURLException;
12 import java.net.ConnectException;
14 import java.util.regex.Pattern;
15 import java.util.regex.Matcher;
16 import java.util.List;
17 import java.util.ArrayList;
21 * Created by IntelliJ IDEA.
25 * To change this template use File | Settings | File Templates.
27 public class IEEEXploreFetcher implements Runnable, EntryFetcher {
29 ImportInspectionDialog dialog = null;
30 JabRefFrame frame = null;
31 HTMLConverter htmlConverter = new HTMLConverter();
33 String startUrl = "http://ieeexplore.ieee.org";
34 String searchUrlPart = "/search/freesearchresult.jsp?queryText=";
35 String endUrl = "+%3Cin%3E+metadata&ResultCount=25&ResultStart=";
36 String risUrl = "http://ieeexplore.ieee.org/xpls/citationAct";
37 private int perPage = 25, hits = 0, unparseable = 0, parsed = 0;
38 private boolean shouldContinue = false;
39 private JCheckBox fetchAstracts = new JCheckBox(Globals.lang("Include abstracts"), false);
40 private boolean fetchingAbstracts = false;
41 private static final int MAX_ABSTRACT_FETCH = 5;
43 public IEEEXploreFetcher() {
47 //Pattern hitsPattern = Pattern.compile("Your search matched <strong>(\\d+)</strong>");
48 Pattern hitsPattern = Pattern.compile(".*Your search matched <strong>(\\d+)</strong>.*");
49 Pattern maxHitsPattern = Pattern.compile(".*A maximum of <strong>(\\d+)</strong>.*");
50 Pattern entryPattern1 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
51 +"\\s+<A href='(.+)'>(.+)</A><br>\\s+Volume (.+), \\s*"
52 +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
54 Pattern entryPattern2 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
55 +"\\s+<A href='(.+)'>(.+)</A><br>\\s+Volume (.+), \\s+Issue (\\d+), \\s*"
56 +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
59 Pattern entryPattern3 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
60 +"\\s+<A href='(.+)'>(.+)</A><br>\\s+Volume (.+), \\s+Issue (\\d+), " +
61 "\\s+Part (\\d+), \\s*" //"[\\s-\\d]+"
62 +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
64 Pattern entryPattern4 = Pattern.compile(".*<strong>(.+)</strong><br>\\s+(.+)<br>"
65 +"\\s+<A href='(.+)'>(.+)</A><br>\\s*" //[\\s-\\da-z]+"
66 +"(.+)? (\\d\\d\\d\\d)\\s+Page\\(s\\):.*");
68 Pattern abstractLinkPattern = Pattern.compile(
69 "<a href=\"(.+)\" class=\"bodyCopySpaced\">Abstract</a>");
71 Pattern ieeeArticleNumberPattern =
72 Pattern.compile("<a href=\".*arnumber=(\\d+).*\">");
74 public JPanel getOptionsPanel() {
75 JPanel pan = new JPanel();
76 pan.setLayout(new BorderLayout());
77 pan.add(fetchAstracts, BorderLayout.CENTER);
81 public void processQuery(String query, ImportInspectionDialog dialog, JabRefFrame frame) {
86 (new Thread(this)).start();
91 public String getTitle() {
92 return Globals.menuTitle("Search IEEEXplore");
96 public URL getIcon() {
97 return GUIGlobals.getIconUrl("www");
100 public String getHelpPage() {
101 return "IEEEXploreHelp.html";
104 public String getKeyName() {
105 return "Search IEEXplore";
108 // This method is called by the dialog when the user has cancelled the import.
109 public void cancelled() {
110 shouldContinue = false;
113 // This method is called by the dialog when the user has selected the
114 // wanted entries, and clicked Ok. The callback object can update status
116 public void done(int entriesImported) {
117 //System.out.println("Number of entries parsed: "+parsed);
118 //System.out.println("Parsing failed for "+unparseable+" entries");
121 // This method is called by the dialog when the user has cancelled or
122 // signalled a stop. It is expected that any long-running fetch operations
123 // will stop after this method is called.
124 public void stopFetching() {
125 shouldContinue = false;
129 * The code that runs the actual search and fetch operation.
133 shouldContinue = true;
136 String address = makeUrl(0);
138 URL url = new URL(address);
139 // Fetch the search page and put the contents in a String:
140 //String page = getResultsFromFile(new File("/home/alver/div/temp.txt"));
141 //URLDownload ud = new URLDownload(new JPanel(), url, new File("/home/alver/div/temp.txt"));
144 //dialog.setVisible(true);
145 String page = getResults(url);
146 hits = getNumberOfHits(page, "Your search matched", hitsPattern);
151 JOptionPane.showMessageDialog(frame, Globals.lang("No entries found for the search string '%0'",
153 Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
156 fetchingAbstracts = fetchAstracts.isSelected();
157 if (fetchingAbstracts && (hits > MAX_ABSTRACT_FETCH)) {
158 fetchingAbstracts = false;
159 JOptionPane.showMessageDialog(frame,
160 Globals.lang("%0 entries found. To reduce server load, abstracts "
161 +"will only be downloaded for searches returning %1 hits or less.",
162 new String[] {String.valueOf(hits), String.valueOf(MAX_ABSTRACT_FETCH)}),
163 Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE);
165 dialog.setVisible(true);
168 int maxHits = getNumberOfHits(page, "A maximum of", maxHitsPattern);
169 //String page = getResultsFromFile(new File("/home/alver/div/temp50.txt"));
171 //List entries = new ArrayList();
172 //System.out.println("Number of hits: "+hits);
173 //System.out.println("Maximum returned: "+maxHits);
176 //parse(dialog, page, 0, 51);
177 //dialog.setProgress(perPage/2, hits);
178 parse(dialog, page, 0, 1);
179 int firstEntry = perPage;
180 while (shouldContinue && (firstEntry < hits)) {
181 //System.out.println("Fetching from: "+firstEntry);
182 address = makeUrl(firstEntry);
183 //System.out.println(address);
184 page = getResults(new URL(address));
185 //dialog.setProgress(firstEntry+perPage/2, hits);
189 parse(dialog, page, 0, 1+firstEntry);
190 firstEntry += perPage;
193 dialog.entryListComplete();
194 } catch (MalformedURLException e) {
196 } catch (ConnectException e) {
197 JOptionPane.showMessageDialog(frame, Globals.lang("Connection to IEEEXplore failed"),
198 Globals.lang("Search IEEExplore"), JOptionPane.ERROR_MESSAGE);
199 } catch (IOException e) {
202 frame.unblock(); // We call this to ensure no lockup.
208 private String makeUrl(int startIndex) {
209 StringBuffer sb = new StringBuffer(startUrl).append(searchUrlPart);
210 sb.append(terms.replaceAll(" ", "+"));
212 sb.append(String.valueOf(startIndex));
213 return sb.toString();
218 private void parse(ImportInspectionDialog dialog, String text, int startIndex, int firstEntryNumber) {
220 int entryNumber = firstEntryNumber;
221 List entries = new ArrayList();
223 while (((entry = parseNextEntry(text, piv, entryNumber)) != null)
224 && (shouldContinue)) {
225 if (entry.getField("title") != null) {
227 dialog.addEntries(entries);
228 dialog.setProgress(parsed+unparseable, hits);
239 private BibtexEntry parseEntryRis(String number)
245 url = new URL(risUrl);
246 conn = url.openConnection();
247 } catch (MalformedURLException e) {
251 conn.setDoInput(true);
252 conn.setDoOutput(true);
253 conn.setRequestProperty("Content-Type",
254 "application/x-www-form-urlencoded");
255 PrintWriter out = new PrintWriter(
256 conn.getOutputStream());
258 "fileFormate=ris&arnumber="+
260 "<arnumber>"+number+"</arnumber>",
264 InputStream inp = conn.getInputStream();
265 List items = new RisImporter().importEntries(inp);
267 if (items.size() > 0)
268 return (BibtexEntry)items.get(0);
273 private BibtexEntry parseNextEntry(String allText, int startIndex, int entryNumber)
275 BibtexEntry entry = null;
276 String toFind = new StringBuffer().append("<div align=\"left\"><strong>")
277 .append(entryNumber).append(".</strong></div>").toString();
278 int index = allText.indexOf(toFind, startIndex);
279 int endIndex = allText.indexOf("</table>", index+1);
281 endIndex = allText.length();
285 String text = allText.substring(index, endIndex);
286 // Fetching abstracts takes time, and causes a lot
287 // of requests, so this should be optional or disabled:
288 if (fetchingAbstracts) {
290 ieeeArticleNumberPattern.matcher(text);
293 entry = parseEntryRis(number.group(1));
294 } catch (IOException e) {
295 e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
299 if (entry != null) { // fetch successful
300 // we just need to add DOI, it is not included in RIS.
301 int pgInd = text.indexOf("Digital Object Identifier ");
303 int fieldEnd = text.indexOf("<br>", pgInd);
305 entry.setField("doi",
306 text.substring(pgInd+26, fieldEnd).trim());
311 BibtexEntryType type;
313 if (text.indexOf("IEEE JNL") >= 0) {
314 type = BibtexEntryType.getType("article");
315 sourceField = "journal";
317 type = BibtexEntryType.getType("inproceedings");
318 sourceField = "booktitle";
322 entry = new BibtexEntry(Util.createNeutralId(), type);
323 //System.out.println(text);
324 Matcher m1 = entryPattern1.matcher(text);
325 Matcher m2 = entryPattern2.matcher(text);
326 Matcher m3 = entryPattern3.matcher(text);
327 Matcher m4 = entryPattern4.matcher(text);
334 entry.setField("title", convertHTMLChars(m.group(1)));
336 tmp = convertHTMLChars(m.group(2));
337 if (tmp.charAt(tmp.length()-1) == ';')
338 tmp= tmp.substring(0, tmp.length()-1);
339 entry.setField("author", tmp.replaceAll("; ", " and "));
342 entry.setField(sourceField, convertHTMLChars(tmp));
344 entry.setField("volume", convertHTMLChars(m.group(5)));
346 entry.setField("month", convertHTMLChars(m.group(6)));
348 entry.setField("year", m.group(7));
351 else if (m2.find()) {
354 entry.setField("title", convertHTMLChars(m.group(1)));
356 tmp = convertHTMLChars(m.group(2));
357 if (tmp.charAt(tmp.length()-1) == ';')
358 tmp= tmp.substring(0, tmp.length()-1);
359 entry.setField("author", tmp.replaceAll("; ", " and "));
362 entry.setField(sourceField, convertHTMLChars(tmp));
364 entry.setField("volume", convertHTMLChars(m.group(5)));
366 entry.setField("number", convertHTMLChars(m.group(6)));
368 entry.setField("month", convertHTMLChars(m.group(7)));
370 entry.setField("year", m.group(8));
373 else if (m3.find()) {
376 entry.setField("title", convertHTMLChars(m.group(1)));
378 tmp = convertHTMLChars(m.group(2));
379 if (tmp.charAt(tmp.length()-1) == ';')
380 tmp= tmp.substring(0, tmp.length()-1);
381 entry.setField("author", tmp.replaceAll("; ", " and "));
384 entry.setField(sourceField, convertHTMLChars(tmp));
386 entry.setField("volume", convertHTMLChars(m.group(5)));
388 entry.setField("number", convertHTMLChars(m.group(6)));
390 entry.setField("month", convertHTMLChars(m.group(8)));
392 entry.setField("year", m.group(9));
395 else if (m4.find()) {
398 entry.setField("title", convertHTMLChars(m.group(1)));
400 tmp = convertHTMLChars(m.group(2));
401 if (tmp.charAt(tmp.length()-1) == ';')
402 tmp= tmp.substring(0, tmp.length()-1);
403 entry.setField("author", tmp.replaceAll("; ", " and "));
406 entry.setField(sourceField, convertHTMLChars(tmp));
408 entry.setField("month", convertHTMLChars(m.group(5)));
410 entry.setField("year", m.group(6));
413 System.err.println("---no structure match---");
414 System.err.println(text);
417 int pgInd = text.indexOf("Page(s):");
420 rest = text.substring(pgInd+8);
421 pgInd = rest.indexOf("<br>");
423 tmp = rest.substring(0, pgInd);
424 entry.setField("pages", tmp.replaceAll("\\s+", "").replaceAll("-","--"));
427 pgInd = rest.indexOf("Digital Object Identifier ", pgInd);
429 int fieldEnd = rest.indexOf("<br>", pgInd);
431 entry.setField("doi", rest.substring(pgInd+26, fieldEnd).trim());
443 * This method must convert HTML style char sequences to normal characters.
444 * @param text The text to handle.
445 * @return The converted text.
447 private String convertHTMLChars(String text) {
449 return htmlConverter.format(text);
454 * Find out how many hits were found.
457 private int getNumberOfHits(String page, String marker, Pattern pattern) throws IOException {
458 int ind = page.indexOf(marker);
460 throw new IOException(Globals.lang("Could not parse number of hits"));
461 String substring = page.substring(ind, Math.min(ind+42, page.length()));
462 Matcher m = pattern.matcher(substring);
465 if (m.groupCount() >= 1) {
467 return Integer.parseInt(m.group(1));
468 } catch (NumberFormatException ex) {
469 throw new IOException(Globals.lang("Could not parse number of hits"));
472 throw new IOException(Globals.lang("Could not parse number of hits"));
476 * Download the URL and return contents as a String.
479 * @throws IOException
481 public String getResults(URL source) throws IOException {
483 InputStream in = source.openStream();
484 StringBuffer sb = new StringBuffer();
485 byte[] buffer = new byte[256];
487 int bytesRead = in.read(buffer);
488 if(bytesRead == -1) break;
489 for (int i=0; i<bytesRead; i++)
490 sb.append((char)buffer[i]);
492 return sb.toString();
496 * Read results from a file instead of an URL. Just for faster debugging.
499 * @throws IOException
501 public String getResultsFromFile(File f) throws IOException {
502 InputStream in = new BufferedInputStream(new FileInputStream(f));
503 StringBuffer sb = new StringBuffer();
504 byte[] buffer = new byte[256];
506 int bytesRead = in.read(buffer);
507 if(bytesRead == -1) break;
508 for (int i=0; i<bytesRead; i++)
509 sb.append((char)buffer[i]);
511 return sb.toString();
516 * Download and parse the web page containing an entry's Abstract:
519 * @throws IOException
521 public String fetchAbstract(String link) throws IOException {
522 URL url = new URL(link);
523 String page = getResults(url);
524 //System.out.println(link);
526 //System.out.println("Fetched abstract page.");
528 String marker = "Abstract</span><br>";
529 int index = page.indexOf(marker);
530 int endIndex = page.indexOf("</td>", index + 1);
531 if ((index >= 0) && (endIndex > index)) {
532 return new String(page.substring(index + marker.length(), endIndex).trim());