/* Copyright 2014 Samuel Gesche This file is part of Continuous RSS Reader. Continuous RSS Reader is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Continuous RSS Reader is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Continuous RSS Reader. If not, see <http://www.gnu.org/licenses/>. */ package continuousrssreader; import gate.Annotation; import gate.Corpus; import gate.Document; import gate.Factory; import gate.FeatureMap; import gate.Gate; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.util.GateException; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; public class GateEmbedder { private boolean initialised; private URL feed; private Set<String> entries; private AbstractLanguageAnalyser parser; public GateEmbedder() { // GATE still needs to be initialised initialised = false; // The structure in which RSS items will be stored // Using a Set to avoid multiple identical items (since we will often // reread the same information) entries = new HashSet(); } public void init() throws GateException, MalformedURLException { // Initialising the embedded GATE itself Gate.init(); // Loading the Toolkit_RSS plugin into the embedded GATE // (note that in our case, we have a directory 'plugins_dire' in our // GATE installation directory, in addition to the normal 'plugins' // directory: this means that we have to redirect the path a little). Gate.getCreoleRegister().registerDirectories( new File(Gate.getPluginsHome() + "/../plugins_dire/", "Toolkit_RSS").toURI().toURL() ); // Creating a RSSParser Processing Resource // (which occurs to be, more specifically, a Language Analyser) parser = (AbstractLanguageAnalyser) Factory.createResource("fr.cnrs.liris.drim.rsstoolkit.RSSParser"); // Initialisation done initialised = true; } public void setFeed(URL feed) throws IllegalStateException { // We could of course init here, but exception handling is // much easier without having to worry about the various // initialisation errors at every step if(!initialised) { throw new IllegalStateException("Gate embedder not initialised"); } this.feed = feed; } public void clearEntries() { // Straightforward, isn't it? entries.clear(); } public void readFeed() throws IOException, ResourceInstantiationException, ExecutionException { // We could of course init here, but exception handling is // much easier without having to worry about the various // initialisation errors at every step if(!initialised) { throw new IllegalStateException("Gate embedder not initialised"); } // Lesson I - Getting GATE to process a document // We must build a Corpus and a Document from the feed content: // a LanguageAnalyser must be given both to be able to work. // The corpus Corpus corpus = Factory.newCorpus("Corpus"); // The document parameters FeatureMap params = Factory.newFeatureMap(); // Source url (the feed url) params.put("sourceUrl", feed); // Encoding (we assume utf-8, which may be false) params.put("encoding", "utf-8"); // We want to keep the markup for RSSParser to be able to work // (else GATE removes the markup and replaces it with annotations; // of course RSSParser could use these but it is not relevant here). params.put("markupAware", false); // Other parameters (irrelevant in our case but still required) params.put("preserveOriginalContent", false); params.put("collectRepositioningInfo", false); // Creating the document and feeding it to the Corpus Document rss = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); corpus.add(rss); // Feeding the whole result to the resource parser.setCorpus(corpus); parser.setDocument(rss); // And here is the tiny central piece of code that needed all that! parser.execute(); // Lesson II - Fiddling with the annotations to build a consistent output // Initialising some structures to carry the information // A feed is composed of items that have a title, a description, a link // and a date of publishing. // All of those are for now annotations that the resource has added to // the document. // The deal with Longs is that we know where these annotations begin, // but they are not structured into a graph per se. So we will have // to get the annotations, then find out for each item which ones among // the four others kinds are within its bounds. ArrayList<Long> items = new ArrayList<>(); Map<Long, String> titles = new HashMap<>(); Map<Long, String> descriptions = new HashMap<>(); Map<Long, String> links = new HashMap<>(); Map<Long, String> dates = new HashMap<>(); // Parsing the document for annotations for(Annotation item: rss.getAnnotations()) { switch(item.getType()) { case "RSSItem": // An annotation has a start and end node that are // characterised by offsets. The offsets are the // character indexes of the beginning and end within the // document. items.add(item.getStartNode().getOffset()); break; case "RSSTitle": // An annotation also has features, that describe it. Here // we access the 'string' feature of an RSSTitle, which // contains the title text. titles.put(item.getStartNode().getOffset(), item.getFeatures().get("string").toString()); break; case "RSSDescription": // And so on descriptions.put(item.getStartNode().getOffset(), item.getFeatures().get("text").toString()); break; case "RSSLink": links.put(item.getStartNode().getOffset(), item.getFeatures().get("url").toString()); break; case "RSSDate": // And so on (a bit more complicated here, right?) dates.put(item.getStartNode().getOffset(), item.getFeatures().get("dayOfMonth") + "/" + item.getFeatures().get("month") + "/" + item.getFeatures().get("year") + " " + item.getFeatures().get("hourOfDay") + ":" + item.getFeatures().get("minute") + ":" + item.getFeatures().get("second")); break; } } // And here comes the boring part where we find out which of the titles // is the title of each item, and so on for every rss feature and every // item. Long[] iItems = new Long[items.size()]; items.toArray(iItems); Arrays.sort(iItems); Long[] iTitles = new Long[titles.keySet().size()]; titles.keySet().toArray(iTitles); Arrays.sort(iTitles); Long[] iDesc = new Long[descriptions.keySet().size()]; descriptions.keySet().toArray(iDesc); Arrays.sort(iDesc); Long[] iLink = new Long[links.keySet().size()]; links.keySet().toArray(iLink); Arrays.sort(iLink); Long[] iDates = new Long[dates.keySet().size()]; dates.keySet().toArray(iDates); Arrays.sort(iDates); for(int i=0; i<iItems.length; i++) { String title = "", desc = "", link = "", date = ""; long start = iItems[i]; long end; try { end = iItems[i+1]; } catch(ArrayIndexOutOfBoundsException ex) { end = Long.MAX_VALUE; } for(long x: iTitles) { if(x>=start && x<=end) { title = titles.get(x); break; } } for(long x: iDesc) { if(x>=start && x<=end) { desc = descriptions.get(x); break; } } for(long x: iLink) { if(x>=start && x<=end) { link = links.get(x); break; } } for(long x: iDates) { if(x>=start && x<=end) { date = dates.get(x); break; } } // And let's put the result into the entries with a nice html layout. entries.add("<h2>" + title + "</h2><p>" + desc + "</p><p><a href=\"" + link + "\">" + link + "</a></p><p><em>Published on " + date + "</em></p>"); } // The end. Fertig. Fini. } public String[] getEntries() { // Returning an array to not expose the internal method of indexing. // We could sort them, too. But this is for another time. String[] result = new String[entries.size()]; entries.toArray(result); return result; } }