User Tools

Site Tools


code_continuousrssreader
GateEmbedder.java
/*
Copyright 2014 Samuel Gesche
 
This file is part of Continuous RSS Reader.
 
Continuous RSS Reader is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
 
Continuous RSS Reader is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
 
You should have received a copy of the GNU General Public License
along with Continuous RSS Reader.  If not, see <http://www.gnu.org/licenses/>.
*/
package continuousrssreader;
 
import gate.Annotation;
import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.GateException;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
 
public class GateEmbedder {
    private boolean initialised;
 
    private URL feed;
    private Set<String> entries;
    private AbstractLanguageAnalyser parser;
 
    public GateEmbedder() {
 
        // GATE still needs to be initialised
        initialised = false;
 
        // The structure in which RSS items will be stored
        // Using a Set to avoid multiple identical items (since we will often
        // reread the same information)
        entries = new HashSet();
    }
 
    public void init() throws GateException, MalformedURLException {
 
        // Initialising the embedded GATE itself
        Gate.init();
 
        // Loading the Toolkit_RSS plugin into the embedded GATE
        // (note that in our case, we have a directory 'plugins_dire' in our
        // GATE installation directory, in addition to the normal 'plugins' 
        // directory: this means that we have to redirect the path a little).
        Gate.getCreoleRegister().registerDirectories(
                new File(Gate.getPluginsHome() + "/../plugins_dire/", "Toolkit_RSS").toURI().toURL()
        );
 
        // Creating a RSSParser Processing Resource 
        // (which occurs to be, more specifically, a Language Analyser)
        parser = (AbstractLanguageAnalyser) Factory.createResource("fr.cnrs.liris.drim.rsstoolkit.RSSParser");
 
        // Initialisation done
        initialised = true;
    }
 
    public void setFeed(URL feed) throws IllegalStateException {
 
        // We could of course init here, but exception handling is 
        // much easier without having to worry about the various 
        // initialisation errors at every step
        if(!initialised) {
            throw new IllegalStateException("Gate embedder not initialised");
        }
 
        this.feed = feed;
    }
 
    public void clearEntries() {
 
        // Straightforward, isn't it?
        entries.clear();
    }
 
    public void readFeed() throws IOException, ResourceInstantiationException, ExecutionException {
 
        // We could of course init here, but exception handling is 
        // much easier without having to worry about the various 
        // initialisation errors at every step
        if(!initialised) {
            throw new IllegalStateException("Gate embedder not initialised");
        }
 
 
        // Lesson I - Getting GATE to process a document
 
        // We must build a Corpus and a Document from the feed content: 
        // a LanguageAnalyser must be given both to be able to work.
 
        // The corpus
        Corpus corpus = Factory.newCorpus("Corpus");
 
        // The document parameters
        FeatureMap params = Factory.newFeatureMap();
        // Source url (the feed url)
        params.put("sourceUrl", feed);
        // Encoding (we assume utf-8, which may be false)
        params.put("encoding", "utf-8");
        // We want to keep the markup for RSSParser to be able to work
        // (else GATE removes the markup and replaces it with annotations;
        // of course RSSParser could use these but it is not relevant here).
        params.put("markupAware", false);
        // Other parameters (irrelevant in our case but still required)
        params.put("preserveOriginalContent", false);
        params.put("collectRepositioningInfo", false);
 
        // Creating the document and feeding it to the Corpus
        Document rss = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
        corpus.add(rss);
 
        // Feeding the whole result to the resource
        parser.setCorpus(corpus);
        parser.setDocument(rss);
 
        // And here is the tiny central piece of code that needed all that!
        parser.execute();
 
 
 
 
        // Lesson II - Fiddling with the annotations to build a consistent output
 
 
        // Initialising some structures to carry the information
        // A feed is composed of items that have a title, a description, a link 
        // and a date of publishing.
        // All of those are for now annotations that the resource has added to 
        // the document.
        // The deal with Longs is that we know where these annotations begin, 
        // but they are not structured into a graph per se. So we will have 
        // to get the annotations, then find out for each item which ones among 
        // the four others kinds are within its bounds.
        ArrayList<Long> items = new ArrayList<>();
        Map<Long, String> titles = new HashMap<>();
        Map<Long, String> descriptions = new HashMap<>();
        Map<Long, String> links = new HashMap<>();
        Map<Long, String> dates = new HashMap<>();
 
        // Parsing the document for annotations
        for(Annotation item: rss.getAnnotations()) {
            switch(item.getType()) {
                case "RSSItem":
                    // An annotation has a start and end node that are 
                    // characterised by offsets. The offsets are the 
                    // character indexes of the beginning and end within the 
                    // document.
                    items.add(item.getStartNode().getOffset());
                    break;
                case "RSSTitle":
                    // An annotation also has features, that describe it. Here
                    // we access the 'string' feature of an RSSTitle, which
                    // contains the title text.
                    titles.put(item.getStartNode().getOffset(), item.getFeatures().get("string").toString());
                    break;
                case "RSSDescription":
                    // And so on
                    descriptions.put(item.getStartNode().getOffset(), item.getFeatures().get("text").toString());
                    break;
                case "RSSLink":
                    links.put(item.getStartNode().getOffset(), item.getFeatures().get("url").toString());
                    break;
                case "RSSDate":
                    // And so on (a bit more complicated here, right?)
                    dates.put(item.getStartNode().getOffset(), 
                            item.getFeatures().get("dayOfMonth") + "/" + 
                            item.getFeatures().get("month") + "/" + 
                            item.getFeatures().get("year") + " " + 
                            item.getFeatures().get("hourOfDay") + ":" + 
                            item.getFeatures().get("minute") + ":" + 
                            item.getFeatures().get("second"));
                    break;
            }
        }
 
        // And here comes the boring part where we find out which of the titles
        // is the title of each item, and so on for every rss feature and every 
        // item.
        Long[] iItems = new Long[items.size()];
        items.toArray(iItems);
        Arrays.sort(iItems);
        Long[] iTitles = new Long[titles.keySet().size()];
        titles.keySet().toArray(iTitles);
        Arrays.sort(iTitles);
        Long[] iDesc = new Long[descriptions.keySet().size()];
        descriptions.keySet().toArray(iDesc);
        Arrays.sort(iDesc);
        Long[] iLink = new Long[links.keySet().size()];
        links.keySet().toArray(iLink);
        Arrays.sort(iLink);
        Long[] iDates = new Long[dates.keySet().size()];
        dates.keySet().toArray(iDates);
        Arrays.sort(iDates);
 
        for(int i=0; i<iItems.length; i++) {
            String title = "", desc = "", link = "", date = "";
            long start = iItems[i];
            long end;
            try {
                end = iItems[i+1];
            } catch(ArrayIndexOutOfBoundsException ex) {
                end = Long.MAX_VALUE;
            }
            for(long x: iTitles) {
                if(x>=start && x<=end) {
                    title = titles.get(x);
                    break;
                }
            }
            for(long x: iDesc) {
                if(x>=start && x<=end) {
                    desc = descriptions.get(x);
                    break;
                }
            }
            for(long x: iLink) {
                if(x>=start && x<=end) {
                    link = links.get(x);
                    break;
                }
            }
            for(long x: iDates) {
                if(x>=start && x<=end) {
                    date = dates.get(x);
                    break;
                }
            }
 
            // And let's put the result into the entries with a nice html layout.
            entries.add("<h2>" + title + "</h2><p>" + desc + 
                    "</p><p><a href=\"" + link + "\">" + link + 
                    "</a></p><p><em>Published on " + date + "</em></p>");
        }
 
        // The end. Fertig. Fini.
    }
 
    public String[] getEntries() {
 
        // Returning an array to not expose the internal method of indexing.
        // We could sort them, too. But this is for another time.
        String[] result = new String[entries.size()];
        entries.toArray(result);
        return result;
    }
 
 
}
code_continuousrssreader.txt · Last modified: 2014/04/08 15:02 by sgesche