/*
Copyright 2012-2014 Samuel Gesche

This file is part of the Greek Reuse Toolkit.

The Greek Reuse Toolkit is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

The Greek Reuse Toolkit is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with the Greek Reuse Toolkit.  If not, see <http://www.gnu.org/licenses/>.
*/

package fr.cnrs.liris.drim.grt.proc.similarite;

import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.Similarity;
import edu.ucla.sspace.lsa.LatentSemanticAnalysis;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Locale;
import java.util.Locale;
import java.util.Properties;
import fr.cnrs.liris.drim.grt.modele.Passage;
import fr.cnrs.liris.drim.grt.modele.Terme;
import fr.cnrs.liris.drim.grt.proc.Texte;

/**
 *
 * @author sgesche
 */
public class AnalyseurSSpace {
    private SemanticSpace analyseur = null;
    
    public final static int LSA = 0;
    
    public AnalyseurSSpace() {
        
    }
    
    public AnalyseurSSpace(int algorithme, Passage[] documents) throws IOException {
        Properties proprietes = new Properties();
        switch(algorithme) {
            case LSA:
                analyseur = new LatentSemanticAnalysis();
                proprietes.setProperty(LatentSemanticAnalysis.LSA_DIMENSIONS_PROPERTY, "300");
                break;
            default:  
        }
        if(analyseur == null) {
            throw new IllegalArgumentException();
        }
        for(Passage document: documents) {
            analyseur.processDocument(new BufferedReader(new StringReader(new Texte(document).getTexte())));
        }
        
        analyseur.processSpace(proprietes);
    }
    
    public AnalyseurSSpace(int algorithme, ArrayList<Terme[]> documents) throws IOException {
        Properties proprietes = new Properties();
        switch(algorithme) {
            case LSA:
                analyseur = new LatentSemanticAnalysis();
                proprietes.setProperty(LatentSemanticAnalysis.LSA_DIMENSIONS_PROPERTY, "300");
                break;
            default:  
        }
        if(analyseur == null) {
            throw new IllegalArgumentException();
        }
        for(Terme[] document: documents) {
            String doc = document[0].getExpression();
            for(int i=1; i<document.length; i++) {
                doc += " " + document[i].getExpression();
            }
            analyseur.processDocument(new BufferedReader(new StringReader(doc)));
        }
        Locale.setDefault(Locale.ENGLISH);
        analyseur.processSpace(proprietes);
    }
    
    public boolean contientTerme(Terme terme) {
        if(analyseur == null) {
            return false;
        }
        return analyseur.getWords().contains(terme.getExpression());
    }
    
    public double getSimilarite(Terme terme1, Terme terme2) {
        if(analyseur == null) {
            return terme1.equals(terme2)?1.0:0.0;
        }
        return Similarity.getSimilarity(Similarity.SimType.COSINE, analyseur.getVector(terme1.getExpression()), analyseur.getVector(terme2.getExpression()));
    }
    
    public Terme[] getTermesSimilaires(Terme terme, double seuil) {
        ArrayList<Terme> res = new ArrayList<>();
        for(String t: analyseur.getWords()) {
            Terme t0 = Terme.cree(t);
            if(getSimilarite(terme, t0) >= seuil) {
                res.add(t0);
            }
        }
        Terme[] resultat = new Terme[res.size()];
        res.toArray(resultat);
        return resultat;
    }
    
}
