/*
 * Decompiled with CFR 0.152.
 */
package edu.ucla.sspace.tools;

import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.text.IteratorFactory;
import edu.ucla.sspace.util.LoggerUtil;
import edu.ucla.sspace.util.TrieMap;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;

public class TokenCounter {
    private static final int UPDATE_INTERVAL = 10000;
    private static final Logger LOGGER = Logger.getLogger(TokenCounter.class.getName());
    private final Map<String, Integer> tokenToCount;
    private final boolean doLowerCasing;

    public TokenCounter() {
        this(false);
    }

    public TokenCounter(boolean bl) {
        this.doLowerCasing = bl;
        this.tokenToCount = new TrieMap<Integer>();
    }

    public Map<String, Integer> getTokenCounts() {
        return Collections.unmodifiableMap(this.tokenToCount);
    }

    public void processFile(String string) throws IOException {
        this.process(new BufferedReader(new FileReader(string)));
    }

    public void processFile(File file) throws IOException {
        this.process(new BufferedReader(new FileReader(file)));
    }

    public void process(BufferedReader bufferedReader) {
        this.process(IteratorFactory.tokenize(bufferedReader));
    }

    public void process(String string) {
        this.process(IteratorFactory.tokenize(string));
    }

    private void process(Iterator<String> iterator) {
        long l = 0L;
        while (iterator.hasNext()) {
            String string = iterator.next();
            if (this.doLowerCasing) {
                string = string.toLowerCase();
            }
            if (string.matches("[0-9]+")) {
                string = "<NUM>";
            }
            if (string.matches("[^\\w\\s;:\\(\\)\\[\\]'!/&?\",\\.<>]")) continue;
            Integer n = this.tokenToCount.get(string);
            this.tokenToCount.put(string, n == null ? 1 : 1 + n);
            if (++l % 10000L != 0L || !LOGGER.isLoggable(Level.FINE)) continue;
            LOGGER.fine("Processed " + l + " tokens.  Currently " + this.tokenToCount.size() + " unique tokens");
        }
    }

    public static void main(String[] stringArray) {
        ArgOptions argOptions = new ArgOptions();
        argOptions.addOption('Z', "stemmingAlgorithm", "specifices the stemming algorithm to use on tokens while iterating.  (default: none)", true, "CLASSNAME", "Tokenizing Options");
        argOptions.addOption('F', "tokenFilter", "filters to apply to the input token stream", true, "FILTER_SPEC", "Tokenizing Options");
        argOptions.addOption('C', "compoundWords", "a file where each line is a recognized compound word", true, "FILE", "Tokenizing Options");
        argOptions.addOption('L', "lowerCase", "lower-cases each token after all other filtering has been applied", false, null, "Tokenizing Options");
        argOptions.addOption('z', "wordLimit", "Set the maximum number of words an document can return", true, "INT", "Tokenizing Options");
        argOptions.addOption('v', "verbose", "Print verbose output about counting status", false, null, "Optional");
        argOptions.parseOptions(stringArray);
        if (argOptions.numPositionalArgs() < 2) {
            System.out.println("usage: java TokenCounter [options] <output-file> <input-file> [<input-file>]*\n" + argOptions.prettyPrint() + "\n" + "The compound word option specifies a file whose contents are compount tokens,\ne.g. white house.  Each compound token should be specified on its own line.\nCompount tokenization is greedy and will select the longest compound token\npresent.  For example if \"bar exam\" and \"California bar exam\" are both\ncompound tokens, the latter will always be returned as a single token, rather\nthan returning the two tokens \"California\" and \"bar exam\"." + "\n\n" + "token configuration lists sets of files that contain tokens to be included or\nexcluded.  The behavior, \"include\" or \"exclude\" is specified\nfirst, followed by one or more file names, each separated by colons.\nMultiple behaviors may be specified one after the other using a ','\ncharacter to separate them.  For example, a typicaly configuration may\nlook like: include=top-tokens.txt:test-words.txt,exclude=stop-words.txt\nNote behaviors are applied in the order they are presented on the command-line.");
            return;
        }
        if (argOptions.hasOption("verbose")) {
            LoggerUtil.setLevel(Level.FINE);
        }
        boolean bl = argOptions.hasOption("lowerCase");
        Properties properties = System.getProperties();
        if (argOptions.hasOption("tokenFilter")) {
            properties.setProperty("edu.ucla.sspace.text.TokenizerFactory.tokenFilter", argOptions.getStringOption("tokenFilter"));
        }
        if (argOptions.hasOption("stemmingAlgorithm")) {
            properties.setProperty("edu.ucla.sspace.text.TokenizerFactory.stemmer", argOptions.getStringOption("stemmingAlgorithm"));
        }
        if (argOptions.hasOption("compoundWords")) {
            properties.setProperty("edu.ucla.sspace.text.TokenizerFactory.compoundTokens", argOptions.getStringOption("compoundWords"));
        }
        if (argOptions.hasOption("wordLimit")) {
            properties.setProperty("edu.ucla.sspace.text.TokenizerFactory.tokenCountLimit", argOptions.getStringOption("wordLimit"));
        }
        IteratorFactory.setProperties(properties);
        try {
            TokenCounter tokenCounter = new TokenCounter(bl);
            for (int i = 1; i < argOptions.numPositionalArgs(); ++i) {
                tokenCounter.processFile(argOptions.getPositionalArg(i));
            }
            PrintWriter printWriter = new PrintWriter(argOptions.getPositionalArg(0));
            for (Map.Entry<String, Integer> entry : tokenCounter.tokenToCount.entrySet()) {
                printWriter.println(entry.getKey() + " " + entry.getValue());
            }
            printWriter.close();
        }
        catch (Throwable throwable) {
            throwable.printStackTrace();
        }
    }
}

