/*
 * Decompiled with CFR 0.152.
 */
package edu.ucla.sspace.tools;

import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.text.DocumentPreprocessor;
import edu.ucla.sspace.text.IteratorFactory;
import edu.ucla.sspace.text.StringUtils;
import edu.ucla.sspace.util.LoggerUtil;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WikipediaCleaner {
    private static final Logger LOGGER = Logger.getLogger(WikipediaCleaner.class.getName());
    private PrintWriter processedArticleWriter;
    private final Set<CleanerOption> options;
    private final int minTokensPerArticle;

    public WikipediaCleaner(String string, Set<CleanerOption> set, int n) {
        this.options = set;
        this.minTokensPerArticle = n;
        try {
            this.processedArticleWriter = new PrintWriter(new BufferedOutputStream(new FileOutputStream(string)));
        }
        catch (IOException iOException) {
            throw new IOError(iOException);
        }
    }

    public void processDocument(WikiDoc wikiDoc) {
        int n;
        String string = wikiDoc.name;
        String string2 = StringUtils.unescapeHTML(string);
        if (!WikipediaCleaner.isArticleLink(string2 = string2.trim().toLowerCase())) {
            LOGGER.fine("skipping non-article document: " + string2);
            return;
        }
        if (string2.indexOf("#REDIRECT") >= 0 || wikiDoc.text.indexOf("#REDIRECT") >= 0) {
            LOGGER.fine("skipping redirect: " + string2);
            return;
        }
        LOGGER.log(Level.FINE, "Procesing article {0} with {1} characters", new Object[]{string2, wikiDoc.text.length()});
        StringBuilder stringBuilder = wikiDoc.text;
        LOGGER.finer("extracting raw article text");
        this.extractArticle(stringBuilder);
        LOGGER.finer("removing tables");
        this.removeTables(stringBuilder);
        LOGGER.finer("removing {{text}} from article");
        this.removeDoubleBraceMarkup(stringBuilder);
        LOGGER.finer("removing [[wiki-link]] from article");
        this.removeWikiLinkMarkup(stringBuilder, string2);
        LOGGER.finer("removing [external-link] from article");
        this.removeExternalLinkMarkup(stringBuilder);
        LOGGER.finer("unescaping HTML");
        StringUtils.unescapeHTML(stringBuilder);
        LOGGER.finer("removing HTML comments");
        this.removeHtmlComments(stringBuilder);
        String string3 = stringBuilder.toString();
        if (this.options.contains((Object)CleanerOption.USE_PREPROCESSOR)) {
            LOGGER.finer("applying preprocessor");
            string3 = new DocumentPreprocessor().process(string3);
        }
        if (this.options.contains((Object)CleanerOption.FILTER_TOKENS)) {
            LOGGER.finer("filtering tokens");
            string3 = this.filterTokens(string3);
        }
        if ((n = this.getTokenCount(string3)) < this.minTokensPerArticle) {
            LOGGER.log(Level.FINE, "Document {0} contained only {1} tokens and was not printed", new Object[]{string2, n});
            return;
        }
        if (this.options.contains((Object)CleanerOption.INCLUDE_TITLES)) {
            this.processedArticleWriter.print(string2);
            this.processedArticleWriter.print(" ");
        }
        this.processedArticleWriter.println(string3);
        this.processedArticleWriter.flush();
    }

    private void extractArticle(StringBuilder stringBuilder) {
        int n = stringBuilder.indexOf("<text");
        int n2 = stringBuilder.indexOf(">", n);
        int n3 = stringBuilder.indexOf("</text");
        if (n3 >= 0) {
            stringBuilder.delete(n3, stringBuilder.length());
        }
        stringBuilder.delete(0, n2 + 1);
    }

    private String filterTokens(String string) {
        Iterator<String> iterator = IteratorFactory.tokenize(string);
        StringBuilder stringBuilder = new StringBuilder(string.length());
        while (iterator.hasNext()) {
            stringBuilder.append(iterator.next());
            if (!iterator.hasNext()) continue;
            stringBuilder.append(" ");
        }
        return stringBuilder.toString();
    }

    private void removeDoubleBraceMarkup(StringBuilder stringBuilder) {
        int n = stringBuilder.indexOf("{{");
        while (n >= 0) {
            int n2 = stringBuilder.indexOf("}}", n);
            int n3 = stringBuilder.indexOf("{{", n + 1);
            while (n3 > n && n3 < n2) {
                this.removeEmbeddedBrace(stringBuilder, n3);
                n2 = stringBuilder.indexOf("}}", n);
                n3 = stringBuilder.indexOf("{{", n + 1);
            }
            if (n2 < 0) break;
            stringBuilder.delete(n, n2 + 2);
            n = stringBuilder.indexOf("{{", n);
        }
    }

    private void removeEmbeddedBrace(StringBuilder stringBuilder, int n) {
        int n2 = n;
        int n3 = stringBuilder.indexOf("}}", n2);
        int n4 = stringBuilder.indexOf("{{", n2 + 1);
        while (n4 > n2 && n4 < n3) {
            this.removeEmbeddedBrace(stringBuilder, n4);
            n3 = stringBuilder.indexOf("}}", n2);
            n4 = stringBuilder.indexOf("{{", n2 + 1);
        }
        if (n3 < 0) {
            return;
        }
        stringBuilder.delete(n2, n3 + 2);
    }

    private void removeTables(StringBuilder stringBuilder) {
        int n;
        int n2 = stringBuilder.indexOf("{|");
        while (n2 >= 0 && (n = stringBuilder.indexOf("|}", n2)) > n2) {
            stringBuilder.delete(n2, n + 2);
            n2 = stringBuilder.indexOf("{|", n2);
        }
    }

    private void removeHtmlComments(StringBuilder stringBuilder) {
        int n;
        int n2 = stringBuilder.indexOf("<!--");
        while (n2 >= 0 && (n = stringBuilder.indexOf("-->", n2)) > n2) {
            stringBuilder.delete(n2, n + 3);
            n2 = stringBuilder.indexOf("<!--", n2);
        }
    }

    public void removeWikiLinkMarkup(StringBuilder stringBuilder, String string) {
        int n;
        int n2 = stringBuilder.indexOf("[[");
        boolean bl = this.options.contains((Object)CleanerOption.INCLUDE_LINK_TEXT);
        while (n2 >= 0 && (n = stringBuilder.indexOf("]]", n2)) >= 0) {
            if (bl && WikipediaCleaner.isArticleLink(stringBuilder.substring(n2 + 2, n), string)) {
                int n3 = stringBuilder.indexOf("|", n2);
                int n4 = n3 >= 0 && n3 < n ? n3 + 1 : n2 + 2;
                String string2 = stringBuilder.substring(n4, n);
                stringBuilder.replace(n2, n + 2, string2);
            } else {
                stringBuilder.delete(n2, n + 2);
            }
            n2 = stringBuilder.indexOf("[[", n2);
        }
    }

    public void removeExternalLinkMarkup(StringBuilder stringBuilder) {
        int n;
        int n2 = stringBuilder.indexOf("[");
        boolean bl = this.options.contains((Object)CleanerOption.INCLUDE_LINK_TEXT);
        while (n2 >= 0 && (n = stringBuilder.indexOf("]", n2)) >= 0) {
            if (bl) {
                int n3 = stringBuilder.indexOf(" ", n2);
                int n4 = n3 >= 0 && n3 < n ? n3 : n2 + 1;
                String string = stringBuilder.substring(n4, n);
                stringBuilder.replace(n2, n + 1, string);
            } else {
                stringBuilder.delete(n2, n + 1);
            }
            n2 = stringBuilder.indexOf("[", n2);
        }
    }

    private int getTokenCount(String string) {
        Pattern pattern = Pattern.compile("\\S+");
        Matcher matcher = pattern.matcher(string);
        int n = 0;
        while (matcher.find()) {
            ++n;
        }
        return n;
    }

    public static void main(String[] stringArray) {
        ArgOptions argOptions = new ArgOptions();
        argOptions.addOption('t', "includeTitles", "Prints article and section titles as a part of the document", false, null, "Document Processing");
        argOptions.addOption('c', "includeCaptions", "Prints image and table captions as a part of the document", false, null, "Document Processing");
        argOptions.addOption('w', "includeLinkText", "Prints text in the Wikipedia links as a part of the document", false, null, "Document Processing");
        argOptions.addOption('F', "tokenFilter", "Specifies a filter to remove or retain certain tokens", true, "FILTER_SPEC", "Filtering");
        argOptions.addOption('M', "minTokens", "Records only those documents with at least the minimum number of tokens", true, "INT", "Filtering");
        argOptions.addOption('P', "applyPreprocessor", "Applies the DocumentPreprocessor to the documents", false, null, "Filtering");
        argOptions.addOption('v', "verbose", "Print verbose output about article cleaning", false, null, "Optional");
        argOptions.addOption('V', "veryVerbose", "Print lots of verbose output about article cleaning", false, null, "Optional");
        argOptions.parseOptions(stringArray);
        if (argOptions.numPositionalArgs() != 2) {
            System.out.println("usage java [OPTIONS] <wikifile> <output-file>\n" + argOptions.prettyPrint());
            return;
        }
        Level level = null;
        if (argOptions.hasOption("verbose")) {
            level = Level.FINE;
        } else if (argOptions.hasOption("veryVerbose")) {
            level = Level.FINER;
        }
        if (level != null) {
            LoggerUtil.setLevel(level);
        }
        EnumSet<CleanerOption> enumSet = EnumSet.noneOf(CleanerOption.class);
        if (argOptions.hasOption("includeTitles")) {
            enumSet.add(CleanerOption.INCLUDE_TITLES);
        }
        if (argOptions.hasOption("includeCaptions")) {
            enumSet.add(CleanerOption.INCLUDE_CAPTIONS);
        }
        if (argOptions.hasOption("includeLinkText")) {
            enumSet.add(CleanerOption.INCLUDE_LINK_TEXT);
        }
        if (argOptions.hasOption("tokenFilter")) {
            Properties properties = new Properties();
            properties.setProperty("edu.ucla.sspace.text.TokenizerFactory.tokenFilter", argOptions.getStringOption("tokenFilter"));
            IteratorFactory.setProperties(properties);
            enumSet.add(CleanerOption.FILTER_TOKENS);
        }
        if (argOptions.hasOption("applyPreprocessor")) {
            enumSet.add(CleanerOption.USE_PREPROCESSOR);
        }
        int n = argOptions.hasOption("minTokens") ? argOptions.getIntOption("minTokens") : 0;
        try {
            DocumentBufferedQueue documentBufferedQueue = new DocumentBufferedQueue(argOptions.getPositionalArg(0));
            String string = argOptions.getPositionalArg(1);
            WikipediaCleaner wikipediaCleaner = new WikipediaCleaner(string, enumSet, n);
            while (documentBufferedQueue.hasNext()) {
                wikipediaCleaner.processDocument(documentBufferedQueue.next());
            }
        }
        catch (Exception exception) {
            exception.printStackTrace();
        }
    }

    private static boolean isArticleLink(String string) {
        String string2 = string.toLowerCase();
        return !string2.startsWith("image:") && !string2.startsWith("wikipedia:") && !string2.startsWith("template:") && !string2.startsWith("category:") && !string2.startsWith("portal:") && !string2.contains("(disambiguation)");
    }

    private static boolean isArticleLink(String string, String string2) {
        if (WikipediaCleaner.isArticleLink(string)) {
            int n = string.indexOf(":");
            if (n >= 0 && Pattern.matches("[a-z]*", string.substring(0, n))) {
                return false;
            }
            return !string.endsWith(":" + string2);
        }
        return false;
    }

    private static class WikiDoc {
        public final String name;
        public final StringBuilder text;

        public WikiDoc(String string, StringBuilder stringBuilder) {
            this.name = string;
            this.text = stringBuilder;
        }
    }

    private static class DocumentBufferedQueue {
        private static final int DOCS_TO_CACHE = 100;
        private static final int TITLE_HTML_LENGTH = "    <title>".length();
        private final BufferedReader wikiReader;
        private final BlockingQueue<WikiDoc> cachedDocs;
        private final AtomicBoolean isReaderOpen;

        public DocumentBufferedQueue(String string) throws IOException {
            this.wikiReader = new BufferedReader(new FileReader(string));
            this.cachedDocs = new LinkedBlockingQueue<WikiDoc>();
            this.isReaderOpen = new AtomicBoolean(true);
            for (int i = 0; i < 100; ++i) {
                WikiDoc wikiDoc = this.cacheDoc();
                if (wikiDoc == null) continue;
                this.cachedDocs.offer(wikiDoc);
            }
        }

        private synchronized WikiDoc cacheDoc() throws IOException {
            StringBuilder stringBuilder = new StringBuilder();
            String string = null;
            String string2 = null;
            while ((string2 = this.wikiReader.readLine()) != null) {
                if (string2.startsWith("</mediawiki>")) {
                    this.isReaderOpen.set(false);
                    continue;
                }
                if (!string2.startsWith("  <page>")) continue;
                try {
                    String string3 = this.wikiReader.readLine();
                    String string4 = string3.substring(TITLE_HTML_LENGTH);
                    int n = string4.indexOf("<");
                    if (n < 0) {
                        throw new Error("Malformed title: " + string2);
                    }
                    string = string4.substring(0, n);
                    while ((string2 = this.wikiReader.readLine()) != null && !string2.startsWith("  </page>")) {
                        stringBuilder.append(string2).append(" ");
                    }
                    return new WikiDoc(string, stringBuilder);
                }
                catch (Throwable throwable) {
                    throwable.printStackTrace();
                    break;
                }
            }
            return null;
        }

        public boolean hasNext() {
            return this.cachedDocs.size() > 0 || this.isReaderOpen.get();
        }

        public WikiDoc next() throws InterruptedException {
            new Thread(){

                @Override
                public void run() {
                    try {
                        WikiDoc wikiDoc = DocumentBufferedQueue.this.cacheDoc();
                        if (wikiDoc != null) {
                            DocumentBufferedQueue.this.cachedDocs.offer(wikiDoc);
                        }
                    }
                    catch (IOException iOException) {
                        iOException.printStackTrace();
                    }
                }
            }.start();
            return this.cachedDocs.poll(600000L, TimeUnit.MILLISECONDS);
        }
    }

    public static enum CleanerOption {
        INCLUDE_TITLES,
        INCLUDE_CAPTIONS,
        INCLUDE_LINK_TEXT,
        FILTER_TOKENS,
        USE_PREPROCESSOR;

    }
}

