/*
 * Decompiled with CFR 0.152.
 */
package edu.ucla.sspace.text;

import edu.ucla.sspace.text.StringUtils;
import edu.ucla.sspace.text.WordIterator;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;

public class DocumentPreprocessor {
    private final Set<DocHash> processedDocs = Collections.synchronizedSet(new HashSet());
    private final Set<String> validWords = new HashSet<String>();

    public DocumentPreprocessor() {
    }

    public DocumentPreprocessor(File file) throws IOException {
        WordIterator wordIterator = new WordIterator(new BufferedReader(new FileReader(file)));
        while (wordIterator.hasNext()) {
            this.validWords.add(wordIterator.next());
        }
        this.addKeyTokens();
    }

    public DocumentPreprocessor(String[] stringArray) {
        for (String string : stringArray) {
            this.validWords.add(string);
        }
        this.addKeyTokens();
    }

    private void addKeyTokens() {
        String[] stringArray;
        for (String string : stringArray = new String[]{"'", "!", ".", "?", ",", ";", "(", ")", "[", "]", "/", ":", "\"", "&", "<", ">", "<num", "<url>", "<emote>", "<slash>", "dollars"}) {
            this.validWords.add(string);
        }
    }

    public String process(String string) {
        return this.process(string, false);
    }

    public String process(String string, boolean bl) {
        CharSequence charSequence;
        CharSequence charSequence2;
        string = StringUtils.unescapeHTML(string);
        string = string.replaceAll("<.*?>", "");
        string = string.replaceAll("<", " < ");
        string = string.replaceAll(">", " > ");
        StringTokenizer stringTokenizer = new StringTokenizer(string);
        StringBuilder stringBuilder = new StringBuilder(string.length());
        while (stringTokenizer.hasMoreTokens()) {
            charSequence2 = stringTokenizer.nextToken();
            if (((String)charSequence2).endsWith("?")) {
                stringBuilder.append(((String)charSequence2).substring(0, ((String)charSequence2).length() - 1)).append(" ?");
            } else if (((String)charSequence2).endsWith(",")) {
                stringBuilder.append(((String)charSequence2).substring(0, ((String)charSequence2).length() - 1)).append(" ,");
            } else if (((String)charSequence2).endsWith(".")) {
                stringBuilder.append(((String)charSequence2).substring(0, ((String)charSequence2).length() - 1)).append(" .");
            } else if (((String)charSequence2).contains("@") && ((String)charSequence2).contains(".")) {
                stringBuilder.append("<URL>");
            } else if (((String)charSequence2).startsWith("http") || ((String)charSequence2).startsWith("ftp")) {
                stringBuilder.append("<URL>");
            } else if (((String)charSequence2).matches("[0-9]+")) {
                stringBuilder.append("<NUM>");
            } else if (((String)charSequence2).equals("/")) {
                stringBuilder.append("<slash>");
            } else if ((((String)charSequence2).length() == 2 || ((String)charSequence2).length() == 3) && (((String)charSequence2).equals(":)") || ((String)charSequence2).equals(":(") || ((String)charSequence2).equals(":/") || ((String)charSequence2).equals(":\\") || ((String)charSequence2).equals(":|") || ((String)charSequence2).equals(":[") || ((String)charSequence2).equals(":]") || ((String)charSequence2).equals(":X") || ((String)charSequence2).equals(":|") || ((String)charSequence2).equals(":[") || ((String)charSequence2).equals(":]") || ((String)charSequence2).equals(":X") || ((String)charSequence2).equals(":D"))) {
                stringBuilder.append("<EMOTE>");
            } else {
                stringBuilder.append((String)charSequence2);
            }
            stringBuilder.append(" ");
        }
        string = stringBuilder.toString().trim();
        string = string.replaceAll("'", " ' ");
        string = string.replaceAll("!", " ! ");
        string = string.replaceAll("\\.", " . ");
        string = string.replaceAll("\\?", " ? ");
        string = string.replaceAll(";", " ; ");
        string = string.replaceAll(",", " , ");
        string = string.replaceAll("\\(", " ( ");
        string = string.replaceAll("\\)", " ) ");
        string = string.replaceAll("\\[", " [ ");
        string = string.replaceAll("\\]", " ] ");
        string = string.replaceAll("/", " / ");
        string = string.replaceAll(":", " : ");
        string = string.replaceAll("\"", " \" ");
        string = string.replaceAll("-", " - ");
        string = string.replaceAll("=", " = ");
        stringTokenizer = new StringTokenizer(string);
        charSequence2 = new StringBuilder(string.length());
        while (stringTokenizer.hasMoreTokens()) {
            charSequence = stringTokenizer.nextToken();
            if (((String)charSequence).length() > 20) continue;
            ((StringBuilder)charSequence2).append((String)charSequence).append(" ");
        }
        string = ((StringBuilder)charSequence2).toString().trim();
        string = string.toLowerCase();
        stringTokenizer = new StringTokenizer(string);
        charSequence = new StringBuilder(string.length());
        while (stringTokenizer.hasMoreTokens()) {
            String string2 = stringTokenizer.nextToken();
            if (string2.startsWith("$")) {
                String string3 = string2.substring(1);
                if (!string3.matches("[0-9]+")) continue;
                ((StringBuilder)charSequence).append("<num>").append(" dollars ");
                continue;
            }
            ((StringBuilder)charSequence).append(string2).append(" ");
        }
        string = ((StringBuilder)charSequence).toString().trim();
        string = string.replaceAll("[^\\w\\s;:\\(\\)\\[\\]'!/&?\",\\.<>]", "");
        if (this.validWords.size() > 0) {
            int n = 0;
            int n2 = 0;
            stringTokenizer = new StringTokenizer(string);
            StringBuilder stringBuilder2 = new StringBuilder(string.length());
            while (stringTokenizer.hasMoreTokens()) {
                String string4 = stringTokenizer.nextToken();
                ++n;
                if (!this.validWords.contains(string4)) continue;
                ++n2;
                if (!bl) continue;
                stringBuilder2.append(string4).append(" ");
            }
            if ((double)n2 / (double)n < 0.4) {
                return "";
            }
            if (bl) {
                string = stringBuilder2.toString();
            }
        }
        return string;
    }

    private static class DocHash {
        private final byte[] hash;
        private final int hashCode;

        public DocHash(String string) {
            this.hash = DocHash.hash(string);
            this.hashCode = this.hash[3] << 24 | this.hash[2] << 16 | this.hash[1] << 8 | this.hash[0];
        }

        public boolean equals(Object object) {
            return object != null && object instanceof DocHash && Arrays.equals(this.hash, ((DocHash)object).hash);
        }

        private static byte[] hash(String string) {
            try {
                MessageDigest messageDigest = MessageDigest.getInstance("MD5");
                return messageDigest.digest(string.getBytes());
            }
            catch (NoSuchAlgorithmException noSuchAlgorithmException) {
                throw new Error(noSuchAlgorithmException);
            }
        }

        public int hashCode() {
            return this.hashCode;
        }
    }
}

