package edu.stanford.nlp.process;

import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.util.Function;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.web.HTMLParser;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/process/DocumentPreprocessor.class */
public class DocumentPreprocessor {
    private static final boolean DEBUG = false;
    private TokenizerFactory tokenizerFactory;
    private String encoding;
    private String[] sentenceFinalPuncWords;
    private static final Pattern urlPattern = Pattern.compile("(?:ht|f)tps?://.*?");
    private static final int PLAIN = 0;
    private static final int XML = 1;
    private static final int HTML = 2;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/stanford/nlp/process/DocumentPreprocessor$ListEscaper.class */
    public static class ListEscaper implements Function<List<List<HasWord>>, List<List<HasWord>>> {
        Function<List<HasWord>, List<HasWord>> f;

        public ListEscaper(Function<List<HasWord>, List<HasWord>> function) {
            this.f = function;
        }

        @Override // edu.stanford.nlp.util.Function
        public List<List<HasWord>> apply(List<List<HasWord>> list) {
            ArrayList arrayList = new ArrayList(list.size());
            Iterator<List<HasWord>> it = list.iterator();
            while (it.hasNext()) {
                arrayList.add(this.f.apply(it.next()));
            }
            return arrayList;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/stanford/nlp/process/DocumentPreprocessor$NullEscaper.class */
    public static class NullEscaper implements Function<List<HasWord>, List<HasWord>> {
        private NullEscaper() {
        }

        @Override // edu.stanford.nlp.util.Function
        public List<HasWord> apply(List<HasWord> list) {
            return list;
        }
    }

    public DocumentPreprocessor(TokenizerFactory tokenizerFactory) {
        this.tokenizerFactory = tokenizerFactory;
    }

    public DocumentPreprocessor() {
        this.tokenizerFactory = PTBTokenizer.factory();
    }

    public DocumentPreprocessor(boolean z) {
        this.tokenizerFactory = PTBTokenizer.factory(false, false, z);
    }

    public void setEncoding(String str) {
        this.encoding = str;
    }

    public void setSentenceFinalPuncWords(String[] strArr) {
        this.sentenceFinalPuncWords = strArr;
    }

    public void setTokenizerFactory(TokenizerFactory tokenizerFactory) {
        this.tokenizerFactory = tokenizerFactory;
    }

    public void usePTBTokenizer() {
        this.tokenizerFactory = PTBTokenizer.factory();
    }

    public void useWhitespaceTokenizer() {
        this.tokenizerFactory = WhitespaceTokenizer.factory();
    }

    public List<Word> getWordsFromText(String str) throws IOException {
        return getWordsFromText(fileOrURLToReader(str));
    }

    public List<Word> getWordsFromText(Reader reader) {
        return this.tokenizerFactory.getTokenizer(new BufferedReader(reader)).tokenize();
    }

    public List<List<? extends HasWord>> getSentencesFromText(String str) throws IOException {
        return getSentencesFromText(fileOrURLToReader(str));
    }

    public List<List<? extends HasWord>> getSentencesFromText(String str, boolean z, String str2, int i) throws IOException {
        return getSentencesFromText(fileOrURLToReader(str), z, str2, i);
    }

    public List<List<? extends HasWord>> getSentencesFromText(Reader reader) {
        return getSentencesFromText(reader, false, (String) null, -1);
    }

    public List<List<? extends HasWord>> getSentencesFromText(String str, Function<List<HasWord>, List<HasWord>> function, String str2, int i) throws IOException {
        return getSentencesFromText(fileOrURLToReader(str), function, str2, i);
    }

    public List<List<? extends HasWord>> getSentencesFromText(Reader reader, String str) {
        return getSentencesFromText(reader, (Function<List<HasWord>, List<HasWord>>) null, str, -1);
    }

    /* JADX WARN: Multi-variable type inference failed */
    public List<List<? extends HasWord>> getSentencesFromText(Reader reader, Function<List<HasWord>, List<HasWord>> function, String str, int i) {
        if (function == null) {
            function = new NullEscaper();
        }
        ListEscaper listEscaper = new ListEscaper(function);
        if (!(this.tokenizerFactory instanceof WhitespaceTokenizer.WhitespaceTokenizerFactory)) {
            if (i >= 0) {
                throw new RuntimeException("Can't read tags from untokenized document.");
            }
            if (str != null) {
                return tokenizeSentences(glueSentences(splitListsOnToken(function.apply(new WhitespaceTokenizer(reader, true).tokenize()), str)));
            }
            return (this.sentenceFinalPuncWords != null ? new WordToSentenceProcessor(new HashSet(Arrays.asList(this.sentenceFinalPuncWords))) : new WordToSentenceProcessor()).process(function.apply(this.tokenizerFactory.getTokenizer(new BufferedReader(reader)).tokenize()));
        }
        if (str != null) {
            List splitListsOnToken = splitListsOnToken(new WhitespaceTokenizer(reader, str.equals("\n")).tokenize(), str);
            if (i >= 0) {
                splitListsOnToken = tagSplitSentences(splitListsOnToken, i);
            }
            return listEscaper.apply((List<List<HasWord>>) splitListsOnToken);
        }
        List<HasWord> list = new WhitespaceTokenizer(reader, false).tokenize();
        if (i >= 0) {
            list = new WordToTaggedWordProcessor((char) i).process(list);
        }
        return (this.sentenceFinalPuncWords != null ? new WordToSentenceProcessor(new HashSet(Arrays.asList(this.sentenceFinalPuncWords))) : new WordToSentenceProcessor()).process(function.apply(list));
    }

    public List<Word> getWordsFromString(String str) {
        return this.tokenizerFactory.getTokenizer(new BufferedReader(new StringReader(str))).tokenize();
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String str, String str2) throws IOException {
        return getSentencesFromXML(str, str2, (String) null, true);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String str, String str2, boolean z) throws IOException {
        return getSentencesFromXML(str, str2, (String) null, z);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String str, String str2, String str3, boolean z) throws IOException {
        return getSentencesFromXML(fileOrURLToReader(str), str2, str3, z);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(Reader reader, String str, String str2, boolean z) {
        return getSentencesFromXML(reader, z ? new PTBEscapingProcessor() : new NullEscaper(), str, str2);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String str, Function<List<HasWord>, List<HasWord>> function, String str2) throws IOException {
        return getSentencesFromXML(fileOrURLToReader(str), function, str2, (String) null);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(String str, Function<List<HasWord>, List<HasWord>> function, String str2, String str3) throws IOException {
        return getSentencesFromXML(fileOrURLToReader(str), function, str2, str3);
    }

    public List<List<? extends HasWord>> getSentencesFromXML(Reader reader, Function<List<HasWord>, List<HasWord>> function, String str, String str2) {
        ArrayList arrayList = new ArrayList();
        if ("onePerElement".equals(str2)) {
            str2 = ".$.onePerElement.$.";
        }
        XMLBeginEndIterator xMLBeginEndIterator = new XMLBeginEndIterator(reader, str);
        while (xMLBeginEndIterator.hasNext()) {
            Iterator<List<? extends HasWord>> it = getSentencesFromText(new BufferedReader(new StringReader((String) xMLBeginEndIterator.next())), function, str2, -1).iterator();
            while (it.hasNext()) {
                arrayList.add(it.next());
            }
        }
        return arrayList;
    }

    public List<Word> getWordsFromHTML(String str) throws IOException {
        return getWordsFromHTML(fileOrURLToReader(str));
    }

    public List<Word> getWordsFromHTML(Reader reader) {
        try {
            return getWordsFromText(new StringReader(new HTMLParser().parse(reader)));
        } catch (IOException e) {
            System.err.println("IOException" + e.getMessage());
            return null;
        }
    }

    public List<List<? extends HasWord>> getSentencesFromHTML(String str) throws IOException {
        return getSentencesFromHTML(fileOrURLToReader(str));
    }

    public List<List<? extends HasWord>> getSentencesFromHTML(Reader reader) {
        try {
            return getSentencesFromText(new StringReader(new HTMLParser().parse(reader)));
        } catch (IOException e) {
            System.err.println("IOException" + e.getMessage());
            return null;
        }
    }

    private List<List<? extends HasWord>> getSentencesFromText(Reader reader, boolean z, String str, int i) {
        return getSentencesFromText(reader, z ? new PTBEscapingProcessor() : new NullEscaper(), str, i);
    }

    private static List<List<HasWord>> splitListsOnToken(List<HasWord> list, String str) {
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        for (HasWord hasWord : list) {
            if (hasWord.word().equals(str)) {
                arrayList.add(arrayList2);
                arrayList2 = new ArrayList();
            } else {
                arrayList2.add(hasWord);
            }
        }
        if (!arrayList2.isEmpty()) {
            arrayList.add(arrayList2);
        }
        return arrayList;
    }

    private static List<String> glueSentences(List<List<HasWord>> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<List<HasWord>> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(glueSentence(it.next()));
        }
        return arrayList;
    }

    private static String glueSentence(List<HasWord> list) {
        StringBuilder sb = new StringBuilder();
        if (!list.isEmpty()) {
            sb.append(list.get(0).word());
            int size = list.size();
            for (int i = 1; i < size; i++) {
                sb.append(" ").append(list.get(i).word());
            }
        }
        return sb.toString();
    }

    private List tokenizeSentences(List<String> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(this.tokenizerFactory.getTokenizer(new StringReader(it.next())).tokenize());
        }
        return arrayList;
    }

    private static List<List<? extends HasWord>> tagSplitSentences(List<List<HasWord>> list, int i) {
        ArrayList arrayList = new ArrayList();
        WordToTaggedWordProcessor wordToTaggedWordProcessor = new WordToTaggedWordProcessor((char) i);
        Iterator<List<HasWord>> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(wordToTaggedWordProcessor.process(it.next()));
        }
        return arrayList;
    }

    private Reader fileOrURLToReader(String str) throws IOException {
        System.err.println(str);
        return urlPattern.matcher(str).matches() ? new BufferedReader(new StringReader(StringUtils.slurpURL(new URL(str)))) : this.encoding == null ? new FileReader(str) : new BufferedReader(new InputStreamReader(new FileInputStream(str), this.encoding));
    }

    public static void main(String[] strArr) throws IOException {
        if (strArr.length == 0) {
            System.err.println("usage: DocumentPreprocessor -file filename [-xml tag|-html] [-noSplitSentence]");
            return;
        }
        boolean z = true;
        boolean z2 = false;
        boolean z3 = false;
        boolean z4 = false;
        String str = null;
        boolean z5 = false;
        String str2 = null;
        int i = 0;
        while (i < strArr.length) {
            if (strArr[i].equals("-file")) {
                i++;
                str2 = strArr[i];
            } else if (strArr[i].equals("-xml")) {
                z5 = true;
                i++;
                str = strArr[i];
            } else if (strArr[i].equals("-html")) {
                z5 = 2;
            } else if (strArr[i].equals("-noSplitSentence")) {
                z = false;
            } else if (strArr[i].equals("-suppressEscaping")) {
                z2 = true;
            } else if (strArr[i].equals("-noTokenization")) {
                z3 = true;
            } else if (strArr[i].equals("-plainOutput")) {
                z4 = true;
            }
            i++;
        }
        DocumentPreprocessor documentPreprocessor = z3 ? new DocumentPreprocessor(WhitespaceTokenizer.factory(true)) : new DocumentPreprocessor(z2);
        System.err.println("Tokenizer: " + documentPreprocessor.tokenizerFactory.getClass());
        List<List<? extends HasWord>> arrayList = new ArrayList();
        switch (z5) {
            case false:
                if (z) {
                    arrayList = documentPreprocessor.getSentencesFromText(str2);
                    break;
                } else {
                    arrayList.add(documentPreprocessor.getWordsFromText(str2));
                    break;
                }
            case true:
                arrayList = documentPreprocessor.getSentencesFromXML(str2, str, !z2);
                break;
            case true:
                if (z) {
                    arrayList = documentPreprocessor.getSentencesFromHTML(str2);
                    break;
                } else {
                    arrayList.add(documentPreprocessor.getWordsFromHTML(str2));
                    break;
                }
        }
        System.err.println("Read in " + arrayList.size() + " sentences.");
        for (List<? extends HasWord> list : arrayList) {
            System.err.println("Length: " + list.size());
            if (z4) {
                for (int i2 = 0; i2 < list.size(); i2++) {
                    if (i2 > 0) {
                        System.out.print(" ");
                    }
                    System.out.print(list.get(i2));
                }
                System.out.println();
            } else {
                System.out.println(list);
            }
        }
    }
}
