package bios.tokenizer;

import bios.common.WordToken;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:bios/tokenizer/Tokenizer.class */
public class Tokenizer {
    private AbbreviationMap mAbbreviations;
    public static final int MAX_MULTI_WORD_SIZE = 20;
    public static final String DOT = block("\\.");
    public static final String DOTDOT = block("\\:");
    public static final String APOSTROPHE = block("\\'");
    public static final String SLASH = block("\\/");
    public static final String UNDERSCORE = block("\\_");
    public static final String MINUS = block("\\-");
    public static final String PLUS = block("\\+");
    public static final String COMMA = block("\\,");
    public static final String DOTCOMMA = block("\\;");
    public static final String QUOTES = block(or("\\\"", "\\'\\'", "\\'", "\\`\\`", "\\`"));
    public static final String DOUBLE_QUOTES = block(or("\\\"", "\\'\\'"));
    public static final String LRB = block("\\(");
    public static final String RRB = block("\\)");
    public static final String LCB = block("\\{");
    public static final String RCB = block("\\}");
    public static final String GREATER = block("\\>");
    public static final String LOWER = block("\\<");
    public static final String AMPERSAND = block("\\&");
    public static final String AT = block("\\@");
    public static final String HTTP = block("[hH][tT][tT][pP]\\:\\/\\/");
    public static final String WHITE_SPACE = block("\\s");
    public static final String DIGIT = block("\\d");
    public static final String LETTER = block("[a-zA-Z]");
    public static final String UPPER = block("[A-Z]");
    public static final String SIGN = range(MINUS + PLUS);
    public static final String FULLNUM = block(zeroOrOne(SIGN) + oneOrMore(DIGIT) + zeroOrMore(zeroOrOne(or(DOT, COMMA, SLASH)) + oneOrMore(DIGIT)));
    public static final String DECNUM = block(DOT + oneOrMore(DIGIT));
    public static final String NUM = or(FULLNUM, DECNUM);
    public static final String DATE = block(oneOrMore(DIGIT) + SLASH + oneOrMore(DIGIT) + SLASH + oneOrMore(DIGIT));
    public static final String TIME = block(oneOrMore(DIGIT) + oneOrMore(block(DOTDOT + oneOrMore(DIGIT))));
    public static final String PUNC = or(QUOTES, block(MINUS + oneOrMore(MINUS)), block(DOT + oneOrMore(DOT)));
    public static final String LETTERS = oneOrMore(LETTER);
    public static final String BLOCK = or(NUM, LETTERS);
    public static final String WORD = block(zeroOrOne(APOSTROPHE) + BLOCK + zeroOrMore(block(zeroOrOne(or(UNDERSCORE, MINUS, APOSTROPHE, SLASH, AMPERSAND)) + BLOCK)));
    public static final String ACRONYM = block(oneOrMore(LETTER + DOT));
    public static final String LOOSE_ACRONYM = block(oneOrMore(oneOrMore(LETTER) + DOT) + zeroOrMore(LETTER));
    public static final String PAREN = or(LRB, RRB, LCB, RCB);
    public static final String HTMLCODE = block(AMPERSAND + UPPER + DOTCOMMA);
    public static final String ANY = block("\\S");
    public static final String EMAIL = block(LETTER + zeroOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) + AT + zeroOrMore(or(LETTER, DIGIT, DOT, MINUS, UNDERSCORE)) + LETTER);
    public static final String URL = block(HTTP + oneOrMore(or(LETTER, DIGIT, DOT, UNDERSCORE, SLASH, AMPERSAND, MINUS, PLUS)));
    public static final String SGML = "<[^<>]+>";
    public static final String RECOGNISED_PATTERN = or(TIME, EMAIL, URL, ACRONYM, DATE, WORD, PUNC, PAREN, SGML, HTMLCODE, ANY);
    private static final Pattern wordPattern = Pattern.compile(RECOGNISED_PATTERN);
    private static final Pattern sgmlPattern = Pattern.compile(SGML);
    private static final Pattern slashDatePattern = Pattern.compile(DATE);
    private static final Pattern acronymPattern = Pattern.compile(LOOSE_ACRONYM);
    private static final Pattern urlPattern = Pattern.compile(URL);
    private static final Pattern emailPattern = Pattern.compile(EMAIL);

    public Tokenizer(String str) throws IOException {
        this.mAbbreviations = new AbbreviationMap(str);
    }

    public static String range(String str) {
        return block("[" + str + "]");
    }

    public static String zeroOrOne(String str) {
        return block(block(str) + "?");
    }

    public static String zeroOrMore(String str) {
        return block(block(str) + "*");
    }

    public static String oneOrMore(String str) {
        return block(block(str) + "+");
    }

    public static String block(String str) {
        return "(" + str + ")";
    }

    public static String or(String str, String str2) {
        return block(block(str) + "|" + block(str2));
    }

    public static String or(String str, String str2, String str3) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3));
    }

    public static String or(String str, String str2, String str3, String str4) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4));
    }

    public static String or(String str, String str2, String str3, String str4, String str5) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6, String str7) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6) + "|" + block(str7));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6) + "|" + block(str7) + "|" + block(str8));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8, String str9) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6) + "|" + block(str7) + "|" + block(str8) + "|" + block(str9));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8, String str9, String str10) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6) + "|" + block(str7) + "|" + block(str8) + "|" + block(str9) + "|" + block(str10));
    }

    public static String or(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8, String str9, String str10, String str11) {
        return block(block(str) + "|" + block(str2) + "|" + block(str3) + "|" + block(str4) + "|" + block(str5) + "|" + block(str6) + "|" + block(str7) + "|" + block(str8) + "|" + block(str9) + "|" + block(str10) + "|" + block(str11));
    }

    public static String rangeNot(String str) {
        return range(block("^" + str));
    }

    private static int hasApostropheBlock(String str) {
        for (int length = str.length() - 1; length > 0; length--) {
            if (str.charAt(length) == '\'' && length < str.length() - 1) {
                return length;
            }
            if (!Character.isLetter(str.charAt(length))) {
                return -1;
            }
        }
        return -1;
    }

    private static String concatenate(List list, int i, int i2) {
        StringBuffer stringBuffer = new StringBuffer();
        while (i < i2) {
            stringBuffer.append(((WordToken) list.get(i)).getWord());
            i++;
        }
        return stringBuffer.toString();
    }

    public static boolean isUrl(String str) {
        return urlPattern.matcher(str).find(0);
    }

    public static boolean isEmail(String str) {
        return emailPattern.matcher(str).find(0);
    }

    public static boolean isSgml(String str) {
        return sgmlPattern.matcher(str).find(0);
    }

    public static boolean isSlashDate(String str) {
        return slashDatePattern.matcher(str).find(0);
    }

    public static boolean isAcronym(String str) {
        return acronymPattern.matcher(str).find(0);
    }

    public List<WordToken> tokenizeWords(String str) throws IOException {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = wordPattern.matcher(str.toString());
        while (matcher.find()) {
            String group = matcher.group();
            int end = matcher.end();
            int length = end - group.length();
            if (group.endsWith("n't")) {
                if (group.length() > 3) {
                    arrayList.add(new WordToken(group.substring(0, group.length() - 3), length, end - 3));
                }
                arrayList.add(new WordToken(group.substring(group.length() - 3, group.length()), end - 3, end));
            } else {
                int hasApostropheBlock = hasApostropheBlock(group);
                if (hasApostropheBlock != -1) {
                    WordToken wordToken = new WordToken(group.substring(0, hasApostropheBlock), length, length + hasApostropheBlock);
                    WordToken wordToken2 = new WordToken(group.substring(hasApostropheBlock, group.length()), length + hasApostropheBlock, end);
                    arrayList.add(wordToken);
                    arrayList.add(wordToken2);
                } else {
                    arrayList.add(new WordToken(group, length, end));
                }
            }
        }
        ArrayList arrayList2 = new ArrayList();
        int i = 0;
        while (i < arrayList.size()) {
            int size = arrayList.size();
            if (size > i + 20) {
                size = i + 20;
            }
            boolean z = false;
            while (true) {
                if (size <= i + 1) {
                    break;
                }
                WordToken wordToken3 = (WordToken) arrayList.get(i);
                WordToken wordToken4 = (WordToken) arrayList.get(size - 1);
                String concatenate = concatenate(arrayList, i, size);
                z = false;
                if (this.mAbbreviations.contains(concatenate)) {
                    z = true;
                    arrayList2.add(new WordToken(concatenate, wordToken3.getStart(), wordToken4.getEnd()));
                    i = size - 1;
                    break;
                }
                size--;
            }
            if (!z) {
                arrayList2.add(arrayList.get(i));
            }
            i++;
        }
        return arrayList2;
    }

    public String tokenizeText(String str) throws IOException {
        List<WordToken> list = tokenizeWords(str);
        StringBuffer stringBuffer = new StringBuffer();
        Iterator<WordToken> it = list.iterator();
        if (it.hasNext()) {
            stringBuffer.append(it.next());
        }
        while (it.hasNext()) {
            stringBuffer.append(" ");
            stringBuffer.append(it.next());
        }
        return stringBuffer.toString().replaceAll("\\s\\s+", " ");
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length != 2) {
            System.err.println("Usage: java bios.tokenizer.Tokenizer <multiwords file> <file to tokenize>");
            System.exit(1);
        }
        Tokenizer tokenizer = new Tokenizer(strArr[0]);
        BufferedReader bufferedReader = new BufferedReader(new FileReader(strArr[1]));
        StringBuffer stringBuffer = new StringBuffer();
        while (true) {
            int read = bufferedReader.read();
            if (read == -1) {
                break;
            } else {
                stringBuffer.append((char) read);
            }
        }
        List<WordToken> list = tokenizer.tokenizeWords(stringBuffer.toString());
        for (int i = 0; i < list.size(); i++) {
            System.out.println(list.get(i));
        }
    }
}
