package edu.stanford.nlp.trees.international.arabic;

import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.process.PTBLexer;
import edu.stanford.nlp.util.Function;
import edu.stanford.nlp.util.StringUtils;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/trees/international/arabic/IBMArabicEscaper.class */
public class IBMArabicEscaper implements Function<List<HasWord>, List<HasWord>> {
    private static final Pattern pEnt = Pattern.compile("\\$[a-z]+_\\((.*?)\\)");
    private static final Pattern presForms = Pattern.compile("[ﭐ-﷿ﹰ-\ufefe]");
    private static final Pattern extendedArabic = Pattern.compile("[ػ-ؿٱ-ۿݐ-ݿ]");
    private static final Pattern alefVariants = Pattern.compile("[آأإ]");
    private static final Pattern pAM = Pattern.compile("ى");
    private static final Pattern pDel = Pattern.compile("[ً-ٰٕ]");
    private static final Pattern pTatweel = Pattern.compile("ـ");
    private static final Pattern pYaaHamza = Pattern.compile("يء");
    private boolean warnedPresentationForms;
    private boolean warnedExtendedArabic;
    private boolean warnedEntityEscaping;
    private boolean warnedNormalization;
    private boolean warnedDeletion;
    private boolean warnedProcliticEnclitic;

    private String escapeString(String str) {
        if (!this.warnedPresentationForms && presForms.matcher(str).find()) {
            System.err.println("IBMArabicEscaper Warning: encountering Arabic presentation form characters which are NOT mapped but just treated as unknown characters: " + str);
            this.warnedPresentationForms = true;
        }
        if (!this.warnedExtendedArabic && extendedArabic.matcher(str).find()) {
            System.err.println("IBMArabicEscaper Warning: encountering Arabic presentation form characters which are NOT mapped but just treated as unknown characters: " + str);
            this.warnedExtendedArabic = true;
        }
        Matcher matcher = alefVariants.matcher(str);
        if (matcher.find()) {
            if (!this.warnedNormalization) {
                System.err.println("IBMArabicEscaper Note: equivalence classing certain characters, such as Alef with madda/hamza, e.g., in: " + str);
                this.warnedNormalization = true;
            }
            str = matcher.replaceAll("ا");
        }
        Matcher matcher2 = pAM.matcher(str);
        if (matcher2.find()) {
            if (!this.warnedNormalization) {
                System.err.println("IBMArabicEscaper Note: equivalence classing certain characters, such as Alef with madda/hamza, e.g., in: " + str);
                this.warnedNormalization = true;
            }
            str = matcher2.replaceAll("ي");
        }
        Matcher matcher3 = pYaaHamza.matcher(str);
        if (matcher3.find()) {
            if (!this.warnedNormalization) {
                System.err.println("IBMArabicEscaper Note: equivalence classing certain characters, such as Alef with madda/hamza, e.g., in: " + str);
                this.warnedNormalization = true;
            }
            str = matcher3.replaceAll("ئ");
        }
        Matcher matcher4 = pDel.matcher(str);
        if (matcher4.find()) {
            if (!this.warnedDeletion) {
                System.err.println("IBMArabicEscaper Note: deleting certain characters, such as tatweel, fatHa, kasra, damma, e.g., in: " + str);
                this.warnedDeletion = true;
            }
            str = matcher4.replaceAll("");
        }
        String tr = StringUtils.tr(str, "،؛؟٪٫٬٭۔٠١٢٣٤٥٦٧٨٩०१२३४५६७८९–—\u0091\u0092‘’\u0093\u0094“”", ",;%.,*.01234567890123456789--''''\"\"\"\"");
        int length = tr.length();
        if (length > 1) {
            Matcher matcher5 = pEnt.matcher(tr);
            if (matcher5.matches()) {
                if (!this.warnedEntityEscaping) {
                    System.err.println("IBMArabicEscaper Note: escaping IBM MT-style entities: " + matcher5.group(0) + " --> " + matcher5.group(1));
                    this.warnedEntityEscaping = true;
                }
                tr = matcher5.replaceAll("$1");
            } else if (tr.charAt(0) == '+') {
                if (!this.warnedProcliticEnclitic) {
                    this.warnedProcliticEnclitic = true;
                    System.err.println("IBMArabicEscaper Note: removing IBM MT-style proclitic/enclitic indicators, e.g., on " + tr);
                }
                tr = tr.substring(1);
            } else if (tr.charAt(length - 1) == '#') {
                if (!this.warnedProcliticEnclitic) {
                    this.warnedProcliticEnclitic = true;
                    System.err.println("IBMArabicEscaper Note: removing IBM MT-style proclitic/enclitic indicators, e.g., on " + tr);
                }
                tr = tr.substring(0, length - 1);
            }
            Matcher matcher6 = pTatweel.matcher(tr);
            if (matcher6.find()) {
                if (!this.warnedDeletion) {
                    System.err.println("IBMArabicEscaper Note: deleting certain characters, such as tatweel, fatHa, kasra, damma, e.g., in: " + tr);
                    this.warnedDeletion = true;
                }
                tr = matcher6.replaceAll("");
            }
        } else if (tr.equals("(")) {
            tr = PTBLexer.openparen;
        } else if (tr.equals(")")) {
            tr = PTBLexer.closeparen;
        } else if (tr.equals("+")) {
            tr = "-PLUS-";
        }
        return tr;
    }

    @Override // edu.stanford.nlp.util.Function
    public List<HasWord> apply(List<HasWord> list) {
        ArrayList<HasWord> arrayList = new ArrayList(list);
        for (HasWord hasWord : arrayList) {
            hasWord.setWord(escapeString(hasWord.word()));
        }
        return arrayList;
    }

    public static void main(String[] strArr) throws IOException {
        IBMArabicEscaper iBMArabicEscaper = new IBMArabicEscaper();
        for (String str : strArr) {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str), "UTF-8"));
            PrintWriter printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str + ".sent"), "UTF-8")));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine != null) {
                    String[] split = readLine.split("\\s+");
                    for (int i = 0; i < split.length; i++) {
                        printWriter.print(iBMArabicEscaper.escapeString(split[i]));
                        if (i != split.length - 1) {
                            printWriter.print(" ");
                        }
                    }
                    printWriter.println();
                }
            }
            bufferedReader.close();
            printWriter.close();
        }
    }
}
