package de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.tokenizer;

import com.neovisionaries.i18n.LanguageCode;
import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.exceptions.SaltTokenizerException;
import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SDocumentGraph;
import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.STextualDS;
import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SToken;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.codehaus.jackson.util.MinimalPrettyPrinter;
import org.eclipse.emf.common.util.BasicEList;
import org.eclipse.emf.common.util.EList;
import org.knallgrau.utils.textcat.TextCategorizer;

/* loaded from: input_file:WEB-INF/lib/salt-saltCommon-1.1.6.jar:de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.class */
public class Tokenizer {
    protected static final String P_CHAR = "\\[\\{\\(´`\"»«‚„†‡‹‘’“”•–—›";
    protected static final String F_CHAR = "\\]\\}'`\"\\),;:!\\?%»«‚„…†‡‰‹‘’“”•–—›";
    private SDocumentGraph sDocumentGraph = null;
    private Map<LanguageCode, HashSet<String>> abbreviations = null;
    private String PClitic = "";
    private String FClitic = "";

    public void setsDocumentGraph(SDocumentGraph sDocumentGraph) {
        this.sDocumentGraph = sDocumentGraph;
    }

    public SDocumentGraph getsDocumentGraph() {
        return this.sDocumentGraph;
    }

    public EList<SToken> tokenize(STextualDS sTextualDS) {
        return tokenize(sTextualDS, null);
    }

    public EList<SToken> tokenize(STextualDS sTextualDS, LanguageCode languageCode) {
        return tokenize(sTextualDS, languageCode, null, null);
    }

    public EList<SToken> tokenize(STextualDS sTextualDS, LanguageCode languageCode, Integer num, Integer num2) {
        EList<SToken> eList = null;
        if (sTextualDS == null) {
            throw new SaltTokenizerException("Cannot tokenize an empty 'SSTextualDS' object.");
        }
        if (sTextualDS.getSText() != null) {
            if (num == null) {
                num = 0;
            }
            if (num2 == null) {
                num2 = Integer.valueOf(sTextualDS.getSText().length());
            }
            if (languageCode == null) {
                languageCode = checkLanguage(sTextualDS.getSText().substring(num.intValue(), num2.intValue()));
            }
            if (languageCode != null && getAbbreviations(languageCode) == null) {
                if (LanguageCode.de.equals(languageCode)) {
                    addAbbreviation(LanguageCode.de, AbbreviationDE.createAbbriviations());
                } else if (LanguageCode.en.equals(languageCode)) {
                    addAbbreviation(LanguageCode.en, AbbreviationEN.createAbbriviations());
                } else if (LanguageCode.fr.equals(languageCode)) {
                    addAbbreviation(LanguageCode.fr, AbbreviationFR.createAbbriviations());
                } else if (LanguageCode.it.equals(languageCode)) {
                    addAbbreviation(LanguageCode.it, AbbreviationIT.createAbbriviations());
                }
            }
            setClitics(languageCode);
            eList = tokenizeToToken(sTextualDS, languageCode, num, num2);
        }
        return eList;
    }

    public LanguageCode checkLanguage(String str) {
        if (str != null) {
            return mapISOLanguageCode(new TextCategorizer().categorize(str));
        }
        return null;
    }

    public LanguageCode mapISOLanguageCode(String str) {
        LanguageCode languageCode = null;
        if ("german".equals(str)) {
            languageCode = LanguageCode.de;
        } else if ("english".equals(str)) {
            languageCode = LanguageCode.en;
        } else if ("french".equals(str)) {
            languageCode = LanguageCode.fr;
        } else if ("spanish".equals(str)) {
            languageCode = LanguageCode.es;
        } else if ("italian".equals(str)) {
            languageCode = LanguageCode.it;
        } else if ("swedish".equals(str)) {
            languageCode = LanguageCode.sv;
        } else if ("polish".equals(str)) {
            languageCode = LanguageCode.pl;
        } else if ("dutch".equals(str)) {
            languageCode = LanguageCode.nl;
        } else if ("norwegian".equals(str)) {
            languageCode = LanguageCode.no;
        } else if ("finnish".equals(str)) {
            languageCode = LanguageCode.fi;
        } else if ("albanian".equals(str)) {
            languageCode = LanguageCode.sq;
        } else if ("slovakian".equals(str)) {
            languageCode = LanguageCode.sk;
        } else if ("slovenian".equals(str)) {
            languageCode = LanguageCode.sl;
        } else if ("danish".equals(str)) {
            languageCode = LanguageCode.da;
        } else if ("hungarian".equals(str)) {
            languageCode = LanguageCode.hu;
        }
        return languageCode;
    }

    public void addAbbreviation(LanguageCode languageCode, HashSet<String> hashSet) {
        if (languageCode == null || hashSet == null) {
            return;
        }
        if (this.abbreviations == null) {
            this.abbreviations = new ConcurrentHashMap();
        }
        if (this.abbreviations.containsKey(languageCode)) {
            return;
        }
        this.abbreviations.put(languageCode, hashSet);
    }

    public void addAbbreviation(LanguageCode languageCode, File file) {
        try {
            HashSet<String> hashSet = new HashSet();
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    addAbbreviation(languageCode, hashSet);
                    return;
                }
                hashSet.add(readLine);
            }
        } catch (FileNotFoundException e) {
            throw new SaltTokenizerException("Cannot tokenize the given text, because the file for abbreviation '" + file.getAbsolutePath() + "' was not found.");
        } catch (IOException e2) {
            throw new SaltTokenizerException("Cannot tokenize the given text, because can not read file '" + file.getAbsolutePath() + "'.");
        }
    }

    public HashSet<String> getAbbreviations(LanguageCode languageCode) {
        if (this.abbreviations == null) {
            this.abbreviations = new ConcurrentHashMap();
        }
        return (HashSet) this.abbreviations.get(languageCode);
    }

    private void setClitics(LanguageCode languageCode) {
        if (LanguageCode.en.equals(languageCode)) {
            this.FClitic = "('(s|re|ve|d|m|em|ll)|n't)";
            return;
        }
        if (LanguageCode.fr.equals(languageCode)) {
            this.PClitic = "([dcjlmnstDCJLNMST]'|[Qq]u'|[Jj]usqu'|[Ll]orsqu')";
            this.FClitic = "(-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m'|-moi|-nous|-on|-toi|-tu|-t'|-vous|-en|-y|-ci|-là)";
        } else if (LanguageCode.es.equals(languageCode)) {
            this.PClitic = "([dD][ae]ll'|[nN]ell'|[Aa]ll'|[lLDd]'|[Ss]ull'|[Qq]uest'|[Uu]n'|[Ss]enz'|[Tt]utt')";
        } else {
            if (LanguageCode.de.equals(languageCode)) {
            }
        }
    }

    public EList<SToken> tokenizeToToken(STextualDS sTextualDS, LanguageCode languageCode, Integer num, Integer num2) {
        BasicEList basicEList = null;
        String substring = sTextualDS.getSText().substring(num.intValue(), num2.intValue());
        List<String> list = tokenizeToString(substring, languageCode);
        if (list != null && list.size() > 0) {
            char[] charArray = substring.toCharArray();
            int i = 0;
            int i2 = 0;
            while (i2 < charArray.length) {
                if (((String) list.get(i)).length() < 1 || ((String) list.get(i)).substring(0, 1).equals(String.valueOf(charArray[i2]))) {
                    StringBuffer stringBuffer = new StringBuffer();
                    for (int i3 = 0; i3 < ((String) list.get(i)).length(); i3++) {
                        stringBuffer.append(charArray[i2 + i3]);
                    }
                    if (((String) list.get(i)).hashCode() == stringBuffer.toString().hashCode()) {
                        SToken createSToken = getsDocumentGraph().createSToken(sTextualDS, Integer.valueOf(i2 + num.intValue()), Integer.valueOf(i2 + num.intValue() + ((String) list.get(i)).length()));
                        if (basicEList == null) {
                            basicEList = new BasicEList();
                        }
                        basicEList.add(createSToken);
                        i2 = (i2 + ((String) list.get(i)).length()) - 1;
                        i++;
                        if (i >= list.size()) {
                            break;
                        }
                    } else {
                        continue;
                    }
                }
                i2++;
            }
        }
        return basicEList;
    }

    public List<String> tokenizeToString(String str, LanguageCode languageCode) {
        ArrayList arrayList = new ArrayList(Arrays.asList(str.replaceAll("\\.\\.\\.", " ... ").replaceAll("([;\\!\\?])([^ ])", "$1 $2").replaceAll("\\s+", MinimalPrettyPrinter.DEFAULT_ROOT_VALUE_SEPARATOR).trim().split(MinimalPrettyPrinter.DEFAULT_ROOT_VALUE_SEPARATOR)));
        int i = 0;
        while (i < arrayList.size()) {
            Matcher matcher = Pattern.compile("^([\\[\\{\\(´`\"»«‚„†‡‹‘’“”•–—›])(.+)").matcher((CharSequence) arrayList.get(i));
            if (matcher.find()) {
                arrayList.remove(i);
                arrayList.add(i, matcher.group(2));
                arrayList.add(i, matcher.group(1));
            } else {
                Matcher matcher2 = Pattern.compile("^(.+)([\\]\\}'`\"\\),;:!\\?%»«‚„…†‡‰‹‘’“”•–—›])$").matcher((CharSequence) arrayList.get(i));
                if (matcher2.find()) {
                    arrayList.remove(i);
                    arrayList.add(i, matcher2.group(2));
                    arrayList.add(i, matcher2.group(1));
                    i--;
                } else {
                    Matcher matcher3 = Pattern.compile("^(.+[\\]\\}'`\"\\),;:!\\?%»«‚„…†‡‰‹‘’“”•–—›])(\\.)$").matcher((CharSequence) arrayList.get(i));
                    if (matcher3.find()) {
                        arrayList.remove(i);
                        arrayList.add(i, matcher3.group(2));
                        arrayList.add(i, matcher3.group(1));
                        i--;
                    } else {
                        HashSet<String> abbreviations = getAbbreviations(languageCode);
                        if ((abbreviations == null || !abbreviations.contains(arrayList.get(i))) && !Pattern.compile("^([A-Za-zÁÂÃÈý®Ð×ÝÞÍðÎÓÔÕØÙãõš›€ß‚ƒ„‡ˆ‰Š‹Œ\u008dŽ\u008fø\u007fŸ\u0015÷·”“’]\\.)+$").matcher((CharSequence) arrayList.get(i)).find()) {
                            Matcher matcher4 = Pattern.compile("^(.+)(\\.)$").matcher((CharSequence) arrayList.get(i));
                            Matcher matcher5 = Pattern.compile("^(\\.\\.\\.|[0-9]+\\.)$").matcher((CharSequence) arrayList.get(i));
                            if (!matcher4.find() || matcher5.find()) {
                                Matcher matcher6 = Pattern.compile("^(" + this.PClitic + ")(.+)$").matcher((CharSequence) arrayList.get(i));
                                if (!matcher6.find() || this.PClitic.isEmpty()) {
                                    Matcher matcher7 = Pattern.compile("(.+)(" + this.FClitic + ")$").matcher((CharSequence) arrayList.get(i));
                                    if (matcher7.find() && !this.FClitic.isEmpty()) {
                                        arrayList.remove(i);
                                        arrayList.add(i, matcher7.group(2));
                                        arrayList.add(i, matcher7.group(1));
                                        i++;
                                    }
                                } else {
                                    arrayList.remove(i);
                                    arrayList.add(i, matcher6.group(2));
                                    arrayList.add(i, matcher6.group(1));
                                }
                            } else {
                                arrayList.remove(i);
                                arrayList.add(i, matcher4.group(2));
                                arrayList.add(i, matcher4.group(1));
                                i++;
                            }
                        }
                    }
                }
            }
            i++;
        }
        return arrayList;
    }
}
