package com.token.sentiment.utils;

import com.google.common.base.Optional;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObjectFactory;
import java.io.IOException;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/token/sentiment/utils/DetectedLanguageUtil.class */
public class DetectedLanguageUtil {
    private static List<LanguageProfile> languageProfiles;
    private static LanguageDetector languageDetector;
    private static TextObjectFactory textObjectFactory;
    private static final Logger log = LoggerFactory.getLogger(DetectedLanguageUtil.class);
    private static volatile DetectedLanguageUtil instance = null;

    public static DetectedLanguageUtil getInstance() throws IOException {
        if (instance == null) {
            synchronized (DetectedLanguageUtil.class) {
                if (instance == null) {
                    instance = new DetectedLanguageUtil();
                }
            }
        }
        return instance;
    }

    private DetectedLanguageUtil() throws IOException {
        languageProfiles = new LanguageProfileReader().readAllBuiltIn();
        languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(languageProfiles).build();
        textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
    }

    public static String revertSimple(String str) {
        return str.replaceAll("&lt;", "<").replaceAll("&gt;", ">").replaceAll("&apos;", "'").replaceAll("&quot;", "\"");
    }

    public String detected(String str) {
        String trim = Pattern.compile("[\\d]").matcher(revertSimple(str).replaceAll("[`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~！@#￥%……*（）——+|{}【】《》‘；：”“’。，、？|-]", " ").trim()).replaceAll("").trim();
        String substring = trim.substring(0, trim.contains(" ") ? trim.indexOf(" ") : trim.length());
        if (checkChineseChar(substring)) {
            return "zh-CN";
        }
        Optional detect = languageDetector.detect(textObjectFactory.forText(trim));
        if (detect.isPresent()) {
            String ldLocale = ((LdLocale) detect.get()).toString();
            log.info("Result : " + ldLocale);
            return ldLocale;
        }
        log.warn("语种识别失败，可能文本太短或混合了多国语言，针对第一个单词再次检查。");
        if (checkEnglishChar(substring)) {
            return "en";
        }
        if (checkRussiaChar(substring)) {
            return "ru";
        }
        return null;
    }

    boolean checkEnglishChar(String str) {
        return str.matches("[a-zA-Z]+");
    }

    boolean checkRussiaChar(String str) {
        return str.matches("[а-яА-я]+");
    }

    public boolean checkChineseChar(String str) {
        Matcher matcher = Pattern.compile("[\\u4e00-\\u9fa5]+").matcher(str);
        return matcher.find() && matcher.group(0).equals(str);
    }

    boolean checkCyrilMongoliaChar(String str) {
        return str.matches("^[\\p{InCyrillic}\\s]+$");
    }
}
