EncoderDetector.java

package edu.jiangxin.apktoolbox.file.core;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.concurrent.CountDownLatch;

import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.mozilla.universalchardet.UniversalDetector;

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;

/**
 * @author jiangxin
 * @author 2018-09-09
 *
 */
public class EncoderDetector {
    private static final Logger logger = LogManager.getLogger(EncoderDetector.class.getSimpleName());
    private static String[] detectorCharsets;
    private static CountDownLatch countDownLatch;

    /**
     * Detect the charset of some file
     * 
     * @param fileName
     * @return charset
     */
    public static String judgeFile(String fileName) {
        File file = new File(fileName);
        if (!file.exists()) {
            logger.error("Can't find the file: " + fileName);
            return null;
        }

        detectorCharsets = new String[2];
        countDownLatch = new CountDownLatch(2);

        Thread cpDetectorThread = new Thread(new Runnable() {
            @Override
            public void run() {
                CodepageDetectorProxy cpDetectorProxy = CodepageDetectorProxy.getInstance();

                // first one returning non-null wins the decision
                cpDetectorProxy.add(new ParsingDetector(false));
                cpDetectorProxy.add(JChardetFacade.getInstance());
                cpDetectorProxy.add(ASCIIDetector.getInstance());
                cpDetectorProxy.add(UnicodeDetector.getInstance());
                Charset charset = null;
                try {
                    // f.toURL()已经废弃,建议通过toURI()间接转换
                    charset = cpDetectorProxy.detectCodepage(file.toURI().toURL());
                } catch (IOException e) {
                    logger.error("cpDetector failed", e);
                    detectorCharsets[0] = null;
                }
                if (charset != null) {
                    detectorCharsets[0] = charset.name();
                } else {
                    detectorCharsets[0] = null;
                }
                countDownLatch.countDown();
            }
        });

        Thread universalDetectorThread = new Thread(new Runnable() {
            @Override
            public void run() {
                UniversalDetector universalDetector = new UniversalDetector(null);
                byte[] buf = new byte[4096];
                FileInputStream fis = null;
                try {
                    fis = new FileInputStream(file);
                    int nread;
                    while ((nread = fis.read(buf)) > 0 && !universalDetector.isDone()) {
                        universalDetector.handleData(buf, 0, nread);
                    }
                    universalDetector.dataEnd();
                    detectorCharsets[1] = universalDetector.getDetectedCharset();
                } catch (IOException e) {
                    logger.error("universalDetector failed", e);
                    detectorCharsets[1] = null;
                } finally {
                    if (fis != null) {
                        try {
                            fis.close();
                        } catch (IOException e) {
                            logger.error("close fis failed", e);
                        }
                    }
                }
                countDownLatch.countDown();
            }
        });
        cpDetectorThread.start();
        universalDetectorThread.start();
        try {
            countDownLatch.await();
        } catch (InterruptedException e) {
            logger.error("await InterruptedException");
            Thread.currentThread().interrupt();
        }
        return electBestCharset();
    }

    private static String electBestCharset() {
        StringBuilder sb = new StringBuilder();
        sb.append("cpDetector: ").append(detectorCharsets[0]).append(", universalDetector: ")
                .append(detectorCharsets[1]);
        if (StringUtils.isEmpty(detectorCharsets[0]) && StringUtils.isEmpty(detectorCharsets[1])) {
            logger.warn(sb.toString());
            return null;
        } else if (StringUtils.isEmpty(detectorCharsets[0]) && StringUtils.isNotEmpty(detectorCharsets[1])) {
            logger.info(sb.toString());
            return detectorCharsets[1];
        } else if (StringUtils.isNotEmpty(detectorCharsets[0]) && StringUtils.isEmpty(detectorCharsets[1])) {
            logger.info(sb.toString());
            return detectorCharsets[0];
        } else if (detectorCharsets[0].equals(detectorCharsets[1])) {
            logger.info(sb.toString());
            return detectorCharsets[1];
        } else {
            logger.warn(sb.toString());
            return detectorCharsets[1];
        }

    }
}