WordUtils.java

package edu.jiangxin.apktoolbox.word;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

public class WordUtils {

    private static final Logger LOGGER = LogManager.getLogger(WordUtils.class.getSimpleName());

    private WordUtils() {
        // utility class
    }

    /**
     * 读取 Word 文档的页数。
     * - .doc  使用 HWPFDocument(poi-scratchpad)
     * - .docx 使用 XWPFDocument(poi-ooxml)
     * 若读取失败或页数 <= 0,返回 0。
     *
     * @param file Word 文档文件
     * @return 文档页数,若无法读取则返回 0
     */
    public static int getPageCount(File file) {
        if (file == null || !file.exists() || !file.isFile()) {
            LOGGER.warn("Invalid file: {}", file);
            return 0;
        }

        String name = file.getName().toLowerCase();
        if (name.endsWith(".doc")) {
            return getDocPageCount(file);
        } else if (name.endsWith(".docx")) {
            return getDocxPageCount(file);
        } else {
            LOGGER.warn("Unsupported file type: {}", file.getPath());
            return 0;
        }
    }

    private static int getDocPageCount(File file) {
        try (FileInputStream fis = new FileInputStream(file);
             HWPFDocument document = new HWPFDocument(fis)) {
            SummaryInformation si = document.getSummaryInformation();
            int pageCount = (si != null) ? si.getPageCount() : 0;
            if (pageCount <= 0) {
                LOGGER.info("Page count <= 0 for .doc file: {}, returning 0", file.getPath());
                return 0;
            }
            LOGGER.info("Processing .doc file: {}, page count: {}", file.getPath(), pageCount);
            return pageCount;
        } catch (IOException e) {
            LOGGER.error("Error reading .doc file: {}, message: {}", file.getPath(), e.getMessage());
            return 0;
        }
    }

    private static int getDocxPageCount(File file) {
        try (FileInputStream fis = new FileInputStream(file);
             XWPFDocument document = new XWPFDocument(fis)) {
            // 优先从 ExtendedProperties 读取页数
            int pageCount = 0;
            if (document.getProperties() != null
                    && document.getProperties().getExtendedProperties() != null
                    && document.getProperties().getExtendedProperties().getUnderlyingProperties() != null) {
                pageCount = document.getProperties().getExtendedProperties().getUnderlyingProperties().getPages();
            }
            // 若 ExtendedProperties 无效,尝试从 CoreProperties 读取 revision 作为备选
            if (pageCount <= 0) {
                String revision = null;
                if (document.getProperties() != null
                        && document.getProperties().getCoreProperties() != null) {
                    revision = document.getProperties().getCoreProperties().getRevision();
                }
                if (revision != null && !revision.isEmpty()) {
                    try {
                        pageCount = Integer.parseInt(revision);
                    } catch (NumberFormatException e) {
                        LOGGER.warn("Cannot parse revision as page count for .docx file: {}", file.getPath());
                        pageCount = 0;
                    }
                }
            }
            if (pageCount <= 0) {
                LOGGER.info("Page count <= 0 for .docx file: {}, returning 0", file.getPath());
                return 0;
            }
            LOGGER.info("Processing .docx file: {}, page count: {}", file.getPath(), pageCount);
            return pageCount;
        } catch (IOException e) {
            LOGGER.error("Error reading .docx file: {}, message: {}", file.getPath(), e.getMessage());
            return 0;
        }
    }
}