View Javadoc
1   package edu.jiangxin.apktoolbox.word;
2   
3   import org.apache.logging.log4j.LogManager;
4   import org.apache.logging.log4j.Logger;
5   import org.apache.poi.hpsf.SummaryInformation;
6   import org.apache.poi.hwpf.HWPFDocument;
7   import org.apache.poi.xwpf.usermodel.XWPFDocument;
8   
9   import java.io.File;
10  import java.io.FileInputStream;
11  import java.io.IOException;
12  
13  public class WordUtils {
14  
15      private static final Logger LOGGER = LogManager.getLogger(WordUtils.class.getSimpleName());
16  
17      private WordUtils() {
18          // utility class
19      }
20  
21      /**
22       * 读取 Word 文档的页数。
23       * - .doc  使用 HWPFDocument(poi-scratchpad)
24       * - .docx 使用 XWPFDocument(poi-ooxml)
25       * 若读取失败或页数 <= 0,返回 0。
26       *
27       * @param file Word 文档文件
28       * @return 文档页数,若无法读取则返回 0
29       */
30      public static int getPageCount(File file) {
31          if (file == null || !file.exists() || !file.isFile()) {
32              LOGGER.warn("Invalid file: {}", file);
33              return 0;
34          }
35  
36          String name = file.getName().toLowerCase();
37          if (name.endsWith(".doc")) {
38              return getDocPageCount(file);
39          } else if (name.endsWith(".docx")) {
40              return getDocxPageCount(file);
41          } else {
42              LOGGER.warn("Unsupported file type: {}", file.getPath());
43              return 0;
44          }
45      }
46  
47      private static int getDocPageCount(File file) {
48          try (FileInputStream fis = new FileInputStream(file);
49               HWPFDocument document = new HWPFDocument(fis)) {
50              SummaryInformation si = document.getSummaryInformation();
51              int pageCount = (si != null) ? si.getPageCount() : 0;
52              if (pageCount <= 0) {
53                  LOGGER.info("Page count <= 0 for .doc file: {}, returning 0", file.getPath());
54                  return 0;
55              }
56              LOGGER.info("Processing .doc file: {}, page count: {}", file.getPath(), pageCount);
57              return pageCount;
58          } catch (IOException e) {
59              LOGGER.error("Error reading .doc file: {}, message: {}", file.getPath(), e.getMessage());
60              return 0;
61          }
62      }
63  
64      private static int getDocxPageCount(File file) {
65          try (FileInputStream fis = new FileInputStream(file);
66               XWPFDocument document = new XWPFDocument(fis)) {
67              // 优先从 ExtendedProperties 读取页数
68              int pageCount = 0;
69              if (document.getProperties() != null
70                      && document.getProperties().getExtendedProperties() != null
71                      && document.getProperties().getExtendedProperties().getUnderlyingProperties() != null) {
72                  pageCount = document.getProperties().getExtendedProperties().getUnderlyingProperties().getPages();
73              }
74              // 若 ExtendedProperties 无效,尝试从 CoreProperties 读取 revision 作为备选
75              if (pageCount <= 0) {
76                  String revision = null;
77                  if (document.getProperties() != null
78                          && document.getProperties().getCoreProperties() != null) {
79                      revision = document.getProperties().getCoreProperties().getRevision();
80                  }
81                  if (revision != null && !revision.isEmpty()) {
82                      try {
83                          pageCount = Integer.parseInt(revision);
84                      } catch (NumberFormatException e) {
85                          LOGGER.warn("Cannot parse revision as page count for .docx file: {}", file.getPath());
86                          pageCount = 0;
87                      }
88                  }
89              }
90              if (pageCount <= 0) {
91                  LOGGER.info("Page count <= 0 for .docx file: {}, returning 0", file.getPath());
92                  return 0;
93              }
94              LOGGER.info("Processing .docx file: {}, page count: {}", file.getPath(), pageCount);
95              return pageCount;
96          } catch (IOException e) {
97              LOGGER.error("Error reading .docx file: {}, message: {}", file.getPath(), e.getMessage());
98              return 0;
99          }
100     }
101 }