View Javadoc
1   package edu.jiangxin.apktoolbox.pdf;
2   
3   import com.itextpdf.kernel.pdf.*;
4   import org.apache.logging.log4j.LogManager;
5   import org.apache.logging.log4j.Logger;
6   import org.apache.pdfbox.Loader;
7   import org.apache.pdfbox.pdmodel.PDDocument;
8   import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
9   import org.apache.pdfbox.pdmodel.PDPage;
10  import org.apache.pdfbox.pdmodel.PDPageTree;
11  import org.apache.pdfbox.pdmodel.encryption.ProtectionPolicy;
12  import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
13  import org.apache.pdfbox.text.PDFTextStripper;
14  
15  import java.io.File;
16  import java.io.IOException;
17  
18  public class PdfUtils {
19      private static final Logger LOGGER = LogManager.getLogger(PdfUtils.class.getSimpleName());
20      public static boolean isScannedPdf(File file, int threshold) {
21          int length = 0;
22  
23          try (PDDocument document = Loader.loadPDF(file)) {
24              boolean isEncrypted = document.isEncrypted();
25              if (isEncrypted) {
26                  document.setAllSecurityToBeRemoved(true);
27              }
28  
29              PDFTextStripper stripper = new PDFTextStripper();
30              String text = stripper.getText(document).trim();
31              length = text.length();
32          } catch (IOException e) {
33              LOGGER.error("Error reading PDF file: {}", e.getMessage());
34              return false;
35          }
36          LOGGER.info("Processing file: {}, text size: {}", file.getPath(), length);
37          return length < threshold;
38      }
39  
40      public static boolean isEncryptedPdf(File file) {
41          boolean isEncrypted;
42  
43          try (PDDocument document = Loader.loadPDF(file)) {
44              isEncrypted = document.isEncrypted();
45          } catch (IOException e) {
46              LOGGER.error("Error reading PDF file: {}", e.getMessage());
47              return false;
48          }
49          LOGGER.info("Processing file: {}, is encrypted: {}", file.getPath(), isEncrypted);
50          return isEncrypted;
51      }
52  
53      public static boolean isNonOutlinePdf(File file) {
54          boolean hasOutline = false;
55  
56          try (PDDocument document = Loader.loadPDF(file)) {
57              boolean isEncrypted = document.isEncrypted();
58              if (isEncrypted) {
59                  document.setAllSecurityToBeRemoved(true);
60              }
61  
62              if (document.getDocumentCatalog() != null && document.getDocumentCatalog().getDocumentOutline() != null) {
63                  hasOutline = true;
64              }
65          } catch (IOException e) {
66              LOGGER.error("Error reading PDF file: {}", e.getMessage());
67              return false;
68          }
69          LOGGER.info("Processing file: {}, has outline: {}", file.getPath(), hasOutline);
70          return !hasOutline;
71      }
72  
73      public static boolean hasAnnotations(File file) {
74          boolean hasAnnotations = false;
75  
76          try (PDDocument document = Loader.loadPDF(file)) {
77              boolean isEncrypted = document.isEncrypted();
78              if (isEncrypted) {
79                  document.setAllSecurityToBeRemoved(true);
80              }
81              PDDocumentCatalog catalog = document.getDocumentCatalog();
82              if (catalog == null) {
83                  return false;
84              }
85              PDPageTree pages = document.getDocumentCatalog().getPages();
86              if (pages == null || pages.getCount() == 0) {
87                  return false;
88              }
89  
90              for (PDPage page : pages) {
91                  if (page.getAnnotations() != null && !page.getAnnotations().isEmpty()) {
92                      int pageNumber = page.getCOSObject().getInt("PageNumber", 0);
93                      String subType = page.getAnnotations().get(0).getSubtype();
94                      LOGGER.info("Found annotations on page: {}, subType: {}", pageNumber, subType);
95                      if (!subType.equals("Link")) {
96                          hasAnnotations = true;
97                          break; // No need to check further if we found annotations
98                      }
99                  }
100             }
101         } catch (IOException e) {
102             LOGGER.error("Error reading PDF file: {}", e.getMessage());
103             return hasAnnotations;
104         }
105         LOGGER.info("Processing file: {}, has annotations: {}", file.getPath(), hasAnnotations);
106         return hasAnnotations;
107     }
108 
109     public static void removePassword(File encryptedFile, File targetDir) {
110         try (PDDocument document = Loader.loadPDF(encryptedFile)) {
111             boolean isEncrypted = document.isEncrypted();
112             if (isEncrypted) {
113                 document.setAllSecurityToBeRemoved(true);
114             }
115             String targetFilePath = targetDir.getAbsolutePath() + File.separator + encryptedFile.getName();
116             document.save(targetFilePath);
117             LOGGER.info("Remove password success: {}", targetFilePath);
118         } catch (IOException e) {
119             LOGGER.error("Error processing PDF file: {}", e.getMessage());
120         }
121     }
122 
123     public static void removePasswordWithIText(File encryptedFile, File targetDir) {
124         PdfReader reader;
125         try {
126             reader = new PdfReader(encryptedFile).setUnethicalReading(true);
127         } catch (IOException e) {
128             LOGGER.error("Error reading PDF file: {}", e.getMessage());
129             return;
130         }
131         String targetFilePath = targetDir.getAbsolutePath() + File.separator + encryptedFile.getName();
132 
133         try (PdfDocument pdfDoc = new PdfDocument(reader,
134                 new PdfWriter(targetFilePath))) {
135             LOGGER.info("Remove password success: {}", targetFilePath);
136         } catch (IOException e) {
137             LOGGER.error("Error writing PDF file: {}", e.getMessage());
138         }
139     }
140 }