PdfUtils.java

package edu.jiangxin.apktoolbox.pdf;

import com.itextpdf.kernel.pdf.*;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.encryption.ProtectionPolicy;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.File;
import java.io.IOException;

public class PdfUtils {
    private static final Logger LOGGER = LogManager.getLogger(PdfUtils.class.getSimpleName());
    public static boolean isScannedPdf(File file, int threshold) {
        int length = 0;

        try (PDDocument document = Loader.loadPDF(file)) {
            boolean isEncrypted = document.isEncrypted();
            if (isEncrypted) {
                document.setAllSecurityToBeRemoved(true);
            }

            PDFTextStripper stripper = new PDFTextStripper();
            String text = stripper.getText(document).trim();
            length = text.length();
        } catch (IOException e) {
            LOGGER.error("Error reading PDF file: {}", e.getMessage());
            return false;
        }
        LOGGER.info("Processing file: {}, text size: {}", file.getPath(), length);
        return length < threshold;
    }

    public static boolean isEncryptedPdf(File file) {
        boolean isEncrypted;

        try (PDDocument document = Loader.loadPDF(file)) {
            isEncrypted = document.isEncrypted();
        } catch (IOException e) {
            LOGGER.error("Error reading PDF file: {}", e.getMessage());
            return false;
        }
        LOGGER.info("Processing file: {}, is encrypted: {}", file.getPath(), isEncrypted);
        return isEncrypted;
    }

    public static boolean isNonOutlinePdf(File file) {
        boolean hasOutline = false;

        try (PDDocument document = Loader.loadPDF(file)) {
            boolean isEncrypted = document.isEncrypted();
            if (isEncrypted) {
                document.setAllSecurityToBeRemoved(true);
            }

            if (document.getDocumentCatalog() != null && document.getDocumentCatalog().getDocumentOutline() != null) {
                hasOutline = true;
            }
        } catch (IOException e) {
            LOGGER.error("Error reading PDF file: {}", e.getMessage());
            return false;
        }
        LOGGER.info("Processing file: {}, has outline: {}", file.getPath(), hasOutline);
        return !hasOutline;
    }

    public static boolean hasAnnotations(File file) {
        boolean hasAnnotations = false;

        try (PDDocument document = Loader.loadPDF(file)) {
            boolean isEncrypted = document.isEncrypted();
            if (isEncrypted) {
                document.setAllSecurityToBeRemoved(true);
            }
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            if (catalog == null) {
                return false;
            }
            PDPageTree pages = document.getDocumentCatalog().getPages();
            if (pages == null || pages.getCount() == 0) {
                return false;
            }

            for (PDPage page : pages) {
                if (page.getAnnotations() != null && !page.getAnnotations().isEmpty()) {
                    int pageNumber = page.getCOSObject().getInt("PageNumber", 0);
                    String subType = page.getAnnotations().get(0).getSubtype();
                    LOGGER.info("Found annotations on page: {}, subType: {}", pageNumber, subType);
                    if (!subType.equals("Link")) {
                        hasAnnotations = true;
                        break; // No need to check further if we found annotations
                    }
                }
            }
        } catch (IOException e) {
            LOGGER.error("Error reading PDF file: {}", e.getMessage());
            return hasAnnotations;
        }
        LOGGER.info("Processing file: {}, has annotations: {}", file.getPath(), hasAnnotations);
        return hasAnnotations;
    }

    public static void removePassword(File encryptedFile, File targetDir) {
        try (PDDocument document = Loader.loadPDF(encryptedFile)) {
            boolean isEncrypted = document.isEncrypted();
            if (isEncrypted) {
                document.setAllSecurityToBeRemoved(true);
            }
            String targetFilePath = targetDir.getAbsolutePath() + File.separator + encryptedFile.getName();
            document.save(targetFilePath);
            LOGGER.info("Remove password success: {}", targetFilePath);
        } catch (IOException e) {
            LOGGER.error("Error processing PDF file: {}", e.getMessage());
        }
    }

    public static void removePasswordWithIText(File encryptedFile, File targetDir) {
        PdfReader reader;
        try {
            reader = new PdfReader(encryptedFile).setUnethicalReading(true);
        } catch (IOException e) {
            LOGGER.error("Error reading PDF file: {}", e.getMessage());
            return;
        }
        String targetFilePath = targetDir.getAbsolutePath() + File.separator + encryptedFile.getName();

        try (PdfDocument pdfDoc = new PdfDocument(reader,
                new PdfWriter(targetFilePath))) {
            LOGGER.info("Remove password success: {}", targetFilePath);
        } catch (IOException e) {
            LOGGER.error("Error writing PDF file: {}", e.getMessage());
        }
    }
}