glimpse2/clean.py

import os
import shutil
import subprocess
import PyPDF2
import img2pdf
from pathlib import Path

def is_image_only_pdf(pdf_path):
    """
    Check if a PDF contains only image-based content.
    """
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                if '/Image' not in str(page.get('/Resources', {})):
                    return False
        return True
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return False

def extract_images_with_pdftoppm(pdf_path, output_dir, output_format="png"):
    """
    Extract images from a PDF using pdftoppm.
    """
    try:
        os.makedirs(output_dir, exist_ok=True)
        command = [
            "pdftoppm",
            "-r", "72",  # Resolution: 300 DPI
            pdf_path,
            f"{output_dir}/page",
            f"-{output_format}"
        ]
        subprocess.run(command, check=True)
        print(f"Images extracted to {output_dir}")
    except subprocess.CalledProcessError as e:
        print(f"Error during image extraction: {e}")

def reassemble_pdf(image_dir, output_pdf_path):
    """
    Reassemble a new PDF from extracted images.
    """
    try:
        images = sorted(Path(image_dir).glob("page-*.png"))
        if not images:
            print("No images found to reassemble.")
            return
        with open(output_pdf_path, "wb") as f:
            f.write(img2pdf.convert([str(img) for img in images]))
        print(f"Reassembled PDF saved at {output_pdf_path}")
    except Exception as e:
        print(f"Error reassembling PDF: {e}")

def main(input_pdf, output_pdf):
    if not os.path.isfile(input_pdf):
        print(f"Error: Input file '{input_pdf}' does not exist.")
        return

    temp_dir = "extracted_images"

    if is_image_only_pdf(input_pdf):
        print("The PDF contains only images. Extracting and reassembling...")
        extract_images_with_pdftoppm(input_pdf, temp_dir)
        reassemble_pdf(temp_dir, output_pdf)
    else:
        print("The PDF contains text or mixed content. Copying input to output...")
        shutil.copy(input_pdf, output_pdf)
        print(f"Copied '{input_pdf}' to '{output_pdf}'.")

    # Clean up temporary directory
    #if os.path.exists(temp_dir):
    #    shutil.rmtree(temp_dir)

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Process PDF files with image-only detection.")
    parser.add_argument("input_pdf", help="Path to the input PDF file")
    parser.add_argument("output_pdf", help="Path to the output PDF file")
    args = parser.parse_args()

    main(args.input_pdf, args.output_pdf)