changes

2025-01-26 20:24:37 -08:00
parent 12d59f4379
commit a5714cafdc
3 changed files with 85 additions and 0 deletions
--- a/clean.py
+++ b/clean.py
@@ -0,0 +1,85 @@
+import os
+import shutil
+import subprocess
+import PyPDF2
+import img2pdf
+from pathlib import Path
+
+def is_image_only_pdf(pdf_path):
+    """
+    Check if a PDF contains only image-based content.
+    """
+    try:
+        with open(pdf_path, "rb") as f:
+            reader = PyPDF2.PdfReader(f)
+            for page in reader.pages:
+                if '/Image' not in str(page.get('/Resources', {})):
+                    return False
+        return True
+    except Exception as e:
+        print(f"Error reading PDF: {e}")
+        return False
+
+def extract_images_with_pdftoppm(pdf_path, output_dir, output_format="png"):
+    """
+    Extract images from a PDF using pdftoppm.
+    """
+    try:
+        os.makedirs(output_dir, exist_ok=True)
+        command = [
+            "pdftoppm",
+            "-r", "72",  # Resolution: 300 DPI
+            pdf_path,
+            f"{output_dir}/page",
+            f"-{output_format}"
+        ]
+        subprocess.run(command, check=True)
+        print(f"Images extracted to {output_dir}")
+    except subprocess.CalledProcessError as e:
+        print(f"Error during image extraction: {e}")
+
+def reassemble_pdf(image_dir, output_pdf_path):
+    """
+    Reassemble a new PDF from extracted images.
+    """
+    try:
+        images = sorted(Path(image_dir).glob("page-*.png"))
+        if not images:
+            print("No images found to reassemble.")
+            return
+        with open(output_pdf_path, "wb") as f:
+            f.write(img2pdf.convert([str(img) for img in images]))
+        print(f"Reassembled PDF saved at {output_pdf_path}")
+    except Exception as e:
+        print(f"Error reassembling PDF: {e}")
+
+def main(input_pdf, output_pdf):
+    if not os.path.isfile(input_pdf):
+        print(f"Error: Input file '{input_pdf}' does not exist.")
+        return
+
+    temp_dir = "extracted_images"
+
+    if is_image_only_pdf(input_pdf):
+        print("The PDF contains only images. Extracting and reassembling...")
+        extract_images_with_pdftoppm(input_pdf, temp_dir)
+        reassemble_pdf(temp_dir, output_pdf)
+    else:
+        print("The PDF contains text or mixed content. Copying input to output...")
+        shutil.copy(input_pdf, output_pdf)
+        print(f"Copied '{input_pdf}' to '{output_pdf}'.")
+
+    # Clean up temporary directory
+    #if os.path.exists(temp_dir):
+    #    shutil.rmtree(temp_dir)
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Process PDF files with image-only detection.")
+    parser.add_argument("input_pdf", help="Path to the input PDF file")
+    parser.add_argument("output_pdf", help="Path to the output PDF file")
+    args = parser.parse_args()
+
+    main(args.input_pdf, args.output_pdf)
+