This commit is contained in:
2025-01-26 20:24:37 -08:00
parent 12d59f4379
commit a5714cafdc
3 changed files with 85 additions and 0 deletions

85
clean.py Normal file
View File

@@ -0,0 +1,85 @@
import os
import shutil
import subprocess
import PyPDF2
import img2pdf
from pathlib import Path
def is_image_only_pdf(pdf_path):
"""
Check if a PDF contains only image-based content.
"""
try:
with open(pdf_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
if '/Image' not in str(page.get('/Resources', {})):
return False
return True
except Exception as e:
print(f"Error reading PDF: {e}")
return False
def extract_images_with_pdftoppm(pdf_path, output_dir, output_format="png"):
"""
Extract images from a PDF using pdftoppm.
"""
try:
os.makedirs(output_dir, exist_ok=True)
command = [
"pdftoppm",
"-r", "72", # Resolution: 300 DPI
pdf_path,
f"{output_dir}/page",
f"-{output_format}"
]
subprocess.run(command, check=True)
print(f"Images extracted to {output_dir}")
except subprocess.CalledProcessError as e:
print(f"Error during image extraction: {e}")
def reassemble_pdf(image_dir, output_pdf_path):
"""
Reassemble a new PDF from extracted images.
"""
try:
images = sorted(Path(image_dir).glob("page-*.png"))
if not images:
print("No images found to reassemble.")
return
with open(output_pdf_path, "wb") as f:
f.write(img2pdf.convert([str(img) for img in images]))
print(f"Reassembled PDF saved at {output_pdf_path}")
except Exception as e:
print(f"Error reassembling PDF: {e}")
def main(input_pdf, output_pdf):
if not os.path.isfile(input_pdf):
print(f"Error: Input file '{input_pdf}' does not exist.")
return
temp_dir = "extracted_images"
if is_image_only_pdf(input_pdf):
print("The PDF contains only images. Extracting and reassembling...")
extract_images_with_pdftoppm(input_pdf, temp_dir)
reassemble_pdf(temp_dir, output_pdf)
else:
print("The PDF contains text or mixed content. Copying input to output...")
shutil.copy(input_pdf, output_pdf)
print(f"Copied '{input_pdf}' to '{output_pdf}'.")
# Clean up temporary directory
#if os.path.exists(temp_dir):
# shutil.rmtree(temp_dir)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Process PDF files with image-only detection.")
parser.add_argument("input_pdf", help="Path to the input PDF file")
parser.add_argument("output_pdf", help="Path to the output PDF file")
args = parser.parse_args()
main(args.input_pdf, args.output_pdf)