import os import shutil import subprocess import PyPDF2 import img2pdf from pathlib import Path def is_image_only_pdf(pdf_path): """ Check if a PDF contains only image-based content. """ try: with open(pdf_path, "rb") as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: if '/Image' not in str(page.get('/Resources', {})): return False return True except Exception as e: print(f"Error reading PDF: {e}") return False def extract_images_with_pdftoppm(pdf_path, output_dir, output_format="png"): """ Extract images from a PDF using pdftoppm. """ try: os.makedirs(output_dir, exist_ok=True) command = [ "pdftoppm", "-r", "72", # Resolution: 300 DPI pdf_path, f"{output_dir}/page", f"-{output_format}" ] subprocess.run(command, check=True) print(f"Images extracted to {output_dir}") except subprocess.CalledProcessError as e: print(f"Error during image extraction: {e}") def reassemble_pdf(image_dir, output_pdf_path): """ Reassemble a new PDF from extracted images. """ try: images = sorted(Path(image_dir).glob("page-*.png")) if not images: print("No images found to reassemble.") return with open(output_pdf_path, "wb") as f: f.write(img2pdf.convert([str(img) for img in images])) print(f"Reassembled PDF saved at {output_pdf_path}") except Exception as e: print(f"Error reassembling PDF: {e}") def main(input_pdf, output_pdf): if not os.path.isfile(input_pdf): print(f"Error: Input file '{input_pdf}' does not exist.") return temp_dir = "extracted_images" if is_image_only_pdf(input_pdf): print("The PDF contains only images. Extracting and reassembling...") extract_images_with_pdftoppm(input_pdf, temp_dir) reassemble_pdf(temp_dir, output_pdf) else: print("The PDF contains text or mixed content. Copying input to output...") shutil.copy(input_pdf, output_pdf) print(f"Copied '{input_pdf}' to '{output_pdf}'.") # Clean up temporary directory #if os.path.exists(temp_dir): # shutil.rmtree(temp_dir) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Process PDF files with image-only detection.") parser.add_argument("input_pdf", help="Path to the input PDF file") parser.add_argument("output_pdf", help="Path to the output PDF file") args = parser.parse_args() main(args.input_pdf, args.output_pdf)