diff --git a/clean.py b/clean.py new file mode 100644 index 0000000..7b1aea4 --- /dev/null +++ b/clean.py @@ -0,0 +1,85 @@ +import os +import shutil +import subprocess +import PyPDF2 +import img2pdf +from pathlib import Path + +def is_image_only_pdf(pdf_path): + """ + Check if a PDF contains only image-based content. + """ + try: + with open(pdf_path, "rb") as f: + reader = PyPDF2.PdfReader(f) + for page in reader.pages: + if '/Image' not in str(page.get('/Resources', {})): + return False + return True + except Exception as e: + print(f"Error reading PDF: {e}") + return False + +def extract_images_with_pdftoppm(pdf_path, output_dir, output_format="png"): + """ + Extract images from a PDF using pdftoppm. + """ + try: + os.makedirs(output_dir, exist_ok=True) + command = [ + "pdftoppm", + "-r", "72", # Resolution: 300 DPI + pdf_path, + f"{output_dir}/page", + f"-{output_format}" + ] + subprocess.run(command, check=True) + print(f"Images extracted to {output_dir}") + except subprocess.CalledProcessError as e: + print(f"Error during image extraction: {e}") + +def reassemble_pdf(image_dir, output_pdf_path): + """ + Reassemble a new PDF from extracted images. + """ + try: + images = sorted(Path(image_dir).glob("page-*.png")) + if not images: + print("No images found to reassemble.") + return + with open(output_pdf_path, "wb") as f: + f.write(img2pdf.convert([str(img) for img in images])) + print(f"Reassembled PDF saved at {output_pdf_path}") + except Exception as e: + print(f"Error reassembling PDF: {e}") + +def main(input_pdf, output_pdf): + if not os.path.isfile(input_pdf): + print(f"Error: Input file '{input_pdf}' does not exist.") + return + + temp_dir = "extracted_images" + + if is_image_only_pdf(input_pdf): + print("The PDF contains only images. Extracting and reassembling...") + extract_images_with_pdftoppm(input_pdf, temp_dir) + reassemble_pdf(temp_dir, output_pdf) + else: + print("The PDF contains text or mixed content. Copying input to output...") + shutil.copy(input_pdf, output_pdf) + print(f"Copied '{input_pdf}' to '{output_pdf}'.") + + # Clean up temporary directory + #if os.path.exists(temp_dir): + # shutil.rmtree(temp_dir) + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Process PDF files with image-only detection.") + parser.add_argument("input_pdf", help="Path to the input PDF file") + parser.add_argument("output_pdf", help="Path to the output PDF file") + args = parser.parse_args() + + main(args.input_pdf, args.output_pdf) + diff --git a/output.pdf b/output.pdf new file mode 100644 index 0000000..6f86358 Binary files /dev/null and b/output.pdf differ diff --git a/tests/SWCPI25011315283.pdf b/tests/SWCPI25011315283.pdf new file mode 100644 index 0000000..681b81f Binary files /dev/null and b/tests/SWCPI25011315283.pdf differ