changes
This commit is contained in:
85
clean.py
Normal file
85
clean.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import PyPDF2
|
||||
import img2pdf
|
||||
from pathlib import Path
|
||||
|
||||
def is_image_only_pdf(pdf_path):
|
||||
"""
|
||||
Check if a PDF contains only image-based content.
|
||||
"""
|
||||
try:
|
||||
with open(pdf_path, "rb") as f:
|
||||
reader = PyPDF2.PdfReader(f)
|
||||
for page in reader.pages:
|
||||
if '/Image' not in str(page.get('/Resources', {})):
|
||||
return False
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error reading PDF: {e}")
|
||||
return False
|
||||
|
||||
def extract_images_with_pdftoppm(pdf_path, output_dir, output_format="png"):
|
||||
"""
|
||||
Extract images from a PDF using pdftoppm.
|
||||
"""
|
||||
try:
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
command = [
|
||||
"pdftoppm",
|
||||
"-r", "72", # Resolution: 300 DPI
|
||||
pdf_path,
|
||||
f"{output_dir}/page",
|
||||
f"-{output_format}"
|
||||
]
|
||||
subprocess.run(command, check=True)
|
||||
print(f"Images extracted to {output_dir}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error during image extraction: {e}")
|
||||
|
||||
def reassemble_pdf(image_dir, output_pdf_path):
|
||||
"""
|
||||
Reassemble a new PDF from extracted images.
|
||||
"""
|
||||
try:
|
||||
images = sorted(Path(image_dir).glob("page-*.png"))
|
||||
if not images:
|
||||
print("No images found to reassemble.")
|
||||
return
|
||||
with open(output_pdf_path, "wb") as f:
|
||||
f.write(img2pdf.convert([str(img) for img in images]))
|
||||
print(f"Reassembled PDF saved at {output_pdf_path}")
|
||||
except Exception as e:
|
||||
print(f"Error reassembling PDF: {e}")
|
||||
|
||||
def main(input_pdf, output_pdf):
|
||||
if not os.path.isfile(input_pdf):
|
||||
print(f"Error: Input file '{input_pdf}' does not exist.")
|
||||
return
|
||||
|
||||
temp_dir = "extracted_images"
|
||||
|
||||
if is_image_only_pdf(input_pdf):
|
||||
print("The PDF contains only images. Extracting and reassembling...")
|
||||
extract_images_with_pdftoppm(input_pdf, temp_dir)
|
||||
reassemble_pdf(temp_dir, output_pdf)
|
||||
else:
|
||||
print("The PDF contains text or mixed content. Copying input to output...")
|
||||
shutil.copy(input_pdf, output_pdf)
|
||||
print(f"Copied '{input_pdf}' to '{output_pdf}'.")
|
||||
|
||||
# Clean up temporary directory
|
||||
#if os.path.exists(temp_dir):
|
||||
# shutil.rmtree(temp_dir)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Process PDF files with image-only detection.")
|
||||
parser.add_argument("input_pdf", help="Path to the input PDF file")
|
||||
parser.add_argument("output_pdf", help="Path to the output PDF file")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.input_pdf, args.output_pdf)
|
||||
|
||||
Reference in New Issue
Block a user