86 lines
2.7 KiB
Python
86 lines
2.7 KiB
Python
import os
|
|
import shutil
|
|
import subprocess
|
|
import PyPDF2
|
|
import img2pdf
|
|
from pathlib import Path
|
|
|
|
def is_image_only_pdf(pdf_path):
|
|
"""
|
|
Check if a PDF contains only image-based content.
|
|
"""
|
|
try:
|
|
with open(pdf_path, "rb") as f:
|
|
reader = PyPDF2.PdfReader(f)
|
|
for page in reader.pages:
|
|
if '/Image' not in str(page.get('/Resources', {})):
|
|
return False
|
|
return True
|
|
except Exception as e:
|
|
print(f"Error reading PDF: {e}")
|
|
return False
|
|
|
|
def extract_images_with_pdftoppm(pdf_path, output_dir, output_format="png"):
|
|
"""
|
|
Extract images from a PDF using pdftoppm.
|
|
"""
|
|
try:
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
command = [
|
|
"pdftoppm",
|
|
"-r", "72", # Resolution: 300 DPI
|
|
pdf_path,
|
|
f"{output_dir}/page",
|
|
f"-{output_format}"
|
|
]
|
|
subprocess.run(command, check=True)
|
|
print(f"Images extracted to {output_dir}")
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"Error during image extraction: {e}")
|
|
|
|
def reassemble_pdf(image_dir, output_pdf_path):
|
|
"""
|
|
Reassemble a new PDF from extracted images.
|
|
"""
|
|
try:
|
|
images = sorted(Path(image_dir).glob("page-*.png"))
|
|
if not images:
|
|
print("No images found to reassemble.")
|
|
return
|
|
with open(output_pdf_path, "wb") as f:
|
|
f.write(img2pdf.convert([str(img) for img in images]))
|
|
print(f"Reassembled PDF saved at {output_pdf_path}")
|
|
except Exception as e:
|
|
print(f"Error reassembling PDF: {e}")
|
|
|
|
def main(input_pdf, output_pdf):
|
|
if not os.path.isfile(input_pdf):
|
|
print(f"Error: Input file '{input_pdf}' does not exist.")
|
|
return
|
|
|
|
temp_dir = "extracted_images"
|
|
|
|
if is_image_only_pdf(input_pdf):
|
|
print("The PDF contains only images. Extracting and reassembling...")
|
|
extract_images_with_pdftoppm(input_pdf, temp_dir)
|
|
reassemble_pdf(temp_dir, output_pdf)
|
|
else:
|
|
print("The PDF contains text or mixed content. Copying input to output...")
|
|
shutil.copy(input_pdf, output_pdf)
|
|
print(f"Copied '{input_pdf}' to '{output_pdf}'.")
|
|
|
|
# Clean up temporary directory
|
|
#if os.path.exists(temp_dir):
|
|
# shutil.rmtree(temp_dir)
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Process PDF files with image-only detection.")
|
|
parser.add_argument("input_pdf", help="Path to the input PDF file")
|
|
parser.add_argument("output_pdf", help="Path to the output PDF file")
|
|
args = parser.parse_args()
|
|
|
|
main(args.input_pdf, args.output_pdf)
|
|
|