88 lines
4.0 KiB
Python
Executable File
88 lines
4.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import base64
|
|
import json
|
|
import os
|
|
import urllib.request
|
|
|
|
import openai
|
|
|
|
client = openai.OpenAI(api_key="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")
|
|
|
|
|
|
def slurp_file(filename):
|
|
with open(filename, 'r') as file:
|
|
return file.read()
|
|
|
|
|
|
BASE_PROMPT="""You are an invoice extraction assistant. Your job is to read PDF documents and extract all invoice and credit note details into structured JSON.
|
|
|
|
DOCUMENT TYPES YOU MAY ENCOUNTER:
|
|
1. **Single Invoice** — one invoice with line items, totals, dates, etc.
|
|
2. **Credit Note** — similar to an invoice but represents a credit/refund. Extract it the same way; the total will be positive (the credit amount).
|
|
3. **Statement / Summary** — a document listing multiple invoices or credits in a table or list format. Each row or entry represents a separate invoice/credit. Extract EACH one as a separate object in the output array.
|
|
4. **Mixed** — a document containing both invoices and credits.
|
|
|
|
EXTRACTION RULES:
|
|
- Extract EVERY invoice or credit you can find. If the document is a statement listing 10 invoices, return all 10.
|
|
- **customer_identifier**: The name of the customer/buyer. Look for "Bill To", "Customer", "Sold To", or the company name at the top.
|
|
- **vendor_identifier**: The name of the vendor/seller. Look for "From", "Vendor", "Supplier", letterhead, or the company issuing the document.
|
|
- **date**: The invoice date in ISO 8601 format (YYYY-MM-DD). If multiple dates exist, use the invoice date, not the due date or statement period.
|
|
- **invoice_number**: The unique invoice or credit note number. Look for labels like "Invoice #", "Inv No", "Credit Note #", "Reference", "Doc #".
|
|
- **account_number**: The customer's account number if present. Not required — omit if not found.
|
|
- **total**: The total amount as a decimal string (e.g., "1234.56"). Use the grand total or amount due. For credits, use the credit amount as a positive number. Numbers in parentheses indicate credits — extract them as positive values.
|
|
- **explanation**: Only use this when you cannot find any valid invoices. Provide a detailed reason (e.g., "document is blank", "PDF contains only images with no extractable text", "document is a cover letter with no invoice data").
|
|
|
|
IMPORTANT:
|
|
- Do NOT skip entries because some fields are missing. Extract what you can.
|
|
- For statements/summaries, each row in an invoice table is a separate invoice entry.
|
|
- If OCR fails completely and no text can be extracted at all, return an array with one object containing only the explanation field.
|
|
- Your FINAL response to the user must be ONLY a JSON array. Do NOT wrap it in markdown code blocks. Do NOT add any prose before or after the JSON."""
|
|
|
|
|
|
def analyze_pdf(pdf_path):
|
|
with open(pdf_path, 'rb') as f:
|
|
pdf_data = f.read()
|
|
|
|
base64_string = base64.b64encode(pdf_data).decode("utf-8")
|
|
|
|
response = client.responses.create(
|
|
model="gpt-4o",
|
|
instructions=BASE_PROMPT,
|
|
input=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "input_file",
|
|
"filename": os.path.basename(pdf_path),
|
|
"file_data": f"data:application/pdf;base64,{base64_string}",
|
|
},
|
|
{
|
|
"type": "input_text",
|
|
"text": "extract the invoice(s) and/or credit(s) details from this document.",
|
|
},
|
|
],
|
|
}
|
|
],
|
|
)
|
|
text = response.output_text
|
|
import re
|
|
match = re.search(r'```(?:json)?\s*\n(.*?)\n```', text, re.DOTALL)
|
|
if match:
|
|
text = match.group(1)
|
|
return text
|
|
|
|
|
|
def analyze_url(url):
|
|
with urllib.request.urlopen(url) as response:
|
|
data = response.read()
|
|
with open("/tmp/test.pdf", "wb") as f:
|
|
f.write(data)
|
|
return analyze_pdf("/tmp/test.pdf")
|
|
|
|
|
|
def handler(event, context):
|
|
print(event)
|
|
url = event['url']
|
|
print("URL IS", url)
|
|
return analyze_url(url) |