#!/usr/bin/env python3 import base64 import json import os import urllib.request from dotenv import load_dotenv _env_path = os.path.join(os.path.dirname(__file__), '.env') if os.path.exists(_env_path): load_dotenv(_env_path) import openai client = openai.OpenAI( api_key=os.environ["OPENROUTER_API_KEY"], base_url="https://openrouter.ai/api/v1", ) def slurp_file(filename): with open(filename, 'r') as file: return file.read() BASE_PROMPT="""You are an invoice extraction assistant. Your job is to read PDF documents and extract all invoice and credit note details. DOCUMENT TYPES YOU MAY ENCOUNTER: 1. **Single Invoice** — one invoice with line items, totals, dates, etc. 2. **Credit Note** — similar to an invoice but represents a credit/refund. Extract it the same way; the total will be positive (the credit amount). 3. **Statement / Summary** — a document listing multiple invoices or credits in a table or list format. Each row or entry represents a separate invoice/credit. 4. **Mixed** — a document containing both invoices and credits. EXTRACTION RULES: - Extract EVERY invoice or credit you can find. If the document is a statement listing 10 invoices, include all 10 in the invoices array. - **customer_identifier**: The name of the customer/buyer. Look for "Bill To", "Customer", "Sold To", or the company name at the top. - **vendor_identifier**: The name of the vendor/seller. Look for "From", "Vendor", "Supplier", letterhead, or the company issuing the document. - **date**: The invoice date in ISO 8601 format (YYYY-MM-DD). If multiple dates exist, use the invoice date, not the due date or statement period. - **invoice_number**: The unique invoice or credit note number. Look for labels like "Invoice #", "Inv No", "Credit Note #", "Reference", "Doc #". - **account_number**: The customer's account number if present. Not required — omit if not found. - **total**: The total amount as a decimal string (e.g., "1234.56"). Use the grand total or amount due. For credits, use the credit amount as a positive number. Numbers in parentheses indicate credits — extract them as positive values. IMPORTANT: - Do NOT skip entries because some fields are missing. Extract what you can. - For statements/summaries, each row in an invoice table is a separate entry in the invoices array. - If OCR fails completely and no text can be extracted at all, set the explanation field to indicate why.""" INVOICE_TOOL = { "type": "function", "function": { "name": "parsed_invoices", "description": "Record all extracted invoices and credit notes from the document. Include every invoice found in the invoices array.", "parameters": { "type": "object", "properties": { "invoices": { "description": "Array of all invoices and credit notes extracted from the document. Include every one you find.", "type": "array", "items": { "type": "object", "properties": { "customer_identifier": { "description": "The customer's name. e.g., ABC Corporation, Microsoft, etc.", "type": "string" }, "vendor_identifier": { "description": "The vendor's name", "type": "string" }, "date": { "description": "Invoice date in ISO 8601 format (YYYY-MM-DD).", "type": "string", "format": "date" }, "invoice_number": { "description": "Unique invoice number for the transaction.", "type": "string" }, "account_number": { "description": "Customer's account number associated with the invoice. Not always present on the invoice.", "type": "string" }, "total": { "description": "Total amount of the invoice, including taxes and fees. It should be a decimal number as a string.", "type": "string", "pattern": "^\\d+(\\.\\d{1,2})?$" } }, "required": ["customer_identifier", "vendor_identifier", "date", "invoice_number", "total"], "additionalProperties": False } }, "explanation": { "description": "Only use this when you cannot find any valid invoices. Provide a detailed reason (e.g., 'document is blank', 'PDF contains only images with no extractable text', 'document is a cover letter with no invoice data').", "type": "string" } } } } } def analyze_pdf(pdf_path): model = os.environ.get("OPENROUTER_MODEL", "openai/gpt-5.4-mini") with open(pdf_path, 'rb') as f: pdf_data = f.read() base64_string = base64.b64encode(pdf_data).decode("utf-8") response = client.chat.completions.create( model=model, messages=[ { "role": "system", "content": BASE_PROMPT, }, { "role": "user", "content": [ { "type": "file", "file": { "filename": os.path.basename(pdf_path), "file_data": f"data:application/pdf;base64,{base64_string}", }, }, { "type": "text", "text": "extract the invoice(s) and/or credit(s) details from this document.", }, ], }, ], tools=[INVOICE_TOOL], ) message = response.choices[0].message if message.tool_calls: for tool_call in message.tool_calls: if tool_call.function.name == "parsed_invoices": data = json.loads(tool_call.function.arguments) return data.get("invoices", []) return [] def analyze_url(url): with urllib.request.urlopen(url) as response: data = response.read() with open("/tmp/test.pdf", "wb") as f: f.write(data) return analyze_pdf("/tmp/test.pdf") def handler(event, context): print(event) if "pdf_base64" in event: pdf_path = "/tmp/invoice.pdf" with open(pdf_path, "wb") as f: f.write(base64.b64decode(event["pdf_base64"])) return analyze_pdf(pdf_path) if "url" in event: return analyze_url(event["url"]) raise ValueError("event must contain 'url' or 'pdf_base64'")