glimpse2/code/main.py

#!/usr/bin/env python3
import base64
import json
import os
import urllib.request

from dotenv import load_dotenv

_env_path = os.path.join(os.path.dirname(__file__), '.env')
if os.path.exists(_env_path):
    load_dotenv(_env_path)

import openai

client = openai.OpenAI(
    api_key=os.environ["OPENROUTER_API_KEY"],
    base_url="https://openrouter.ai/api/v1",
)


def slurp_file(filename):
    with open(filename, 'r') as file:
        return file.read()


BASE_PROMPT="""You are an invoice extraction assistant. Your job is to read PDF documents and extract all invoice and credit note details.

DOCUMENT TYPES YOU MAY ENCOUNTER:
1. **Single Invoice** — one invoice with line items, totals, dates, etc.
2. **Credit Note** — similar to an invoice but represents a credit/refund. Extract it the same way; the total will be positive (the credit amount).
3. **Statement / Summary** — a document listing multiple invoices or credits in a table or list format. Each row or entry represents a separate invoice/credit.
4. **Mixed** — a document containing both invoices and credits.

EXTRACTION RULES:
- Extract EVERY invoice or credit you can find. If the document is a statement listing 10 invoices, include all 10 in the invoices array.
- **customer_identifier**: The name of the customer/buyer. Look for "Bill To", "Customer", "Sold To", or the company name at the top.
- **vendor_identifier**: The name of the vendor/seller. Look for "From", "Vendor", "Supplier", letterhead, or the company issuing the document.
- **date**: The invoice date in ISO 8601 format (YYYY-MM-DD). If multiple dates exist, use the invoice date, not the due date or statement period.
- **invoice_number**: The unique invoice or credit note number. Look for labels like "Invoice #", "Inv No", "Credit Note #", "Reference", "Doc #".
- **account_number**: The customer's account number if present. Not required — omit if not found.
- **total**: The total amount as a decimal string (e.g., "1234.56"). Use the grand total or amount due. For credits, use the credit amount as a positive number. Numbers in parentheses indicate credits — extract them as positive values.

IMPORTANT:
- Do NOT skip entries because some fields are missing. Extract what you can.
- For statements/summaries, each row in an invoice table is a separate entry in the invoices array.
- If OCR fails completely and no text can be extracted at all, set the explanation field to indicate why."""

INVOICE_TOOL = {
    "type": "function",
    "function": {
        "name": "parsed_invoices",
        "description": "Record all extracted invoices and credit notes from the document. Include every invoice found in the invoices array.",
        "parameters": {
            "type": "object",
            "properties": {
                "invoices": {
                    "description": "Array of all invoices and credit notes extracted from the document. Include every one you find.",
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "customer_identifier": {
                                "description": "The customer's name. e.g., ABC Corporation, Microsoft, etc.",
                                "type": "string"
                            },
                            "vendor_identifier": {
                                "description": "The vendor's name",
                                "type": "string"
                            },
                            "date": {
                                "description": "Invoice date in ISO 8601 format (YYYY-MM-DD).",
                                "type": "string",
                                "format": "date"
                            },
                            "invoice_number": {
                                "description": "Unique invoice number for the transaction.",
                                "type": "string"
                            },
                            "account_number": {
                                "description": "Customer's account number associated with the invoice. Not always present on the invoice.",
                                "type": "string"
                            },
                            "total": {
                                "description": "Total amount of the invoice, including taxes and fees. It should be a decimal number as a string.",
                                "type": "string",
                                "pattern": "^\\d+(\\.\\d{1,2})?$"
                            }
                        },
                        "required": ["customer_identifier", "vendor_identifier", "date", "invoice_number", "total"],
                        "additionalProperties": False
                    }
                },
                "explanation": {
                    "description": "Only use this when you cannot find any valid invoices. Provide a detailed reason (e.g., 'document is blank', 'PDF contains only images with no extractable text', 'document is a cover letter with no invoice data').",
                    "type": "string"
                }
            }
        }
    }
}


def analyze_pdf(pdf_path):
    model = os.environ.get("OPENROUTER_MODEL", "openai/gpt-5.4-mini")

    with open(pdf_path, 'rb') as f:
        pdf_data = f.read()
    base64_string = base64.b64encode(pdf_data).decode("utf-8")

    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": BASE_PROMPT,
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "file",
                        "file": {
                            "filename": os.path.basename(pdf_path),
                            "file_data": f"data:application/pdf;base64,{base64_string}",
                        },
                    },
                    {
                        "type": "text",
                        "text": "extract the invoice(s) and/or credit(s) details from this document.",
                    },
                ],
            },
        ],
        tools=[INVOICE_TOOL],
    )

    message = response.choices[0].message

    if message.tool_calls:
        for tool_call in message.tool_calls:
            if tool_call.function.name == "parsed_invoices":
                data = json.loads(tool_call.function.arguments)
                return data.get("invoices", [])

    return []


def analyze_url(url):
    with urllib.request.urlopen(url) as response:
        data = response.read()
    with open("/tmp/test.pdf", "wb") as f:
        f.write(data)
    return analyze_pdf("/tmp/test.pdf")


def handler(event, context):
    print(event)
    if "pdf_base64" in event:
        pdf_path = "/tmp/invoice.pdf"
        with open(pdf_path, "wb") as f:
            f.write(base64.b64decode(event["pdf_base64"]))
        return analyze_pdf(pdf_path)
    if "url" in event:
        return analyze_url(event["url"])
    raise ValueError("event must contain 'url' or 'pdf_base64'")