new glimpse approach

2026-05-27 10:12:45 -07:00
parent 6e65df2591
commit 2b9648dd1a
9 changed files with 768 additions and 23 deletions
--- a/code/main.py
+++ b/code/main.py
@@ -2,7 +2,6 @@
 import base64
 import json
 import os
-import re
 import urllib.request

 from dotenv import load_dotenv
@@ -24,29 +23,81 @@ def slurp_file(filename):
        return file.read()


-BASE_PROMPT="""You are an invoice extraction assistant. Your job is to read PDF documents and extract all invoice and credit note details into structured JSON.
+BASE_PROMPT="""You are an invoice extraction assistant. Your job is to read PDF documents and extract all invoice and credit note details.

 DOCUMENT TYPES YOU MAY ENCOUNTER:
 1. **Single Invoice** — one invoice with line items, totals, dates, etc.
 2. **Credit Note** — similar to an invoice but represents a credit/refund. Extract it the same way; the total will be positive (the credit amount).
-3. **Statement / Summary** — a document listing multiple invoices or credits in a table or list format. Each row or entry represents a separate invoice/credit. Extract EACH one as a separate object in the output array.
+3. **Statement / Summary** — a document listing multiple invoices or credits in a table or list format. Each row or entry represents a separate invoice/credit.
 4. **Mixed** — a document containing both invoices and credits.

 EXTRACTION RULES:
- Extract EVERY invoice or credit you can find. If the document is a statement listing 10 invoices, return all 10.
+- Extract EVERY invoice or credit you can find. If the document is a statement listing 10 invoices, include all 10 in the invoices array.
 - **customer_identifier**: The name of the customer/buyer. Look for "Bill To", "Customer", "Sold To", or the company name at the top.
 - **vendor_identifier**: The name of the vendor/seller. Look for "From", "Vendor", "Supplier", letterhead, or the company issuing the document.
 - **date**: The invoice date in ISO 8601 format (YYYY-MM-DD). If multiple dates exist, use the invoice date, not the due date or statement period.
 - **invoice_number**: The unique invoice or credit note number. Look for labels like "Invoice #", "Inv No", "Credit Note #", "Reference", "Doc #".
 - **account_number**: The customer's account number if present. Not required — omit if not found.
 - **total**: The total amount as a decimal string (e.g., "1234.56"). Use the grand total or amount due. For credits, use the credit amount as a positive number. Numbers in parentheses indicate credits — extract them as positive values.
- **explanation**: Only use this when you cannot find any valid invoices. Provide a detailed reason (e.g., "document is blank", "PDF contains only images with no extractable text", "document is a cover letter with no invoice data").

 IMPORTANT:
 - Do NOT skip entries because some fields are missing. Extract what you can.
- For statements/summaries, each row in an invoice table is a separate invoice entry.
- If OCR fails completely and no text can be extracted at all, return an array with one object containing only the explanation field.
- Your FINAL response must be ONLY a JSON array. Do NOT wrap it in markdown code blocks. Do NOT add any prose before or after the JSON."""
+- For statements/summaries, each row in an invoice table is a separate entry in the invoices array.
+- If OCR fails completely and no text can be extracted at all, set the explanation field to indicate why."""
+
+INVOICE_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "parsed_invoices",
+        "description": "Record all extracted invoices and credit notes from the document. Include every invoice found in the invoices array.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "invoices": {
+                    "description": "Array of all invoices and credit notes extracted from the document. Include every one you find.",
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "customer_identifier": {
+                                "description": "The customer's name. e.g., ABC Corporation, Microsoft, etc.",
+                                "type": "string"
+                            },
+                            "vendor_identifier": {
+                                "description": "The vendor's name",
+                                "type": "string"
+                            },
+                            "date": {
+                                "description": "Invoice date in ISO 8601 format (YYYY-MM-DD).",
+                                "type": "string",
+                                "format": "date"
+                            },
+                            "invoice_number": {
+                                "description": "Unique invoice number for the transaction.",
+                                "type": "string"
+                            },
+                            "account_number": {
+                                "description": "Customer's account number associated with the invoice. Not always present on the invoice.",
+                                "type": "string"
+                            },
+                            "total": {
+                                "description": "Total amount of the invoice, including taxes and fees. It should be a decimal number as a string.",
+                                "type": "string",
+                                "pattern": "^\\d+(\\.\\d{1,2})?$"
+                            }
+                        },
+                        "required": ["customer_identifier", "vendor_identifier", "date", "invoice_number", "total"],
+                        "additionalProperties": False
+                    }
+                },
+                "explanation": {
+                    "description": "Only use this when you cannot find any valid invoices. Provide a detailed reason (e.g., 'document is blank', 'PDF contains only images with no extractable text', 'document is a cover letter with no invoice data').",
+                    "type": "string"
+                }
+            }
+        }
+    }
+}


 def analyze_pdf(pdf_path):
@@ -80,12 +131,18 @@ def analyze_pdf(pdf_path):
                ],
            },
        ],
+        tools=[INVOICE_TOOL],
    )
-    text = response.choices[0].message.content
-    match = re.search(r'```(?:json)?\s*\n(.*?)\n```', text, re.DOTALL)
-    if match:
-        text = match.group(1)
-    return text
+
+    message = response.choices[0].message
+
+    if message.tool_calls:
+        for tool_call in message.tool_calls:
+            if tool_call.function.name == "parsed_invoices":
+                data = json.loads(tool_call.function.arguments)
+                return data.get("invoices", [])
+
+    return []


 def analyze_url(url):
@@ -98,6 +155,11 @@ def analyze_url(url):

 def handler(event, context):
    print(event)
-    url = event['url']
-    print("URL IS", url)
-    return analyze_url(url)
+    if "pdf_base64" in event:
+        pdf_path = "/tmp/invoice.pdf"
+        with open(pdf_path, "wb") as f:
+            f.write(base64.b64decode(event["pdf_base64"]))
+        return analyze_pdf(pdf_path)
+    if "url" in event:
+        return analyze_url(event["url"])
+    raise ValueError("event must contain 'url' or 'pdf_base64'")