new glimpse approach
This commit is contained in:
94
code/main.py
94
code/main.py
@@ -2,7 +2,6 @@
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import urllib.request
|
||||
|
||||
from dotenv import load_dotenv
|
||||
@@ -24,29 +23,81 @@ def slurp_file(filename):
|
||||
return file.read()
|
||||
|
||||
|
||||
BASE_PROMPT="""You are an invoice extraction assistant. Your job is to read PDF documents and extract all invoice and credit note details into structured JSON.
|
||||
BASE_PROMPT="""You are an invoice extraction assistant. Your job is to read PDF documents and extract all invoice and credit note details.
|
||||
|
||||
DOCUMENT TYPES YOU MAY ENCOUNTER:
|
||||
1. **Single Invoice** — one invoice with line items, totals, dates, etc.
|
||||
2. **Credit Note** — similar to an invoice but represents a credit/refund. Extract it the same way; the total will be positive (the credit amount).
|
||||
3. **Statement / Summary** — a document listing multiple invoices or credits in a table or list format. Each row or entry represents a separate invoice/credit. Extract EACH one as a separate object in the output array.
|
||||
3. **Statement / Summary** — a document listing multiple invoices or credits in a table or list format. Each row or entry represents a separate invoice/credit.
|
||||
4. **Mixed** — a document containing both invoices and credits.
|
||||
|
||||
EXTRACTION RULES:
|
||||
- Extract EVERY invoice or credit you can find. If the document is a statement listing 10 invoices, return all 10.
|
||||
- Extract EVERY invoice or credit you can find. If the document is a statement listing 10 invoices, include all 10 in the invoices array.
|
||||
- **customer_identifier**: The name of the customer/buyer. Look for "Bill To", "Customer", "Sold To", or the company name at the top.
|
||||
- **vendor_identifier**: The name of the vendor/seller. Look for "From", "Vendor", "Supplier", letterhead, or the company issuing the document.
|
||||
- **date**: The invoice date in ISO 8601 format (YYYY-MM-DD). If multiple dates exist, use the invoice date, not the due date or statement period.
|
||||
- **invoice_number**: The unique invoice or credit note number. Look for labels like "Invoice #", "Inv No", "Credit Note #", "Reference", "Doc #".
|
||||
- **account_number**: The customer's account number if present. Not required — omit if not found.
|
||||
- **total**: The total amount as a decimal string (e.g., "1234.56"). Use the grand total or amount due. For credits, use the credit amount as a positive number. Numbers in parentheses indicate credits — extract them as positive values.
|
||||
- **explanation**: Only use this when you cannot find any valid invoices. Provide a detailed reason (e.g., "document is blank", "PDF contains only images with no extractable text", "document is a cover letter with no invoice data").
|
||||
|
||||
IMPORTANT:
|
||||
- Do NOT skip entries because some fields are missing. Extract what you can.
|
||||
- For statements/summaries, each row in an invoice table is a separate invoice entry.
|
||||
- If OCR fails completely and no text can be extracted at all, return an array with one object containing only the explanation field.
|
||||
- Your FINAL response must be ONLY a JSON array. Do NOT wrap it in markdown code blocks. Do NOT add any prose before or after the JSON."""
|
||||
- For statements/summaries, each row in an invoice table is a separate entry in the invoices array.
|
||||
- If OCR fails completely and no text can be extracted at all, set the explanation field to indicate why."""
|
||||
|
||||
INVOICE_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "parsed_invoices",
|
||||
"description": "Record all extracted invoices and credit notes from the document. Include every invoice found in the invoices array.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"invoices": {
|
||||
"description": "Array of all invoices and credit notes extracted from the document. Include every one you find.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"customer_identifier": {
|
||||
"description": "The customer's name. e.g., ABC Corporation, Microsoft, etc.",
|
||||
"type": "string"
|
||||
},
|
||||
"vendor_identifier": {
|
||||
"description": "The vendor's name",
|
||||
"type": "string"
|
||||
},
|
||||
"date": {
|
||||
"description": "Invoice date in ISO 8601 format (YYYY-MM-DD).",
|
||||
"type": "string",
|
||||
"format": "date"
|
||||
},
|
||||
"invoice_number": {
|
||||
"description": "Unique invoice number for the transaction.",
|
||||
"type": "string"
|
||||
},
|
||||
"account_number": {
|
||||
"description": "Customer's account number associated with the invoice. Not always present on the invoice.",
|
||||
"type": "string"
|
||||
},
|
||||
"total": {
|
||||
"description": "Total amount of the invoice, including taxes and fees. It should be a decimal number as a string.",
|
||||
"type": "string",
|
||||
"pattern": "^\\d+(\\.\\d{1,2})?$"
|
||||
}
|
||||
},
|
||||
"required": ["customer_identifier", "vendor_identifier", "date", "invoice_number", "total"],
|
||||
"additionalProperties": False
|
||||
}
|
||||
},
|
||||
"explanation": {
|
||||
"description": "Only use this when you cannot find any valid invoices. Provide a detailed reason (e.g., 'document is blank', 'PDF contains only images with no extractable text', 'document is a cover letter with no invoice data').",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def analyze_pdf(pdf_path):
|
||||
@@ -80,12 +131,18 @@ def analyze_pdf(pdf_path):
|
||||
],
|
||||
},
|
||||
],
|
||||
tools=[INVOICE_TOOL],
|
||||
)
|
||||
text = response.choices[0].message.content
|
||||
match = re.search(r'```(?:json)?\s*\n(.*?)\n```', text, re.DOTALL)
|
||||
if match:
|
||||
text = match.group(1)
|
||||
return text
|
||||
|
||||
message = response.choices[0].message
|
||||
|
||||
if message.tool_calls:
|
||||
for tool_call in message.tool_calls:
|
||||
if tool_call.function.name == "parsed_invoices":
|
||||
data = json.loads(tool_call.function.arguments)
|
||||
return data.get("invoices", [])
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def analyze_url(url):
|
||||
@@ -98,6 +155,11 @@ def analyze_url(url):
|
||||
|
||||
def handler(event, context):
|
||||
print(event)
|
||||
url = event['url']
|
||||
print("URL IS", url)
|
||||
return analyze_url(url)
|
||||
if "pdf_base64" in event:
|
||||
pdf_path = "/tmp/invoice.pdf"
|
||||
with open(pdf_path, "wb") as f:
|
||||
f.write(base64.b64decode(event["pdf_base64"]))
|
||||
return analyze_pdf(pdf_path)
|
||||
if "url" in event:
|
||||
return analyze_url(event["url"])
|
||||
raise ValueError("event must contain 'url' or 'pdf_base64'")
|
||||
Reference in New Issue
Block a user