progress

2026-05-27 09:21:06 -07:00
parent 127f1486be
commit 3d9f82f1ea
2 changed files with 48 additions and 41 deletions
--- a/code/main.py
+++ b/code/main.py
@@ -1,69 +1,76 @@
-
 #!/usr/bin/env python3
+import base64
 import json
 import os
-import pprint
 import urllib.request

 import openai

-openai.api_key = "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"
+client = openai.OpenAI(api_key="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")


 def slurp_file(filename):
    with open(filename, 'r') as file:
-        data = file.read()
-    return data
+        return file.read()


-BASE_PROMPT="""You extract invoice details from pdfs. Some pdfs are invoices, some are credits, and some are statements that may contain statements or credits. Numbers in parenthesis typically indicate credits. Always follow this json schema. Do not respond with anything except the raw json response. Do not respond in code blocks(```). If you don't find any invoices, make sure to fill out the explanation field at least.
-```
-{}
-```
-""".format(slurp_file(os.path.join(os.path.dirname(__file__), 'schema.json')))
+BASE_PROMPT="""You are an invoice extraction assistant. Your job is to read PDF documents and extract all invoice and credit note details into structured JSON.

+DOCUMENT TYPES YOU MAY ENCOUNTER:
+1. **Single Invoice** — one invoice with line items, totals, dates, etc.
+2. **Credit Note** — similar to an invoice but represents a credit/refund. Extract it the same way; the total will be positive (the credit amount).
+3. **Statement / Summary** — a document listing multiple invoices or credits in a table or list format. Each row or entry represents a separate invoice/credit. Extract EACH one as a separate object in the output array.
+4. **Mixed** — a document containing both invoices and credits.

-client = openai.OpenAI(api_key="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")
-client.api_key = "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"
+EXTRACTION RULES:
+- Extract EVERY invoice or credit you can find. If the document is a statement listing 10 invoices, return all 10.
+- **customer_identifier**: The name of the customer/buyer. Look for "Bill To", "Customer", "Sold To", or the company name at the top.
+- **vendor_identifier**: The name of the vendor/seller. Look for "From", "Vendor", "Supplier", letterhead, or the company issuing the document.
+- **date**: The invoice date in ISO 8601 format (YYYY-MM-DD). If multiple dates exist, use the invoice date, not the due date or statement period.
+- **invoice_number**: The unique invoice or credit note number. Look for labels like "Invoice #", "Inv No", "Credit Note #", "Reference", "Doc #".
+- **account_number**: The customer's account number if present. Not required — omit if not found.
+- **total**: The total amount as a decimal string (e.g., "1234.56"). Use the grand total or amount due. For credits, use the credit amount as a positive number. Numbers in parentheses indicate credits — extract them as positive values.
+- **explanation**: Only use this when you cannot find any valid invoices. Provide a detailed reason (e.g., "document is blank", "PDF contains only images with no extractable text", "document is a cover letter with no invoice data").
+
+IMPORTANT:
+- Do NOT skip entries because some fields are missing. Extract what you can.
+- For statements/summaries, each row in an invoice table is a separate invoice entry.
+- If OCR fails completely and no text can be extracted at all, return an array with one object containing only the explanation field.
+- Your FINAL response to the user must be ONLY a JSON array. Do NOT wrap it in markdown code blocks. Do NOT add any prose before or after the JSON."""


 def analyze_pdf(pdf_path):
-    assistant = client.beta.assistants.create(
-        name="pdf-reader",
-        instructions=BASE_PROMPT,
-        model="gpt-4o",
-        tools=[{"type": "file_search"}],
-    )
-
    with open(pdf_path, 'rb') as f:
-        message_file = client.files.create(file=f, purpose="assistants")
+        pdf_data = f.read()

-    thread = client.beta.threads.create(
-        messages=[
+    base64_string = base64.b64encode(pdf_data).decode("utf-8")
+
+    response = client.responses.create(
+        model="gpt-4o",
+        instructions=BASE_PROMPT,
+        input=[
            {
                "role": "user",
-                "content": "extract the invoice(s) and/or credit(s) details from this invoice or statement",
-                "attachments": [
-                    {"file_id": message_file.id, "tools": [{"type": "file_search"}]}
+                "content": [
+                    {
+                        "type": "input_file",
+                        "filename": os.path.basename(pdf_path),
+                        "file_data": f"data:application/pdf;base64,{base64_string}",
+                    },
+                    {
+                        "type": "input_text",
+                        "text": "extract the invoice(s) and/or credit(s) details from this document.",
+                    },
                ],
            }
-        ]
+        ],
    )
-    print(thread.id)
-
-    run = client.beta.threads.runs.create_and_poll(
-        thread_id=thread.id, assistant_id=assistant.id
-    )
-
-    messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))
-    print("MESSAGES")
-    pprint.pprint(messages)
-    print("\n\n")
-    print("status", run.status)
-    print("\n\n")
-    print("full run")
-    pprint.pprint(run)
-    return json.loads(messages[0].content[0].text.value)
+    text = response.output_text
+    import re
+    match = re.search(r'```(?:json)?\s*\n(.*?)\n```', text, re.DOTALL)
+    if match:
+        text = match.group(1)
+    return text


 def analyze_url(url):