81 lines
2.4 KiB
Python
Executable File
81 lines
2.4 KiB
Python
Executable File
|
|
#!/usr/bin/env python3
|
|
import json
|
|
import os
|
|
import pprint
|
|
import urllib.request
|
|
|
|
import openai
|
|
|
|
openai.api_key = "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"
|
|
|
|
|
|
def slurp_file(filename):
|
|
with open(filename, 'r') as file:
|
|
data = file.read()
|
|
return data
|
|
|
|
|
|
BASE_PROMPT="""You extract invoice details from pdfs. Some pdfs are invoices, some are credits, and some are statements that may contain statements or credits. Numbers in parenthesis typically indicate credits. Always follow this json schema. Do not respond with anything except the raw json response. Do not respond in code blocks(```). If you don't find any invoices, make sure to fill out the explanation field at least.
|
|
```
|
|
{}
|
|
```
|
|
""".format(slurp_file(os.path.join(os.path.dirname(__file__), 'schema.json')))
|
|
|
|
|
|
client = openai.OpenAI(api_key="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")
|
|
client.api_key = "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"
|
|
|
|
|
|
def analyze_pdf(pdf_path):
|
|
assistant = client.beta.assistants.create(
|
|
name="pdf-reader",
|
|
instructions=BASE_PROMPT,
|
|
model="gpt-4o",
|
|
tools=[{"type": "file_search"}],
|
|
)
|
|
|
|
with open(pdf_path, 'rb') as f:
|
|
message_file = client.files.create(file=f, purpose="assistants")
|
|
|
|
thread = client.beta.threads.create(
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": "extract the invoice(s) and/or credit(s) details from this invoice or statement",
|
|
"attachments": [
|
|
{"file_id": message_file.id, "tools": [{"type": "file_search"}]}
|
|
],
|
|
}
|
|
]
|
|
)
|
|
print(thread.id)
|
|
|
|
run = client.beta.threads.runs.create_and_poll(
|
|
thread_id=thread.id, assistant_id=assistant.id
|
|
)
|
|
|
|
messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))
|
|
print("MESSAGES")
|
|
pprint.pprint(messages)
|
|
print("\n\n")
|
|
print("status", run.status)
|
|
print("\n\n")
|
|
print("full run")
|
|
pprint.pprint(run)
|
|
return json.loads(messages[0].content[0].text.value)
|
|
|
|
|
|
def analyze_url(url):
|
|
with urllib.request.urlopen(url) as response:
|
|
data = response.read()
|
|
with open("/tmp/test.pdf", "wb") as f:
|
|
f.write(data)
|
|
return analyze_pdf("/tmp/test.pdf")
|
|
|
|
|
|
def handler(event, context):
|
|
print(event)
|
|
url = event['url']
|
|
print("URL IS", url)
|
|
return analyze_url(url) |