Begins the process of AI-powered invoices

This commit is contained in:
Bryce
2023-08-02 22:34:56 -07:00
parent 7ed6a11ffd
commit 460e2077b9
9 changed files with 599 additions and 88 deletions

View File

@@ -0,0 +1,12 @@
(ns amazonica.aws.textract
(:require [amazonica.core :as amz])
(:import [com.amazonaws.services.textract AmazonTextractClient ]))
#_
(import '[com.amazonaws.services.textract AmazonTextractClient ])
#_(import '[com.amazonaws.services.textract.model S3Object ])
#_(import '[com.amazonaws.services.textract.model StartExpenseAnalysisRequest ])
#_(import '[com.amazonaws.services.textract.model GetExpenseAnalysisRequest ])
#_(import '[com.amazonaws.services.textract.model DocumentLocation])
(amz/set-client AmazonTextractClient *ns*)

View File

@@ -207,5 +207,5 @@
"EZCater XLS Import"])
(page*))
"EZCater upload")))
"Invoice Glimpse")))

View File

@@ -11,6 +11,7 @@
[auto-ap.ssr.search :as search]
[auto-ap.ssr.company-dropdown :as company-dropdown]
[auto-ap.ssr.company.reports :as company-reports]
[auto-ap.ssr.invoice.glimpse :as invoice-glimpse]
[auto-ap.routes.ezcater-xls :as ezcater-xls]
[auto-ap.ssr.company :as company]))
@@ -39,6 +40,9 @@
:company-reports (wrap-client-redirect-unauthenticated (wrap-secure company-reports/page))
:company-reports-table (wrap-client-redirect-unauthenticated (wrap-secure company-reports/table))
:company-reports-delete (wrap-client-redirect-unauthenticated (wrap-admin company-reports/delete-report))
:invoice-glimpse (wrap-client-redirect-unauthenticated (wrap-admin invoice-glimpse/page))
:invoice-glimpse-upload (wrap-client-redirect-unauthenticated (wrap-admin invoice-glimpse/upload))
:invoice-glimpse-job (wrap-client-redirect-unauthenticated (wrap-admin invoice-glimpse/job-progress))
:transaction-insights (wrap-client-redirect-unauthenticated (wrap-admin insights/page))
:transaction-insight-table (wrap-client-redirect-unauthenticated (wrap-admin insights/insight-table))
:transaction-insight-rows (wrap-client-redirect-unauthenticated (wrap-admin insights/transaction-rows))

View File

@@ -0,0 +1,295 @@
(ns auto-ap.ssr.invoice.glimpse
(:require
[amazonica.aws.s3 :as s3]
[amazonica.aws.textract :as textract]
[auto-ap.datomic :refer [conn pull-attr pull-id]]
[auto-ap.datomic.clients :as d-clients]
[auto-ap.logging :as alog]
[auto-ap.solr :as solr]
[auto-ap.ssr-routes :as ssr-routes]
[auto-ap.ssr.components :as com]
[auto-ap.ssr.ui :refer [base-page]]
[auto-ap.ssr.utils :refer [html-response path->name]]
[auto-ap.time :as atime]
[bidi.bidi :as bidi]
[cemerick.url :as url]
[clojure.java.io :as io]
[clojure.string :as str]
[com.brunobonacci.mulog :as mu]
[config.core :refer [env]]
[datomic.api :as dc]
[hiccup2.core :as hiccup]
[iol-ion.tx :refer [random-tempid]])
(:import
(java.util UUID)))
(def bucket-name (:data-bucket env))
(defn lookup [tx]
(->> (:expense-documents tx)
(mapcat :summary-fields)
(concat (->> tx :expense-documents ))
(map (fn [sf]
(-> sf
(update :label-detection dissoc :geometry)
(update :value-detection dissoc :geometry))))
#_(group-by (fn [sf]
[(get-in sf ["Type" "Text"])
(get-in sf ["LabelDetection" "Text"])]
))))
(defn find-best [field-descriptors]
{:raw field-descriptors
:best
(->> field-descriptors
(sort-by #(* (-> % :type :confidence)
(-> % :value-detection :confidence)))
last
:value-detection
:text)})
(defn textract->coalesced [tx]
(let [lookup (lookup tx)
]
{:total (find-best (filter (fn [node] (= "TOTAL" (:text (:type node)))) lookup))
:account-number (find-best (filter (fn [node] (= "CUSTOMER_NUMBER" (:text (:type node)))) lookup))
:customer-identifier (find-best (filter (fn [node] (= "RECEIVER_NAME" (:text (:type node)))) lookup))
:vendor-name (find-best (filter (fn [node] (= "VENDOR_NAME" (:text (:type node)))) lookup))
:date (find-best (filter (fn [node] (= "ORDER_DATE" (:text (:type node)))) lookup))
:invoice-number (find-best (filter (fn [node] (= "INVOICE_RECEIPT_ID" (:text (:type node)))) lookup))
}))
(defn clean-customer [c]
(clojure.string/replace c #"\W+" " "))
(defn coalesced->invoice [i]
(mu/with-context {:inference i}
(let [vendor-id (->> (solr/query solr/impl "vendors" {"query" (format "name:(%s) ", (:best (:vendor-name i))) "fields" "score, *"})
(filter (fn [d] (> (:score d) 4.0)))
(map (comp #(Long/parseLong %) :id))
first)
account-number (:best (:account-number i))
customer-identifier (:best (:customer-identifier i))
client-id (or
(when (not-empty account-number)
(:db/id (d-clients/exact-match (:best (:account-number i)))))
(when (:best (:customer-identifier i))
(->> (solr/query solr/impl "clients" {"query" (format "name:(%s) ", (clean-customer customer-identifier)) "fields" "score, *"})
#_(filter (fn [d] (> (:score d) 4.0)))
(map (comp #(Long/parseLong %) :id))
first)))
location (when client-id
(->> (dc/pull (dc/db conn) '[:client/locations] client-id)
:client/locations
first))
invoice-number (:best (:invoice-number i))
total (Double/parseDouble (some->> i
:total
:best
(re-find #"([0-9.\-]+)")
second) )
date (or (atime/parse (:best (:date i)) "MM/dd/yyyy")
(atime/parse (:best (:date i)) "MM/dd/yy"))]
(when-not vendor-id
(alog/warn ::cant-find-vendor
:search-results (solr/query solr/impl "vendors" {"query" (format "name:(%s) ", (:best (:vendor-name i))) "fields" "score, *"})
:vendor-name (:vendor-name i)))
(when-not client-id
(alog/warn ::cant-find-customer))
(when (and client-id date invoice-number vendor-id total)
{:db/id (random-tempid)
:invoice/client client-id
:invoice/client-identifier (or account-number customer-identifier)
:invoice/vendor vendor-id
:invoice/invoice-number invoice-number
:invoice/total total
:invoice/date date
:invoice/location location
:invoice/import-status :import-status/pending
:invoice/outstanding-balance total
:invoice/status :invoice-status/unpaid}))))
(defn upload-form* []
[:div
[:form.bg-blue-100.border-2.border-dashed.rounded-lg.border-blue-300.p-4.max-w-md.w-md.text-center.cursor-pointer
{:action (bidi/path-for ssr-routes/only-routes
:invoice-glimpse-upload)
:method "POST"
:id "invoice"}
"Drop an invoice here"]
[:script
(hiccup/raw
"
invoice_dropzone = new Dropzone(\"#invoice\", {
success: function(file, response) {
window.location.href = file.xhr.responseURL;
},
disablePreviews: true
}); ")]])
(defn refresh-job [job-id]
(let [{:keys [:db/id :textract-invoice/textract-status]} (dc/pull (dc/db conn) '[:db/id :textract-invoice/textract-status] [:textract-invoice/job-id job-id])]
(when (= "IN_PROGRESS" textract-status)
(let [result (textract/get-expense-analysis {:job-id job-id})]
@(dc/transact conn [{:db/id id :textract-invoice/textract-status (:job-status result)}])))
(dc/pull (dc/db conn) '[*] [:textract-invoice/job-id job-id])))
(defn textract->invoice-form* [job-id]
(let [coalesced (-> (textract/get-expense-analysis {:job-id job-id})
(textract->coalesced))
candidate-invoice (-> coalesced
(coalesced->invoice))]
[:form
[:div.grid.grid-cols-6.gap-4
[:div.col-span-6
(com/field {:label "Client"}
(com/text-input {:name (path->name [:invoice/client])
:value (pull-attr (dc/db conn) :client/name (:invoice/client candidate-invoice))
:placeholder "Client"
:disabled true
:autofocus true}))]
[:div.col-span-6
(com/field {:label "Vendor"}
(com/text-input {:name (path->name [:invoice/vendor])
:value (pull-attr (dc/db conn) :vendor/name (:invoice/vendor candidate-invoice))
:placeholder "Vendor"
:disabled true
:autofocus true}))]
[:div.col-span-3
(com/field {:label "Date"}
(com/text-input {:name (path->name [:invoice/date])
:value (atime/unparse-local (:invoice/date candidate-invoice)
atime/normal-date)
:placeholder "Date"
:disabled true
:autofocus true}))]
[:div.col-span-3.col-start-1.text-xs
"Alternates: "
(butlast
(interleave
(map (fn [x] (com/link {:href "#"} (pr-str x))) (set (map (comp :text :value-detection) (:raw (:date coalesced)))))
(repeat ", ")))]
[:div.col-span-2.col-start-1
(com/field {:label "Total"}
(com/text-input {:name (path->name [:invoice/total])
:value (:invoice/total candidate-invoice)
:placeholder "Total"
:disabled true
:autofocus true}))]
[:div.col-span-3.col-start-1.text-xs
"Alternates: "
(butlast
(interleave
(map (fn [x] (com/link {:href "#"} (pr-str x))) (set (map (comp :text :value-detection) (:raw (:total coalesced)))))
(repeat ", ")))]
[:div.col-span-2.col-start-1
(com/field {:label "Invoice Number"}
(com/text-input {:name (path->name [:invoice/invoice-number])
:value (:invoice/invoice-number candidate-invoice)
:placeholder "Invoice Number"
:disabled true
:autofocus true}))]]]))
(defn job-progress* [job-id]
(when (pull-id (dc/db conn) [:textract-invoice/job-id job-id])
(let [textract-invoice (refresh-job job-id)]
(cond
(= "IN_PROGRESS" (:textract-invoice/textract-status textract-invoice))
[:div.bg-blue-100.border-2.border-dashed.rounded-lg.border-blue-300.p-4.max-w-md.w-md.text-center.cursor-pointer
{:hx-get (str (bidi/path-for ssr-routes/only-routes
:invoice-glimpse-job)
"?" (url/map->query {:job-id job-id}))
:hx-trigger "load delay:5s"
:hx-swap "outerHTML"}
"Analyzing job " (subs (:textract-invoice/job-id textract-invoice) 0 8) "..."]
(= "SUCCEEDED" (:textract-invoice/textract-status textract-invoice))
[:div.px-4
[:a.mb-2 {:href (bidi/path-for ssr-routes/only-routes
:invoice-glimpse)}
(com/button {:color :secondary} "New import")]
[:div.flex.flex-row.space-x-4
[:div {:style {:width "805"}}
(com/card {}
[:iframe.p-4 {:src (:textract-invoice/pdf-url textract-invoice) :width 791 :height 1024}])]
[:div {:class "basis-1/4"}
(com/card {}
[:div.p-4
(textract->invoice-form* job-id)])]]]))))
(defn job-progress [request]
(html-response (job-progress* (get (:query-params request) "job-id"))))
(defn page* [job-id]
[:div.mt-4
(com/card {}
[:div.px-4.py-3.space-y-4.flex.flex-col
[:h1.text-2xl.mb-3.font-bold "Invoice Glimpse"]
[:p.text-sm.italic "Import your invoices with the power of AI."]
[:div.flex.flex-row.space-x-4 (com/pill {:color :primary} "Beta")
[:span "Note: This upload is expirimental. Please only use PDFs with a single invoice in them."]]
(when job-id
(job-progress* job-id))
(when-not job-id
(upload-form*))])])
(defn begin-textract-file [s3-location]
(let [analysis (textract/start-expense-analysis {:document-location {:s3-object {:bucket (:data-bucket env) :name s3-location}}})
textract-invoice {:textract-invoice/job-id (:job-id analysis)
:textract-invoice/textract-status "IN_PROGRESS"
:textract-invoice/pdf-url (str "http://" bucket-name ".s3-website-us-east-1.amazonaws.com/" s3-location)}]
@(dc/transact conn [textract-invoice])
textract-invoice))
(defn upload [{:keys [identity] :as request}]
(let [file (or (get (:params request) :file)
(get (:params request) "file"))]
(mu/log ::uploading-file
:file file)
(with-open [s (io/input-stream (:tempfile file))]
(try
(let [s3-location (str "textract-files/" (UUID/randomUUID) "." (last (str/split (:filename file) #"[\\.]")))
_ (with-open [stream (io/input-stream (:tempfile file))]
(s3/put-object (:data-bucket env)
s3-location
stream
{:content-type "application/pdf"
:content-length (.length (:tempfile file))}))
textract-invoice (begin-textract-file s3-location)]
{:headers {"Location"
(str (bidi/path-for ssr-routes/only-routes
:invoice-glimpse)
"?" (url/map->query {:job-id (:textract-invoice/job-id textract-invoice)}))}
:status 302})
(catch Exception e
(alog/error ::cant-begin-textract
:error e)
(html-response [:div (.getMessage e)]))))))
(defn page [{:keys [matched-route request-method] :as request}]
(mu/log ::method
:method request-method)
(base-page
request
(com/page {:nav (com/admin-aside-nav)
:active-client (:client (:session request))
:identity (:identity request)
:app-params {:hx-get (bidi/path-for ssr-routes/only-routes
:invoice-glimpse)
:hx-trigger "clientSelected from:body"
:hx-select "#app-contents"
:hx-swap "outerHTML swap:300ms"}}
(com/breadcrumbs {}
[:a {:href (bidi/path-for ssr-routes/only-routes
:admin)}
"Invoice"]
[:a {:href (bidi/path-for ssr-routes/only-routes
:invoice-glimpse)}
"Glimpse"])
(page* (get (:query-params request) "job-id")))
"Invoice Glimpse"))

View File

@@ -2,6 +2,9 @@
(def routes {"logout" :logout
"search" :search
"invoice" {"/glimpse" {"" {:get :invoice-glimpse
:post :invoice-glimpse-upload}
"/job" {:get :invoice-glimpse-job}}}
"admin" {"/history" {"" :admin-history
"/" :admin-history
#"/search/?" :admin-history-search