Begins the process of AI-powered invoices
This commit is contained in:
162
scratch-sessions/textract.repl
Normal file
162
scratch-sessions/textract.repl
Normal file
@@ -0,0 +1,162 @@
|
||||
|
||||
(ns amazonica.aws.textract
|
||||
(:require
|
||||
[auto-ap.solr :as solr]
|
||||
[unilog.context :as lc]))
|
||||
(require '[amazonica.core :as amz])
|
||||
(import '[com.amazonaws.services.textract AmazonTextractClient ])
|
||||
|
||||
(import '[com.amazonaws.services.textract AmazonTextractClient ])
|
||||
(import '[com.amazonaws.services.textract.model S3Object ])
|
||||
(import '[com.amazonaws.services.textract.model StartExpenseAnalysisRequest ])
|
||||
(import '[com.amazonaws.services.textract.model GetExpenseAnalysisRequest ])
|
||||
|
||||
(import '[com.amazonaws.services.textract.model DocumentLocation])
|
||||
(amz/set-client AmazonTextractClient *ns*)
|
||||
|
||||
(in-ns 'user)
|
||||
(require '[clojure.java.io :as io])
|
||||
(require '[cheshire.core :as cheshire])
|
||||
(require '[amazonica.aws.s3 :as s3])
|
||||
(require '[auto-ap.graphql.utils :refer [cleanse-query]])
|
||||
(require '[iol-ion.tx :as itx])
|
||||
|
||||
(require '[config.core :refer [env]])
|
||||
(require '[amazonica.aws.textract :as txtract])
|
||||
|
||||
|
||||
(import '[com.amazonaws.services.textract AmazonTextractClient ])
|
||||
(import '[com.amazonaws.services.textract.model S3Object ])
|
||||
(import '[com.amazonaws.services.textract.model StartExpenseAnalysisRequest ])
|
||||
(import '[com.amazonaws.services.textract.model GetExpenseAnalysisRequest ])
|
||||
|
||||
(import '[com.amazonaws.services.textract.model DocumentLocation])
|
||||
(import '[java.util UUID])
|
||||
|
||||
|
||||
|
||||
(defn textract-file [s3-location]
|
||||
(let [job-id (:job-id (txtract/start-expense-analysis {:document-location {:s3-object {:bucket (:data-bucket env) :name s3-location}}}))
|
||||
result (loop [result (txtract/get-expense-analysis {:job-id job-id})]
|
||||
(println "checking..." (:job-status result))
|
||||
(Thread/sleep 2000)
|
||||
(if (= "IN_PROGRESS" (:job-status result))
|
||||
(recur (txtract/get-expense-analysis {:job-id job-id}))
|
||||
result))] result))
|
||||
|
||||
(defn lookup [tx]
|
||||
(->> (:expense-documents tx)
|
||||
(mapcat :summary-fields)
|
||||
(concat (->> tx :expense-documents ))
|
||||
(map (fn [sf]
|
||||
(-> sf
|
||||
(update :label-detection dissoc :geometry)
|
||||
(update :value-detection dissoc :geometry))))
|
||||
#_(group-by (fn [sf]
|
||||
[(get-in sf ["Type" "Text"])
|
||||
(get-in sf ["LabelDetection" "Text"])]
|
||||
))))
|
||||
|
||||
(defn find-best [field-descriptors]
|
||||
{:raw field-descriptors
|
||||
:best
|
||||
(->> field-descriptors
|
||||
(sort-by #(* (-> % :type :confidence)
|
||||
(-> % :value-detection :confidence)))
|
||||
last
|
||||
:value-detection
|
||||
:text)})
|
||||
|
||||
(require '[auto-ap.solr :as solr])
|
||||
(require '[auto-ap.logging :as alog])
|
||||
|
||||
(require '[com.brunobonacci.mulog :as mu])
|
||||
(require '[auto-ap.datomic.clients :as d-clients])
|
||||
(require '[auto-ap.time :as atime])
|
||||
|
||||
(defn textract->coalesced [tx]
|
||||
(let [lookup (lookup tx)
|
||||
]
|
||||
{:total (find-best (filter (fn [node] (= "TOTAL" (:text (:type node)))) lookup))
|
||||
:account-number (find-best (filter (fn [node] (= "CUSTOMER_NUMBER" (:text (:type node)))) lookup))
|
||||
:customer-identifier (find-best (filter (fn [node] (= "RECEIVER_NAME" (:text (:type node)))) lookup))
|
||||
:vendor-name (find-best (filter (fn [node] (= "VENDOR_NAME" (:text (:type node)))) lookup))
|
||||
:date (find-best (filter (fn [node] (= "ORDER_DATE" (:text (:type node)))) lookup))
|
||||
:invoice-number (find-best (filter (fn [node] (= "INVOICE_RECEIPT_ID" (:text (:type node)))) lookup))
|
||||
}))
|
||||
|
||||
(defn clean-customer [c]
|
||||
(clojure.string/replace c #"\W+" " "))
|
||||
|
||||
(require '[datomic.api :as dc])
|
||||
|
||||
(require '[auto-ap.datomic :refer [conn]])
|
||||
|
||||
(defn coalesced->invoice [i]
|
||||
(mu/with-context {:inference i}
|
||||
(let [vendor-id (->> (solr/query solr/impl "vendors" {"query" (format "name:(%s) ", (:best (:vendor-name i))) "fields" "score, *"})
|
||||
(filter (fn [d] (> (:score d) 4.0)))
|
||||
(map (comp #(Long/parseLong %) :id))
|
||||
first)
|
||||
account-number (:best (:account-number i))
|
||||
customer-identifier (:best (:customer-identifier i))
|
||||
client-id (or
|
||||
(when (not-empty account-number)
|
||||
(:db/id (d-clients/exact-match (:best (:account-number i)))))
|
||||
(when (:best (:customer-identifier i))
|
||||
(->> (solr/query solr/impl "clients" {"query" (format "name:(%s) ", (clean-customer customer-identifier)) "fields" "score, *"})
|
||||
#_(filter (fn [d] (> (:score d) 4.0)))
|
||||
(map (comp #(Long/parseLong %) :id))
|
||||
first)))
|
||||
location (when client-id
|
||||
(->> (dc/pull (dc/db conn) '[:client/locations] client-id)
|
||||
:client/locations
|
||||
first))
|
||||
invoice-number (:best (:invoice-number i))
|
||||
total (Double/parseDouble (some->> i
|
||||
:total
|
||||
:best
|
||||
(re-find #"([0-9.\-]+)")
|
||||
second) )
|
||||
date (or (atime/parse (:best (:date i)) "MM/dd/yyyy")
|
||||
(atime/parse (:best (:date i)) "MM/dd/yy"))]
|
||||
(when-not vendor-id
|
||||
(alog/warn ::cant-find-vendor
|
||||
:search-results (solr/query solr/impl "vendors" {"query" (format "name:(%s) ", (:best (:vendor-name i))) "fields" "score, *"})
|
||||
:vendor-name (:vendor-name i)))
|
||||
(when-not client-id
|
||||
(alog/warn ::cant-find-customer))
|
||||
(when (and client-id date invoice-number vendor-id total)
|
||||
{:db/id (itx/random-tempid)
|
||||
:invoice/client client-id
|
||||
:invoice/client-identifier (or account-number customer-identifier)
|
||||
:invoice/vendor vendor-id
|
||||
:invoice/invoice-number invoice-number
|
||||
:invoice/total total
|
||||
:invoice/date date
|
||||
:invoice/location location
|
||||
:invoice/import-status :import-status/pending
|
||||
:invoice/outstanding-balance total
|
||||
:invoice/status :invoice-status/unpaid}))))
|
||||
|
||||
(defn file->textract->invoice [f]
|
||||
(let [s3-location (str "textract-files/" (UUID/randomUUID) "." (last (str/split f #"[\\.]")))
|
||||
file (io/file f)]
|
||||
(with-open [stream (io/input-stream f)]
|
||||
(s3/put-object (:data-bucket env)
|
||||
s3-location
|
||||
stream
|
||||
{:content-type "application/pdf"
|
||||
:content-length (.length file)}))
|
||||
(-> (textract-file s3-location)
|
||||
(textract->coalesced)
|
||||
(coalesced->invoice))))
|
||||
|
||||
|
||||
|
||||
|
||||
#_(def result (with-open [x (io/reader "batch.json")]
|
||||
(json/parse-stream x)))
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user