Files
integreat/scratch-sessions/textract.repl
2023-10-30 12:35:18 -07:00

162 lines
6.8 KiB
Plaintext

(ns amazonica.aws.textract
(:require
[auto-ap.solr :as solr]))
(require '[amazonica.core :as amz])
(import '[com.amazonaws.services.textract AmazonTextractClient ])
(import '[com.amazonaws.services.textract AmazonTextractClient ])
(import '[com.amazonaws.services.textract.model S3Object ])
(import '[com.amazonaws.services.textract.model StartExpenseAnalysisRequest ])
(import '[com.amazonaws.services.textract.model GetExpenseAnalysisRequest ])
(import '[com.amazonaws.services.textract.model DocumentLocation])
(amz/set-client AmazonTextractClient *ns*)
(in-ns 'user)
(require '[clojure.java.io :as io])
(require '[cheshire.core :as cheshire])
(require '[amazonica.aws.s3 :as s3])
(require '[auto-ap.graphql.utils :refer [cleanse-query]])
(require '[iol-ion.tx :as itx])
(require '[config.core :refer [env]])
(require '[amazonica.aws.textract :as txtract])
(import '[com.amazonaws.services.textract AmazonTextractClient ])
(import '[com.amazonaws.services.textract.model S3Object ])
(import '[com.amazonaws.services.textract.model StartExpenseAnalysisRequest ])
(import '[com.amazonaws.services.textract.model GetExpenseAnalysisRequest ])
(import '[com.amazonaws.services.textract.model DocumentLocation])
(import '[java.util UUID])
(defn textract-file [s3-location]
(let [job-id (:job-id (txtract/start-expense-analysis {:document-location {:s3-object {:bucket (:data-bucket env) :name s3-location}}}))
result (loop [result (txtract/get-expense-analysis {:job-id job-id})]
(println "checking..." (:job-status result))
(Thread/sleep 2000)
(if (= "IN_PROGRESS" (:job-status result))
(recur (txtract/get-expense-analysis {:job-id job-id}))
result))] result))
(defn lookup [tx]
(->> (:expense-documents tx)
(mapcat :summary-fields)
(concat (->> tx :expense-documents ))
(map (fn [sf]
(-> sf
(update :label-detection dissoc :geometry)
(update :value-detection dissoc :geometry))))
#_(group-by (fn [sf]
[(get-in sf ["Type" "Text"])
(get-in sf ["LabelDetection" "Text"])]
))))
(defn find-best [field-descriptors]
{:raw field-descriptors
:best
(->> field-descriptors
(sort-by #(* (-> % :type :confidence)
(-> % :value-detection :confidence)))
last
:value-detection
:text)})
(require '[auto-ap.solr :as solr])
(require '[auto-ap.logging :as alog])
(require '[com.brunobonacci.mulog :as mu])
(require '[auto-ap.datomic.clients :as d-clients])
(require '[auto-ap.time :as atime])
(defn textract->coalesced [tx]
(let [lookup (lookup tx)
]
{:total (find-best (filter (fn [node] (= "TOTAL" (:text (:type node)))) lookup))
:account-number (find-best (filter (fn [node] (= "CUSTOMER_NUMBER" (:text (:type node)))) lookup))
:customer-identifier (find-best (filter (fn [node] (= "RECEIVER_NAME" (:text (:type node)))) lookup))
:vendor-name (find-best (filter (fn [node] (= "VENDOR_NAME" (:text (:type node)))) lookup))
:date (find-best (filter (fn [node] (= "ORDER_DATE" (:text (:type node)))) lookup))
:invoice-number (find-best (filter (fn [node] (= "INVOICE_RECEIPT_ID" (:text (:type node)))) lookup))
}))
(defn clean-customer [c]
(clojure.string/replace c #"\W+" " "))
(require '[datomic.api :as dc])
(require '[auto-ap.datomic :refer [conn]])
(defn coalesced->invoice [i]
(mu/with-context {:inference i}
(let [vendor-id (->> (solr/query solr/impl "vendors" {"query" (format "name:(%s) ", (:best (:vendor-name i))) "fields" "score, *"})
(filter (fn [d] (> (:score d) 4.0)))
(map (comp #(Long/parseLong %) :id))
first)
account-number (:best (:account-number i))
customer-identifier (:best (:customer-identifier i))
client-id (or
(when (not-empty account-number)
(:db/id (d-clients/exact-match (:best (:account-number i)))))
(when (:best (:customer-identifier i))
(->> (solr/query solr/impl "clients" {"query" (format "name:(%s) ", (clean-customer customer-identifier)) "fields" "score, *"})
#_(filter (fn [d] (> (:score d) 4.0)))
(map (comp #(Long/parseLong %) :id))
first)))
location (when client-id
(->> (dc/pull (dc/db conn) '[:client/locations] client-id)
:client/locations
first))
invoice-number (:best (:invoice-number i))
total (Double/parseDouble (some->> i
:total
:best
(re-find #"([0-9.\-]+)")
second) )
date (or (atime/parse (:best (:date i)) "MM/dd/yyyy")
(atime/parse (:best (:date i)) "MM/dd/yy"))]
(when-not vendor-id
(alog/warn ::cant-find-vendor
:search-results (solr/query solr/impl "vendors" {"query" (format "name:(%s) ", (:best (:vendor-name i))) "fields" "score, *"})
:vendor-name (:vendor-name i)))
(when-not client-id
(alog/warn ::cant-find-customer))
(when (and client-id date invoice-number vendor-id total)
{:db/id (itx/random-tempid)
:invoice/client client-id
:invoice/client-identifier (or account-number customer-identifier)
:invoice/vendor vendor-id
:invoice/invoice-number invoice-number
:invoice/total total
:invoice/date date
:invoice/location location
:invoice/import-status :import-status/pending
:invoice/outstanding-balance total
:invoice/status :invoice-status/unpaid}))))
(defn file->textract->invoice [f]
(let [s3-location (str "textract-files/" (UUID/randomUUID) "." (last (str/split f #"[\\.]")))
file (io/file f)]
(with-open [stream (io/input-stream f)]
(s3/put-object (:data-bucket env)
s3-location
stream
{:content-type "application/pdf"
:content-length (.length file)}))
(-> (textract-file s3-location)
(textract->coalesced)
(coalesced->invoice))))
#_(def result (with-open [x (io/reader "batch.json")]
(json/parse-stream x)))