162 lines
6.8 KiB
Plaintext
162 lines
6.8 KiB
Plaintext
|
|
(ns amazonica.aws.textract
|
|
(:require
|
|
[auto-ap.solr :as solr]))
|
|
(require '[amazonica.core :as amz])
|
|
(import '[com.amazonaws.services.textract AmazonTextractClient ])
|
|
|
|
(import '[com.amazonaws.services.textract AmazonTextractClient ])
|
|
(import '[com.amazonaws.services.textract.model S3Object ])
|
|
(import '[com.amazonaws.services.textract.model StartExpenseAnalysisRequest ])
|
|
(import '[com.amazonaws.services.textract.model GetExpenseAnalysisRequest ])
|
|
|
|
(import '[com.amazonaws.services.textract.model DocumentLocation])
|
|
(amz/set-client AmazonTextractClient *ns*)
|
|
|
|
(in-ns 'user)
|
|
(require '[clojure.java.io :as io])
|
|
(require '[cheshire.core :as cheshire])
|
|
(require '[amazonica.aws.s3 :as s3])
|
|
(require '[auto-ap.graphql.utils :refer [cleanse-query]])
|
|
(require '[iol-ion.tx :as itx])
|
|
|
|
(require '[config.core :refer [env]])
|
|
(require '[amazonica.aws.textract :as txtract])
|
|
|
|
|
|
(import '[com.amazonaws.services.textract AmazonTextractClient ])
|
|
(import '[com.amazonaws.services.textract.model S3Object ])
|
|
(import '[com.amazonaws.services.textract.model StartExpenseAnalysisRequest ])
|
|
(import '[com.amazonaws.services.textract.model GetExpenseAnalysisRequest ])
|
|
|
|
(import '[com.amazonaws.services.textract.model DocumentLocation])
|
|
(import '[java.util UUID])
|
|
|
|
|
|
|
|
(defn textract-file [s3-location]
|
|
(let [job-id (:job-id (txtract/start-expense-analysis {:document-location {:s3-object {:bucket (:data-bucket env) :name s3-location}}}))
|
|
result (loop [result (txtract/get-expense-analysis {:job-id job-id})]
|
|
(println "checking..." (:job-status result))
|
|
(Thread/sleep 2000)
|
|
(if (= "IN_PROGRESS" (:job-status result))
|
|
(recur (txtract/get-expense-analysis {:job-id job-id}))
|
|
result))] result))
|
|
|
|
(defn lookup [tx]
|
|
(->> (:expense-documents tx)
|
|
(mapcat :summary-fields)
|
|
(concat (->> tx :expense-documents ))
|
|
(map (fn [sf]
|
|
(-> sf
|
|
(update :label-detection dissoc :geometry)
|
|
(update :value-detection dissoc :geometry))))
|
|
#_(group-by (fn [sf]
|
|
[(get-in sf ["Type" "Text"])
|
|
(get-in sf ["LabelDetection" "Text"])]
|
|
))))
|
|
|
|
(defn find-best [field-descriptors]
|
|
{:raw field-descriptors
|
|
:best
|
|
(->> field-descriptors
|
|
(sort-by #(* (-> % :type :confidence)
|
|
(-> % :value-detection :confidence)))
|
|
last
|
|
:value-detection
|
|
:text)})
|
|
|
|
(require '[auto-ap.solr :as solr])
|
|
(require '[auto-ap.logging :as alog])
|
|
|
|
(require '[com.brunobonacci.mulog :as mu])
|
|
(require '[auto-ap.datomic.clients :as d-clients])
|
|
(require '[auto-ap.time :as atime])
|
|
|
|
(defn textract->coalesced [tx]
|
|
(let [lookup (lookup tx)
|
|
]
|
|
{:total (find-best (filter (fn [node] (= "TOTAL" (:text (:type node)))) lookup))
|
|
:account-number (find-best (filter (fn [node] (= "CUSTOMER_NUMBER" (:text (:type node)))) lookup))
|
|
:customer-identifier (find-best (filter (fn [node] (= "RECEIVER_NAME" (:text (:type node)))) lookup))
|
|
:vendor-name (find-best (filter (fn [node] (= "VENDOR_NAME" (:text (:type node)))) lookup))
|
|
:date (find-best (filter (fn [node] (= "ORDER_DATE" (:text (:type node)))) lookup))
|
|
:invoice-number (find-best (filter (fn [node] (= "INVOICE_RECEIPT_ID" (:text (:type node)))) lookup))
|
|
}))
|
|
|
|
(defn clean-customer [c]
|
|
(clojure.string/replace c #"\W+" " "))
|
|
|
|
(require '[datomic.api :as dc])
|
|
|
|
(require '[auto-ap.datomic :refer [conn]])
|
|
|
|
(defn coalesced->invoice [i]
|
|
(mu/with-context {:inference i}
|
|
(let [vendor-id (->> (solr/query solr/impl "vendors" {"query" (format "name:(%s) ", (:best (:vendor-name i))) "fields" "score, *"})
|
|
(filter (fn [d] (> (:score d) 4.0)))
|
|
(map (comp #(Long/parseLong %) :id))
|
|
first)
|
|
account-number (:best (:account-number i))
|
|
customer-identifier (:best (:customer-identifier i))
|
|
client-id (or
|
|
(when (not-empty account-number)
|
|
(:db/id (d-clients/exact-match (:best (:account-number i)))))
|
|
(when (:best (:customer-identifier i))
|
|
(->> (solr/query solr/impl "clients" {"query" (format "name:(%s) ", (clean-customer customer-identifier)) "fields" "score, *"})
|
|
#_(filter (fn [d] (> (:score d) 4.0)))
|
|
(map (comp #(Long/parseLong %) :id))
|
|
first)))
|
|
location (when client-id
|
|
(->> (dc/pull (dc/db conn) '[:client/locations] client-id)
|
|
:client/locations
|
|
first))
|
|
invoice-number (:best (:invoice-number i))
|
|
total (Double/parseDouble (some->> i
|
|
:total
|
|
:best
|
|
(re-find #"([0-9.\-]+)")
|
|
second) )
|
|
date (or (atime/parse (:best (:date i)) "MM/dd/yyyy")
|
|
(atime/parse (:best (:date i)) "MM/dd/yy"))]
|
|
(when-not vendor-id
|
|
(alog/warn ::cant-find-vendor
|
|
:search-results (solr/query solr/impl "vendors" {"query" (format "name:(%s) ", (:best (:vendor-name i))) "fields" "score, *"})
|
|
:vendor-name (:vendor-name i)))
|
|
(when-not client-id
|
|
(alog/warn ::cant-find-customer))
|
|
(when (and client-id date invoice-number vendor-id total)
|
|
{:db/id (itx/random-tempid)
|
|
:invoice/client client-id
|
|
:invoice/client-identifier (or account-number customer-identifier)
|
|
:invoice/vendor vendor-id
|
|
:invoice/invoice-number invoice-number
|
|
:invoice/total total
|
|
:invoice/date date
|
|
:invoice/location location
|
|
:invoice/import-status :import-status/pending
|
|
:invoice/outstanding-balance total
|
|
:invoice/status :invoice-status/unpaid}))))
|
|
|
|
(defn file->textract->invoice [f]
|
|
(let [s3-location (str "textract-files/" (UUID/randomUUID) "." (last (str/split f #"[\\.]")))
|
|
file (io/file f)]
|
|
(with-open [stream (io/input-stream f)]
|
|
(s3/put-object (:data-bucket env)
|
|
s3-location
|
|
stream
|
|
{:content-type "application/pdf"
|
|
:content-length (.length file)}))
|
|
(-> (textract-file s3-location)
|
|
(textract->coalesced)
|
|
(coalesced->invoice))))
|
|
|
|
|
|
|
|
|
|
#_(def result (with-open [x (io/reader "batch.json")]
|
|
(json/parse-stream x)))
|
|
|
|
|
|
|