(ns amazonica.aws.textract (:require [auto-ap.solr :as solr])) (require '[amazonica.core :as amz]) (import '[com.amazonaws.services.textract AmazonTextractClient ]) (import '[com.amazonaws.services.textract AmazonTextractClient ]) (import '[com.amazonaws.services.textract.model S3Object ]) (import '[com.amazonaws.services.textract.model StartExpenseAnalysisRequest ]) (import '[com.amazonaws.services.textract.model GetExpenseAnalysisRequest ]) (import '[com.amazonaws.services.textract.model DocumentLocation]) (amz/set-client AmazonTextractClient *ns*) (in-ns 'user) (require '[clojure.java.io :as io]) (require '[cheshire.core :as cheshire]) (require '[amazonica.aws.s3 :as s3]) (require '[auto-ap.graphql.utils :refer [cleanse-query]]) (require '[iol-ion.tx :as itx]) (require '[config.core :refer [env]]) (require '[amazonica.aws.textract :as txtract]) (import '[com.amazonaws.services.textract AmazonTextractClient ]) (import '[com.amazonaws.services.textract.model S3Object ]) (import '[com.amazonaws.services.textract.model StartExpenseAnalysisRequest ]) (import '[com.amazonaws.services.textract.model GetExpenseAnalysisRequest ]) (import '[com.amazonaws.services.textract.model DocumentLocation]) (import '[java.util UUID]) (defn textract-file [s3-location] (let [job-id (:job-id (txtract/start-expense-analysis {:document-location {:s3-object {:bucket (:data-bucket env) :name s3-location}}})) result (loop [result (txtract/get-expense-analysis {:job-id job-id})] (println "checking..." (:job-status result)) (Thread/sleep 2000) (if (= "IN_PROGRESS" (:job-status result)) (recur (txtract/get-expense-analysis {:job-id job-id})) result))] result)) (defn lookup [tx] (->> (:expense-documents tx) (mapcat :summary-fields) (concat (->> tx :expense-documents )) (map (fn [sf] (-> sf (update :label-detection dissoc :geometry) (update :value-detection dissoc :geometry)))) #_(group-by (fn [sf] [(get-in sf ["Type" "Text"]) (get-in sf ["LabelDetection" "Text"])] )))) (defn find-best [field-descriptors] {:raw field-descriptors :best (->> field-descriptors (sort-by #(* (-> % :type :confidence) (-> % :value-detection :confidence))) last :value-detection :text)}) (require '[auto-ap.solr :as solr]) (require '[auto-ap.logging :as alog]) (require '[com.brunobonacci.mulog :as mu]) (require '[auto-ap.datomic.clients :as d-clients]) (require '[auto-ap.time :as atime]) (defn textract->coalesced [tx] (let [lookup (lookup tx) ] {:total (find-best (filter (fn [node] (= "TOTAL" (:text (:type node)))) lookup)) :account-number (find-best (filter (fn [node] (= "CUSTOMER_NUMBER" (:text (:type node)))) lookup)) :customer-identifier (find-best (filter (fn [node] (= "RECEIVER_NAME" (:text (:type node)))) lookup)) :vendor-name (find-best (filter (fn [node] (= "VENDOR_NAME" (:text (:type node)))) lookup)) :date (find-best (filter (fn [node] (= "ORDER_DATE" (:text (:type node)))) lookup)) :invoice-number (find-best (filter (fn [node] (= "INVOICE_RECEIPT_ID" (:text (:type node)))) lookup)) })) (defn clean-customer [c] (clojure.string/replace c #"\W+" " ")) (require '[datomic.api :as dc]) (require '[auto-ap.datomic :refer [conn]]) (defn coalesced->invoice [i] (mu/with-context {:inference i} (let [vendor-id (->> (solr/query solr/impl "vendors" {"query" (format "name:(%s) ", (:best (:vendor-name i))) "fields" "score, *"}) (filter (fn [d] (> (:score d) 4.0))) (map (comp #(Long/parseLong %) :id)) first) account-number (:best (:account-number i)) customer-identifier (:best (:customer-identifier i)) client-id (or (when (not-empty account-number) (:db/id (d-clients/exact-match (:best (:account-number i))))) (when (:best (:customer-identifier i)) (->> (solr/query solr/impl "clients" {"query" (format "name:(%s) ", (clean-customer customer-identifier)) "fields" "score, *"}) #_(filter (fn [d] (> (:score d) 4.0))) (map (comp #(Long/parseLong %) :id)) first))) location (when client-id (->> (dc/pull (dc/db conn) '[:client/locations] client-id) :client/locations first)) invoice-number (:best (:invoice-number i)) total (Double/parseDouble (some->> i :total :best (re-find #"([0-9.\-]+)") second) ) date (or (atime/parse (:best (:date i)) "MM/dd/yyyy") (atime/parse (:best (:date i)) "MM/dd/yy"))] (when-not vendor-id (alog/warn ::cant-find-vendor :search-results (solr/query solr/impl "vendors" {"query" (format "name:(%s) ", (:best (:vendor-name i))) "fields" "score, *"}) :vendor-name (:vendor-name i))) (when-not client-id (alog/warn ::cant-find-customer)) (when (and client-id date invoice-number vendor-id total) {:db/id (itx/random-tempid) :invoice/client client-id :invoice/client-identifier (or account-number customer-identifier) :invoice/vendor vendor-id :invoice/invoice-number invoice-number :invoice/total total :invoice/date date :invoice/location location :invoice/import-status :import-status/pending :invoice/outstanding-balance total :invoice/status :invoice-status/unpaid})))) (defn file->textract->invoice [f] (let [s3-location (str "textract-files/" (UUID/randomUUID) "." (last (str/split f #"[\\.]"))) file (io/file f)] (with-open [stream (io/input-stream f)] (s3/put-object (:data-bucket env) s3-location stream {:content-type "application/pdf" :content-length (.length file)})) (-> (textract-file s3-location) (textract->coalesced) (coalesced->invoice)))) #_(def result (with-open [x (io/reader "batch.json")] (json/parse-stream x)))