211 lines
8.6 KiB
Clojure
211 lines
8.6 KiB
Clojure
(ns auto-ap.parse
|
|
(:require [amazonica.aws.lambda :as lambda]
|
|
[amazonica.aws.s3 :as s3]
|
|
[auto-ap.logging :as alog]
|
|
[auto-ap.parse.csv :as csv]
|
|
[auto-ap.parse.excel :as excel]
|
|
[auto-ap.parse.templates :as t]
|
|
[auto-ap.parse.util :as u]
|
|
[auto-ap.ssr.vendor :as vendors]
|
|
[clj-fuzzy.metrics :as m]
|
|
[clojure.data.json :as json]
|
|
[clojure.java.io :as io]
|
|
[clojure.java.shell :as sh]
|
|
[clojure.set :as set]
|
|
[clojure.string :as str]))
|
|
|
|
(defonce last-text (atom nil))
|
|
|
|
|
|
(defn template-applies? [text {:keys [keywords]}]
|
|
(every? #(re-find % text) keywords))
|
|
|
|
(defn extract-template
|
|
([text template]
|
|
(alog/info ::template-determined
|
|
:template (str template))
|
|
|
|
(if (:multi template)
|
|
(mapcat
|
|
#(extract-template % text (dissoc template :multi))
|
|
(str/split text (:multi template)))
|
|
|
|
(extract-template text text template)))
|
|
([text full-text template]
|
|
(when (and template
|
|
(or (not (:multi-match? template))
|
|
(re-find (:multi-match? template) text)))
|
|
[(->> template
|
|
:extract
|
|
(reduce-kv
|
|
(fn [result k v]
|
|
(let [value (some-> (or (first (map second (re-seq v text)))
|
|
(first (map second (re-seq v full-text))))
|
|
str/trim)
|
|
[value-parser parser-params] (-> template :parser k)]
|
|
(assoc result k (try
|
|
(u/parse-value value-parser parser-params value)
|
|
(catch Exception e
|
|
(alog/warn ::cant-parse-value :error e :raw value))))))
|
|
{:vendor-code (:vendor template)
|
|
:template (str (:vendor template) " - selected because it matched "
|
|
(pr-str (:keywords template)))
|
|
:text text
|
|
:full-text full-text}))])))
|
|
|
|
(defn parse [text]
|
|
(reset! last-text text)
|
|
(->> t/pdf-templates
|
|
(filter (partial template-applies? text))
|
|
first
|
|
(extract-template text)))
|
|
|
|
|
|
|
|
(defmulti parse-file
|
|
"Parses a file based on its extension. Accepts options as additional arguments.
|
|
Options:
|
|
- :allow-glimpse? (default false) - If true, allows parsing a glimpse of the file."
|
|
(fn [_ filename & opts]
|
|
(.toLowerCase (last (str/split filename #"\.")))))
|
|
|
|
(defn invoke-glimpse2 [f]
|
|
(let [result (slurp (:payload (lambda/invoke {:client-config {:request-timeout 120000
|
|
:socket-timeout 120000}} {:function-name "glimpse2" :payload
|
|
(json/write-str
|
|
(alog/peek ::x {"url" (str "https://" "data.prod.app.integreatconsult.com" "/" f)}))})))]
|
|
|
|
|
|
(alog/info ::glimpse2-payload :payload result)
|
|
(-> result
|
|
json/read-str)))
|
|
|
|
(defn glimpse2 [file]
|
|
(try
|
|
(let [tmp-key (str "glimpse2/import/" (java.util.UUID/randomUUID) ".pdf")
|
|
_ (with-open [f (io/input-stream file)]
|
|
(s3/put-object {:bucket-name "data.prod.app.integreatconsult.com"
|
|
:key tmp-key
|
|
:input-stream f}))
|
|
is (invoke-glimpse2 tmp-key)]
|
|
(alog/peek ::glimpse2-result is)
|
|
(for [i is]
|
|
{:date (u/parse-value :clj-time "yyyy-MM-dd" (str/trim (get i "date")))
|
|
:customer-identifier (get i "customer_identifier")
|
|
:account-number (not-empty (get i "account_number"))
|
|
:vendor-search (get i "vendor_identifier")
|
|
:vendor-code (-> (vendors/best-match (get i "vendor_identifier"))
|
|
(get "label"))
|
|
:total (get i "total")
|
|
:invoice-number (get i "invoice_number")
|
|
:template "None found - defaulting to ChatGPT"}))
|
|
|
|
(catch Exception e
|
|
(alog/warn ::glimpse2-not-work :error e)
|
|
nil)))
|
|
|
|
(defmethod parse-file
|
|
"pdf"
|
|
[file _ & {:keys [allow-glimpse?] :or {allow-glimpse? false}}]
|
|
(or
|
|
(-> (sh/sh "pdftotext" "-layout" file "-")
|
|
:out
|
|
parse)
|
|
(and allow-glimpse? (alog/peek ::glimpse2-result (glimpse2 file)))))
|
|
|
|
(defmethod parse-file
|
|
"csv"
|
|
[file filename & _]
|
|
(csv/parse-file file filename))
|
|
|
|
(defmethod parse-file
|
|
"xls"
|
|
[file filename & _]
|
|
(excel/parse-file file filename))
|
|
|
|
|
|
(defmethod parse-file
|
|
"xlsx"
|
|
[file filename & _]
|
|
(excel/parse-file file filename))
|
|
|
|
(defn best-match
|
|
([clients invoice-client-name]
|
|
(best-match clients invoice-client-name 0.25))
|
|
([clients invoice-client-name threshold]
|
|
(let [fuzzy-match (->> clients
|
|
(mapcat (fn [{:keys [:client/matches :client/name] :as client :or {matches []}}]
|
|
(map (fn [m]
|
|
(let [similarity-index (m/jaccard (.toLowerCase invoice-client-name) (.toLowerCase m))
|
|
similarity-index (cond
|
|
(and (= 0 similarity-index)
|
|
(= (.toLowerCase invoice-client-name) (.toLowerCase m)))
|
|
0
|
|
|
|
(= 0 similarity-index)
|
|
0.1
|
|
|
|
:else
|
|
similarity-index)]
|
|
[client similarity-index]))
|
|
(conj matches name))))
|
|
(filter #(<= (second %) threshold))
|
|
(sort-by second)
|
|
first)
|
|
|
|
word-set (set (filter (complement str/blank?) (str/split (.toLowerCase invoice-client-name) #"[\s:\-]")))
|
|
client-word-match (->> clients
|
|
(map
|
|
(fn [{:keys [:client/matches :client/name] :as client :or {matches []}}]
|
|
(let [client-words (-> #{}
|
|
(into
|
|
(mapcat
|
|
(fn [match] (str/split (.toLowerCase match) #"\s"))
|
|
matches))
|
|
(into
|
|
(str/split (.toLowerCase name) #"\s")))]
|
|
[client (count (set/intersection client-words word-set))])))
|
|
(filter (fn [[_ c]] (> c 0)))
|
|
(sort-by (fn [[_ c]] c))
|
|
reverse
|
|
first)]
|
|
(or fuzzy-match client-word-match))))
|
|
|
|
(defn exact-match
|
|
([clients invoice-client-name]
|
|
(->> clients
|
|
(filter (fn [{:keys [:client/matches :client/location-matches :client/locations :client/name] :as client :or {matches []}}]
|
|
(seq
|
|
(filter (fn [m]
|
|
(and
|
|
m
|
|
invoice-client-name
|
|
(= (.toLowerCase invoice-client-name) (.toLowerCase m))))
|
|
(conj matches name)))))
|
|
first)))
|
|
|
|
(defn best-location-match [client text full-text]
|
|
(or (->> client
|
|
:client/location-matches
|
|
(mapcat (fn [{:keys [:location-match/location :location-match/matches]}]
|
|
(map (fn [match] [location match]) matches)))
|
|
(filter (fn [[_ match]]
|
|
(re-find (re-pattern (str "(?i)" match)) text)))
|
|
first
|
|
first)
|
|
(->> client
|
|
:client/location-matches
|
|
(mapcat (fn [{:keys [:location-match/location :location-match/matches]}]
|
|
(map (fn [match] [location match]) matches)))
|
|
(filter (fn [[_ match]] (re-find (re-pattern (str "(?i)" match)) full-text)))
|
|
first
|
|
first)
|
|
(:client/default-location client)
|
|
(first (:client/locations client))))
|
|
|
|
#_{:clj-kondo/ignore [:clojure-lsp/unused-public-var]}
|
|
(defn dbg-parse [v]
|
|
(println v)
|
|
(map
|
|
(fn [x] (dissoc x :full-text :text))
|
|
(parse v))) |