(ns auto-ap.parse (:require [amazonica.aws.lambda :as lambda] [amazonica.aws.s3 :as s3] [auto-ap.logging :as alog] [auto-ap.parse.csv :as csv] [auto-ap.parse.excel :as excel] [auto-ap.parse.templates :as t] [auto-ap.parse.util :as u] [auto-ap.ssr.vendor :as vendors] [clj-fuzzy.metrics :as m] [clojure.data.json :as json] [clojure.java.io :as io] [clojure.java.shell :as sh] [clojure.set :as set] [clojure.string :as str])) (defonce last-text (atom nil)) (defn template-applies? [text {:keys [keywords]}] (every? #(re-find % text) keywords)) (defn extract-template ([text template] (alog/info ::template-determined :template (str template)) (if (:multi template) (mapcat #(extract-template % text (dissoc template :multi)) (str/split text (:multi template))) (extract-template text text template))) ([text full-text template] (when (and template (or (not (:multi-match? template)) (re-find (:multi-match? template) text))) [(->> template :extract (reduce-kv (fn [result k v] (let [value (some-> (or (first (map second (re-seq v text))) (first (map second (re-seq v full-text)))) str/trim) [value-parser parser-params] (-> template :parser k)] (assoc result k (try (u/parse-value value-parser parser-params value) (catch Exception e (alog/warn ::cant-parse-value :error e :raw value)))))) {:vendor-code (:vendor template) :template (str (:vendor template) " - selected because it matched " (pr-str (:keywords template))) :text text :full-text full-text}))]))) (defn parse [text] (reset! last-text text) (->> t/pdf-templates (filter (partial template-applies? text)) first (extract-template text))) (defmulti parse-file "Parses a file based on its extension. Accepts options as additional arguments. Options: - :allow-glimpse? (default false) - If true, allows parsing a glimpse of the file." (fn [_ filename & opts] (.toLowerCase (last (str/split filename #"\."))))) (defn invoke-glimpse2 [f] (let [result (slurp (:payload (lambda/invoke {:client-config {:request-timeout 120000 :socket-timeout 120000}} {:function-name "glimpse2" :payload (json/write-str (alog/peek ::x {"url" (str "https://" "data.prod.app.integreatconsult.com" "/" f)}))})))] (alog/info ::glimpse2-payload :payload result) (-> result json/read-str))) (defn glimpse2 [file] (try (let [tmp-key (str "glimpse2/import/" (java.util.UUID/randomUUID) ".pdf") _ (with-open [f (io/input-stream file)] (s3/put-object {:bucket-name "data.prod.app.integreatconsult.com" :key tmp-key :input-stream f})) is (invoke-glimpse2 tmp-key)] (alog/peek ::glimpse2-result is) (for [i is] {:date (u/parse-value :clj-time "yyyy-MM-dd" (str/trim (get i "date"))) :customer-identifier (get i "customer_identifier") :account-number (not-empty (get i "account_number")) :vendor-search (get i "vendor_identifier") :vendor-code (-> (vendors/best-match (get i "vendor_identifier")) (get "label")) :total (get i "total") :invoice-number (get i "invoice_number") :template "None found - defaulting to ChatGPT"})) (catch Exception e (alog/warn ::glimpse2-not-work :error e) nil))) (defmethod parse-file "pdf" [file _ & {:keys [allow-glimpse?] :or {allow-glimpse? false}}] (or (-> (sh/sh "pdftotext" "-layout" file "-") :out parse) (and allow-glimpse? (alog/peek ::glimpse2-result (glimpse2 file))))) (defmethod parse-file "csv" [file filename & _] (csv/parse-file file filename)) (defmethod parse-file "xls" [file filename & _] (excel/parse-file file filename)) (defmethod parse-file "xlsx" [file filename & _] (excel/parse-file file filename)) (defn best-match ([clients invoice-client-name] (best-match clients invoice-client-name 0.25)) ([clients invoice-client-name threshold] (let [fuzzy-match (->> clients (mapcat (fn [{:keys [:client/matches :client/name] :as client :or {matches []}}] (map (fn [m] (let [similarity-index (m/jaccard (.toLowerCase invoice-client-name) (.toLowerCase m)) similarity-index (cond (and (= 0 similarity-index) (= (.toLowerCase invoice-client-name) (.toLowerCase m))) 0 (= 0 similarity-index) 0.1 :else similarity-index)] [client similarity-index])) (conj matches name)))) (filter #(<= (second %) threshold)) (sort-by second) first) word-set (set (filter (complement str/blank?) (str/split (.toLowerCase invoice-client-name) #"[\s:\-]"))) client-word-match (->> clients (map (fn [{:keys [:client/matches :client/name] :as client :or {matches []}}] (let [client-words (-> #{} (into (mapcat (fn [match] (str/split (.toLowerCase match) #"\s")) matches)) (into (str/split (.toLowerCase name) #"\s")))] [client (count (set/intersection client-words word-set))]))) (filter (fn [[_ c]] (> c 0))) (sort-by (fn [[_ c]] c)) reverse first)] (or fuzzy-match client-word-match)))) (defn exact-match ([clients invoice-client-name] (->> clients (filter (fn [{:keys [:client/matches :client/location-matches :client/locations :client/name] :as client :or {matches []}}] (seq (filter (fn [m] (and m invoice-client-name (= (.toLowerCase invoice-client-name) (.toLowerCase m)))) (conj matches name))))) first))) (defn best-location-match [client text full-text] (or (->> client :client/location-matches (mapcat (fn [{:keys [:location-match/location :location-match/matches]}] (map (fn [match] [location match]) matches))) (filter (fn [[_ match]] (re-find (re-pattern (str "(?i)" match)) text))) first first) (->> client :client/location-matches (mapcat (fn [{:keys [:location-match/location :location-match/matches]}] (map (fn [match] [location match]) matches))) (filter (fn [[_ match]] (re-find (re-pattern (str "(?i)" match)) full-text))) first first) (:client/default-location client) (first (:client/locations client)))) #_{:clj-kondo/ignore [:clojure-lsp/unused-public-var]} (defn dbg-parse [v] (println v) (map (fn [x] (dissoc x :full-text :text)) (parse v)))