102 lines
2.8 KiB
Clojure
102 lines
2.8 KiB
Clojure
(ns auto-ap.parse
|
|
(:require [auto-ap.parse.excel :as excel]
|
|
[auto-ap.parse.templates :as t]
|
|
[clj-fuzzy.metrics :as m]
|
|
[clojure.java.shell :as sh]
|
|
[clojure.string :as str]
|
|
[clj-time.format :as f]
|
|
[clj-time.core :as time]))
|
|
|
|
|
|
(defmulti parse-value (fn [method _ _]
|
|
method))
|
|
|
|
|
|
(defmethod parse-value :trim-commas
|
|
[_ _ value]
|
|
(str/replace value #"," "")
|
|
)
|
|
|
|
(defmethod parse-value :clj-time
|
|
[_ format value]
|
|
(time/from-time-zone (f/parse (f/formatter format) value)
|
|
(time/time-zone-for-id "America/Los_Angeles")))
|
|
|
|
(defmethod parse-value nil
|
|
[_ _ value]
|
|
value)
|
|
|
|
(def last-text (atom nil))
|
|
|
|
|
|
(defn template-applies? [text {:keys [keywords]}]
|
|
(every? #(re-find % text) keywords))
|
|
|
|
(defn extract-template [text template]
|
|
(if (:multi template)
|
|
(mapcat
|
|
#(extract-template % (dissoc template :multi))
|
|
(str/split text (:multi template)))
|
|
|
|
(when template
|
|
[(->> template
|
|
:extract
|
|
(reduce-kv
|
|
(fn [result k v]
|
|
(let [value (some-> (first (map second (re-seq v text)))
|
|
str/trim )
|
|
[value-parser parser-params] (-> template :parser k)]
|
|
(assoc result k (parse-value value-parser parser-params value))))
|
|
{:vendor-code (:vendor template)
|
|
:text text}))])))
|
|
|
|
(defn parse [text]
|
|
(reset! last-text text)
|
|
(->> t/pdf-templates
|
|
(filter (partial template-applies? text))
|
|
first
|
|
(extract-template text)))
|
|
|
|
|
|
(defmulti parse-file (fn [file filename] (.toLowerCase (last (str/split filename #"\." )))))
|
|
|
|
(defmethod parse-file
|
|
"pdf"
|
|
[file filename]
|
|
(-> (sh/sh "pdftotext" "-layout" file "-")
|
|
:out
|
|
parse))
|
|
|
|
(defmethod parse-file
|
|
"xls"
|
|
[file filename]
|
|
(excel/parse-file file filename))
|
|
|
|
|
|
(defmethod parse-file
|
|
"xlsx"
|
|
[file filename]
|
|
(excel/parse-file file filename))
|
|
|
|
(defn best-match [clients invoice-client-name]
|
|
(->> clients
|
|
|
|
(mapcat (fn [{:keys [:db/id :client/matches :client/name] :as client :or {matches []}}]
|
|
(map (fn [m]
|
|
[client (m/jaccard (.toLowerCase invoice-client-name) (.toLowerCase m))])
|
|
(conj matches name))))
|
|
(filter #(< (second %) 0.25))
|
|
(sort-by second)
|
|
ffirst))
|
|
|
|
(defn best-location-match [client text]
|
|
(or (->> client
|
|
:client/location-matches
|
|
(mapcat (fn [{:keys [:location-match/location :location-match/matches]}]
|
|
(map (fn [match] [location match]) matches)))
|
|
(filter (fn [[location match]] (re-find (re-pattern (str "(?i)" match)) text)) )
|
|
first
|
|
first)
|
|
(:client/default-location client)
|
|
(first (:client/locations client))))
|