Files
integreat/src/clj/auto_ap/parse.clj
Bryce Covert 9cb42be9e8 lots of fixes.
2020-04-21 07:13:42 -07:00

138 lines
5.0 KiB
Clojure

(ns auto-ap.parse
(:require [auto-ap.parse.excel :as excel]
[auto-ap.parse.templates :as t]
[auto-ap.parse.util :as u]
[auto-ap.parse.csv :as csv]
[clj-fuzzy.metrics :as m]
[clojure.java.shell :as sh]
[clojure.string :as str]
[clj-time.format :as f]
[clj-time.core :as time]
[clojure.set :as set]))
(def last-text (atom nil))
(defn template-applies? [text {:keys [keywords]}]
(every? #(re-find % text) keywords))
(defn extract-template
([text template]
(println "template" template)
(if (:multi template)
(mapcat
#(extract-template % text (dissoc template :multi))
(str/split text (:multi template)))
(extract-template text text template)))
([text full-text template]
(when (and template
(or (not (:multi-match? template))
(re-find (:multi-match? template) text )))
[(->> template
:extract
(reduce-kv
(fn [result k v]
(let [value (some-> (or (first (map second (re-seq v text)))
(first (map second (re-seq v full-text))))
str/trim )
[value-parser parser-params] (-> template :parser k)]
(assoc result k (try
(println "applying parser" value-parser "to value" value)
(u/parse-value value-parser parser-params value)
(catch Exception e
(println e))))))
{:vendor-code (:vendor template)
:text text
:full-text full-text}))])))
(defn parse [text]
(println "Parsing PDF " text)
(reset! last-text text)
(->> t/pdf-templates
(filter (partial template-applies? text))
first
(extract-template text)))
(defmulti parse-file (fn [file filename] (.toLowerCase (last (str/split filename #"\." )))))
(defmethod parse-file
"pdf"
[file filename]
(-> (sh/sh "pdftotext" "-layout" file "-")
:out
parse))
(defmethod parse-file
"csv"
[file filename]
(csv/parse-file file filename))
(defmethod parse-file
"xls"
[file filename]
(excel/parse-file file filename))
(defmethod parse-file
"xlsx"
[file filename]
(excel/parse-file file filename))
(defn best-match [clients invoice-client-name]
(let [fuzzy-match (->> clients
(mapcat (fn [{:keys [:db/id :client/matches :client/name] :as client :or {matches []}}]
(map (fn [m]
[client (m/jaccard (.toLowerCase invoice-client-name) (.toLowerCase m))])
(conj matches name))))
(filter #(< (second %) 0.25))
(sort-by second)
ffirst)
word-set (set (filter (complement str/blank?) (str/split (.toLowerCase invoice-client-name) #"[\s:\-]" )))
client-word-match (->> clients
(map
(fn [{:keys [:db/id :client/matches :client/name] :as client :or {matches []}}]
(let [client-words (-> #{}
(into
(mapcat
(fn [match] (str/split (.toLowerCase match) #"\s" ))
matches))
(into
(str/split (.toLowerCase name) #"\s" )))]
[client (count (set/intersection client-words word-set))])))
(filter (fn [[_ c]] (> c 0)))
(sort-by (fn [[_ c]] c))
reverse
ffirst)]
(or fuzzy-match client-word-match)))
(defn best-location-match [client text full-text]
(or (->> client
:client/location-matches
(mapcat (fn [{:keys [:location-match/location :location-match/matches]}]
(map (fn [match] [location match]) matches)))
(filter (fn [[location match]]
(re-find (re-pattern (str "(?i)" match)) text)) )
first
first)
(->> client
:client/location-matches
(mapcat (fn [{:keys [:location-match/location :location-match/matches]}]
(map (fn [match] [location match]) matches)))
(filter (fn [[location match]] (re-find (re-pattern (str "(?i)" match)) full-text)) )
first
first)
(:client/default-location client)
(first (:client/locations client))))
(defn dbg-parse [v]
(doto
(map
(fn [x] (dissoc x :full-text :text))
(parse v))
clojure.pprint/pprint ))