(ns auto-ap.parse (:require [auto-ap.parse.excel :as excel] [auto-ap.parse.templates :as t] [auto-ap.parse.util :as u] [auto-ap.parse.csv :as csv] [clj-fuzzy.metrics :as m] [clojure.java.shell :as sh] [clojure.string :as str] [clj-time.format :as f] [clj-time.core :as time] [clojure.set :as set])) (def last-text (atom nil)) (defn template-applies? [text {:keys [keywords]}] (every? #(re-find % text) keywords)) (defn extract-template ([text template] (println "template" template) (if (:multi template) (mapcat #(extract-template % text (dissoc template :multi)) (str/split text (:multi template))) (extract-template text text template))) ([text full-text template] (when (and template (or (not (:multi-match? template)) (re-find (:multi-match? template) text ))) [(->> template :extract (reduce-kv (fn [result k v] (let [value (some-> (or (first (map second (re-seq v text))) (first (map second (re-seq v full-text)))) str/trim ) [value-parser parser-params] (-> template :parser k)] (assoc result k (try (println "applying parser" value-parser "to value" value) (u/parse-value value-parser parser-params value) (catch Exception e (println e)))))) {:vendor-code (:vendor template) :text text :full-text full-text}))]))) (defn parse [text] (println "Parsing PDF " text) (reset! last-text text) (->> t/pdf-templates (filter (partial template-applies? text)) first (extract-template text))) (defmulti parse-file (fn [file filename] (.toLowerCase (last (str/split filename #"\." ))))) (defmethod parse-file "pdf" [file filename] (-> (sh/sh "pdftotext" "-layout" file "-") :out parse)) (defmethod parse-file "csv" [file filename] (csv/parse-file file filename)) (defmethod parse-file "xls" [file filename] (excel/parse-file file filename)) (defmethod parse-file "xlsx" [file filename] (excel/parse-file file filename)) (defn best-match [clients invoice-client-name] (let [fuzzy-match (->> clients (mapcat (fn [{:keys [:db/id :client/matches :client/name] :as client :or {matches []}}] (map (fn [m] [client (m/jaccard (.toLowerCase invoice-client-name) (.toLowerCase m))]) (conj matches name)))) (filter #(< (second %) 0.25)) (sort-by second) ffirst) word-set (set (filter (complement str/blank?) (str/split (.toLowerCase invoice-client-name) #"[\s:\-]" ))) client-word-match (->> clients (map (fn [{:keys [:db/id :client/matches :client/name] :as client :or {matches []}}] (let [client-words (-> #{} (into (mapcat (fn [match] (str/split (.toLowerCase match) #"\s" )) matches)) (into (str/split (.toLowerCase name) #"\s" )))] [client (count (set/intersection client-words word-set))]))) (filter (fn [[_ c]] (> c 0))) (sort-by (fn [[_ c]] c)) reverse ffirst)] (or fuzzy-match client-word-match))) (defn best-location-match [client text full-text] (or (->> client :client/location-matches (mapcat (fn [{:keys [:location-match/location :location-match/matches]}] (map (fn [match] [location match]) matches))) (filter (fn [[location match]] (re-find (re-pattern (str "(?i)" match)) text)) ) first first) (->> client :client/location-matches (mapcat (fn [{:keys [:location-match/location :location-match/matches]}] (map (fn [match] [location match]) matches))) (filter (fn [[location match]] (re-find (re-pattern (str "(?i)" match)) full-text)) ) first first) (:client/default-location client) (first (:client/locations client)))) (defn dbg-parse [v] (doto (map (fn [x] (dissoc x :full-text :text)) (parse v)) clojure.pprint/pprint ))