Files
integreat/src/clj/auto_ap/parse.clj
2018-05-17 20:06:46 -07:00

81 lines
2.1 KiB
Clojure

(ns auto-ap.parse
(:require [auto-ap.parse.excel :as excel]
[auto-ap.parse.templates :as t]
[clj-fuzzy.metrics :as m]
[clojure.java.shell :as sh]
[clojure.string :as str]
[clj-time.format :as f]
[clj-time.core :as time]))
(defmulti parse-value (fn [method _ _]
method))
(defmethod parse-value :clj-time
[_ format value]
(time/from-time-zone (f/parse (f/formatter format) value)
(time/time-zone-for-id "America/Los_Angeles")))
(defmethod parse-value nil
[_ _ value]
value)
(defn template-applies? [text {:keys [keywords]}]
(every? #(re-find % text) keywords))
(defn extract-template [text template]
(if (:multi template)
(mapcat
#(extract-template % (dissoc template :multi))
(str/split text (:multi template)))
[(->> template
:extract
(reduce-kv
(fn [result k v]
(let [value (some-> (first (map second (re-seq v text)))
str/trim )
[value-parser parser-params] (-> template :parser k)]
(assoc result k (parse-value value-parser parser-params value))))
{:vendor-code (:vendor template)}))]))
(defn parse [text]
(->> t/pdf-templates
(filter (partial template-applies? text))
first
(extract-template text)))
(defmulti parse-file (fn [file filename] (.toLowerCase (last (str/split filename #"\." )))))
(defmethod parse-file
"pdf"
[file filename]
(-> (sh/sh "pdftotext" "-layout" file "-")
:out
parse))
(defmethod parse-file
"xls"
[file filename]
(excel/parse-file file filename))
(defmethod parse-file
"xlsx"
[file filename]
(excel/parse-file file filename))
(defn best-match [companies company-identifier]
(println companies)
(->> companies
(map (fn [company]
(if-let [matches (:matches company)]
[company (apply min (map #(m/jaccard (.toLowerCase company-identifier) %) matches))]
[company 1])))
(filter #(< (second %) 0.25))
(sort-by second)
ffirst))