(ns auto-ap.parse (:require [auto-ap.parse.excel :as excel] [auto-ap.parse.templates :as t] [clj-fuzzy.metrics :as m] [clojure.java.shell :as sh] [clojure.string :as str] [clj-time.format :as f] [clj-time.core :as time])) (defmulti parse-value (fn [method _ _] method)) (defmethod parse-value :clj-time [_ format value] (time/from-time-zone (f/parse (f/formatter format) value) (time/time-zone-for-id "America/Los_Angeles"))) (defmethod parse-value nil [_ _ value] value) (defn template-applies? [text {:keys [keywords]}] (every? #(re-find % text) keywords)) (defn extract-template [text template] (if (:multi template) (mapcat #(extract-template % (dissoc template :multi)) (str/split text (:multi template))) [(->> template :extract (reduce-kv (fn [result k v] (let [value (some-> (first (map second (re-seq v text))) str/trim ) [value-parser parser-params] (-> template :parser k)] (assoc result k (parse-value value-parser parser-params value)))) {:vendor-code (:vendor template)}))])) (defn parse [text] (->> t/pdf-templates (filter (partial template-applies? text)) first (extract-template text))) (defmulti parse-file (fn [file filename] (.toLowerCase (last (str/split filename #"\." ))))) (defmethod parse-file "pdf" [file filename] (-> (sh/sh "pdftotext" "-layout" file "-") :out parse)) (defmethod parse-file "xls" [file filename] (excel/parse-file file filename)) (defmethod parse-file "xlsx" [file filename] (excel/parse-file file filename)) (defn best-match [companies company-identifier] (println companies) (->> companies (map (fn [company] (if-let [matches (:matches company)] [company (apply min (map #(m/jaccard (.toLowerCase company-identifier) %) matches))] [company 1]))) (filter #(< (second %) 0.25)) (sort-by second) ffirst))