(ns auto-ap.parse (:require [clojure.java.io :as io] [clojure.string :as str] [clojure.java.shell :as sh])) (def templates [{:keywords [#"CHEF'S WAREHOUSE"] :extract {:total #"2 WKS C\.C\.\s+([\d.,]+)" :customer-identifier #"\n([A-Z][A-Z ]+)\s{2,}" :date #"\s+([0-9]+/[0-9]+/[0-9]+)" :invoice-number #"\s+[0-9]+/[0-9]+/[0-9]+\s+([0-9]+)"}} {:keywords [#"Golden Gate Meat"] :extract {:total #"Invoice Total\:\s+\$([\d.,]+)" :customer-identifier #"Bill To\s*:\s*([\w ]+)\s{2,}" :date #"Printed:\s+([0-9]+/[0-9]+/[0-9]+)" :invoice-number #"Invoice\s+[^\n]+\n[^\n]+\n\s+([0-9]+)"}} {:keywords [#"CINTAS CORPORATION"] :extract {:invoice-number #"INVOICE\s#\s+([\d.,]+)" :customer-identifier #"BILL TO\s*:\s{2,}([\w ]+)\s{2,}" :date #"INVOICE DATE\s*\n.*\s+([0-9]+/[0-9]+/[0-9]+)" :total #"INVOICE TOTAL\s+([0-9.]+)"} :multi #"\f\f"}]) (defn template-applies? [text {:keys [keywords]}] (every? #(re-find % text) keywords)) (defn extract-template [text template] (if (:multi template) (mapcat #(extract-template % (dissoc template :multi)) (str/split text (:multi template))) [(->> template :extract (reduce-kv (fn [result k v] (assoc result k (some-> (first (map second (re-seq v text))) str/trim ))) {}))])) (defn parse [text] (->> templates (filter (partial template-applies? text)) first (extract-template text))) (defn parse-file [file] (-> (sh/sh "pdftotext" "-layout" file "-") :out parse))