54 lines
1.7 KiB
Clojure
54 lines
1.7 KiB
Clojure
(ns auto-ap.parse
|
|
(:require [clojure.java.io :as io]
|
|
[clojure.string :as str]
|
|
[clojure.java.shell :as sh]))
|
|
|
|
(def templates
|
|
[{:keywords [#"CHEF'S WAREHOUSE"]
|
|
:extract {:total #"2 WKS C\.C\.\s+([\d.,]+)"
|
|
:customer-identifier #"\n([A-Z][A-Z ]+)\s{2,}"
|
|
:date #"\s+([0-9]+/[0-9]+/[0-9]+)"
|
|
:invoice-number #"\s+[0-9]+/[0-9]+/[0-9]+\s+([0-9]+)"}}
|
|
|
|
{:keywords [#"Golden Gate Meat"]
|
|
:extract {:total #"Invoice Total\:\s+\$([\d.,]+)"
|
|
:customer-identifier #"Bill To\s*:\s*([\w ]+)\s{2,}"
|
|
:date #"Printed:\s+([0-9]+/[0-9]+/[0-9]+)"
|
|
:invoice-number #"Invoice\s+[^\n]+\n[^\n]+\n\s+([0-9]+)"}}
|
|
|
|
{:keywords [#"CINTAS CORPORATION"]
|
|
:extract {:invoice-number #"INVOICE\s#\s+([\d.,]+)"
|
|
:customer-identifier #"BILL TO\s*:\s{2,}([\w ]+)\s{2,}"
|
|
:date #"INVOICE DATE\s*\n.*\s+([0-9]+/[0-9]+/[0-9]+)"
|
|
:total #"INVOICE TOTAL\s+([0-9.]+)"}
|
|
:multi #"\f\f"}])
|
|
|
|
(defn template-applies? [text {:keys [keywords]}]
|
|
(every? #(re-find % text) keywords))
|
|
|
|
(defn extract-template [text template]
|
|
(if (:multi template)
|
|
(mapcat
|
|
#(extract-template % (dissoc template :multi))
|
|
(str/split text (:multi template)))
|
|
|
|
[(->> template
|
|
:extract
|
|
(reduce-kv
|
|
(fn [result k v]
|
|
(assoc result k (some-> (first (map second (re-seq v text)))
|
|
str/trim )))
|
|
{}))]))
|
|
|
|
(defn parse [text]
|
|
(->> templates
|
|
(filter (partial template-applies? text))
|
|
first
|
|
(extract-template text)))
|
|
|
|
(defn parse-file
|
|
[file]
|
|
(-> (sh/sh "pdftotext" "-layout" file "-")
|
|
:out
|
|
parse))
|