commit 0c442b29e08b9ab33cfe1962b2930c6c12e78d3f Author: Bryce Covert Date: Wed Dec 6 17:55:09 2017 -0800 first version of sample parser. diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..22d6a481 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +/target +/lib +/classes +/checkouts +pom.xml +pom.xml.asc +*.jar +*.class +/.lein-* +/.nrepl-port diff --git a/README.md b/README.md new file mode 100644 index 00000000..f0f0adec --- /dev/null +++ b/README.md @@ -0,0 +1,19 @@ +# auto-ap + +FIXME + +## Prerequisites + +You will need [Leiningen][] 2.0.0 or above installed. + +[leiningen]: https://github.com/technomancy/leiningen + +## Running + +To start a web server for the application, run: + + lein ring server + +## License + +Copyright © 2017 FIXME diff --git a/project.clj b/project.clj new file mode 100644 index 00000000..104ad572 --- /dev/null +++ b/project.clj @@ -0,0 +1,13 @@ +(defproject auto-ap "0.1.0-SNAPSHOT" + :description "FIXME: write description" + :url "http://example.com/FIXME" + :min-lein-version "2.0.0" + :dependencies [[org.clojure/clojure "1.8.0"] + [compojure "1.6.0"] + [ring/ring-defaults "0.2.1"] + [ring/ring-json "0.4.0"]] + :plugins [[lein-ring "0.9.7"]] + :ring {:handler auto-ap.handler/app} + :profiles + {:dev {:dependencies [[javax.servlet/servlet-api "2.5"] + [ring/ring-mock "0.3.0"]]}}) diff --git a/resources/public/chefswarehouse.PDF b/resources/public/chefswarehouse.PDF new file mode 100644 index 00000000..f2ed61e1 Binary files /dev/null and b/resources/public/chefswarehouse.PDF differ diff --git a/resources/public/cintas.pdf b/resources/public/cintas.pdf new file mode 100644 index 00000000..4c506f05 Binary files /dev/null and b/resources/public/cintas.pdf differ diff --git a/resources/public/golden-gate-meats.pdf b/resources/public/golden-gate-meats.pdf new file mode 100644 index 00000000..d93f0392 Binary files /dev/null and b/resources/public/golden-gate-meats.pdf differ diff --git a/resources/public/index.html b/resources/public/index.html new file mode 100644 index 00000000..c1b32626 --- /dev/null +++ b/resources/public/index.html @@ -0,0 +1,37 @@ + + + + + + + + + + +
+

Invoice Parsing Demo

+
+

Drop invoice pdfs here

+ +
+

Found invoices:

+ + + +
+ + + + diff --git a/src/auto_ap/handler.clj b/src/auto_ap/handler.clj new file mode 100644 index 00000000..c3779792 --- /dev/null +++ b/src/auto_ap/handler.clj @@ -0,0 +1,29 @@ +(ns auto-ap.handler + (:require [compojure.core :refer :all] + [compojure.route :as route] + [clojure.java.io :as io] + [auto-ap.parse :as parse] + [ring.middleware.multipart-params :as mp] + [ring.util.response :as response] + [ring.middleware.defaults :refer [wrap-defaults site-defaults]] + [ring.middleware.json :refer [wrap-json-response]])) + +(defroutes app-routes + (GET "/" [] (response/resource-response "index.html" {:root "public"})) + (POST "/pdf-upload" + {{ files "file"} :params :as params} + (let [{:keys [filename tempfile]} (second files)] + (io/copy tempfile (io/file "resources" "public" filename)) + (for [{:keys [total date invoice-number customer-identifier]} (parse/parse-file (str "resources/public/" filename))] + (do + (println (str "An invoice #" invoice-number " on " date " for " total)) + (str "An invoice for customer " customer-identifier " #" invoice-number " on " date " for " total ))))) + (route/resources "/") + (route/not-found "Not Found")) + +#_(defroutes routes + (GET "/" [] (resource-response "index.html" {:root "public"})) + (resources "/")) + +(def app + (wrap-json-response (mp/wrap-multipart-params app-routes))) diff --git a/src/auto_ap/parse.clj b/src/auto_ap/parse.clj new file mode 100644 index 00000000..ad15bc36 --- /dev/null +++ b/src/auto_ap/parse.clj @@ -0,0 +1,54 @@ +(ns auto-ap.parse + (:require [clojure.java.io :as io] + [clojure.string :as str] + [clojure.java.shell :as sh])) + +(def templates + [{:keywords [#"CHEF'S WAREHOUSE"] + :extract {:total #"2 WKS C\.C\.\s+([\d.,]+)" + :customer-identifier #"\n([A-Z][A-Z ]+)\s{2,}" + :date #"\s+([0-9]+/[0-9]+/[0-9]+)" + :invoice-number #"\s+[0-9]+/[0-9]+/[0-9]+\s+([0-9]+)"}} + + {:keywords [#"Golden Gate Meat"] + :extract {:total #"Invoice Total\:\s+\$([\d.,]+)" + :customer-identifier #"Bill To\s*:\s*([\w ]+)\s{2,}" + :date #"Printed:\s+([0-9]+/[0-9]+/[0-9]+)" + :invoice-number #"Invoice\s+[^\n]+\n[^\n]+\n\s+([0-9]+)"}} + + {:keywords [#"CINTAS CORPORATION"] + :extract {:invoice-number #"INVOICE\s#\s+([\d.,]+)" + :customer-identifier #"BILL TO\s*:\s{2,}([\w ]+)\s{2,}" + :date #"INVOICE DATE\s*\n.*\s+([0-9]+/[0-9]+/[0-9]+)" + :total #"INVOICE TOTAL\s+([0-9.]+)"} + :multi #"\f\f"}]) + +(defn template-applies? [text {:keys [keywords]}] + (every? #(re-find % text) keywords)) + +(defn extract-template [text template] + (if (:multi template) + (mapcat + #(extract-template % (dissoc template :multi)) + (str/split text (:multi template))) + + [(->> template + :extract + (reduce-kv + (fn [result k v] + (assoc result k (some-> (first (map second (re-seq v text))) + str/trim ))) + {}))])) + +(defn parse [text] + (->> templates + (filter (partial template-applies? text)) + first + (extract-template text))) + +(defn parse-file + [file] + (-> (sh/sh "pdftotext" "-layout" file "-") + :out + (doto println) + parse)) diff --git a/test/auto_ap/handler_test.clj b/test/auto_ap/handler_test.clj new file mode 100644 index 00000000..906bf302 --- /dev/null +++ b/test/auto_ap/handler_test.clj @@ -0,0 +1,14 @@ +(ns auto-ap.handler-test + (:require [clojure.test :refer :all] + [ring.mock.request :as mock] + [auto-ap.handler :refer :all])) + +(deftest test-app + (testing "main route" + (let [response (app (mock/request :get "/"))] + (is (= (:status response) 200)) + (is (= (:body response) "Hello World")))) + + (testing "not-found route" + (let [response (app (mock/request :get "/invalid"))] + (is (= (:status response) 404)))))