first version of sample parser.
This commit is contained in:
10
.gitignore
vendored
Normal file
10
.gitignore
vendored
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
/target
|
||||||
|
/lib
|
||||||
|
/classes
|
||||||
|
/checkouts
|
||||||
|
pom.xml
|
||||||
|
pom.xml.asc
|
||||||
|
*.jar
|
||||||
|
*.class
|
||||||
|
/.lein-*
|
||||||
|
/.nrepl-port
|
||||||
19
README.md
Normal file
19
README.md
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# auto-ap
|
||||||
|
|
||||||
|
FIXME
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
You will need [Leiningen][] 2.0.0 or above installed.
|
||||||
|
|
||||||
|
[leiningen]: https://github.com/technomancy/leiningen
|
||||||
|
|
||||||
|
## Running
|
||||||
|
|
||||||
|
To start a web server for the application, run:
|
||||||
|
|
||||||
|
lein ring server
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
Copyright © 2017 FIXME
|
||||||
13
project.clj
Normal file
13
project.clj
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
(defproject auto-ap "0.1.0-SNAPSHOT"
|
||||||
|
:description "FIXME: write description"
|
||||||
|
:url "http://example.com/FIXME"
|
||||||
|
:min-lein-version "2.0.0"
|
||||||
|
:dependencies [[org.clojure/clojure "1.8.0"]
|
||||||
|
[compojure "1.6.0"]
|
||||||
|
[ring/ring-defaults "0.2.1"]
|
||||||
|
[ring/ring-json "0.4.0"]]
|
||||||
|
:plugins [[lein-ring "0.9.7"]]
|
||||||
|
:ring {:handler auto-ap.handler/app}
|
||||||
|
:profiles
|
||||||
|
{:dev {:dependencies [[javax.servlet/servlet-api "2.5"]
|
||||||
|
[ring/ring-mock "0.3.0"]]}})
|
||||||
BIN
resources/public/chefswarehouse.PDF
Normal file
BIN
resources/public/chefswarehouse.PDF
Normal file
Binary file not shown.
BIN
resources/public/cintas.pdf
Normal file
BIN
resources/public/cintas.pdf
Normal file
Binary file not shown.
BIN
resources/public/golden-gate-meats.pdf
Normal file
BIN
resources/public/golden-gate-meats.pdf
Normal file
Binary file not shown.
37
resources/public/index.html
Normal file
37
resources/public/index.html
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/dropzone@5.2.0/dist/dropzone.min.js"></script>
|
||||||
|
<script src="http://code.jquery.com/jquery-3.2.1.min.js" integrity="sha256-hwg4gsxgFZhOsEEamdOYGBf13FyQuiTwlAQgxVSNgt4=" crossorigin="anonymous"></script>
|
||||||
|
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bulma/0.6.1/css/bulma.css" />
|
||||||
|
<style>.dz-error-mark { display:none} .dz-details {display:none}
|
||||||
|
form { border: 3px solid lightgray; padding: 25px; width: 100%;}
|
||||||
|
.dz-success-mark {display:none} </style>
|
||||||
|
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="app" class="container">
|
||||||
|
<h1>Invoice Parsing Demo</h1>
|
||||||
|
<form action="/pdf-upload" id="my-dropzone">
|
||||||
|
<h3>Drop invoice pdfs here</h3>
|
||||||
|
<input type="file" name="file" style="display:none"/>
|
||||||
|
</form>
|
||||||
|
<h2 style="display:none">Found invoices:</h2>
|
||||||
|
<ul>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
var myDropzone = new Dropzone("#my-dropzone");
|
||||||
|
myDropzone.on("success", function(file, a) {
|
||||||
|
$("h2").show();
|
||||||
|
JSON.parse(a).map(function(x) {
|
||||||
|
|
||||||
|
$("ul").append($("<li>").text(x));
|
||||||
|
});
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
29
src/auto_ap/handler.clj
Normal file
29
src/auto_ap/handler.clj
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
(ns auto-ap.handler
|
||||||
|
(:require [compojure.core :refer :all]
|
||||||
|
[compojure.route :as route]
|
||||||
|
[clojure.java.io :as io]
|
||||||
|
[auto-ap.parse :as parse]
|
||||||
|
[ring.middleware.multipart-params :as mp]
|
||||||
|
[ring.util.response :as response]
|
||||||
|
[ring.middleware.defaults :refer [wrap-defaults site-defaults]]
|
||||||
|
[ring.middleware.json :refer [wrap-json-response]]))
|
||||||
|
|
||||||
|
(defroutes app-routes
|
||||||
|
(GET "/" [] (response/resource-response "index.html" {:root "public"}))
|
||||||
|
(POST "/pdf-upload"
|
||||||
|
{{ files "file"} :params :as params}
|
||||||
|
(let [{:keys [filename tempfile]} (second files)]
|
||||||
|
(io/copy tempfile (io/file "resources" "public" filename))
|
||||||
|
(for [{:keys [total date invoice-number customer-identifier]} (parse/parse-file (str "resources/public/" filename))]
|
||||||
|
(do
|
||||||
|
(println (str "An invoice #" invoice-number " on " date " for " total))
|
||||||
|
(str "An invoice for customer " customer-identifier " #" invoice-number " on " date " for " total )))))
|
||||||
|
(route/resources "/")
|
||||||
|
(route/not-found "Not Found"))
|
||||||
|
|
||||||
|
#_(defroutes routes
|
||||||
|
(GET "/" [] (resource-response "index.html" {:root "public"}))
|
||||||
|
(resources "/"))
|
||||||
|
|
||||||
|
(def app
|
||||||
|
(wrap-json-response (mp/wrap-multipart-params app-routes)))
|
||||||
54
src/auto_ap/parse.clj
Normal file
54
src/auto_ap/parse.clj
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
(ns auto-ap.parse
|
||||||
|
(:require [clojure.java.io :as io]
|
||||||
|
[clojure.string :as str]
|
||||||
|
[clojure.java.shell :as sh]))
|
||||||
|
|
||||||
|
(def templates
|
||||||
|
[{:keywords [#"CHEF'S WAREHOUSE"]
|
||||||
|
:extract {:total #"2 WKS C\.C\.\s+([\d.,]+)"
|
||||||
|
:customer-identifier #"\n([A-Z][A-Z ]+)\s{2,}"
|
||||||
|
:date #"\s+([0-9]+/[0-9]+/[0-9]+)"
|
||||||
|
:invoice-number #"\s+[0-9]+/[0-9]+/[0-9]+\s+([0-9]+)"}}
|
||||||
|
|
||||||
|
{:keywords [#"Golden Gate Meat"]
|
||||||
|
:extract {:total #"Invoice Total\:\s+\$([\d.,]+)"
|
||||||
|
:customer-identifier #"Bill To\s*:\s*([\w ]+)\s{2,}"
|
||||||
|
:date #"Printed:\s+([0-9]+/[0-9]+/[0-9]+)"
|
||||||
|
:invoice-number #"Invoice\s+[^\n]+\n[^\n]+\n\s+([0-9]+)"}}
|
||||||
|
|
||||||
|
{:keywords [#"CINTAS CORPORATION"]
|
||||||
|
:extract {:invoice-number #"INVOICE\s#\s+([\d.,]+)"
|
||||||
|
:customer-identifier #"BILL TO\s*:\s{2,}([\w ]+)\s{2,}"
|
||||||
|
:date #"INVOICE DATE\s*\n.*\s+([0-9]+/[0-9]+/[0-9]+)"
|
||||||
|
:total #"INVOICE TOTAL\s+([0-9.]+)"}
|
||||||
|
:multi #"\f\f"}])
|
||||||
|
|
||||||
|
(defn template-applies? [text {:keys [keywords]}]
|
||||||
|
(every? #(re-find % text) keywords))
|
||||||
|
|
||||||
|
(defn extract-template [text template]
|
||||||
|
(if (:multi template)
|
||||||
|
(mapcat
|
||||||
|
#(extract-template % (dissoc template :multi))
|
||||||
|
(str/split text (:multi template)))
|
||||||
|
|
||||||
|
[(->> template
|
||||||
|
:extract
|
||||||
|
(reduce-kv
|
||||||
|
(fn [result k v]
|
||||||
|
(assoc result k (some-> (first (map second (re-seq v text)))
|
||||||
|
str/trim )))
|
||||||
|
{}))]))
|
||||||
|
|
||||||
|
(defn parse [text]
|
||||||
|
(->> templates
|
||||||
|
(filter (partial template-applies? text))
|
||||||
|
first
|
||||||
|
(extract-template text)))
|
||||||
|
|
||||||
|
(defn parse-file
|
||||||
|
[file]
|
||||||
|
(-> (sh/sh "pdftotext" "-layout" file "-")
|
||||||
|
:out
|
||||||
|
(doto println)
|
||||||
|
parse))
|
||||||
14
test/auto_ap/handler_test.clj
Normal file
14
test/auto_ap/handler_test.clj
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
(ns auto-ap.handler-test
|
||||||
|
(:require [clojure.test :refer :all]
|
||||||
|
[ring.mock.request :as mock]
|
||||||
|
[auto-ap.handler :refer :all]))
|
||||||
|
|
||||||
|
(deftest test-app
|
||||||
|
(testing "main route"
|
||||||
|
(let [response (app (mock/request :get "/"))]
|
||||||
|
(is (= (:status response) 200))
|
||||||
|
(is (= (:body response) "Hello World"))))
|
||||||
|
|
||||||
|
(testing "not-found route"
|
||||||
|
(let [response (app (mock/request :get "/invalid"))]
|
||||||
|
(is (= (:status response) 404)))))
|
||||||
Reference in New Issue
Block a user