first version of sample parser.
This commit is contained in:
29
src/auto_ap/handler.clj
Normal file
29
src/auto_ap/handler.clj
Normal file
@@ -0,0 +1,29 @@
|
||||
(ns auto-ap.handler
|
||||
(:require [compojure.core :refer :all]
|
||||
[compojure.route :as route]
|
||||
[clojure.java.io :as io]
|
||||
[auto-ap.parse :as parse]
|
||||
[ring.middleware.multipart-params :as mp]
|
||||
[ring.util.response :as response]
|
||||
[ring.middleware.defaults :refer [wrap-defaults site-defaults]]
|
||||
[ring.middleware.json :refer [wrap-json-response]]))
|
||||
|
||||
(defroutes app-routes
|
||||
(GET "/" [] (response/resource-response "index.html" {:root "public"}))
|
||||
(POST "/pdf-upload"
|
||||
{{ files "file"} :params :as params}
|
||||
(let [{:keys [filename tempfile]} (second files)]
|
||||
(io/copy tempfile (io/file "resources" "public" filename))
|
||||
(for [{:keys [total date invoice-number customer-identifier]} (parse/parse-file (str "resources/public/" filename))]
|
||||
(do
|
||||
(println (str "An invoice #" invoice-number " on " date " for " total))
|
||||
(str "An invoice for customer " customer-identifier " #" invoice-number " on " date " for " total )))))
|
||||
(route/resources "/")
|
||||
(route/not-found "Not Found"))
|
||||
|
||||
#_(defroutes routes
|
||||
(GET "/" [] (resource-response "index.html" {:root "public"}))
|
||||
(resources "/"))
|
||||
|
||||
(def app
|
||||
(wrap-json-response (mp/wrap-multipart-params app-routes)))
|
||||
54
src/auto_ap/parse.clj
Normal file
54
src/auto_ap/parse.clj
Normal file
@@ -0,0 +1,54 @@
|
||||
(ns auto-ap.parse
|
||||
(:require [clojure.java.io :as io]
|
||||
[clojure.string :as str]
|
||||
[clojure.java.shell :as sh]))
|
||||
|
||||
(def templates
|
||||
[{:keywords [#"CHEF'S WAREHOUSE"]
|
||||
:extract {:total #"2 WKS C\.C\.\s+([\d.,]+)"
|
||||
:customer-identifier #"\n([A-Z][A-Z ]+)\s{2,}"
|
||||
:date #"\s+([0-9]+/[0-9]+/[0-9]+)"
|
||||
:invoice-number #"\s+[0-9]+/[0-9]+/[0-9]+\s+([0-9]+)"}}
|
||||
|
||||
{:keywords [#"Golden Gate Meat"]
|
||||
:extract {:total #"Invoice Total\:\s+\$([\d.,]+)"
|
||||
:customer-identifier #"Bill To\s*:\s*([\w ]+)\s{2,}"
|
||||
:date #"Printed:\s+([0-9]+/[0-9]+/[0-9]+)"
|
||||
:invoice-number #"Invoice\s+[^\n]+\n[^\n]+\n\s+([0-9]+)"}}
|
||||
|
||||
{:keywords [#"CINTAS CORPORATION"]
|
||||
:extract {:invoice-number #"INVOICE\s#\s+([\d.,]+)"
|
||||
:customer-identifier #"BILL TO\s*:\s{2,}([\w ]+)\s{2,}"
|
||||
:date #"INVOICE DATE\s*\n.*\s+([0-9]+/[0-9]+/[0-9]+)"
|
||||
:total #"INVOICE TOTAL\s+([0-9.]+)"}
|
||||
:multi #"\f\f"}])
|
||||
|
||||
(defn template-applies? [text {:keys [keywords]}]
|
||||
(every? #(re-find % text) keywords))
|
||||
|
||||
(defn extract-template [text template]
|
||||
(if (:multi template)
|
||||
(mapcat
|
||||
#(extract-template % (dissoc template :multi))
|
||||
(str/split text (:multi template)))
|
||||
|
||||
[(->> template
|
||||
:extract
|
||||
(reduce-kv
|
||||
(fn [result k v]
|
||||
(assoc result k (some-> (first (map second (re-seq v text)))
|
||||
str/trim )))
|
||||
{}))]))
|
||||
|
||||
(defn parse [text]
|
||||
(->> templates
|
||||
(filter (partial template-applies? text))
|
||||
first
|
||||
(extract-template text)))
|
||||
|
||||
(defn parse-file
|
||||
[file]
|
||||
(-> (sh/sh "pdftotext" "-layout" file "-")
|
||||
:out
|
||||
(doto println)
|
||||
parse))
|
||||
Reference in New Issue
Block a user