diff --git a/.gitignore b/.gitignore index ae81b840..93a77fa5 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ pom.xml.asc /.nrepl-port /resources/public/js/compiled *.log +examples/ diff --git a/project.clj b/project.clj index 44c377f9..56586ef9 100644 --- a/project.clj +++ b/project.clj @@ -13,6 +13,7 @@ [ring/ring-json "0.4.0"] [ring "1.4.0"] [yogthos/config "0.8"] + [dk.ative/docjure "1.12.0"] [org.clojure/java.jdbc "0.7.3"] [cljsjs/dropzone "4.3.0-0"] ;; https://mvnrepository.com/artifact/postgresql/postgresql diff --git a/src/clj/auto_ap/handler.clj b/src/clj/auto_ap/handler.clj index fd82a493..43e7d9c8 100644 --- a/src/clj/auto_ap/handler.clj +++ b/src/clj/auto_ap/handler.clj @@ -54,7 +54,7 @@ (println existing-invoices) (invoices/insert-multi! (for [{:keys [total date invoice-number customer-identifier vendor] :as row} - (parse/parse-file (.getPath tempfile))] + (parse/parse-file (.getPath tempfile) filename)] (assoc row :imported false :potential-duplicate (boolean (seq (filter #(and (= vendor (:vendor %)) diff --git a/src/clj/auto_ap/parse.clj b/src/clj/auto_ap/parse.clj index 503affe9..6fe8655f 100644 --- a/src/clj/auto_ap/parse.clj +++ b/src/clj/auto_ap/parse.clj @@ -1,7 +1,8 @@ (ns auto-ap.parse (:require [clojure.java.io :as io] [clojure.string :as str] - [clojure.java.shell :as sh])) + [clojure.java.shell :as sh] + [auto-ap.parse.excel :as excel])) (def templates [{:vendor "CHFW" @@ -49,8 +50,23 @@ first (extract-template text))) -(defn parse-file - [file] + +(defmulti parse-file (fn [file filename] (last (str/split filename #"\." )))) + +(defmethod parse-file + "pdf" + [file filename] (-> (sh/sh "pdftotext" "-layout" file "-") :out parse)) + +(defmethod parse-file + "xls" + [file filename] + (excel/parse-file file filename)) + + +(defmethod parse-file + "xlsx" + [file filename] + (excel/parse-file file filename)) diff --git a/src/clj/auto_ap/parse/excel.clj b/src/clj/auto_ap/parse/excel.clj new file mode 100644 index 00000000..064a380a --- /dev/null +++ b/src/clj/auto_ap/parse/excel.clj @@ -0,0 +1,61 @@ +(ns auto-ap.parse.excel + (:import [org.apache.poi.ss.util CellAddress]) + (:require [dk.ative.docjure.spreadsheet :as d] + + [clojure.string :as str])) + +(def templates + [{:vendor "Isp Productions" + :keywords [#"ISP PRODUCTIONS"] + :extract {:customer-identifier [#"SERVICES PROVIDED TO" 1 0] + :total [#"PAY THIS" -1 0] + :date [#"INVOICE DATE" 0 1] + :invoice-number [#"INVOICE NUMBER" 0 1]}} + {:vendor "Southern Wine Online" + :keywords [#"Please note that the total invoice amount may"] + :extract {:customer-identifier [#"Customer #" 1 0] + :total [#"Total Invoice" 0 5] + :date [#"Date" 0 0 #"Date: (.*)"] + :invoice-number [#"Invoice #" 0 0 #"Invoice #: (.*)"]}}]) + +(defn template-applies? [text {:keys [keywords]}] + (every? #(re-find % text) keywords)) + +(defn extract [wb {:keys [extract vendor]}] + (println extract) + + (reduce-kv + (fn [invoice k [regex offset-row offset-column extract-regex]] + (assoc invoice k + (->> wb + (d/sheet-seq) + first + (d/cell-seq) + (filter (fn [cell] + (re-find regex (str (d/read-cell cell))))) + (map (fn [cell] + (let [address (.getAddress cell) + cell-value (str (d/read-cell (d/select-cell (.toString (CellAddress. (+ offset-row (.getRow address)) (+ offset-column (.getColumn address)) )) + (first (d/sheet-seq wb)))))] + (if extract-regex + (second (re-find extract-regex cell-value)) + + cell-value)))) + first))) + {:vendor vendor} + extract)) + +(defn parse-file + [file filename] + [(let [wb (d/load-workbook file) + text (->> wb + (d/sheet-seq) + first + (d/cell-seq) + (map d/read-cell) + (str/join " "))] + (->> templates + (filter (partial template-applies? text)) + first + (extract wb) + ))])