diff --git a/project.clj b/project.clj index 1a5039ef..69a5f7a2 100644 --- a/project.clj +++ b/project.clj @@ -29,8 +29,6 @@ [ring/ring-jetty-adapter "1.9.6" :exclusions [org.eclipse.jetty/jetty-server]] [yogthos/config "1.1.7"] - [dk.ative/docjure "1.14.0"] - [clj-fuzzy "0.4.1"] [com.walmartlabs/lacinia "0.37.0"] [vincit/venia "0.2.5"] diff --git a/src/clj/auto_ap/parse/excel.clj b/src/clj/auto_ap/parse/excel.clj index 4130dfeb..1f3b21f7 100644 --- a/src/clj/auto_ap/parse/excel.clj +++ b/src/clj/auto_ap/parse/excel.clj @@ -2,18 +2,23 @@ (:require [auto-ap.parse.templates :as t] [auto-ap.parse.util :as u] [clojure.string :as str] - [dk.ative.docjure.spreadsheet :as d]) - (:import (org.apache.poi.ss.util CellAddress))) + [amazonica.aws.lambda :as lambda] + [clojure.data.json :as json] + [config.core :refer [env]] + [clojure.java.io :as io] + [amazonica.aws.s3 :as s3]) + ) (defn template-applies? [text {:keys [keywords]}] + (every? #(re-find % text) keywords)) (defn extract [wb {:keys [extract vendor parser]}] (if (fn? extract) (extract wb vendor) - [(reduce-kv + #_[(reduce-kv (fn [invoice k [regex offset-row offset-column extract-regex]] (assoc invoice k (->> wb @@ -38,17 +43,34 @@ {:vendor-code vendor} extract)])) +(defn extract-sheet-details [bucket object] + (-> (lambda/invoke {:function-name "xls-extractor" :payload + (json/write-str + {"s3_url" object "s3_bucket" bucket})}) + :payload + slurp + json/read-str)) + (defn parse-file [file _] - (let [wb (d/load-workbook file) - text (->> wb - (d/sheet-seq) - first - (d/cell-seq) - (map d/read-cell) - (str/join " "))] + (let [tmp-key (str "xls-invoice/import/" (java.util.UUID/randomUUID)) + _ (with-open [f (io/input-stream file)] + (s3/put-object {:bucket-name (:data-bucket env) + :key tmp-key + :input-stream f})) + sheet (extract-sheet-details (:data-bucket env) tmp-key) + text (str/join " " (mapcat seq sheet))] (->> t/excel-templates (filter (partial template-applies? text)) first - (extract wb) - ))) + (extract sheet)))) + + + + +(defn xls-date->date [f] + (when (not-empty f) + (let [f (Double/parseDouble f) + unix-days (- f 25569.0) + unix-secs (* unix-days 86400.0)] + (java.util.Date. (long (Math/round (* 1000.0 unix-secs))))))) diff --git a/src/clj/auto_ap/parse/templates.clj b/src/clj/auto_ap/parse/templates.clj index e05614c4..d3b270cc 100644 --- a/src/clj/auto_ap/parse/templates.clj +++ b/src/clj/auto_ap/parse/templates.clj @@ -1,8 +1,6 @@ (ns auto-ap.parse.templates - (:require [dk.ative.docjure.spreadsheet :as d] - [auto-ap.parse.util :as u] - [clojure.string :as str]) - (:import (org.apache.poi.ss.util CellAddress))) + (:require [auto-ap.parse.util :as u] + [clojure.string :as str])) (def pdf-templates @@ -614,97 +612,27 @@ :parser {:date [:clj-time "MM/dd/yy"] :total [:trim-commas-and-negate nil]}}]) -(defn offset [c x y] - (.toString (CellAddress. (+ y (.getRow (.getAddress c))) (+ x (.getColumn (.getAddress c))) ))) - (def excel-templates - [{:vendor "Isp Productions" - :keywords [#"ISP PRODUCTIONS"] - :extract {:customer-identifier [#"SERVICES PROVIDED TO" 1 0] - :total [#"PAY THIS" -1 0] - :date [#"INVOICE DATE" 0 1] - :invoice-number [#"INVOICE NUMBER" 0 1]}} - {:vendor "Southern Glazers" - :keywords [#"Please note that the total invoice amount may"] - :extract {:customer-identifier [#"Customer #" 1 0] - :total [#"Subtotal" 0 16 ] - :date [#"Date" 0 0 #"Date: (.*)"] - :invoice-number [#"Invoice #" 0 0 #"Invoice #: (.*)"] - :account-number [#"Customer #" 0 0 #"Customer #: (.*)"]} - :parser { :total [:trim-commas-and-remove-dollars-and-invert-parentheses nil] - :date [:clj-time "MM/dd/yyyy"]}} - {:vendor "Mama Lu's Foods" + [{:vendor "Mama Lu's Foods" :keywords [#"Mama Lu's Foods"] - :extract (fn [wb vendor] - (let [[sheet] (d/sheet-seq wb)] - (transduce (comp - (drop 5) - (filter + :extract (fn [sheet vendor] + (transduce (comp + (drop 5) + (filter (fn [r] (and - r - (->> r d/cell-seq second d/read-cell)))) - (map + (seq r) + (->> r second not-empty)))) + (map (fn [r] - (let [[_ customer-order-number num date name amount] (map d/read-cell (d/cell-seq r))] + (let [[_ customer-order-number num date name amount] r] {:customer-identifier (second (re-find #"([^:]*):" name)) - :text name - :full-text name - :date (u/parse-value :clj-time "MM/dd/yyyy" (str/trim date)) - :invoice-number (str customer-order-number "-" (int num)) - :total (str amount) - :vendor-code vendor})))) - conj - [] - (d/row-seq sheet))))} - {:vendor "DVW Commercial" - :keywords [#"Total for" #"Num"] - :extract (fn [wb vendor] - (let [[sheet] (d/sheet-seq wb)] - (transduce (comp (filter (fn [c] - (re-find #"Invoice" (str (d/read-cell c))))) - (map (fn [c] - (let [customer-identifier (d/read-cell (->> (d/select-cell (offset c -3 0) sheet) - (iterate (fn [c] - (d/select-cell (offset c 0 -1) sheet))) - (filter (fn [c] - (not (str/blank? (d/read-cell c))))) - first))] - {:customer-identifier customer-identifier - :text customer-identifier - :full-text customer-identifier - :date (d/read-cell (d/select-cell (offset c 2 0) sheet)) - :invoice-number (d/read-cell (d/select-cell (offset c 4 0) sheet)) - :total (str (d/read-cell (d/select-cell (offset c 8 0) sheet))) - :vendor-code vendor})))) - conj - [] - (d/cell-seq sheet))))} - {:vendor "Chef's Choice Produce Co" - :keywords [#"Alt_invoice_number"] - :extract (fn [wb vendor] - (let [[sheet] (d/sheet-seq wb)] - (transduce (comp - (drop-while (fn [c] - (not (re-find #"Customer_id" (str (d/read-cell c)))))) - (drop 9) - (filter (fn [c] - (= 0 (.getColumnIndex c)))) - (filter (fn [c] - (not (str/blank? (str/trim (or (d/read-cell (d/select-cell (offset c 1 0) sheet)) "")))))) - (map (fn [c] - {:customer-identifier (str/trim (d/read-cell (d/select-cell (offset c 1 0) sheet))) - :text (d/read-cell (d/select-cell (offset c 1 0) sheet)) - :full-text (d/read-cell (d/select-cell (offset c 1 0) sheet)) - :date (u/parse-value :clj-time "MM/dd/yyyy" (str/trim (d/read-cell (d/select-cell (offset c 5 0) sheet)))) - :invoice-number (->> - (re-find #"^(?:0+([A-Z0-9]+))|([A-Z]+[A-Z0-9]+)" (str/trim (d/read-cell (d/select-cell (offset c 2 0) sheet)))) - (drop 1 ) - (filter identity) - first) - :total (str (d/read-cell (d/select-cell (offset c 7 0) sheet))) - :vendor-code vendor})) - (filter :customer-identifier)) - conj - [] - (d/cell-seq sheet))))}]) + :text name + :full-text name + :date (u/parse-value :clj-time "MM/dd/yyyy" (str/trim date)) + :invoice-number (str customer-order-number "-" (Integer/parseInt num)) + :total (str amount) + :vendor-code vendor})))) + conj + [] + sheet))}]) diff --git a/src/clj/auto_ap/routes/ezcater_xls.clj b/src/clj/auto_ap/routes/ezcater_xls.clj index 8bd53bc4..dc45f3cf 100644 --- a/src/clj/auto_ap/routes/ezcater_xls.clj +++ b/src/clj/auto_ap/routes/ezcater_xls.clj @@ -3,6 +3,7 @@ [auto-ap.datomic :refer [audit-transact conn]] [auto-ap.logging :as alog] [clojure.data.json :as json] + [auto-ap.parse.excel :as excel] [auto-ap.parse :as parse] [amazonica.aws.lambda :as lambda] [config.core :refer [env]] @@ -16,7 +17,6 @@ [clojure.java.io :as io] [com.brunobonacci.mulog :as mu] [datomic.api :as dc] - [dk.ative.docjure.spreadsheet :as doc] [hiccup2.core :as hiccup] [amazonica.aws.s3 :as s3])) @@ -27,14 +27,7 @@ (.setScale 2 java.math.RoundingMode/HALF_UP) (double)))) -(defn extract-sheet-details [bucket object] - (-> (lambda/invoke {:function-name "xls-extractor" :payload - (json/write-str - {"s3_url" object "s3_bucket" bucket})} - ) - :payload - slurp - json/read-str)) + (defn rows->maps [rows] @@ -43,12 +36,7 @@ (into {} (map vector headers r))))) -(defn xls-date->date [f] - (when (not-empty f) - (let [f (Double/parseDouble f) - unix-days (- f 25569.0) - unix-secs (* unix-days 86400.0)] - (java.util.Date. (long (Math/round (* 1000.0 unix-secs))))))) + (defn map->sales-order [r clients] @@ -67,7 +55,7 @@ (parse/exact-match clients)) client-id (:db/id client) location (first (:client/locations client)) - event-date (some-> (xls-date->date event-date) + event-date (some-> (excel/xls-date->date event-date) coerce/to-date-time atime/as-local-time coerce/to-date )] @@ -137,7 +125,7 @@ :key object :input-stream s}) (into [] - (->> (extract-sheet-details (:data-bucket env) object) + (->> (excel/extract-sheet-details (:data-bucket env) object) rows->maps (map #(map->sales-order % clients)) (filter identity))))) diff --git a/test/clj/auto_ap/integration/routes/ezcater_xls.clj b/test/clj/auto_ap/integration/routes/ezcater_xls.clj index 4559a217..366c6208 100644 --- a/test/clj/auto_ap/integration/routes/ezcater_xls.clj +++ b/test/clj/auto_ap/integration/routes/ezcater_xls.clj @@ -20,8 +20,7 @@ :client/name "The client" :client/matches ["Nick the Greek (Elk Grove)"])])] (with-open [s (io/input-stream (io/resource "sample-ezcater.xlsx"))] - (is (seq (sut/stream->sales-orders s))) - ) + (is (seq (sut/stream->sales-orders s)))) (with-open [s (io/input-stream (io/resource "sample-ezcater.xlsx"))] (is (= #:sales-order {:vendor :vendor/ccp-ezcater