feat(sales): initial Parquet migration infrastructure
- Add DuckDB/S3 parquet storage layer (auto-ap.storage.parquet) - Add sales_to_parquet migration script for historical data - Add cleanup_sales for post-migration Datomic cleanup - Add sales_orders_new.clj with DuckDB read layer for SSR views - Add test scaffolding for parquet storage - Add plan document for move-detailed-sales-to-parquet feat(sales): redirect production and read flows to Parquet/DuckDB - U3: Square production (upsert) now buffers to parquet via flatten-order-to-parquet! - U3: EzCater core import-order now buffers to parquet instead of Datomic transact - U3: EzCater XLS upload-xls now buffers to parquet instead of audit-transact - U4: Rewrite sales_orders.clj to read from DuckDB via pq/get-sales-orders - U5: Rewrite sales_summaries to use parquet aggregation functions - get-payment-items-parquet, get-discounts-parquet, get-refund-items-parquet - get-tax-parquet, get-tip-parquet, get-sales-parquet - Add sum-* aggregation functions to storage/sales_summaries.clj - sum-discounts, sum-refunds-by-type, sum-taxes, sum-tips, sum-sales-by-category
This commit is contained in:
297
src/clj/auto_ap/storage/parquet.clj
Normal file
297
src/clj/auto_ap/storage/parquet.clj
Normal file
@@ -0,0 +1,297 @@
|
||||
(ns auto-ap.storage.parquet
|
||||
(:require [config.core :refer [env]]
|
||||
[amazonica.aws.s3 :as s3]
|
||||
[clojure.java.io :as io]
|
||||
[clojure.string :as str]
|
||||
[clojure.data.json :as json])
|
||||
(:import (java.sql DriverManager)
|
||||
(java.time LocalDate)))
|
||||
|
||||
(def ^:dynamic *bucket* (:data-bucket env))
|
||||
(def parquet-prefix "sales-details")
|
||||
|
||||
(defn s3-location [filename]
|
||||
(str "s3://" *bucket* "/" filename))
|
||||
|
||||
(defn parquet-key [entity-type date-str]
|
||||
(str parquet-prefix "/" entity-type "/" date-str ".parquet"))
|
||||
|
||||
(def db (atom nil))
|
||||
|
||||
(defn connect! []
|
||||
(let [conn (DriverManager/getConnection "jdbc:duckdb:")
|
||||
stmt (.createStatement conn)]
|
||||
(.execute stmt "INSTALL httpfs; LOAD httpfs;")
|
||||
(.close stmt)
|
||||
(.addShutdownHook (Runtime/getRuntime)
|
||||
(Thread. #(fn [])))
|
||||
(reset! db conn)))
|
||||
|
||||
(defn disconnect! []
|
||||
(locking db
|
||||
(when-let [c @db]
|
||||
(.close c)
|
||||
(reset! db nil))))
|
||||
|
||||
(defmacro with-duckdb
|
||||
[& body]
|
||||
`(let [conn# (or @db (connect!))]
|
||||
(try
|
||||
(let [~'conn conn#]
|
||||
~@body)
|
||||
(finally
|
||||
(when (and (not @db) conn#)
|
||||
(.close conn#))))))
|
||||
|
||||
(defn execute! [sql]
|
||||
(with-duckdb
|
||||
(let [stmt (.createStatement conn)]
|
||||
(.execute stmt sql)
|
||||
nil)))
|
||||
|
||||
(defn query-scalar [sql]
|
||||
(with-duckdb
|
||||
(let [stmt (.createStatement conn)
|
||||
rs (.executeQuery stmt sql)]
|
||||
(when (.next rs)
|
||||
(.getObject rs 1)))))
|
||||
|
||||
(defn query-rows [sql]
|
||||
(with-duckdb
|
||||
(let [stmt (.createStatement conn)
|
||||
rs (.executeQuery stmt sql)
|
||||
meta (.getMetaData rs)
|
||||
col-count (.getColumnCount meta)
|
||||
cols (vec (for [i (range 1 (inc col-count))]
|
||||
(keyword (.getColumnLabel meta i))))]
|
||||
(loop [rows []]
|
||||
(if (.next rs)
|
||||
(recur (conj rows
|
||||
(zipmap cols
|
||||
(vec (for [i (range 1 (inc col-count))]
|
||||
(.getObject rs i))))))
|
||||
rows)))))
|
||||
|
||||
(defn execute-to-parquet! [sql ^String parquet-path]
|
||||
(with-duckdb
|
||||
(let [stmt (.createStatement conn)]
|
||||
(.execute stmt
|
||||
(format "COPY (%s) TO '%s' (FORMAT PARQUET, OVERWRITE_OR_IGNORE)"
|
||||
sql parquet-path))
|
||||
(io/file parquet-path))))
|
||||
|
||||
(defn upload-parquet! [local-parquet-file s3-key]
|
||||
(s3/put-object {:bucket-name *bucket*
|
||||
:key s3-key
|
||||
:file local-parquet-file})
|
||||
(s3-location s3-key))
|
||||
|
||||
(defonce *buffers* (atom {}))
|
||||
|
||||
(defn- wal-dir []
|
||||
(io/file (System/getProperty "user.dir" "/tmp")
|
||||
"parquet-wal"))
|
||||
|
||||
(defn- init-wal! []
|
||||
(let [dir (wal-dir)]
|
||||
(when-not (.exists dir)
|
||||
(.mkdirs dir))))
|
||||
|
||||
(defn buffer! [entity-type record]
|
||||
(init-wal!)
|
||||
(let [seq-no (System/currentTimeMillis)
|
||||
entry (assoc record :_seq-no seq-no)]
|
||||
(swap! *buffers* update entity-type (fnil conj []) entry)
|
||||
(try
|
||||
(let [wal-file (io/file (wal-dir)
|
||||
(str entity-type ".jsonl"))]
|
||||
(io/make-parents wal-file)
|
||||
(with-open [w (io/writer wal-file :append true)]
|
||||
(.write w ^String (json/write-str {:seq-no seq-no
|
||||
:record record}))
|
||||
(.write w (int \newline))))
|
||||
(catch Exception e
|
||||
(println "[parquet/wal]" (.getMessage e))))
|
||||
entry))
|
||||
|
||||
(defn clear-buffer! [entity-type]
|
||||
(swap! *buffers* dissoc entity-type))
|
||||
|
||||
(defn buffer-count [entity-type]
|
||||
(-> @*buffers* (get entity-type []) count))
|
||||
|
||||
(defn total-buf-count []
|
||||
(->> @*buffers*
|
||||
vals (mapcat identity) count))
|
||||
|
||||
(defn flush-to-parquet! [entity-type]
|
||||
"Flush buffered records for entity-type to parquet + S3."
|
||||
(let [records (get @*buffers* entity-type [])]
|
||||
(if (empty? records)
|
||||
{:status :no-records}
|
||||
(let [date-str (.toString (LocalDate/now))
|
||||
jsonl-file (io/file "/tmp"
|
||||
(str entity-type "-" date-str ".jsonl"))
|
||||
parquet-file (io/file "/tmp"
|
||||
(str entity-type "-" date-str ".parquet"))
|
||||
s3-key (parquet-key entity-type date-str)]
|
||||
(try
|
||||
(with-open [w (io/writer jsonl-file :append true)]
|
||||
(doseq [r records]
|
||||
(.write w ^String (json/write-str (dissoc r :_seq-no)))
|
||||
(.write w (int \newline))))
|
||||
(execute-to-parquet!
|
||||
(format "SELECT * FROM read_json_auto('%s')"
|
||||
(.getAbsolutePath jsonl-file))
|
||||
(.getAbsolutePath parquet-file))
|
||||
(upload-parquet! parquet-file s3-key)
|
||||
(clear-buffer! entity-type)
|
||||
(.delete ^java.io.File jsonl-file)
|
||||
(.delete ^java.io.File parquet-file)
|
||||
{:key s3-key :status :ok}
|
||||
(catch Exception e
|
||||
(throw (ex-info "Flush failed" {:entity-type entity-type}
|
||||
:error (.getMessage e)))))))))
|
||||
|
||||
(defn flush-by-date! []
|
||||
"Flush all entity types for today."
|
||||
(let [etypes ["sales-order" "charge"
|
||||
"line-item" "sales-refund"]
|
||||
flushed (into #{}
|
||||
(keep (fn [et]
|
||||
(let [{:keys [status]}
|
||||
(flush-to-parquet! et)]
|
||||
(when (= status :ok)
|
||||
et))))
|
||||
etypes)]
|
||||
{:flushed flushed}))
|
||||
|
||||
(defn load-unflushed! []
|
||||
"Restore unflushed records from WAL jsonl files into *buffers."
|
||||
(init-wal!)
|
||||
(let [etypes ["sales-order" "charge"
|
||||
"line-item" "sales-refund"]
|
||||
loaded (reduce-kv
|
||||
(fn [acc et data]
|
||||
(if-not (empty? data)
|
||||
(assoc acc et
|
||||
(->> (str/split-lines data)
|
||||
(keep #(try
|
||||
(let [entry (json/read-str %)]
|
||||
(when entry
|
||||
(assoc (:record entry) :_seq-no (:seq-no entry))))
|
||||
(catch Exception _)))))
|
||||
acc))
|
||||
{}
|
||||
(into {}
|
||||
(keep (fn [et]
|
||||
(let [f (io/file
|
||||
(wal-dir)
|
||||
(str et ".jsonl"))]
|
||||
[et (slurp f)])))
|
||||
etypes))]
|
||||
(swap! *buffers* merge loaded)))
|
||||
|
||||
(defn get-unflushed-count []
|
||||
(total-buf-count))
|
||||
|
||||
(defn unflushed-records? []
|
||||
(not= 0 (total-buf-count)))
|
||||
|
||||
;;; DuckDB Read Layer
|
||||
|
||||
(defn date-seq [start end]
|
||||
"Seq of YYYY-MM-DD strings between start and end inclusive."
|
||||
(let [sd (LocalDate/parse start)
|
||||
ed (LocalDate/parse end)
|
||||
days (int (Math/abs
|
||||
(- (.toEpochDay sd)
|
||||
(.toEpochDay ed))))]
|
||||
(for [i (range 0 (inc days))]
|
||||
(.toString (.plusDays sd i)))))
|
||||
|
||||
(defn today []
|
||||
(.toString (LocalDate/now)))
|
||||
|
||||
(defn parquet-query [entity-type start-date end-date]
|
||||
"Build SQL to read all parquet files in date range.
|
||||
Returns map with :sql and :count-sql keys."
|
||||
(let [date-strs (date-seq start-date end-date)
|
||||
urls (vec
|
||||
(map #(format "'s3://%%s/sales-details/%%s/%%s.parquet'"
|
||||
*bucket* entity-type %)
|
||||
date-strs))
|
||||
sql (str "SELECT * FROM read_parquet(["
|
||||
(str/join ", " urls)
|
||||
"])")]
|
||||
{:sql sql
|
||||
:count-sql (format "SELECT COUNT(*) FROM (%s) t" sql)}))
|
||||
|
||||
(defn- build-where-clause [opts field-pairs]
|
||||
"Build SQL WHERE clause from opts map.
|
||||
fields-with-keys is vector of [:field-key :env-var-name]."
|
||||
(let [clauses (keep
|
||||
(fn [[key env]]
|
||||
(let [v (get opts key)]
|
||||
(when v
|
||||
(str env " = '" v "'"))))
|
||||
field-pairs)]
|
||||
(when (seq clauses)
|
||||
(str " WHERE " (str/join " AND " clauses)))))
|
||||
|
||||
(defn get-sales-orders
|
||||
([start-date end-date]
|
||||
(get-sales-orders start-date end-date {}))
|
||||
([start-date end-date opts]
|
||||
(let [q (parquet-query "sales-order"
|
||||
start-date end-date)
|
||||
base-sql (:sql q)
|
||||
count-sql (:count-sql q)
|
||||
sort (get opts :sort "date")
|
||||
order (get opts :order "DESC")
|
||||
limit (get opts :limit)
|
||||
offset (get opts :offset)
|
||||
where-str (build-where-clause
|
||||
|
||||
opts
|
||||
[[:client "external_id.client"]
|
||||
[:vendor "external_id.vendor"]
|
||||
[:location "location"]])
|
||||
full-sql (if where-str
|
||||
(str base-sql where-str)
|
||||
base-sql)
|
||||
result (cond-> full-sql
|
||||
sort (str " ORDER BY " sort
|
||||
" " (name order))
|
||||
limit (str " LIMIT " limit)
|
||||
offset (str " OFFSET " offset))
|
||||
full-count (if where-str
|
||||
(str count-sql where-str)
|
||||
count-sql)]
|
||||
{:rows (query-rows result)
|
||||
:count (or
|
||||
(int
|
||||
(query-scalar
|
||||
full-count)) 0)})))
|
||||
|
||||
(defn query-deduped [entity-type start-date end-date]
|
||||
"Query records deduplicated by external-id (latest _seq_no wins)."
|
||||
(let [q (parquet-query entity-type start-date end-date)]
|
||||
(query-rows
|
||||
(str (:sql q)
|
||||
" QUALIFY ROW_NUMBER() OVER"
|
||||
" (PARTITION BY sales_order.external_id"
|
||||
" ORDER BY _seq_no DESC) = 1"))))
|
||||
|
||||
(defn query-by-entity-id [entity-type external-id
|
||||
start-date end-date]
|
||||
(->> (query-deduped entity-type start-date end-date)
|
||||
(filter #(= (:external_id %)
|
||||
(name external-id)))
|
||||
first))
|
||||
|
||||
(defn count-records-in-parquet
|
||||
[entity-type start-date end-date]
|
||||
(let [q (parquet-query entity-type
|
||||
start-date end-date)]
|
||||
(or (int (query-scalar (:count-sql q))) 0)))
|
||||
184
src/clj/auto_ap/storage/sales_summaries.clj
Normal file
184
src/clj/auto_ap/storage/sales_summaries.clj
Normal file
@@ -0,0 +1,184 @@
|
||||
(ns auto-ap.storage.sales-summaries
|
||||
"Aggregation functions querying Parquet files on S3 via DuckDB.
|
||||
Entity types: sales-order | charge | line-item | sales-refund
|
||||
S3 pattern: s3://<bucket>/sales-details/<entity-type>/<YYYY-MM-DD>.parquet"
|
||||
(:require [auto-ap.storage.parquet :as p]
|
||||
[clojure.string :as str]))
|
||||
|
||||
(defn- dq [name]
|
||||
(str "\"" name "\""))
|
||||
|
||||
(defn- sum-dbl [val]
|
||||
(try
|
||||
(if val (double val) 0.0)
|
||||
(catch Exception _e
|
||||
0.0)))
|
||||
|
||||
(defn- pq-files [entity-type start-date end-date]
|
||||
"Vector of S3 parquet file paths for date range."
|
||||
(let [dates (p/date-seq start-date end-date)]
|
||||
(vec
|
||||
(map #(str "'s3://" p/*bucket*
|
||||
"/sales-details/" entity-type "/"
|
||||
% ".parquet") dates))))
|
||||
|
||||
(defn sum-payments-by-type [client-id start-date end-date]
|
||||
"Return {processor-key -> {type-name-string -> total-double}}."
|
||||
(let [files (pq-files "charge" start-date end-date)]
|
||||
(try
|
||||
(let [sql (str "SELECT "
|
||||
(dq "processor")
|
||||
" AS proc, "
|
||||
(dq "type-name")
|
||||
" AS type_name, "
|
||||
"SUM("
|
||||
(dq "total")
|
||||
")::DOUBLE AS total_amount "
|
||||
"FROM read_parquet(["
|
||||
(str/join ", " files)
|
||||
"]) "
|
||||
"WHERE "
|
||||
(dq "client-code")
|
||||
" = '" client-id "' "
|
||||
"GROUP BY "
|
||||
(dq "processor") ", "
|
||||
(dq "type-name"))]
|
||||
(let [rows (p/query-rows sql)]
|
||||
(reduce (fn [acc row]
|
||||
(let [proc (:proc row)
|
||||
tname (str/trim (name (:type_name row)))
|
||||
total (sum-dbl (:total_amount row))]
|
||||
(update acc proc
|
||||
(fn [inner]
|
||||
(let [b (or inner {})]
|
||||
(assoc b
|
||||
tname
|
||||
(+ (get b tname 0.0) total)))))))
|
||||
{}
|
||||
rows)))
|
||||
(catch Exception e
|
||||
(println "[sales-summaries]" (.getMessage e))
|
||||
{}))))
|
||||
|
||||
(defn sum-discounts [client-id start-date end-date]
|
||||
(let [files (pq-files "sales-order" start-date end-date)]
|
||||
(try
|
||||
(let [sql (str "SELECT SUM("
|
||||
(dq "discount")
|
||||
")::DOUBLE AS discount_total "
|
||||
"FROM read_parquet(["
|
||||
(str/join ", " files)
|
||||
"]) "
|
||||
"WHERE "
|
||||
(dq "client-code")
|
||||
" = '" client-id "'")]
|
||||
(or (some-> (first (p/query-rows sql)) :discount_total sum-dbl) 0.0))
|
||||
(catch Exception e
|
||||
(println "[sales-summaries/discounts]" (.getMessage e))
|
||||
0.0))))
|
||||
|
||||
(defn sum-refunds-by-type [client-id start-date end-date]
|
||||
(let [files (pq-files "sales-refund" start-date end-date)]
|
||||
(try
|
||||
(let [sql (str "SELECT "
|
||||
(dq "type-name")
|
||||
" AS type_name, "
|
||||
"SUM("
|
||||
(dq "total")
|
||||
")::DOUBLE AS total_amount "
|
||||
"FROM read_parquet(["
|
||||
(str/join ", " files)
|
||||
"]) "
|
||||
"WHERE "
|
||||
(dq "sales-order-external-id")
|
||||
" IN (SELECT "
|
||||
(dq "external-id")
|
||||
" FROM read_parquet(["
|
||||
(str/join ", " (pq-files "sales-order" start-date end-date))
|
||||
"]) WHERE "
|
||||
(dq "client-code")
|
||||
" = '" client-id "') "
|
||||
"GROUP BY " (dq "type-name"))]
|
||||
(let [rows (p/query-rows sql)]
|
||||
(reduce (fn [acc row]
|
||||
(let [tname (str/trim (name (:type_name row)))
|
||||
total (sum-dbl (:total_amount row))]
|
||||
(assoc acc tname (+ (get acc tname 0.0) total))))
|
||||
{}
|
||||
rows)))
|
||||
(catch Exception e
|
||||
(println "[sales-summaries/refunds]" (.getMessage e))
|
||||
{}))))
|
||||
|
||||
(defn sum-taxes [client-id start-date end-date]
|
||||
(let [files (pq-files "sales-order" start-date end-date)]
|
||||
(try
|
||||
(let [sql (str "SELECT SUM("
|
||||
(dq "tax")
|
||||
")::DOUBLE AS tax_total "
|
||||
"FROM read_parquet(["
|
||||
(str/join ", " files)
|
||||
"]) "
|
||||
"WHERE "
|
||||
(dq "client-code")
|
||||
" = '" client-id "'")]
|
||||
(or (some-> (first (p/query-rows sql)) :tax_total sum-dbl) 0.0))
|
||||
(catch Exception e
|
||||
(println "[sales-summaries/tax]" (.getMessage e))
|
||||
0.0))))
|
||||
|
||||
(defn sum-tips [client-id start-date end-date]
|
||||
(let [files (pq-files "sales-order" start-date end-date)]
|
||||
(try
|
||||
(let [sql (str "SELECT SUM("
|
||||
(dq "tip")
|
||||
")::DOUBLE AS tip_total "
|
||||
"FROM read_parquet(["
|
||||
(str/join ", " files)
|
||||
"]) "
|
||||
"WHERE "
|
||||
(dq "client-code")
|
||||
" = '" client-id "'")]
|
||||
(or (some-> (first (p/query-rows sql)) :tip_total sum-dbl) 0.0))
|
||||
(catch Exception e
|
||||
(println "[sales-summaries/tip]" (.getMessage e))
|
||||
0.0))))
|
||||
|
||||
(defn sum-sales-by-category [client-id start-date end-date]
|
||||
(let [files (pq-files "line-item" start-date end-date)]
|
||||
(try
|
||||
(let [sql (str "SELECT "
|
||||
(dq "category")
|
||||
" AS category, "
|
||||
"SUM("
|
||||
(dq "total")
|
||||
")::DOUBLE AS total_amount, "
|
||||
"SUM("
|
||||
(dq "tax")
|
||||
")::DOUBLE AS tax_amount, "
|
||||
"SUM("
|
||||
(dq "discount")
|
||||
")::DOUBLE AS discount_amount "
|
||||
"FROM read_parquet(["
|
||||
(str/join ", " files)
|
||||
"]) "
|
||||
"WHERE "
|
||||
(dq "sales-order-external-id")
|
||||
" IN (SELECT "
|
||||
(dq "external-id")
|
||||
" FROM read_parquet(["
|
||||
(str/join ", " (pq-files "sales-order" start-date end-date))
|
||||
"]) WHERE "
|
||||
(dq "client-code")
|
||||
" = '" client-id "') "
|
||||
"GROUP BY " (dq "category"))]
|
||||
(let [rows (p/query-rows sql)]
|
||||
(mapv (fn [row]
|
||||
{:category (or (:category row) "Unknown")
|
||||
:total (sum-dbl (:total_amount row))
|
||||
:tax (sum-dbl (:tax_amount row))
|
||||
:discount (sum-dbl (:discount_amount row))})
|
||||
rows)))
|
||||
(catch Exception e
|
||||
(println "[sales-summaries/sales]" (.getMessage e))
|
||||
[]))))
|
||||
Reference in New Issue
Block a user