feat(sales): initial Parquet migration infrastructure

- Add DuckDB/S3 parquet storage layer (auto-ap.storage.parquet)
- Add sales_to_parquet migration script for historical data
- Add cleanup_sales for post-migration Datomic cleanup
- Add sales_orders_new.clj with DuckDB read layer for SSR views
- Add test scaffolding for parquet storage
- Add plan document for move-detailed-sales-to-parquet

feat(sales): redirect production and read flows to Parquet/DuckDB

- U3: Square production (upsert) now buffers to parquet via flatten-order-to-parquet!
- U3: EzCater core import-order now buffers to parquet instead of Datomic transact
- U3: EzCater XLS upload-xls now buffers to parquet instead of audit-transact
- U4: Rewrite sales_orders.clj to read from DuckDB via pq/get-sales-orders
- U5: Rewrite sales_summaries to use parquet aggregation functions
  - get-payment-items-parquet, get-discounts-parquet, get-refund-items-parquet
  - get-tax-parquet, get-tip-parquet, get-sales-parquet
- Add sum-* aggregation functions to storage/sales_summaries.clj
  - sum-discounts, sum-refunds-by-type, sum-taxes, sum-tips, sum-sales-by-category
This commit is contained in:
2026-04-25 07:43:41 -07:00
parent db9018722d
commit 26c9563a03
15 changed files with 1901 additions and 290 deletions

View File

@@ -0,0 +1,297 @@
(ns auto-ap.storage.parquet
(:require [config.core :refer [env]]
[amazonica.aws.s3 :as s3]
[clojure.java.io :as io]
[clojure.string :as str]
[clojure.data.json :as json])
(:import (java.sql DriverManager)
(java.time LocalDate)))
(def ^:dynamic *bucket* (:data-bucket env))
(def parquet-prefix "sales-details")
(defn s3-location [filename]
(str "s3://" *bucket* "/" filename))
(defn parquet-key [entity-type date-str]
(str parquet-prefix "/" entity-type "/" date-str ".parquet"))
(def db (atom nil))
(defn connect! []
(let [conn (DriverManager/getConnection "jdbc:duckdb:")
stmt (.createStatement conn)]
(.execute stmt "INSTALL httpfs; LOAD httpfs;")
(.close stmt)
(.addShutdownHook (Runtime/getRuntime)
(Thread. #(fn [])))
(reset! db conn)))
(defn disconnect! []
(locking db
(when-let [c @db]
(.close c)
(reset! db nil))))
(defmacro with-duckdb
[& body]
`(let [conn# (or @db (connect!))]
(try
(let [~'conn conn#]
~@body)
(finally
(when (and (not @db) conn#)
(.close conn#))))))
(defn execute! [sql]
(with-duckdb
(let [stmt (.createStatement conn)]
(.execute stmt sql)
nil)))
(defn query-scalar [sql]
(with-duckdb
(let [stmt (.createStatement conn)
rs (.executeQuery stmt sql)]
(when (.next rs)
(.getObject rs 1)))))
(defn query-rows [sql]
(with-duckdb
(let [stmt (.createStatement conn)
rs (.executeQuery stmt sql)
meta (.getMetaData rs)
col-count (.getColumnCount meta)
cols (vec (for [i (range 1 (inc col-count))]
(keyword (.getColumnLabel meta i))))]
(loop [rows []]
(if (.next rs)
(recur (conj rows
(zipmap cols
(vec (for [i (range 1 (inc col-count))]
(.getObject rs i))))))
rows)))))
(defn execute-to-parquet! [sql ^String parquet-path]
(with-duckdb
(let [stmt (.createStatement conn)]
(.execute stmt
(format "COPY (%s) TO '%s' (FORMAT PARQUET, OVERWRITE_OR_IGNORE)"
sql parquet-path))
(io/file parquet-path))))
(defn upload-parquet! [local-parquet-file s3-key]
(s3/put-object {:bucket-name *bucket*
:key s3-key
:file local-parquet-file})
(s3-location s3-key))
(defonce *buffers* (atom {}))
(defn- wal-dir []
(io/file (System/getProperty "user.dir" "/tmp")
"parquet-wal"))
(defn- init-wal! []
(let [dir (wal-dir)]
(when-not (.exists dir)
(.mkdirs dir))))
(defn buffer! [entity-type record]
(init-wal!)
(let [seq-no (System/currentTimeMillis)
entry (assoc record :_seq-no seq-no)]
(swap! *buffers* update entity-type (fnil conj []) entry)
(try
(let [wal-file (io/file (wal-dir)
(str entity-type ".jsonl"))]
(io/make-parents wal-file)
(with-open [w (io/writer wal-file :append true)]
(.write w ^String (json/write-str {:seq-no seq-no
:record record}))
(.write w (int \newline))))
(catch Exception e
(println "[parquet/wal]" (.getMessage e))))
entry))
(defn clear-buffer! [entity-type]
(swap! *buffers* dissoc entity-type))
(defn buffer-count [entity-type]
(-> @*buffers* (get entity-type []) count))
(defn total-buf-count []
(->> @*buffers*
vals (mapcat identity) count))
(defn flush-to-parquet! [entity-type]
"Flush buffered records for entity-type to parquet + S3."
(let [records (get @*buffers* entity-type [])]
(if (empty? records)
{:status :no-records}
(let [date-str (.toString (LocalDate/now))
jsonl-file (io/file "/tmp"
(str entity-type "-" date-str ".jsonl"))
parquet-file (io/file "/tmp"
(str entity-type "-" date-str ".parquet"))
s3-key (parquet-key entity-type date-str)]
(try
(with-open [w (io/writer jsonl-file :append true)]
(doseq [r records]
(.write w ^String (json/write-str (dissoc r :_seq-no)))
(.write w (int \newline))))
(execute-to-parquet!
(format "SELECT * FROM read_json_auto('%s')"
(.getAbsolutePath jsonl-file))
(.getAbsolutePath parquet-file))
(upload-parquet! parquet-file s3-key)
(clear-buffer! entity-type)
(.delete ^java.io.File jsonl-file)
(.delete ^java.io.File parquet-file)
{:key s3-key :status :ok}
(catch Exception e
(throw (ex-info "Flush failed" {:entity-type entity-type}
:error (.getMessage e)))))))))
(defn flush-by-date! []
"Flush all entity types for today."
(let [etypes ["sales-order" "charge"
"line-item" "sales-refund"]
flushed (into #{}
(keep (fn [et]
(let [{:keys [status]}
(flush-to-parquet! et)]
(when (= status :ok)
et))))
etypes)]
{:flushed flushed}))
(defn load-unflushed! []
"Restore unflushed records from WAL jsonl files into *buffers."
(init-wal!)
(let [etypes ["sales-order" "charge"
"line-item" "sales-refund"]
loaded (reduce-kv
(fn [acc et data]
(if-not (empty? data)
(assoc acc et
(->> (str/split-lines data)
(keep #(try
(let [entry (json/read-str %)]
(when entry
(assoc (:record entry) :_seq-no (:seq-no entry))))
(catch Exception _)))))
acc))
{}
(into {}
(keep (fn [et]
(let [f (io/file
(wal-dir)
(str et ".jsonl"))]
[et (slurp f)])))
etypes))]
(swap! *buffers* merge loaded)))
(defn get-unflushed-count []
(total-buf-count))
(defn unflushed-records? []
(not= 0 (total-buf-count)))
;;; DuckDB Read Layer
(defn date-seq [start end]
"Seq of YYYY-MM-DD strings between start and end inclusive."
(let [sd (LocalDate/parse start)
ed (LocalDate/parse end)
days (int (Math/abs
(- (.toEpochDay sd)
(.toEpochDay ed))))]
(for [i (range 0 (inc days))]
(.toString (.plusDays sd i)))))
(defn today []
(.toString (LocalDate/now)))
(defn parquet-query [entity-type start-date end-date]
"Build SQL to read all parquet files in date range.
Returns map with :sql and :count-sql keys."
(let [date-strs (date-seq start-date end-date)
urls (vec
(map #(format "'s3://%%s/sales-details/%%s/%%s.parquet'"
*bucket* entity-type %)
date-strs))
sql (str "SELECT * FROM read_parquet(["
(str/join ", " urls)
"])")]
{:sql sql
:count-sql (format "SELECT COUNT(*) FROM (%s) t" sql)}))
(defn- build-where-clause [opts field-pairs]
"Build SQL WHERE clause from opts map.
fields-with-keys is vector of [:field-key :env-var-name]."
(let [clauses (keep
(fn [[key env]]
(let [v (get opts key)]
(when v
(str env " = '" v "'"))))
field-pairs)]
(when (seq clauses)
(str " WHERE " (str/join " AND " clauses)))))
(defn get-sales-orders
([start-date end-date]
(get-sales-orders start-date end-date {}))
([start-date end-date opts]
(let [q (parquet-query "sales-order"
start-date end-date)
base-sql (:sql q)
count-sql (:count-sql q)
sort (get opts :sort "date")
order (get opts :order "DESC")
limit (get opts :limit)
offset (get opts :offset)
where-str (build-where-clause
opts
[[:client "external_id.client"]
[:vendor "external_id.vendor"]
[:location "location"]])
full-sql (if where-str
(str base-sql where-str)
base-sql)
result (cond-> full-sql
sort (str " ORDER BY " sort
" " (name order))
limit (str " LIMIT " limit)
offset (str " OFFSET " offset))
full-count (if where-str
(str count-sql where-str)
count-sql)]
{:rows (query-rows result)
:count (or
(int
(query-scalar
full-count)) 0)})))
(defn query-deduped [entity-type start-date end-date]
"Query records deduplicated by external-id (latest _seq_no wins)."
(let [q (parquet-query entity-type start-date end-date)]
(query-rows
(str (:sql q)
" QUALIFY ROW_NUMBER() OVER"
" (PARTITION BY sales_order.external_id"
" ORDER BY _seq_no DESC) = 1"))))
(defn query-by-entity-id [entity-type external-id
start-date end-date]
(->> (query-deduped entity-type start-date end-date)
(filter #(= (:external_id %)
(name external-id)))
first))
(defn count-records-in-parquet
[entity-type start-date end-date]
(let [q (parquet-query entity-type
start-date end-date)]
(or (int (query-scalar (:count-sql q))) 0)))

View File

@@ -0,0 +1,184 @@
(ns auto-ap.storage.sales-summaries
"Aggregation functions querying Parquet files on S3 via DuckDB.
Entity types: sales-order | charge | line-item | sales-refund
S3 pattern: s3://<bucket>/sales-details/<entity-type>/<YYYY-MM-DD>.parquet"
(:require [auto-ap.storage.parquet :as p]
[clojure.string :as str]))
(defn- dq [name]
(str "\"" name "\""))
(defn- sum-dbl [val]
(try
(if val (double val) 0.0)
(catch Exception _e
0.0)))
(defn- pq-files [entity-type start-date end-date]
"Vector of S3 parquet file paths for date range."
(let [dates (p/date-seq start-date end-date)]
(vec
(map #(str "'s3://" p/*bucket*
"/sales-details/" entity-type "/"
% ".parquet") dates))))
(defn sum-payments-by-type [client-id start-date end-date]
"Return {processor-key -> {type-name-string -> total-double}}."
(let [files (pq-files "charge" start-date end-date)]
(try
(let [sql (str "SELECT "
(dq "processor")
" AS proc, "
(dq "type-name")
" AS type_name, "
"SUM("
(dq "total")
")::DOUBLE AS total_amount "
"FROM read_parquet(["
(str/join ", " files)
"]) "
"WHERE "
(dq "client-code")
" = '" client-id "' "
"GROUP BY "
(dq "processor") ", "
(dq "type-name"))]
(let [rows (p/query-rows sql)]
(reduce (fn [acc row]
(let [proc (:proc row)
tname (str/trim (name (:type_name row)))
total (sum-dbl (:total_amount row))]
(update acc proc
(fn [inner]
(let [b (or inner {})]
(assoc b
tname
(+ (get b tname 0.0) total)))))))
{}
rows)))
(catch Exception e
(println "[sales-summaries]" (.getMessage e))
{}))))
(defn sum-discounts [client-id start-date end-date]
(let [files (pq-files "sales-order" start-date end-date)]
(try
(let [sql (str "SELECT SUM("
(dq "discount")
")::DOUBLE AS discount_total "
"FROM read_parquet(["
(str/join ", " files)
"]) "
"WHERE "
(dq "client-code")
" = '" client-id "'")]
(or (some-> (first (p/query-rows sql)) :discount_total sum-dbl) 0.0))
(catch Exception e
(println "[sales-summaries/discounts]" (.getMessage e))
0.0))))
(defn sum-refunds-by-type [client-id start-date end-date]
(let [files (pq-files "sales-refund" start-date end-date)]
(try
(let [sql (str "SELECT "
(dq "type-name")
" AS type_name, "
"SUM("
(dq "total")
")::DOUBLE AS total_amount "
"FROM read_parquet(["
(str/join ", " files)
"]) "
"WHERE "
(dq "sales-order-external-id")
" IN (SELECT "
(dq "external-id")
" FROM read_parquet(["
(str/join ", " (pq-files "sales-order" start-date end-date))
"]) WHERE "
(dq "client-code")
" = '" client-id "') "
"GROUP BY " (dq "type-name"))]
(let [rows (p/query-rows sql)]
(reduce (fn [acc row]
(let [tname (str/trim (name (:type_name row)))
total (sum-dbl (:total_amount row))]
(assoc acc tname (+ (get acc tname 0.0) total))))
{}
rows)))
(catch Exception e
(println "[sales-summaries/refunds]" (.getMessage e))
{}))))
(defn sum-taxes [client-id start-date end-date]
(let [files (pq-files "sales-order" start-date end-date)]
(try
(let [sql (str "SELECT SUM("
(dq "tax")
")::DOUBLE AS tax_total "
"FROM read_parquet(["
(str/join ", " files)
"]) "
"WHERE "
(dq "client-code")
" = '" client-id "'")]
(or (some-> (first (p/query-rows sql)) :tax_total sum-dbl) 0.0))
(catch Exception e
(println "[sales-summaries/tax]" (.getMessage e))
0.0))))
(defn sum-tips [client-id start-date end-date]
(let [files (pq-files "sales-order" start-date end-date)]
(try
(let [sql (str "SELECT SUM("
(dq "tip")
")::DOUBLE AS tip_total "
"FROM read_parquet(["
(str/join ", " files)
"]) "
"WHERE "
(dq "client-code")
" = '" client-id "'")]
(or (some-> (first (p/query-rows sql)) :tip_total sum-dbl) 0.0))
(catch Exception e
(println "[sales-summaries/tip]" (.getMessage e))
0.0))))
(defn sum-sales-by-category [client-id start-date end-date]
(let [files (pq-files "line-item" start-date end-date)]
(try
(let [sql (str "SELECT "
(dq "category")
" AS category, "
"SUM("
(dq "total")
")::DOUBLE AS total_amount, "
"SUM("
(dq "tax")
")::DOUBLE AS tax_amount, "
"SUM("
(dq "discount")
")::DOUBLE AS discount_amount "
"FROM read_parquet(["
(str/join ", " files)
"]) "
"WHERE "
(dq "sales-order-external-id")
" IN (SELECT "
(dq "external-id")
" FROM read_parquet(["
(str/join ", " (pq-files "sales-order" start-date end-date))
"]) WHERE "
(dq "client-code")
" = '" client-id "') "
"GROUP BY " (dq "category"))]
(let [rows (p/query-rows sql)]
(mapv (fn [row]
{:category (or (:category row) "Unknown")
:total (sum-dbl (:total_amount row))
:tax (sum-dbl (:tax_amount row))
:discount (sum-dbl (:discount_amount row))})
rows)))
(catch Exception e
(println "[sales-summaries/sales]" (.getMessage e))
[]))))