Files
integreat/src/clj/auto_ap/storage/parquet.clj
Bryce 9153494ed7 feat(sales): wire SSR page to parquet/DuckDB layer with full 7.9M-record support
- Add fetch-page-ssr and summarize-page-ssr to read from parquet via DuckDB
- Add get-sales-orders-summary for cross-page totals (SUM across all rows)
- Optimize parquet-query for large ranges (>60 days) with year-level globs
- Add default-date-range with fallback to data's actual range
- Fix migration: flatten-order-to-pieces! vswap!, pull specs, date handling
- Add denormalized columns: payment-methods, processors, categories, source
- Handle schema-enforce middleware stripping dates via raw query-string parsing
- Add graceful fallback for missing parquet files (catch Exception)
- Fix load-unflushed! with .exists check on WAL files
2026-04-27 20:05:13 -07:00

374 lines
14 KiB
Clojure

(ns auto-ap.storage.parquet
(:require [config.core :refer [env]]
[amazonica.aws.s3 :as s3]
[clojure.java.io :as io]
[clojure.string :as str]
[clojure.data.json :as json])
(:import (java.sql DriverManager)
(java.time LocalDate)))
(def ^:dynamic *bucket* (:data-bucket env))
(def parquet-prefix "sales-details")
(defn s3-location [filename]
(str "s3://" *bucket* "/" filename))
(defn parquet-key [entity-type date-str]
(str parquet-prefix "/" entity-type "/" date-str ".parquet"))
(def db (atom nil))
(defn connect! []
(let [conn (DriverManager/getConnection "jdbc:duckdb:")
stmt (.createStatement conn)]
(.execute stmt "INSTALL httpfs; LOAD httpfs;")
(when-let [key (:aws-access-key-id env)]
(.execute stmt (str "SET s3_access_key_id='" key "'"))
(.execute stmt (str "SET s3_secret_access_key='" (:aws-secret-access-key env) "'"))
(.execute stmt (str "SET s3_region='" (or (:aws-region env) "us-east-1") "'")))
(.close stmt)
(.addShutdownHook (Runtime/getRuntime)
(Thread. #(fn [])))
(reset! db conn)))
(defn disconnect! []
(locking db
(when-let [c @db]
(.close c)
(reset! db nil))))
(defmacro with-duckdb
[& body]
`(let [conn# (or @db (connect!))]
(try
(let [~'conn conn#]
~@body)
(finally
(when (and (not @db) conn#)
(.close conn#))))))
(defn execute! [sql]
(with-duckdb
(let [stmt (.createStatement conn)]
(.execute stmt sql)
nil)))
(defn query-scalar [sql]
(with-duckdb
(let [stmt (.createStatement conn)
rs (.executeQuery stmt sql)]
(when (.next rs)
(.getObject rs 1)))))
(defn query-rows [sql]
(with-duckdb
(let [stmt (.createStatement conn)
rs (.executeQuery stmt sql)
meta (.getMetaData rs)
col-count (.getColumnCount meta)
cols (vec (for [i (range 1 (inc col-count))]
(keyword (.getColumnLabel meta i))))]
(loop [rows []]
(if (.next rs)
(recur (conj rows
(zipmap cols
(vec (for [i (range 1 (inc col-count))]
(.getObject rs i))))))
rows)))))
(defn execute-to-parquet! [sql ^String parquet-path]
(with-duckdb
(let [stmt (.createStatement conn)]
(.execute stmt
(format "COPY (%s) TO '%s' (FORMAT PARQUET, OVERWRITE_OR_IGNORE)"
sql parquet-path))
(io/file parquet-path))))
(defn upload-parquet! [local-parquet-file s3-key]
(s3/put-object {:bucket-name *bucket*
:key s3-key
:file local-parquet-file})
(s3-location s3-key))
(defonce *buffers* (atom {}))
(defn- wal-dir []
(io/file (System/getProperty "user.dir" "/tmp")
"parquet-wal"))
(defn- init-wal! []
(let [dir (wal-dir)]
(when-not (.exists dir)
(.mkdirs dir))))
(defn buffer! [entity-type record]
(init-wal!)
(let [seq-no (System/currentTimeMillis)
entry (assoc record :_seq-no seq-no)]
(swap! *buffers* update entity-type (fnil conj []) entry)
(try
(let [wal-file (io/file (wal-dir)
(str entity-type ".jsonl"))]
(io/make-parents wal-file)
(with-open [w (io/writer wal-file :append true)]
(.write w ^String (json/write-str {:seq-no seq-no
:record record}))
(.write w (int \newline))))
(catch Exception e
(println "[parquet/wal]" (.getMessage e))))
entry))
(defn clear-buffer! [entity-type]
(swap! *buffers* dissoc entity-type))
(defn buffer-count [entity-type]
(-> @*buffers* (get entity-type []) count))
(defn total-buf-count []
(->> @*buffers*
vals (mapcat identity) count))
(defn flush-to-parquet! [entity-type date-str]
"Flush buffered records for entity-type to parquet + S3."
(let [records (get @*buffers* entity-type [])]
(if (empty? records)
{:status :no-records}
(let [date-str (or date-str (.toString (LocalDate/now)))
jsonl-file (io/file "/tmp"
(str entity-type "-" date-str ".jsonl"))
parquet-file (io/file "/tmp"
(str entity-type "-" date-str ".parquet"))
s3-key (parquet-key entity-type date-str)]
(try
(with-open [w (io/writer jsonl-file :append true)]
(doseq [r records]
(.write w ^String (json/write-str (dissoc r :_seq-no)))
(.write w (int \newline))))
(execute-to-parquet!
(format "SELECT * FROM read_json_auto('%s')"
(.getAbsolutePath jsonl-file))
(.getAbsolutePath parquet-file))
(upload-parquet! parquet-file s3-key)
(clear-buffer! entity-type)
(.delete ^java.io.File jsonl-file)
(.delete ^java.io.File parquet-file)
{:key s3-key :status :ok}
(catch Exception e
(throw (ex-info "Flush failed"
{:entity-type entity-type
:error (.getMessage e)}))))))))
(defn flush-by-date! []
"Flush all entity types for today."
(let [etypes ["sales-order" "charge"
"line-item" "sales-refund"]
today (.toString (LocalDate/now))
flushed (into #{}
(keep (fn [et]
(let [{:keys [status]}
(flush-to-parquet! et today)]
(when (= status :ok)
et))))
etypes)]
{:flushed flushed}))
(defn load-unflushed! []
"Restore unflushed records from WAL jsonl files into *buffers."
(init-wal!)
(let [etypes ["sales-order" "charge"
"line-item" "sales-refund"]
loaded (reduce-kv
(fn [acc et data]
(if-not (empty? data)
(assoc acc et
(->> (str/split-lines data)
(keep #(try
(let [entry (json/read-str %)]
(when entry
(assoc (:record entry) :_seq-no (:seq-no entry))))
(catch Exception _)))))
acc))
{}
(into {}
(keep (fn [et]
(let [f (io/file
(wal-dir)
(str et ".jsonl"))]
(when (.exists f)
[et (slurp f)])))
etypes)))]
(swap! *buffers* merge loaded)))
(defn get-unflushed-count []
(total-buf-count))
(defn unflushed-records? []
(not= 0 (total-buf-count)))
;;; DuckDB Read Layer
(defn date-seq [start end]
"Seq of YYYY-MM-DD strings between start and end inclusive."
(let [sd (LocalDate/parse start)
ed (LocalDate/parse end)
days (int (Math/abs
(- (.toEpochDay sd)
(.toEpochDay ed))))]
(for [i (range 0 (inc days))]
(.toString (.plusDays sd i)))))
(defn today []
(.toString (LocalDate/now)))
(defn- parquet-glob [entity-type start-date end-date]
"Build a glob pattern or explicit file list for the date range.
Uses glob patterns for ranges > 60 days; explicit list otherwise."
(let [days (-> (LocalDate/parse end-date)
(.toEpochDay)
(- (.toEpochDay (LocalDate/parse start-date)))
inc)]
(if (> days 60)
(let [prefix (format "s3://%s/sales-details/%s/" *bucket* entity-type)
sy (-> (LocalDate/parse start-date) .getYear)
ey (-> (LocalDate/parse end-date) .getYear)]
(if (= sy ey)
[(format "%s%d-*.parquet" prefix sy)]
(vec
(for [y (range sy (inc ey))]
(format "%s%d-*.parquet" prefix y)))))
(vec
(map (fn [d]
(format "'s3://%s/sales-details/%s/%s.parquet'"
*bucket* entity-type d))
(date-seq start-date end-date))))))
(defn parquet-query [entity-type start-date end-date]
"Build SQL to read all parquet files in date range.
Returns map with :sql and :count-sql keys."
(let [globs (parquet-glob entity-type start-date end-date)
use-glob? (some #(.endsWith ^String % "*.parquet") globs)
base (if use-glob?
(format "SELECT * FROM read_parquet(%s, union_by_name=true)"
(if (= (count globs) 1)
(format "'%s'" (first globs))
(format "[%s]"
(str/join ", " (map #(format "'%s'" %) globs)))))
(format "SELECT * FROM read_parquet([%s])"
(str/join ", " globs)))
add-date-filter (fn [sql]
(if (> (-> (LocalDate/parse end-date)
(.toEpochDay)
(- (.toEpochDay (LocalDate/parse start-date)))
inc)
60)
(format "%s WHERE date >= '%s' AND date <= '%s'"
sql start-date end-date)
sql))
sql (add-date-filter base)]
{:sql sql
:count-sql (format "SELECT COUNT(*) FROM (%s) t" sql)}))
(defn- like-clause [col v]
(str "\"" col "\" LIKE '%" v "%'"))
(defn- build-sales-orders-where [opts]
(let [eq-clauses (keep
(fn [[key col]]
(let [v (get opts key)]
(when v
(str "\"" col "\" = '" v "'"))))
[[:client "client-code"]
[:vendor "vendor"]
[:location "location"]])
like-clauses (keep
(fn [[key col]]
(let [v (get opts key)]
(when v
(like-clause col v))))
[[:payment-method "payment-methods"]
[:processor "processors"]
[:category "categories"]])
range-clauses (keep
(fn [[key col op]]
(let [v (get opts key)]
(when v
(str "\"" col "\" " op " " v))))
[[:total-gte "total" ">="]
[:total-lte "total" "<="]])
all-clauses (concat eq-clauses like-clauses range-clauses)]
(when (seq all-clauses)
(str " WHERE " (str/join " AND " all-clauses)))))
(defn get-sales-orders
([start-date end-date]
(get-sales-orders start-date end-date {}))
([start-date end-date opts]
(try
(let [q (parquet-query "sales-order"
start-date end-date)
base-sql (:sql q)
count-sql (:count-sql q)
sort (get opts :sort "date")
order (get opts :order "DESC")
limit (get opts :limit)
offset (get opts :offset)
where-str (build-sales-orders-where opts)
full-sql (if where-str
(str base-sql where-str)
base-sql)
result (cond-> full-sql
sort (str " ORDER BY " sort
" " (name order))
limit (str " LIMIT " limit)
offset (str " OFFSET " offset))
full-count (if where-str
(str count-sql where-str)
count-sql)]
{:rows (query-rows result)
:count (or
(int
(query-scalar
full-count)) 0)})
(catch Exception _
{:rows [] :count 0}))))
(defn get-sales-orders-summary
([start-date end-date]
(get-sales-orders-summary start-date end-date {}))
([start-date end-date opts]
(try
(let [q (parquet-query "sales-order" start-date end-date)
base-sql (:sql q)
where-str (build-sales-orders-where opts)
full-sql (if where-str
(str base-sql where-str)
base-sql)
sum-sql (format "SELECT COALESCE(SUM(total), 0) as total, COALESCE(SUM(tax), 0) as tax FROM (%s) t" full-sql)
row (first (query-rows sum-sql))]
{:total (or (:total row) 0.0)
:tax (or (:tax row) 0.0)})
(catch Exception _
{:total 0.0 :tax 0.0}))))
(defn query-deduped [entity-type start-date end-date]
"Query records deduplicated by external-id (latest _seq_no wins)."
(let [q (parquet-query entity-type start-date end-date)]
(query-rows
(str (:sql q)
" QUALIFY ROW_NUMBER() OVER"
" (PARTITION BY sales_order.external_id"
" ORDER BY _seq_no DESC) = 1"))))
(defn query-by-entity-id [entity-type external-id
start-date end-date]
(->> (query-deduped entity-type start-date end-date)
(filter #(= (:external_id %)
(name external-id)))
first))
(defn count-records-in-parquet
[entity-type start-date end-date]
(let [q (parquet-query entity-type
start-date end-date)]
(or (int (query-scalar (:count-sql q))) 0)))