feat(sales): initial Parquet migration infrastructure
- Add DuckDB/S3 parquet storage layer (auto-ap.storage.parquet) - Add sales_to_parquet migration script for historical data - Add cleanup_sales for post-migration Datomic cleanup - Add sales_orders_new.clj with DuckDB read layer for SSR views - Add test scaffolding for parquet storage - Add plan document for move-detailed-sales-to-parquet feat(sales): redirect production and read flows to Parquet/DuckDB - U3: Square production (upsert) now buffers to parquet via flatten-order-to-parquet! - U3: EzCater core import-order now buffers to parquet instead of Datomic transact - U3: EzCater XLS upload-xls now buffers to parquet instead of audit-transact - U4: Rewrite sales_orders.clj to read from DuckDB via pq/get-sales-orders - U5: Rewrite sales_summaries to use parquet aggregation functions - get-payment-items-parquet, get-discounts-parquet, get-refund-items-parquet - get-tax-parquet, get-tip-parquet, get-sales-parquet - Add sum-* aggregation functions to storage/sales_summaries.clj - sum-discounts, sum-refunds-by-type, sum-taxes, sum-tips, sum-sales-by-category
This commit is contained in:
219
src/clj/auto_ap/migration/cleanup_sales.clj
Normal file
219
src/clj/auto_ap/migration/cleanup_sales.clj
Normal file
@@ -0,0 +1,219 @@
|
||||
(ns auto-ap.migration.cleanup-sales
|
||||
(:require [auto-ap.datomic :refer [conn]]
|
||||
[auto-ap.storage.parquet :as pq]
|
||||
[amazonica.aws.s3 :as s3]
|
||||
[datomic.api :as d-api]
|
||||
[clojure.string :as str]))
|
||||
|
||||
(def ^:private BATCH-SIZE 1000)
|
||||
(def ^:private DRY-RUN? true)
|
||||
|
||||
(defn- set-dry-run! [v]
|
||||
(alter-var-root #'DRY-RUN? (constantly v)))
|
||||
|
||||
; -- query helpers
|
||||
|
||||
(defn- query-sales-order-ids
|
||||
"Return all entity IDs that have :sales-order/external-id."
|
||||
[db]
|
||||
(->> (d-api/q '[:find ?e
|
||||
:where [?e :sales-order/external-id]]
|
||||
db)
|
||||
(map first)))
|
||||
|
||||
(defn- collect-child-ids
|
||||
"Gather child entity IDs for a batch of sales orders. Returns map with
|
||||
keys :orders, :charges, :line-items, :refunds — each a vector of
|
||||
entity IDs eligible for retraction."
|
||||
[db order-ids]
|
||||
(let [order-set (set order-ids)
|
||||
charges (->> (d-api/q '[:find ?c
|
||||
:in $ [?o ...]
|
||||
:where [$ ?o :sales-order/charges ?c]]
|
||||
db order-set)
|
||||
(map second))
|
||||
refunds (->> (d-api/q '[:find ?r
|
||||
:in $ [?o ...]
|
||||
:where [$ ?o :sales-order/refunds ?r]]
|
||||
db order-set)
|
||||
(map second))
|
||||
line-items (->> (d-api/q '[:find ?li
|
||||
:in $ [?c ...]
|
||||
:where [$ ?c :charge/line-items ?li]]
|
||||
db charges)
|
||||
(map second))]
|
||||
{:orders order-ids
|
||||
:charges (vec charges)
|
||||
:line-items (vec line-items)
|
||||
:refunds (vec refunds)}))
|
||||
|
||||
; -- transaction batching
|
||||
|
||||
(defn- batch-transact
|
||||
"Issue [:db/retractEntity ...] transactions in batches of BATCH-SIZE.
|
||||
conn$ is a Datomic connection object.
|
||||
entity-ids should be a seq of Long entity IDs."
|
||||
[conn entity-ids]
|
||||
(let [batches (partition-all BATCH-SIZE entity-ids)
|
||||
_ (doseq [[idx batch] (map-indexed vector batches)]
|
||||
(let [n (count batch)
|
||||
txes (map (fn [eid]
|
||||
[:db/retractEntity eid])
|
||||
batch)]
|
||||
(println " batch" idx ":" n "retracts")
|
||||
(when-not DRY-RUN?
|
||||
@(d-api/transact conn txes))))]
|
||||
:done))
|
||||
|
||||
(defn- retract-all-child-ids!
|
||||
"Retract orders, charges, line-items and refunds from all entity-ID
|
||||
maps produced by collect-child-ids. Logs progress every batch."
|
||||
[conn child-entity-map]
|
||||
(doseq [[type id-seq] child-entity-map]
|
||||
(when (seq id-seq)
|
||||
(println "retracting" type ":" (count id-seq) "ids")
|
||||
(batch-transact conn id-seq))))
|
||||
|
||||
; -- month grouping
|
||||
|
||||
(defn- group-orders-by-month
|
||||
"Group sales order entity IDs by [year month] extracted from
|
||||
:sales-order/day-value. Returns map {{y m} [eid ...]}."
|
||||
[db order-ids]
|
||||
(reduce (fn [acc eid]
|
||||
(when-let [day-val (:sales-order/day-value
|
||||
(d-api/entity db eid))]
|
||||
(let [[y m _] (str/split (str day-val) #"-")
|
||||
k [(Integer/parseInt y)
|
||||
(Integer/parseInt m)]]
|
||||
(update acc k conj eid))))
|
||||
{}
|
||||
order-ids))
|
||||
|
||||
; -- S3 verification (uses amazonica + parquet module)
|
||||
|
||||
(def ENTITY-TYPES ["sales-order" "charge"
|
||||
"line-item" "sales-refund"])
|
||||
|
||||
(defn- s3-keys-for-date
|
||||
"Build S3 parquet keys for all entity types on a given date."
|
||||
[date-str]
|
||||
(mapv #(pq/parquet-key % date-str) ENTITY-TYPES))
|
||||
|
||||
(defn- days-in-month
|
||||
"Return seq of YYYY-MM-DD strings for all days in [year month]."
|
||||
[year month]
|
||||
(let [start (java.time.LocalDate/of year month 1)
|
||||
first-of-next (.plusMonths start 1)
|
||||
diff (.toEpochDay first-of-next)
|
||||
start-day (.toEpochDay start)]
|
||||
(for [d (range start-day diff)]
|
||||
(.toString (java.time.LocalDate/ofEpochDay d)))))
|
||||
|
||||
(defn- object-exists?
|
||||
"Check if an S3 object exists by attempting get-object."
|
||||
[key]
|
||||
(try
|
||||
(s3/get-object {:bucket-name pq/*bucket*
|
||||
:key key})
|
||||
true
|
||||
(catch com.amazonaws.services.s3.model.AmazonS3Exception _
|
||||
false)))
|
||||
|
||||
(defn- verify-month-in-s3?
|
||||
"Check that every day in [year month] has at least one backing
|
||||
Parquet file on S3 across all entity types.
|
||||
Returns a map {:ok bool :missing vec-of-dates}."
|
||||
[year month]
|
||||
(let [dates (days-in-month year month)]
|
||||
(loop [[d & rest] dates
|
||||
result []]
|
||||
(if-not d
|
||||
{:ok (empty? result)
|
||||
:missing result}
|
||||
(let [keys (s3-keys-for-date d)
|
||||
found? (some object-exists? keys)]
|
||||
(recur rest
|
||||
(if found?
|
||||
result
|
||||
(conj result d))))))))
|
||||
|
||||
; -- public API: delete-by-month
|
||||
|
||||
(defn- delete-by-month [conn client-entid year month]
|
||||
"Retract all sales entities for a specific year+month.
|
||||
Returns :ok on success, :skipped if S3 verification failed."
|
||||
(println "=== deleting" year "-" month
|
||||
"dry-run? =" DRY-RUN?)
|
||||
(let [db (d-api/db conn)
|
||||
all-ids (query-sales-order-ids db)
|
||||
group (group-orders-by-month db all-ids)
|
||||
target-keys (get group [year month] [])]
|
||||
(if (zero? (count target-keys))
|
||||
(do (println " no orders found for" year "-" month)
|
||||
:skipped)
|
||||
(do
|
||||
(let [child-maps (collect-child-ids db target-keys)
|
||||
total-ids (->> child-maps vals
|
||||
(reduce into [])
|
||||
distinct
|
||||
count)]
|
||||
(println " " total-ids "total entities to retract")
|
||||
(when-not DRY-RUN?
|
||||
(retract-all-child-ids! conn child-maps)))
|
||||
:ok))))
|
||||
|
||||
; -- public API: cleanup-all
|
||||
|
||||
(defn cleanup-all []
|
||||
"Remove ALL sales-order, charge, line-item, sales-refund from
|
||||
Datomic. Uses d-api/transact to issue [:db/retractEntity ...] for
|
||||
each entity. Iterates over every month found in DB."
|
||||
(let [db (d-api/db conn)
|
||||
all-ids (query-sales-order-ids db)
|
||||
group (group-orders-by-month db all-ids)
|
||||
months (sort (keys group))]
|
||||
(println "found" (count months) "months of data")
|
||||
(doseq [[y m] months]
|
||||
(delete-by-month conn nil y m))
|
||||
(println "cleanup-all complete")))
|
||||
|
||||
; -- public API: safe-cleanup-all
|
||||
|
||||
(defn- collect-all-months [conn]
|
||||
"Return sorted vec of [year month] pairs with sales orders in DB."
|
||||
(let [db (d-api/db conn)
|
||||
all-ids (query-sales-order-ids db)
|
||||
grouped (group-orders-by-month db all-ids)]
|
||||
(sort (keys grouped))))
|
||||
|
||||
(defn safe-cleanup-all []
|
||||
"Same as cleanup-all but verifies S3 data exists first.
|
||||
Before deleting a month's entities, checks that parquet files
|
||||
exist in auto-ap.storage.parquet bucket under prefix 'sales-details'."
|
||||
(let [conn$ conn
|
||||
months (collect-all-months conn)]
|
||||
(println "=== safe-cleanup-all"
|
||||
"months:" (count months)
|
||||
"dry-run? =" DRY-RUN?)
|
||||
(doseq [[_ y m] months]
|
||||
(when-not DRY-RUN?
|
||||
(let [result (verify-month-in-s3? y m)
|
||||
missing (:missing result)]
|
||||
(cond
|
||||
(:ok result)
|
||||
(do (println "verified" y "-" m "S3 OK, deleting...")
|
||||
(delete-by-month conn$ nil y m))
|
||||
|
||||
(> (count missing) 0)
|
||||
(do (println "ERROR" y "-" m "missing in S3:"
|
||||
(str/join ", " missing))
|
||||
(throw
|
||||
(ex-info
|
||||
"Missing S3 data — aborting!"
|
||||
{:year y :month m
|
||||
:missing missing})))
|
||||
|
||||
:else
|
||||
(println "SKIPPING" y "-" m "no parquet files")))))
|
||||
(println "safe-cleanup-all complete")))
|
||||
Reference in New Issue
Block a user