Files
integreat/src/clj/auto_ap/migration/cleanup_sales.clj
Bryce ea7f46ea8a fix(sales): fix parquet SQL generation and cleanup formatting
- Fix double ORDER BY in sales_orders raw-graphql-ids (was passing full
  ORDER BY clause from build-sort-clause into get-sales-orders which
  prepends its own ORDER BY, producing 'ORDER BY ORDER BY ... DESC DESC')
- Fix WHERE clause column names in parquet build-where-clause:
  external_id.client -> client-code, external_id.vendor -> vendor
- Fix parquet-query format string (%%s -> %s with proper format call)
- Fix ex-info call signature in flush! (was passing :error as third arg
  instead of inside the data map)
- Add S3 credentials to DuckDB connect! so httpfs can read from S3
- Fix parquet buffer indentation and alignment across square/core3,
  ezcater/core, ezcater_xls, payments, sales_summaries, migrations
- Fix broken Datomic query syntax in ezcater/core (upsert-used-subscriptions,
  upsert-recent find/where clauses mangled by paren-repair)
- Uncomment accidentally commented code block in square/core3
- Fix paren/indentation issues in ssr/payments, jobs/sales_summaries
2026-04-27 10:33:22 -07:00

220 lines
7.7 KiB
Clojure

(ns auto-ap.migration.cleanup-sales
(:require [auto-ap.datomic :refer [conn]]
[auto-ap.storage.parquet :as pq]
[amazonica.aws.s3 :as s3]
[datomic.api :as d-api]
[clojure.string :as str]))
(def ^:private BATCH-SIZE 1000)
(def ^:private DRY-RUN? true)
(defn- set-dry-run! [v]
(alter-var-root #'DRY-RUN? (constantly v)))
; -- query helpers
(defn- query-sales-order-ids
"Return all entity IDs that have :sales-order/external-id."
[db]
(->> (d-api/q '[:find ?e
:where [?e :sales-order/external-id]]
db)
(map first)))
(defn- collect-child-ids
"Gather child entity IDs for a batch of sales orders. Returns map with
keys :orders, :charges, :line-items, :refunds — each a vector of
entity IDs eligible for retraction."
[db order-ids]
(let [order-set (set order-ids)
charges (->> (d-api/q '[:find ?c
:in $ [?o ...]
:where [$ ?o :sales-order/charges ?c]]
db order-set)
(map second))
refunds (->> (d-api/q '[:find ?r
:in $ [?o ...]
:where [$ ?o :sales-order/refunds ?r]]
db order-set)
(map second))
line-items (->> (d-api/q '[:find ?li
:in $ [?c ...]
:where [$ ?c :charge/line-items ?li]]
db charges)
(map second))]
{:orders order-ids
:charges (vec charges)
:line-items (vec line-items)
:refunds (vec refunds)}))
; -- transaction batching
(defn- batch-transact
"Issue [:db/retractEntity ...] transactions in batches of BATCH-SIZE.
conn$ is a Datomic connection object.
entity-ids should be a seq of Long entity IDs."
[conn entity-ids]
(let [batches (partition-all BATCH-SIZE entity-ids)
_ (doseq [[idx batch] (map-indexed vector batches)]
(let [n (count batch)
txes (map (fn [eid]
[:db/retractEntity eid])
batch)]
(println " batch" idx ":" n "retracts")
(when-not DRY-RUN?
@(d-api/transact conn txes))))]
:done))
(defn- retract-all-child-ids!
"Retract orders, charges, line-items and refunds from all entity-ID
maps produced by collect-child-ids. Logs progress every batch."
[conn child-entity-map]
(doseq [[type id-seq] child-entity-map]
(when (seq id-seq)
(println "retracting" type ":" (count id-seq) "ids")
(batch-transact conn id-seq))))
; -- month grouping
(defn- group-orders-by-month
"Group sales order entity IDs by [year month] extracted from
:sales-order/day-value. Returns map {{y m} [eid ...]}."
[db order-ids]
(reduce (fn [acc eid]
(when-let [day-val (:sales-order/day-value
(d-api/entity db eid))]
(let [[y m _] (str/split (str day-val) #"-")
k [(Integer/parseInt y)
(Integer/parseInt m)]]
(update acc k conj eid))))
{}
order-ids))
; -- S3 verification (uses amazonica + parquet module)
(def ENTITY-TYPES ["sales-order" "charge"
"line-item" "sales-refund"])
(defn- s3-keys-for-date
"Build S3 parquet keys for all entity types on a given date."
[date-str]
(mapv #(pq/parquet-key % date-str) ENTITY-TYPES))
(defn- days-in-month
"Return seq of YYYY-MM-DD strings for all days in [year month]."
[year month]
(let [start (java.time.LocalDate/of year month 1)
first-of-next (.plusMonths start 1)
diff (.toEpochDay first-of-next)
start-day (.toEpochDay start)]
(for [d (range start-day diff)]
(.toString (java.time.LocalDate/ofEpochDay d)))))
(defn- object-exists?
"Check if an S3 object exists by attempting get-object."
[key]
(try
(s3/get-object {:bucket-name pq/*bucket*
:key key})
true
(catch com.amazonaws.services.s3.model.AmazonS3Exception _
false)))
(defn- verify-month-in-s3?
"Check that every day in [year month] has at least one backing
Parquet file on S3 across all entity types.
Returns a map {:ok bool :missing vec-of-dates}."
[year month]
(let [dates (days-in-month year month)]
(loop [[d & rest] dates
result []]
(if-not d
{:ok (empty? result)
:missing result}
(let [keys (s3-keys-for-date d)
found? (some object-exists? keys)]
(recur rest
(if found?
result
(conj result d))))))))
; -- public API: delete-by-month
(defn- delete-by-month [conn client-entid year month]
"Retract all sales entities for a specific year+month.
Returns :ok on success, :skipped if S3 verification failed."
(println "=== deleting" year "-" month
"dry-run? =" DRY-RUN?)
(let [db (d-api/db conn)
all-ids (query-sales-order-ids db)
group (group-orders-by-month db all-ids)
target-keys (get group [year month] [])]
(if (zero? (count target-keys))
(do (println " no orders found for" year "-" month)
:skipped)
(do
(let [child-maps (collect-child-ids db target-keys)
total-ids (->> child-maps vals
(reduce into [])
distinct
count)]
(println " " total-ids "total entities to retract")
(when-not DRY-RUN?
(retract-all-child-ids! conn child-maps)))
:ok))))
; -- public API: cleanup-all
(defn cleanup-all []
"Remove ALL sales-order, charge, line-item, sales-refund from
Datomic. Uses d-api/transact to issue [:db/retractEntity ...] for
each entity. Iterates over every month found in DB."
(let [db (d-api/db conn)
all-ids (query-sales-order-ids db)
group (group-orders-by-month db all-ids)
months (sort (keys group))]
(println "found" (count months) "months of data")
(doseq [[y m] months]
(delete-by-month conn nil y m))
(println "cleanup-all complete")))
; -- public API: safe-cleanup-all
(defn- collect-all-months [conn]
"Return sorted vec of [year month] pairs with sales orders in DB."
(let [db (d-api/db conn)
all-ids (query-sales-order-ids db)
grouped (group-orders-by-month db all-ids)]
(sort (keys grouped))))
(defn safe-cleanup-all []
"Same as cleanup-all but verifies S3 data exists first.
Before deleting a month's entities, checks that parquet files
exist in auto-ap.storage.parquet bucket under prefix 'sales-details'."
(let [conn$ conn
months (collect-all-months conn)]
(println "=== safe-cleanup-all"
"months:" (count months)
"dry-run? =" DRY-RUN?)
(doseq [[_ y m] months]
(when-not DRY-RUN?
(let [result (verify-month-in-s3? y m)
missing (:missing result)]
(cond
(:ok result)
(do (println "verified" y "-" m "S3 OK, deleting...")
(delete-by-month conn$ nil y m))
(> (count missing) 0)
(do (println "ERROR" y "-" m "missing in S3:"
(str/join ", " missing))
(throw
(ex-info
"Missing S3 data — aborting!"
{:year y :month m
:missing missing})))
:else
(println "SKIPPING" y "-" m "no parquet files")))))
(println "safe-cleanup-all complete")))