(ns auto-ap.migration.cleanup-sales (:require [auto-ap.datomic :refer [conn]] [auto-ap.storage.parquet :as pq] [amazonica.aws.s3 :as s3] [datomic.api :as d-api] [clojure.string :as str])) (def ^:private BATCH-SIZE 1000) (def ^:private DRY-RUN? true) (defn- set-dry-run! [v] (alter-var-root #'DRY-RUN? (constantly v))) ; -- query helpers (defn- query-sales-order-ids "Return all entity IDs that have :sales-order/external-id." [db] (->> (d-api/q '[:find ?e :where [?e :sales-order/external-id]] db) (map first))) (defn- collect-child-ids "Gather child entity IDs for a batch of sales orders. Returns map with keys :orders, :charges, :line-items, :refunds — each a vector of entity IDs eligible for retraction." [db order-ids] (let [order-set (set order-ids) charges (->> (d-api/q '[:find ?c :in $ [?o ...] :where [$ ?o :sales-order/charges ?c]] db order-set) (map second)) refunds (->> (d-api/q '[:find ?r :in $ [?o ...] :where [$ ?o :sales-order/refunds ?r]] db order-set) (map second)) line-items (->> (d-api/q '[:find ?li :in $ [?c ...] :where [$ ?c :charge/line-items ?li]] db charges) (map second))] {:orders order-ids :charges (vec charges) :line-items (vec line-items) :refunds (vec refunds)})) ; -- transaction batching (defn- batch-transact "Issue [:db/retractEntity ...] transactions in batches of BATCH-SIZE. conn$ is a Datomic connection object. entity-ids should be a seq of Long entity IDs." [conn entity-ids] (let [batches (partition-all BATCH-SIZE entity-ids) _ (doseq [[idx batch] (map-indexed vector batches)] (let [n (count batch) txes (map (fn [eid] [:db/retractEntity eid]) batch)] (println " batch" idx ":" n "retracts") (when-not DRY-RUN? @(d-api/transact conn txes))))] :done)) (defn- retract-all-child-ids! "Retract orders, charges, line-items and refunds from all entity-ID maps produced by collect-child-ids. Logs progress every batch." [conn child-entity-map] (doseq [[type id-seq] child-entity-map] (when (seq id-seq) (println "retracting" type ":" (count id-seq) "ids") (batch-transact conn id-seq)))) ; -- month grouping (defn- group-orders-by-month "Group sales order entity IDs by [year month] extracted from :sales-order/day-value. Returns map {{y m} [eid ...]}." [db order-ids] (reduce (fn [acc eid] (when-let [day-val (:sales-order/day-value (d-api/entity db eid))] (let [[y m _] (str/split (str day-val) #"-") k [(Integer/parseInt y) (Integer/parseInt m)]] (update acc k conj eid)))) {} order-ids)) ; -- S3 verification (uses amazonica + parquet module) (def ENTITY-TYPES ["sales-order" "charge" "line-item" "sales-refund"]) (defn- s3-keys-for-date "Build S3 parquet keys for all entity types on a given date." [date-str] (mapv #(pq/parquet-key % date-str) ENTITY-TYPES)) (defn- days-in-month "Return seq of YYYY-MM-DD strings for all days in [year month]." [year month] (let [start (java.time.LocalDate/of year month 1) first-of-next (.plusMonths start 1) diff (.toEpochDay first-of-next) start-day (.toEpochDay start)] (for [d (range start-day diff)] (.toString (java.time.LocalDate/ofEpochDay d))))) (defn- object-exists? "Check if an S3 object exists by attempting get-object." [key] (try (s3/get-object {:bucket-name pq/*bucket* :key key}) true (catch com.amazonaws.services.s3.model.AmazonS3Exception _ false))) (defn- verify-month-in-s3? "Check that every day in [year month] has at least one backing Parquet file on S3 across all entity types. Returns a map {:ok bool :missing vec-of-dates}." [year month] (let [dates (days-in-month year month)] (loop [[d & rest] dates result []] (if-not d {:ok (empty? result) :missing result} (let [keys (s3-keys-for-date d) found? (some object-exists? keys)] (recur rest (if found? result (conj result d)))))))) ; -- public API: delete-by-month (defn- delete-by-month [conn client-entid year month] "Retract all sales entities for a specific year+month. Returns :ok on success, :skipped if S3 verification failed." (println "=== deleting" year "-" month "dry-run? =" DRY-RUN?) (let [db (d-api/db conn) all-ids (query-sales-order-ids db) group (group-orders-by-month db all-ids) target-keys (get group [year month] [])] (if (zero? (count target-keys)) (do (println " no orders found for" year "-" month) :skipped) (do (let [child-maps (collect-child-ids db target-keys) total-ids (->> child-maps vals (reduce into []) distinct count)] (println " " total-ids "total entities to retract") (when-not DRY-RUN? (retract-all-child-ids! conn child-maps))) :ok)))) ; -- public API: cleanup-all (defn cleanup-all [] "Remove ALL sales-order, charge, line-item, sales-refund from Datomic. Uses d-api/transact to issue [:db/retractEntity ...] for each entity. Iterates over every month found in DB." (let [db (d-api/db conn) all-ids (query-sales-order-ids db) group (group-orders-by-month db all-ids) months (sort (keys group))] (println "found" (count months) "months of data") (doseq [[y m] months] (delete-by-month conn nil y m)) (println "cleanup-all complete"))) ; -- public API: safe-cleanup-all (defn- collect-all-months [conn] "Return sorted vec of [year month] pairs with sales orders in DB." (let [db (d-api/db conn) all-ids (query-sales-order-ids db) grouped (group-orders-by-month db all-ids)] (sort (keys grouped)))) (defn safe-cleanup-all [] "Same as cleanup-all but verifies S3 data exists first. Before deleting a month's entities, checks that parquet files exist in auto-ap.storage.parquet bucket under prefix 'sales-details'." (let [conn$ conn months (collect-all-months conn)] (println "=== safe-cleanup-all" "months:" (count months) "dry-run? =" DRY-RUN?) (doseq [[_ y m] months] (when-not DRY-RUN? (let [result (verify-month-in-s3? y m) missing (:missing result)] (cond (:ok result) (do (println "verified" y "-" m "S3 OK, deleting...") (delete-by-month conn$ nil y m)) (> (count missing) 0) (do (println "ERROR" y "-" m "missing in S3:" (str/join ", " missing)) (throw (ex-info "Missing S3 data — aborting!" {:year y :month m :missing missing}))) :else (println "SKIPPING" y "-" m "no parquet files"))))) (println "safe-cleanup-all complete")))