refactor: overhaul sync script with CLI modes, batch writes, and archive tracking

Add argparse with full/last_n/oldest_percent/hybrid/single sync modes.

Implement batch Firestore writes to reduce API overhead.

Add is_archived flag based on phase_name during sync.

Track last_synced_at on each project for incremental sync.

Improve logging with structured prefixes and worker summaries.

Remove dead code (duplicate date function, sync_single helper).
This commit is contained in:
2026-05-12 23:40:25 -07:00
parent c62de705de
commit 3633923fa7

218
sync.py
View File

@@ -9,7 +9,7 @@ import os
import concurrent.futures import concurrent.futures
import threading import threading
from typing import List, Dict, Any, Optional from typing import List, Dict, Any, Optional
from datetime import datetime from datetime import datetime, timedelta
import pytz import pytz
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
@@ -17,6 +17,34 @@ load_dotenv()
# Add the current directory to the Python path so we can import app and models # Add the current directory to the Python path so we can import app and models
sys.path.append(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.dirname(os.path.abspath(__file__)))
def batch_write_to_firestore(db, collection_name: str, documents: List[tuple], batch_size: int = 500):
"""Write documents to Firestore in batches from the main thread.
Args:
db: Firestore client
collection_name: Name of the collection
documents: List of (doc_id, data) tuples
batch_size: Number of documents per batch
"""
collection = db.collection(collection_name)
total = len(documents)
written = 0
for i in range(0, total, batch_size):
batch = documents[i:i + batch_size]
try:
write_batch = db.batch()
for doc_id, data in batch:
ref = collection.document(str(doc_id))
write_batch.set(ref, data)
write_batch.commit()
written += len(batch)
print(f"[BATCH] Wrote {written}/{total} documents")
except Exception as e:
print(f"[ERROR] Batch write failed: {e}")
print(f"[BATCH] Completed writing {written} documents to Firestore")
def convert_to_pacific_time(date_str): def convert_to_pacific_time(date_str):
"""Convert UTC date string to Pacific Time and format as YYYY-MM-DD. """Convert UTC date string to Pacific Time and format as YYYY-MM-DD.
@@ -67,22 +95,6 @@ def extract_domains_from_emails(emails: List[str]) -> List[str]:
return sorted(list(domains)) return sorted(list(domains))
try:
# Parse the UTC datetime
utc_time = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
# Set timezone to UTC
utc_time = utc_time.replace(tzinfo=pytz.UTC)
# Convert to Pacific Time
pacific_time = utc_time.astimezone(pytz.timezone('America/Los_Angeles'))
# Format as YYYY-MM-DD
return pacific_time.strftime('%m/%d/%Y')
except (ValueError, AttributeError) as e:
print(f"[WARN] Date conversion failed for '{date_str}': {e}")
return ''
from models.project_model import ProjectModel from models.project_model import ProjectModel
from filevine_client import FilevineClient from filevine_client import FilevineClient
@@ -111,18 +123,19 @@ def process_project(index: int, total: int, project_data: dict, client: Filevine
p = project_data p = project_data
pid = (p.get("projectId") or {}).get("native") pid = (p.get("projectId") or {}).get("native")
print(f"Working on {pid} ({index}/{total})")
client = get_filevine_client() client = get_filevine_client()
if pid is None: if pid is None:
print(f"[SKIP] Missing projectId for item {index}")
return {} return {}
project_name = p.get("projectName", "")
try: try:
c = client.fetch_client((p.get("clientId") or {}).get("native")) c = client.fetch_client((p.get("clientId") or {}).get("native"))
cs = client.fetch_contacts(pid) cs = client.fetch_contacts(pid)
detail = client.fetch_project_detail(pid) detail = client.fetch_project_detail(pid)
except Exception as e: except Exception as e:
print(f"[WARN] Failed to fetch essential data for {pid}: {e}") print(f"[ERROR] Failed to fetch essential data for project {pid} '{project_name}': {e}")
return {} return {}
defendant_one = next((c.get('orgContact', {}) for c in cs if "Defendant" in c.get('orgContact', {}).get('personTypes', [])), {}) defendant_one = next((c.get('orgContact', {}) for c in cs if "Defendant" in c.get('orgContact', {}).get('personTypes', [])), {})
@@ -171,9 +184,6 @@ def process_project(index: int, total: int, project_data: dict, client: Filevine
# Extract default date # Extract default date
default_date = convert_to_pacific_time(dates_and_deadlines.get("defaultDate")) or '' default_date = convert_to_pacific_time(dates_and_deadlines.get("defaultDate")) or ''
case_filed_date = convert_to_pacific_time(dates_and_deadlines.get("dateCaseFiled")) or '' case_filed_date = convert_to_pacific_time(dates_and_deadlines.get("dateCaseFiled")) or ''
cf = dates_and_deadlines.get("dateCaseFiled")
from pprint import pprint
print(f"CASE FILED {case_filed_date} {cf}")
# Extract motion hearing dates # Extract motion hearing dates
demurrer_hearing_date = convert_to_pacific_time(dates_and_deadlines.get("demurrerHearingDate")) or '' demurrer_hearing_date = convert_to_pacific_time(dates_and_deadlines.get("demurrerHearingDate")) or ''
@@ -219,12 +229,8 @@ def process_project(index: int, total: int, project_data: dict, client: Filevine
# Extract attorney fees and costs # Extract attorney fees and costs
attorney_fees = fees_and_costs.get("totalAttorneysFees") or '' attorney_fees = fees_and_costs.get("totalAttorneysFees") or ''
costs = fees_and_costs.get("totalCosts") or '' costs = fees_and_costs.get("totalCosts") or ''
from pprint import pprint
property_managers = [property_contacts.get('propertyManager1'), property_contacts.get('propertyManager2'), property_contacts.get('propertyManager3'), property_contacts.get('propertyManager4')] property_managers = [property_contacts.get('propertyManager1'), property_contacts.get('propertyManager2'), property_contacts.get('propertyManager3'), property_contacts.get('propertyManager4')]
import itertools
# valid_property_managers = list(itertools.chain(*))
valid_property_managers = [e.get('address').lower() for pm in property_managers if pm and pm.get('emails') for e in pm.get('emails') if e and e.get('address')] valid_property_managers = [e.get('address').lower() for pm in property_managers if pm and pm.get('emails') for e in pm.get('emails') if e and e.get('address')]
print(valid_property_managers)
row = ProjectModel( row = ProjectModel(
@@ -284,79 +290,165 @@ def process_project(index: int, total: int, project_data: dict, client: Filevine
project_url=p.get("projectUrl") or detail.get("projectUrl"), project_url=p.get("projectUrl") or detail.get("projectUrl"),
#property_contacts=property_contacts #property_contacts=property_contacts
viewing_emails = valid_property_managers, viewing_emails = valid_property_managers,
viewing_domains = extract_domains_from_emails(valid_property_managers) viewing_domains = extract_domains_from_emails(valid_property_managers),
last_synced_at=datetime.now(pytz.UTC).isoformat()
) )
# Store the results in Firestore print(f"[{index}/{total}] Saved: {pid} | Matter {row.number} | {project_name}")
from app import db # Import db from app
projects_ref = db.collection("projects")
from pprint import pprint
# pprint([p.get("number"), property_info, new_file_review])
# Add new projects
project_id = row.project_id
if project_id:
projects_ref.document(str(project_id)).set(row.to_dict())
print(f"Finished on {pid} Matter {row.number} ({index}/{total})")
return row.to_dict() return row.to_dict()
except Exception as e: except Exception as e:
print(f"[ERROR] Processing failed for {pid}: {e}") print(f"[ERROR] Failed to process project {pid} '{project_name}': {e}")
import traceback
traceback.print_exc()
return {} return {}
def process_projects_parallel(projects: List[dict], client: FilevineClient, max_workers: int = 9) -> List[Dict[str, Any]]: def process_projects_parallel(projects: List[dict], client: FilevineClient, max_workers: int = 10) -> List[Dict[str, Any]]:
""" """
Process projects in parallel using a worker pool. Process projects in parallel using a worker pool.
Args: Args:
projects: List of project data dictionaries projects: List of project data dictionaries
client: FilevineClient instance client: FilevineClient instance
max_workers: Number of concurrent workers (default 9) max_workers: Number of concurrent workers (default 10)
Returns: Returns:
List of processed project dictionaries List of processed project dictionaries
""" """
# Create a thread pool with specified number of workers
total = len(projects) total = len(projects)
success_count = 0
fail_count = 0
print(f"[WORKERS] Starting parallel processing of {total} projects with {max_workers} workers...")
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, initializer=worker_init, initargs=(client,)) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, initializer=worker_init, initargs=(client,)) as executor:
# Submit all tasks to the executor
future_to_project = {executor.submit(process_project, indx, total, project, client): project for indx, project in enumerate(projects)} future_to_project = {executor.submit(process_project, indx, total, project, client): project for indx, project in enumerate(projects)}
# Collect results as they complete
results = [] results = []
for future in concurrent.futures.as_completed(future_to_project): for future in concurrent.futures.as_completed(future_to_project):
try: try:
result = future.result() result = future.result()
if result and result.get('ProjectId'):
success_count += 1
else:
fail_count += 1
results.append(result) results.append(result)
except Exception as e: except Exception as e:
print(f"[ERROR] Processing failed: {e}") fail_count += 1
# Add empty dict or handle error appropriately print(f"[ERROR] Worker thread failed: {e}")
results.append({}) results.append({})
print(f"[WORKERS] Completed: {success_count} succeeded, {fail_count} failed, {total} total")
return results return results
def get_oldest_unsynced_projects(db, fraction: float = 0.2) -> List[int]:
"""Get the oldest fraction of projects by last_synced_at from Firestore.
Args:
db: Firestore client
fraction: Fraction of projects to return (default 0.2 = 1/5th)
Returns:
List of project IDs (native) that need syncing
"""
try:
projects_ref = db.collection("projects")
all_docs = list(projects_ref.stream())
total = len(all_docs)
count_to_sync = max(1, int(total * fraction))
# Sort by last_synced_at ascending (empty strings first, then oldest timestamps)
sorted_docs = sorted(all_docs, key=lambda doc: doc.to_dict().get("last_synced_at", ""))
selected_docs = sorted_docs[:count_to_sync]
result_ids = [int(doc.id) for doc in selected_docs if doc.id and doc.id != "None"]
print(f"[SYNC STRATEGY] {total} projects in Firestore, will sync oldest {len(result_ids)} ({fraction*100:.0f}%)")
if selected_docs:
sample = selected_docs[0].to_dict()
print(f"[SYNC STRATEGY] Oldest: ID={result_ids[0]}, last_synced_at='{sample.get('last_synced_at', 'N/A')}'")
if len(selected_docs) > 1:
sample = selected_docs[-1].to_dict()
print(f"[SYNC STRATEGY] Cutoff: ID={result_ids[-1]}, last_synced_at='{sample.get('last_synced_at', 'N/A')}'")
return result_ids
except Exception as e:
print(f"[ERROR] Failed to get oldest unsynced projects: {e}")
import traceback
traceback.print_exc()
return []
def main(): def main():
"""Main function to fetch and sync projects""" """Main function to fetch and sync projects"""
print("Starting project sync...") import argparse
parser = argparse.ArgumentParser(description='Sync Filevine projects to Firestore')
parser.add_argument('--mode', choices=['full', 'last_n', 'oldest_percent', 'hybrid', 'single'],
default='hybrid', help='Sync mode: full=all projects, last_n=recently active, oldest_percent=oldest by last_synced_at, hybrid=last_n+oldest_percent, single=one project')
parser.add_argument('--days', type=int, default=14, help='Number of days for last_n mode (default: 14)')
parser.add_argument('--percent', type=float, default=20.0, help='Percentage for oldest_percent mode (default: 20)')
parser.add_argument('--project-id', type=int, help='Project ID for single mode (required when mode=single)')
args = parser.parse_args()
if args.mode == 'single' and not args.project_id:
parser.error("--project-id is required when mode is 'single'")
print(f"[SYNC] Starting sync - mode={args.mode}, workers=10")
try: try:
# Initialize Filevine client
client = FilevineClient() client = FilevineClient()
bearer = client.get_bearer_token() client.get_bearer_token()
from app import db
# List projects (all pages) with filter for projects updated in the last 7 days if args.mode == 'full':
from datetime import datetime, timedelta print("[MODE] Full sync - fetching all projects")
seven_days_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d') projects = client.list_all_projects()
projects = client.list_all_projects(latest_activity_since=seven_days_ago)
elif args.mode == 'last_n':
days_ago = (datetime.now() - timedelta(days=args.days)).strftime('%Y-%m-%d')
print(f"[MODE] Last {args.days} days - fetching active since {days_ago}")
projects = client.list_all_projects(latest_activity_since=days_ago)
elif args.mode == 'oldest_percent':
fraction = args.percent / 100.0
oldest_ids = get_oldest_unsynced_projects(db, fraction=fraction)
print(f"[MODE] Oldest {args.percent}% - fetching {len(oldest_ids)} projects")
all_projects = client.list_all_projects()
projects = [p for p in all_projects if p.get("projectId", {}).get("native") in set(oldest_ids)]
elif args.mode == 'single':
print(f"[MODE] Single project - fetching project {args.project_id}")
project_detail = client.fetch_project_detail(args.project_id)
projects = [project_detail] if project_detail else []
elif args.mode == 'hybrid':
print("[MODE] Hybrid - active + oldest")
#projects = [p for p in projects if (p.get("projectId") or {}).get("native") == 15914808] days_ago = (datetime.now() - timedelta(days=args.days)).strftime('%Y-%m-%d')
#projects = projects[:10] active_projects = client.list_all_projects(latest_activity_since=days_ago)
active_ids = {p.get("projectId", {}).get("native") for p in active_projects}
print(f"[SYNC] {len(active_projects)} active since {days_ago}")
fraction = args.percent / 100.0
oldest_ids = get_oldest_unsynced_projects(db, fraction=fraction)
all_ids_to_sync = active_ids.union(set(oldest_ids))
print(f"[SYNC] {len(all_ids_to_sync)} total unique to sync")
all_projects = client.list_all_projects()
projects = [p for p in all_projects if p.get("projectId", {}).get("native") in all_ids_to_sync]
# Process projects in parallel # Process projects in parallel
detailed_rows = process_projects_parallel(projects, client, 9) detailed_rows = process_projects_parallel(projects, client, max_workers=10)
# Batch write all results to Firestore
documents = []
for row in detailed_rows:
if row.get('ProjectId'):
row['is_archived'] = (row.get('phase_name') == 'Archived')
documents.append((row.get('ProjectId'), row))
batch_write_to_firestore(db, "projects", documents)
print(f"Successfully synced {len(detailed_rows)} projects to Firestore") print(f"[SYNC] Complete - {len(documents)} projects saved to Firestore")
except Exception as e: except Exception as e:
print(f"Error during sync: {e}") print(f"Error during sync: {e}")
@@ -364,13 +456,5 @@ def main():
traceback.print_exc() traceback.print_exc()
sys.exit(1) sys.exit(1)
def sync_single(x):
client = FilevineClient()
z = process_project(0, 1, client.fetch_project_detail(x), client)
from pprint import pprint
#pprint(z)
if __name__ == "__main__": if __name__ == "__main__":
main() main()