claudetools/clients/valleywide/app-modernization/source-analysis/size_candidates.py

"""Size up candidate source folders we might want to copy off D:.
Match by exact-leaf-name; print all matches per leaf, sorted by size.
"""
import csv, sys, os
from collections import defaultdict

CSV = sys.argv[1] if len(sys.argv) > 1 else 'clients/valleywide/app-modernization/WizTree_20260516172207.csv'

# Match folders whose leaf name (the last path component) is in this set
LEAVES_OF_INTEREST = {
    'project', 'source', 'source hold', 'virtualbox', 'vm_vdi',
    'vwp_current', 'vwp_update', 'vwp_inv', 'vwp_current_0317',
    'kingston', 'full', 'recovery', 'darv',
    'virtual box', 'virtual box copy', 'xp box',
}

groups = defaultdict(list)  # leaf -> [(size, files, folders, modified, path)]

with open(CSV, encoding='utf-8-sig', errors='replace') as f:
    r = csv.reader(f)
    next(r)  # banner
    next(r)  # header
    for row in r:
        if not row or len(row) < 7:
            continue
        p = row[0]
        if not p.endswith('\\'):
            continue
        leaf = p.rstrip('\\').rsplit('\\', 1)[-1].lower()
        if leaf in LEAVES_OF_INTEREST:
            try:
                sz = int(row[1])
                files = int(row[5]) if row[5].strip() else 0
                folders = int(row[6]) if row[6].strip() else 0
            except (ValueError, IndexError):
                continue
            groups[leaf].append((sz, files, folders, row[3], p))

print(f'{"GB":>8} {"Files":>8} {"Folders":>7}  Modified            Path')
print('-' * 140)
# Order: source code targets first, then VMs
order = ['vwp_current', 'vwp_update', 'vwp_inv', 'vwp_current_0317',
         'project', 'source', 'source hold', 'kingston', 'full', 'recovery',
         'virtualbox', 'vm_vdi', 'virtual box', 'virtual box copy', 'xp box',
         'darv']
for leaf in order:
    items = sorted(groups.get(leaf, []), reverse=True)
    if not items:
        continue
    print(f'\n--- leaf "{leaf}" ({len(items)} match{"es" if len(items)!=1 else ""}) ---')
    for sz, files, folders, mod, p in items[:10]:
        print(f'{sz/1024/1024/1024:>8.2f} {files:>8} {folders:>7}  {mod:<19} {p}')