Files
claudetools/clients/valleywide/app-modernization/source-analysis/analyze_wiztree.py
Mike Swanson 5359e7c49e feat(valleywide): recover VWP Orders VB6 source from D: backup drive
Recovered Darv's VB6 source for the Valley Wide Plastering Orders
application from the D: backup drive (label "Backup", 8 TB, 5.3 TB used).
This is the first time we've had the actual source — prior session only
had a single frmPayroll.frm from the AD server.

Three project variants identified across two snapshots:
- Full-Project/   (2,129 files, 124 MB) — D:\Office-Estimates\Darv\Full\Project\
- Kingston-Project/ (2,189 files, 130 MB) — D:\Office-Estimates\Darv\Kingston\Project\
- Source/         (170 files, 559 MB)   — D:\Office-Estimates\Darv\Source\ wholesale
- SOURCE-HOLD/    (3 files, 1 MB)       — D:\Office-Estimates\Darv\SOURCE HOLD\

Latest ORDERS_C.vbp date is 2020-06-09 (Kingston snapshot). Production
Orders_10A.exe was live as of April 2024 — open question whether newer
source exists on other backup drives Mike will scan next.

Also includes per-category and per-keyword analysis CSVs from a WizTree
file-list export, plus the analyzer script that produced them
(re-runnable for the next drive's CSV).

VMs (VWIN7-DW.vdi 8.3 GB + XP-for-ORDERS_copy.vdi 2.8 GB), the live
VWP.mdb, and the 393 MB raw WizTree CSV stay on disk only — gitignored.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 17:36:27 -07:00

183 lines
7.1 KiB
Python

"""Analyze a WizTree CSV for VWP app-modernization source-code hunting.
Single streaming pass. Emits:
- per-category file lists (vb6-source, access-db, crystal-reports, installshield,
vm-images, source-archives, rar/zip near Darv)
- all paths whose terminal segment matches Darv/Source/Orders/VWP/Denali
- a summary table
Run:
py analyze_wiztree.py <wiztree.csv> <output-dir>
"""
from __future__ import annotations
import csv, os, re, sys
from collections import defaultdict
from pathlib import Path
EXT_CATEGORIES = {
# VB6 source
'.vbp': 'vb6-project', '.vbg': 'vb6-project-group', '.vbw': 'vb6-workspace',
'.frm': 'vb6-form', '.bas': 'vb6-module', '.cls': 'vb6-class',
'.ctl': 'vb6-usercontrol', '.frx': 'vb6-form-resource', '.ctx': 'vb6-usercontrol-resource',
# Access
'.mdb': 'access-mdb', '.accdb': 'access-accdb', '.mde': 'access-mde', '.accde': 'access-accde',
# Crystal Reports
'.rpt': 'crystal-report',
# InstallShield / installers
'.ism': 'installshield-project', '.isproj': 'installshield-project',
'.msi': 'installer', '.iss': 'inno-setup-script',
# VMs
'.vdi': 'vm-image-vbox', '.vmdk': 'vm-image-vmware', '.vhd': 'vm-image-vhd',
'.vhdx': 'vm-image-vhdx', '.ova': 'vm-image-ova', '.vbox': 'vm-config-vbox',
# Archives (interesting near Darv only — handled separately)
'.zip': 'archive', '.rar': 'archive', '.7z': 'archive', '.tar': 'archive',
'.gz': 'archive', '.cab': 'archive',
}
# Folder leaf name keywords (lower-case substring match)
FOLDER_KEYWORDS = ('darv', 'source', 'orders', 'vwp', 'denali')
# Archives outside Darv/Source/Orders/VWP context are noise — filter
ARCHIVE_INTEREST_RE = re.compile(r'\\(darv|source|orders|vwp|denali)\\', re.IGNORECASE)
def categorize(path: str) -> str | None:
p = path.lower()
# Last char '\' means folder row — handle separately
if p.endswith('\\'):
return None
ext = os.path.splitext(p)[1]
cat = EXT_CATEGORIES.get(ext)
if cat is None:
return None
if cat == 'archive':
# Archives only interesting if path contains Darv/Source/Orders/VWP/Denali
if not ARCHIVE_INTEREST_RE.search(path):
return None
return 'archive-near-darv'
return cat
def is_folder_row(path: str) -> bool:
return path.endswith('\\')
def folder_keyword_match(path: str) -> str | None:
# Get terminal segment (strip trailing \, then take last component)
p = path.rstrip('\\')
leaf = p.rsplit('\\', 1)[-1].lower()
for kw in FOLDER_KEYWORDS:
if kw in leaf:
return kw
return None
def main(csv_path: str, out_dir: str) -> int:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
cat_files: dict[str, list[tuple[int, str, str]]] = defaultdict(list) # cat -> [(size, modified, path)]
folder_matches: dict[str, list[tuple[int, int, int, str, str]]] = defaultdict(list)
# ^ keyword -> [(size_bytes, files, folders, modified, path)]
seen_rows = 0
seen_files = 0
seen_folders = 0
with open(csv_path, 'r', encoding='utf-8-sig', errors='replace', newline='') as fh:
reader = csv.reader(fh)
header_seen = False
for row in reader:
seen_rows += 1
if not row:
continue
# Skip donation/banner line if it's not a CSV row
if len(row) < 2:
continue
if not header_seen:
if row[0].strip().lower().startswith('file name') or row[0].strip().lower() == 'file name':
header_seen = True
continue
if row[0].startswith('Generated by'):
continue
# Defensive: maybe first row IS data
header_seen = True
path = row[0]
try:
size = int(row[1]) if row[1].strip() else 0
except (ValueError, IndexError):
size = 0
modified = row[3] if len(row) > 3 else ''
if is_folder_row(path):
seen_folders += 1
kw = folder_keyword_match(path)
if kw:
try:
files = int(row[5]) if len(row) > 5 and row[5].strip() else 0
folders = int(row[6]) if len(row) > 6 and row[6].strip() else 0
except (ValueError, IndexError):
files, folders = 0, 0
folder_matches[kw].append((size, files, folders, modified, path))
else:
seen_files += 1
cat = categorize(path)
if cat:
cat_files[cat].append((size, modified, path))
# Write per-category CSVs
summary_lines = []
summary_lines.append(f"# WizTree analysis — {csv_path}")
summary_lines.append(f"")
summary_lines.append(f"Rows: {seen_rows:,} Files: {seen_files:,} Folders: {seen_folders:,}")
summary_lines.append(f"")
summary_lines.append(f"## File matches by category")
summary_lines.append(f"")
summary_lines.append(f"| Category | Count | Total MB |")
summary_lines.append(f"|---|---|---|")
for cat in sorted(cat_files.keys()):
rows = cat_files[cat]
total_mb = sum(r[0] for r in rows) / 1024 / 1024
summary_lines.append(f"| `{cat}` | {len(rows):,} | {total_mb:,.1f} |")
# Sort by size descending, write CSV
rows.sort(key=lambda r: -r[0])
csv_out = out / f"cat_{cat}.csv"
with open(csv_out, 'w', encoding='utf-8', newline='') as fout:
w = csv.writer(fout)
w.writerow(['size_bytes', 'size_mb', 'modified', 'path'])
for size, mod, path in rows:
w.writerow([size, f"{size/1024/1024:.2f}", mod, path])
summary_lines.append("")
summary_lines.append(f"## Folder name matches")
summary_lines.append(f"")
summary_lines.append(f"| Keyword | Matching folders | Total GB | Total files inside |")
summary_lines.append(f"|---|---|---|---|")
for kw in sorted(folder_matches.keys()):
rows = folder_matches[kw]
total_gb = sum(r[0] for r in rows) / 1024 / 1024 / 1024
total_files = sum(r[1] for r in rows)
summary_lines.append(f"| `{kw}` | {len(rows):,} | {total_gb:,.1f} | {total_files:,} |")
# Sort by size descending, write CSV
rows.sort(key=lambda r: -r[0])
csv_out = out / f"folder_{kw}.csv"
with open(csv_out, 'w', encoding='utf-8', newline='') as fout:
w = csv.writer(fout)
w.writerow(['size_bytes', 'size_gb', 'files_inside', 'folders_inside', 'modified', 'path'])
for size, files, folders, mod, path in rows:
w.writerow([size, f"{size/1024/1024/1024:.2f}", files, folders, mod, path])
summary_path = out / 'SUMMARY.md'
summary_path.write_text('\n'.join(summary_lines), encoding='utf-8')
print(summary_path.read_text(encoding='utf-8'))
print(f"\n[OK] Per-category and per-keyword CSVs written to: {out}")
return 0
if __name__ == '__main__':
if len(sys.argv) < 3:
print("Usage: py analyze_wiztree.py <wiztree.csv> <output-dir>", file=sys.stderr)
sys.exit(2)
sys.exit(main(sys.argv[1], sys.argv[2]))