"""Analyze a WizTree CSV for VWP app-modernization source-code hunting. Single streaming pass. Emits: - per-category file lists (vb6-source, access-db, crystal-reports, installshield, vm-images, source-archives, rar/zip near Darv) - all paths whose terminal segment matches Darv/Source/Orders/VWP/Denali - a summary table Run: py analyze_wiztree.py """ from __future__ import annotations import csv, os, re, sys from collections import defaultdict from pathlib import Path EXT_CATEGORIES = { # VB6 source '.vbp': 'vb6-project', '.vbg': 'vb6-project-group', '.vbw': 'vb6-workspace', '.frm': 'vb6-form', '.bas': 'vb6-module', '.cls': 'vb6-class', '.ctl': 'vb6-usercontrol', '.frx': 'vb6-form-resource', '.ctx': 'vb6-usercontrol-resource', # Access '.mdb': 'access-mdb', '.accdb': 'access-accdb', '.mde': 'access-mde', '.accde': 'access-accde', # Crystal Reports '.rpt': 'crystal-report', # InstallShield / installers '.ism': 'installshield-project', '.isproj': 'installshield-project', '.msi': 'installer', '.iss': 'inno-setup-script', # VMs '.vdi': 'vm-image-vbox', '.vmdk': 'vm-image-vmware', '.vhd': 'vm-image-vhd', '.vhdx': 'vm-image-vhdx', '.ova': 'vm-image-ova', '.vbox': 'vm-config-vbox', # Archives (interesting near Darv only — handled separately) '.zip': 'archive', '.rar': 'archive', '.7z': 'archive', '.tar': 'archive', '.gz': 'archive', '.cab': 'archive', } # Folder leaf name keywords (lower-case substring match) FOLDER_KEYWORDS = ('darv', 'source', 'orders', 'vwp', 'denali') # Archives outside Darv/Source/Orders/VWP context are noise — filter ARCHIVE_INTEREST_RE = re.compile(r'\\(darv|source|orders|vwp|denali)\\', re.IGNORECASE) def categorize(path: str) -> str | None: p = path.lower() # Last char '\' means folder row — handle separately if p.endswith('\\'): return None ext = os.path.splitext(p)[1] cat = EXT_CATEGORIES.get(ext) if cat is None: return None if cat == 'archive': # Archives only interesting if path contains Darv/Source/Orders/VWP/Denali if not ARCHIVE_INTEREST_RE.search(path): return None return 'archive-near-darv' return cat def is_folder_row(path: str) -> bool: return path.endswith('\\') def folder_keyword_match(path: str) -> str | None: # Get terminal segment (strip trailing \, then take last component) p = path.rstrip('\\') leaf = p.rsplit('\\', 1)[-1].lower() for kw in FOLDER_KEYWORDS: if kw in leaf: return kw return None def main(csv_path: str, out_dir: str) -> int: out = Path(out_dir) out.mkdir(parents=True, exist_ok=True) cat_files: dict[str, list[tuple[int, str, str]]] = defaultdict(list) # cat -> [(size, modified, path)] folder_matches: dict[str, list[tuple[int, int, int, str, str]]] = defaultdict(list) # ^ keyword -> [(size_bytes, files, folders, modified, path)] seen_rows = 0 seen_files = 0 seen_folders = 0 with open(csv_path, 'r', encoding='utf-8-sig', errors='replace', newline='') as fh: reader = csv.reader(fh) header_seen = False for row in reader: seen_rows += 1 if not row: continue # Skip donation/banner line if it's not a CSV row if len(row) < 2: continue if not header_seen: if row[0].strip().lower().startswith('file name') or row[0].strip().lower() == 'file name': header_seen = True continue if row[0].startswith('Generated by'): continue # Defensive: maybe first row IS data header_seen = True path = row[0] try: size = int(row[1]) if row[1].strip() else 0 except (ValueError, IndexError): size = 0 modified = row[3] if len(row) > 3 else '' if is_folder_row(path): seen_folders += 1 kw = folder_keyword_match(path) if kw: try: files = int(row[5]) if len(row) > 5 and row[5].strip() else 0 folders = int(row[6]) if len(row) > 6 and row[6].strip() else 0 except (ValueError, IndexError): files, folders = 0, 0 folder_matches[kw].append((size, files, folders, modified, path)) else: seen_files += 1 cat = categorize(path) if cat: cat_files[cat].append((size, modified, path)) # Write per-category CSVs summary_lines = [] summary_lines.append(f"# WizTree analysis — {csv_path}") summary_lines.append(f"") summary_lines.append(f"Rows: {seen_rows:,} Files: {seen_files:,} Folders: {seen_folders:,}") summary_lines.append(f"") summary_lines.append(f"## File matches by category") summary_lines.append(f"") summary_lines.append(f"| Category | Count | Total MB |") summary_lines.append(f"|---|---|---|") for cat in sorted(cat_files.keys()): rows = cat_files[cat] total_mb = sum(r[0] for r in rows) / 1024 / 1024 summary_lines.append(f"| `{cat}` | {len(rows):,} | {total_mb:,.1f} |") # Sort by size descending, write CSV rows.sort(key=lambda r: -r[0]) csv_out = out / f"cat_{cat}.csv" with open(csv_out, 'w', encoding='utf-8', newline='') as fout: w = csv.writer(fout) w.writerow(['size_bytes', 'size_mb', 'modified', 'path']) for size, mod, path in rows: w.writerow([size, f"{size/1024/1024:.2f}", mod, path]) summary_lines.append("") summary_lines.append(f"## Folder name matches") summary_lines.append(f"") summary_lines.append(f"| Keyword | Matching folders | Total GB | Total files inside |") summary_lines.append(f"|---|---|---|---|") for kw in sorted(folder_matches.keys()): rows = folder_matches[kw] total_gb = sum(r[0] for r in rows) / 1024 / 1024 / 1024 total_files = sum(r[1] for r in rows) summary_lines.append(f"| `{kw}` | {len(rows):,} | {total_gb:,.1f} | {total_files:,} |") # Sort by size descending, write CSV rows.sort(key=lambda r: -r[0]) csv_out = out / f"folder_{kw}.csv" with open(csv_out, 'w', encoding='utf-8', newline='') as fout: w = csv.writer(fout) w.writerow(['size_bytes', 'size_gb', 'files_inside', 'folders_inside', 'modified', 'path']) for size, files, folders, mod, path in rows: w.writerow([size, f"{size/1024/1024/1024:.2f}", files, folders, mod, path]) summary_path = out / 'SUMMARY.md' summary_path.write_text('\n'.join(summary_lines), encoding='utf-8') print(summary_path.read_text(encoding='utf-8')) print(f"\n[OK] Per-category and per-keyword CSVs written to: {out}") return 0 if __name__ == '__main__': if len(sys.argv) < 3: print("Usage: py analyze_wiztree.py ", file=sys.stderr) sys.exit(2) sys.exit(main(sys.argv[1], sys.argv[2]))