Recovered Darv's VB6 source for the Valley Wide Plastering Orders application from the D: backup drive (label "Backup", 8 TB, 5.3 TB used). This is the first time we've had the actual source — prior session only had a single frmPayroll.frm from the AD server. Three project variants identified across two snapshots: - Full-Project/ (2,129 files, 124 MB) — D:\Office-Estimates\Darv\Full\Project\ - Kingston-Project/ (2,189 files, 130 MB) — D:\Office-Estimates\Darv\Kingston\Project\ - Source/ (170 files, 559 MB) — D:\Office-Estimates\Darv\Source\ wholesale - SOURCE-HOLD/ (3 files, 1 MB) — D:\Office-Estimates\Darv\SOURCE HOLD\ Latest ORDERS_C.vbp date is 2020-06-09 (Kingston snapshot). Production Orders_10A.exe was live as of April 2024 — open question whether newer source exists on other backup drives Mike will scan next. Also includes per-category and per-keyword analysis CSVs from a WizTree file-list export, plus the analyzer script that produced them (re-runnable for the next drive's CSV). VMs (VWIN7-DW.vdi 8.3 GB + XP-for-ORDERS_copy.vdi 2.8 GB), the live VWP.mdb, and the 393 MB raw WizTree CSV stay on disk only — gitignored. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
183 lines
7.1 KiB
Python
183 lines
7.1 KiB
Python
"""Analyze a WizTree CSV for VWP app-modernization source-code hunting.
|
|
|
|
Single streaming pass. Emits:
|
|
- per-category file lists (vb6-source, access-db, crystal-reports, installshield,
|
|
vm-images, source-archives, rar/zip near Darv)
|
|
- all paths whose terminal segment matches Darv/Source/Orders/VWP/Denali
|
|
- a summary table
|
|
|
|
Run:
|
|
py analyze_wiztree.py <wiztree.csv> <output-dir>
|
|
"""
|
|
from __future__ import annotations
|
|
import csv, os, re, sys
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
EXT_CATEGORIES = {
|
|
# VB6 source
|
|
'.vbp': 'vb6-project', '.vbg': 'vb6-project-group', '.vbw': 'vb6-workspace',
|
|
'.frm': 'vb6-form', '.bas': 'vb6-module', '.cls': 'vb6-class',
|
|
'.ctl': 'vb6-usercontrol', '.frx': 'vb6-form-resource', '.ctx': 'vb6-usercontrol-resource',
|
|
# Access
|
|
'.mdb': 'access-mdb', '.accdb': 'access-accdb', '.mde': 'access-mde', '.accde': 'access-accde',
|
|
# Crystal Reports
|
|
'.rpt': 'crystal-report',
|
|
# InstallShield / installers
|
|
'.ism': 'installshield-project', '.isproj': 'installshield-project',
|
|
'.msi': 'installer', '.iss': 'inno-setup-script',
|
|
# VMs
|
|
'.vdi': 'vm-image-vbox', '.vmdk': 'vm-image-vmware', '.vhd': 'vm-image-vhd',
|
|
'.vhdx': 'vm-image-vhdx', '.ova': 'vm-image-ova', '.vbox': 'vm-config-vbox',
|
|
# Archives (interesting near Darv only — handled separately)
|
|
'.zip': 'archive', '.rar': 'archive', '.7z': 'archive', '.tar': 'archive',
|
|
'.gz': 'archive', '.cab': 'archive',
|
|
}
|
|
|
|
# Folder leaf name keywords (lower-case substring match)
|
|
FOLDER_KEYWORDS = ('darv', 'source', 'orders', 'vwp', 'denali')
|
|
|
|
# Archives outside Darv/Source/Orders/VWP context are noise — filter
|
|
ARCHIVE_INTEREST_RE = re.compile(r'\\(darv|source|orders|vwp|denali)\\', re.IGNORECASE)
|
|
|
|
|
|
def categorize(path: str) -> str | None:
|
|
p = path.lower()
|
|
# Last char '\' means folder row — handle separately
|
|
if p.endswith('\\'):
|
|
return None
|
|
ext = os.path.splitext(p)[1]
|
|
cat = EXT_CATEGORIES.get(ext)
|
|
if cat is None:
|
|
return None
|
|
if cat == 'archive':
|
|
# Archives only interesting if path contains Darv/Source/Orders/VWP/Denali
|
|
if not ARCHIVE_INTEREST_RE.search(path):
|
|
return None
|
|
return 'archive-near-darv'
|
|
return cat
|
|
|
|
|
|
def is_folder_row(path: str) -> bool:
|
|
return path.endswith('\\')
|
|
|
|
|
|
def folder_keyword_match(path: str) -> str | None:
|
|
# Get terminal segment (strip trailing \, then take last component)
|
|
p = path.rstrip('\\')
|
|
leaf = p.rsplit('\\', 1)[-1].lower()
|
|
for kw in FOLDER_KEYWORDS:
|
|
if kw in leaf:
|
|
return kw
|
|
return None
|
|
|
|
|
|
def main(csv_path: str, out_dir: str) -> int:
|
|
out = Path(out_dir)
|
|
out.mkdir(parents=True, exist_ok=True)
|
|
|
|
cat_files: dict[str, list[tuple[int, str, str]]] = defaultdict(list) # cat -> [(size, modified, path)]
|
|
folder_matches: dict[str, list[tuple[int, int, int, str, str]]] = defaultdict(list)
|
|
# ^ keyword -> [(size_bytes, files, folders, modified, path)]
|
|
|
|
seen_rows = 0
|
|
seen_files = 0
|
|
seen_folders = 0
|
|
|
|
with open(csv_path, 'r', encoding='utf-8-sig', errors='replace', newline='') as fh:
|
|
reader = csv.reader(fh)
|
|
header_seen = False
|
|
for row in reader:
|
|
seen_rows += 1
|
|
if not row:
|
|
continue
|
|
# Skip donation/banner line if it's not a CSV row
|
|
if len(row) < 2:
|
|
continue
|
|
if not header_seen:
|
|
if row[0].strip().lower().startswith('file name') or row[0].strip().lower() == 'file name':
|
|
header_seen = True
|
|
continue
|
|
if row[0].startswith('Generated by'):
|
|
continue
|
|
# Defensive: maybe first row IS data
|
|
header_seen = True
|
|
|
|
path = row[0]
|
|
try:
|
|
size = int(row[1]) if row[1].strip() else 0
|
|
except (ValueError, IndexError):
|
|
size = 0
|
|
modified = row[3] if len(row) > 3 else ''
|
|
|
|
if is_folder_row(path):
|
|
seen_folders += 1
|
|
kw = folder_keyword_match(path)
|
|
if kw:
|
|
try:
|
|
files = int(row[5]) if len(row) > 5 and row[5].strip() else 0
|
|
folders = int(row[6]) if len(row) > 6 and row[6].strip() else 0
|
|
except (ValueError, IndexError):
|
|
files, folders = 0, 0
|
|
folder_matches[kw].append((size, files, folders, modified, path))
|
|
else:
|
|
seen_files += 1
|
|
cat = categorize(path)
|
|
if cat:
|
|
cat_files[cat].append((size, modified, path))
|
|
|
|
# Write per-category CSVs
|
|
summary_lines = []
|
|
summary_lines.append(f"# WizTree analysis — {csv_path}")
|
|
summary_lines.append(f"")
|
|
summary_lines.append(f"Rows: {seen_rows:,} Files: {seen_files:,} Folders: {seen_folders:,}")
|
|
summary_lines.append(f"")
|
|
summary_lines.append(f"## File matches by category")
|
|
summary_lines.append(f"")
|
|
summary_lines.append(f"| Category | Count | Total MB |")
|
|
summary_lines.append(f"|---|---|---|")
|
|
for cat in sorted(cat_files.keys()):
|
|
rows = cat_files[cat]
|
|
total_mb = sum(r[0] for r in rows) / 1024 / 1024
|
|
summary_lines.append(f"| `{cat}` | {len(rows):,} | {total_mb:,.1f} |")
|
|
# Sort by size descending, write CSV
|
|
rows.sort(key=lambda r: -r[0])
|
|
csv_out = out / f"cat_{cat}.csv"
|
|
with open(csv_out, 'w', encoding='utf-8', newline='') as fout:
|
|
w = csv.writer(fout)
|
|
w.writerow(['size_bytes', 'size_mb', 'modified', 'path'])
|
|
for size, mod, path in rows:
|
|
w.writerow([size, f"{size/1024/1024:.2f}", mod, path])
|
|
|
|
summary_lines.append("")
|
|
summary_lines.append(f"## Folder name matches")
|
|
summary_lines.append(f"")
|
|
summary_lines.append(f"| Keyword | Matching folders | Total GB | Total files inside |")
|
|
summary_lines.append(f"|---|---|---|---|")
|
|
for kw in sorted(folder_matches.keys()):
|
|
rows = folder_matches[kw]
|
|
total_gb = sum(r[0] for r in rows) / 1024 / 1024 / 1024
|
|
total_files = sum(r[1] for r in rows)
|
|
summary_lines.append(f"| `{kw}` | {len(rows):,} | {total_gb:,.1f} | {total_files:,} |")
|
|
# Sort by size descending, write CSV
|
|
rows.sort(key=lambda r: -r[0])
|
|
csv_out = out / f"folder_{kw}.csv"
|
|
with open(csv_out, 'w', encoding='utf-8', newline='') as fout:
|
|
w = csv.writer(fout)
|
|
w.writerow(['size_bytes', 'size_gb', 'files_inside', 'folders_inside', 'modified', 'path'])
|
|
for size, files, folders, mod, path in rows:
|
|
w.writerow([size, f"{size/1024/1024/1024:.2f}", files, folders, mod, path])
|
|
|
|
summary_path = out / 'SUMMARY.md'
|
|
summary_path.write_text('\n'.join(summary_lines), encoding='utf-8')
|
|
print(summary_path.read_text(encoding='utf-8'))
|
|
print(f"\n[OK] Per-category and per-keyword CSVs written to: {out}")
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) < 3:
|
|
print("Usage: py analyze_wiztree.py <wiztree.csv> <output-dir>", file=sys.stderr)
|
|
sys.exit(2)
|
|
sys.exit(main(sys.argv[1], sys.argv[2]))
|