"""Categorize PASS/FAIL lines across the 14 local VASLOG .DAT samples.

Goal: understand whether plain-decimal vs E-notation correlates with
file (model), date, or random distribution.
"""
import os, re

DAT_DIR = r'D:\claudetools\projects\dataforth-dos\datasheet-pipeline\scmvas-hvas-research\samples\vaslog-dat'

RE_PASS_SCI   = re.compile(r'"(PASS|FAIL)\s*(-?\d+\.?\d*E[+-]?\d{2})(\d?)"', re.I)
RE_PASS_PLAIN = re.compile(r'"(PASS|FAIL)\s*(-?\.?\d+\.?\d*)"', re.I)
RE_SNDATE     = re.compile(r'^"([^"]+)","(\d{2}-\d{2}-\d{4})"')

for fn in sorted(os.listdir(DAT_DIR)):
    path = os.path.join(DAT_DIR, fn)
    with open(path, 'r', encoding='latin-1') as f:
        lines = [l.strip() for l in f if l.strip()]
    sci = 0
    plain = 0
    other = 0
    dates = []
    model = None
    for line in lines:
        if line.startswith('"') and not line.startswith('"PASS') and not line.startswith('"FAIL') and ',' not in line and '0' not in line[1:3]:
            if not model: model = line.replace('"','').strip()
        m_snd = RE_SNDATE.match(line)
        if m_snd:
            dates.append(m_snd.group(2))
            continue
        # Only interested in lines that contain a PASS/FAIL status field (not the SN line)
        if '"PASS' in line or '"FAIL' in line:
            m_sci = RE_PASS_SCI.search(line)
            m_plain = RE_PASS_PLAIN.search(line)
            if m_sci: sci += 1
            elif m_plain: plain += 1
            else: other += 1
    # Sort dates by year
    dates_sorted = sorted(dates)
    date_range = f'{dates_sorted[0]} .. {dates_sorted[-1]}' if dates_sorted else '-'
    total = sci + plain + other
    print(f'{fn:20s}  model={model!r:18s}  total={total:4d}  sci={sci:4d}  plain={plain:4d}  other={other:4d}  dates={date_range}')