From d58d1dd76cdc2ec1317efe76c6b3ad0950102a82 Mon Sep 17 00:00:00 2001
From: Mike Swanson <mike@azcomputerguru.com>
Date: Wed, 17 Jun 2026 14:42:46 -0700
Subject: [PATCH] =?UTF-8?q?dataforth(datasheet):=20same-day=20retest=20fai?=
 =?UTF-8?q?thfulness=20=E2=80=94=20exposure=20sweep=20+=20fix=20proposal?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Whole-source sweep (981,716 records / 406,549 serials): 6,515 same-day multi-run
events; DB holds a NON-latest run for 311 (the strictly-greater-date conflict rule
freezes on an arbitrary same-day run). Corrects the verdict doc to flag same-day
retests as a latest-wins faithfulness violation (not benign). Adds the proposed
>= -with-data-differs conflict-rule fix (diagnose-only) and the sweep tool.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../CONFLICT-RULE-FIX-PROPOSAL-2026-06-17.md  |  83 ++++++++++++++
 .../PARSING-FIDELITY-VERDICT-2026-06-17.md    |  11 +-
 .../SAMEDAY-RETEST-EXPOSURE-2026-06-17.txt    |  29 +++++
 .../tools/sweep-sameday-retests.js            | 104 ++++++++++++++++++
 4 files changed, 224 insertions(+), 3 deletions(-)
 create mode 100644 projects/dataforth-dos/CONFLICT-RULE-FIX-PROPOSAL-2026-06-17.md
 create mode 100644 projects/dataforth-dos/SAMEDAY-RETEST-EXPOSURE-2026-06-17.txt
 create mode 100644 projects/dataforth-dos/datasheet-pipeline/implementation/tools/sweep-sameday-retests.js

diff --git a/projects/dataforth-dos/CONFLICT-RULE-FIX-PROPOSAL-2026-06-17.md b/projects/dataforth-dos/CONFLICT-RULE-FIX-PROPOSAL-2026-06-17.md
new file mode 100644
index 00000000..0270dffd
--- /dev/null
+++ b/projects/dataforth-dos/CONFLICT-RULE-FIX-PROPOSAL-2026-06-17.md
@@ -0,0 +1,83 @@
+# Proposal — make the DB hold the LATEST test run (same-day retest fix)
+
+**Date:** 2026-06-17 · **Host:** AD2 · **Status:** PROPOSAL — diagnose-only, review before deploying
+**File to change:** `C:\Shares\testdatadb\database\import.js` (repo: `projects/dataforth-dos/database/import.js`)
+**Evidence:** `PARSING-FIDELITY-VERDICT-2026-06-17.md`, `SAMEDAY-RETEST-EXPOSURE-2026-06-17.txt`
+
+## Problem
+
+`test_records` is one row per serial number. On re-import, the `INSERT ... ON CONFLICT (serial_number)` updates only when:
+
+```sql
+WHERE test_records.overall_result = 'FAIL'
+   OR (EXCLUDED.overall_result = 'PASS' AND EXCLUDED.test_date > test_records.test_date)
+```
+
+The date comparison is **strictly greater**, and the `.DAT` serial/date line carries **date only** (no time). So when a unit is tested two or more times on the **same date**, the first same-day run to be imported wins and no later same-day run can replace it. The DB — and therefore the website datasheet — can show a **non-final** run.
+
+This is the documented audit failure mode: same-day runs are usually trim / re-test iterations, and the **last** run is the accepted certificate result.
+
+## Exposure (whole-source sweep, 2026-06-17)
+
+981,716 records parsed across 26,815 `.DAT` files (406,549 serials):
+
+- Same-day multi-run events (distinct values): **6,515** across **5,977 serials**
+- DB already on the latest same-day run: 3,803
+- Superseded by a later-date retest (fine): 984
+- **DB on a non-latest run (the defect): 311**
+- Serial absent from DB (collisions/completeness): 1,417
+
+## Root cause
+
+1. **Strictly-greater date** (`>`) in the conflict `WHERE` — rejects all same-date updates.
+2. **Date-only granularity** — no intra-day timestamp in the `.DAT` to order same-day runs.
+
+## Proposed fix (minimal, guarded)
+
+Allow a same-date PASS to overwrite **only when the data actually differs**, so the last differing same-day run processed wins (imports run in chronological append order, and the live station logs are scanned last — so the last-processed run is the latest):
+
+```sql
+ON CONFLICT (serial_number) DO UPDATE SET
+    log_type = EXCLUDED.log_type,
+    model_number = EXCLUDED.model_number,
+    test_date = EXCLUDED.test_date,
+    test_station = EXCLUDED.test_station,
+    overall_result = EXCLUDED.overall_result,
+    raw_data = EXCLUDED.raw_data,
+    source_file = EXCLUDED.source_file,
+    api_uploaded_at = NULL,
+    forweb_exported_at = NULL
+WHERE test_records.overall_result = 'FAIL'
+   OR (EXCLUDED.overall_result = 'PASS' AND EXCLUDED.test_date > test_records.test_date)
+   OR (EXCLUDED.overall_result = 'PASS' AND EXCLUDED.test_date = test_records.test_date
+       AND EXCLUDED.raw_data IS DISTINCT FROM test_records.raw_data)   -- NEW: latest same-day run wins
+```
+
+The added clause only fires on a genuine same-date data change, so identical re-imports do **not** needlessly clear `api_uploaded_at` (avoids re-push churn).
+
+### Behavior after fix
+
+| Existing | Incoming | Before | After |
+|---|---|---|---|
+| PASS date D | PASS date D, different data | ignored (stale) | **updated → latest run** |
+| PASS date D | PASS date D, identical | ignored | ignored (no churn) |
+| PASS date D | PASS date D+1 | updated | updated (unchanged) |
+| PASS date D+1 | PASS date D | ignored | ignored (unchanged) |
+| FAIL | PASS (any date) | updated | updated (unchanged) |
+
+## Caveats / assumptions
+
+- **Relies on chronological append order** within a `.DAT` and on the live station logs being scanned **last** (they are: `runImport` does HISTLOGS → Recovery → station `TEST_PATH`). If a serial's latest run existed only in HISTLOGS (scanned first) and an older copy in a station log (scanned last), the older copy would win. Rare, but possible. For a hard guarantee, add a monotonic tiebreaker (ingest sequence, or a per-run timestamp if the test program can emit one) — a larger change.
+- **Re-push impact:** the 311 corrected rows (plus any future same-day retests) will clear `api_uploaded_at` and re-upload to Hoffman on the next run. Expected and desired (the website gets the final result), but it is outbound API traffic — run deliberately.
+- **Does NOT fix** generic reused serials (`1-1`, `1-2`, …) that collide across different products, nor the 608 units absent from the DB. Those are separate items (serial-uniqueness model / ingestion completeness).
+
+## Stronger alternative (larger migration)
+
+If full per-run archival is required (every test sheet reproducible), replace the `UNIQUE (serial_number)` model with a composite key **`(serial_number, test_date, run_sequence)`** (or store all runs and select the latest at render time). This preserves every run and removes the same-day ambiguity entirely, but is a schema migration + dedupe + render/upload changes — propose separately if desired.
+
+## Rollout (after approval)
+
+1. Apply the `WHERE`-clause change to `database/import.js` (repo copy first, review, then deploy).
+2. Re-run the import so the 311 same-day cases settle on the latest run.
+3. Let the upload path re-push the cleared rows; confirm counts.
+4. Re-run `tools/validate-parsing.js` to confirm same-day violations drop to ~0.
diff --git a/projects/dataforth-dos/PARSING-FIDELITY-VERDICT-2026-06-17.md b/projects/dataforth-dos/PARSING-FIDELITY-VERDICT-2026-06-17.md
index 52b16b20..d47d5d07 100644
--- a/projects/dataforth-dos/PARSING-FIDELITY-VERDICT-2026-06-17.md
+++ b/projects/dataforth-dos/PARSING-FIDELITY-VERDICT-2026-06-17.md
@@ -5,7 +5,12 @@
 
 ## Verdict
 
-**Ingestion/parsing is faithful — 0 genuine parse faults across 11,239 comparable records.** Every staged datasheet that has a corresponding DB record and a comparable test run matches on serial, model, date, and the 5 accuracy-test results. The earlier "mismatches" were all explained by retests, reused serials, format variants, or legacy out-of-scope units — not by the parser misreading or mis-segmenting data.
+**Two distinct questions, two answers:**
+
+1. **Is the parser faithful to the `.DAT` record it reads?** YES — 0 genuine parse faults across 11,239 comparable records. Every value the importer stores is byte-exact; no misreads, no mis-segmentation.
+2. **Does each DB row faithfully reproduce the unit's *final* test sheet?** NOT always. The DB is one-row-per-serial, and for units re-tested **on the same calendar date** the conflict rule (strictly-greater date) freezes on an arbitrary same-day run instead of the latest. Whole-source sweep: **311 (serial,date) groups where the DB holds a non-latest same-day run** (see `SAMEDAY-RETEST-EXPOSURE-2026-06-17.txt`). This is a data-model / conflict-rule defect, not a parser fault — fix proposed in `CONFLICT-RULE-FIX-PROPOSAL-2026-06-17.md`.
+
+The remaining staged-sample "mismatches" were explained by legitimate later-date retests (latest-wins working), reused generic serials, VAS format, or legacy out-of-scope units.
 
 ## Method
 
@@ -20,7 +25,7 @@ Compared each staged original `.TXT` (the DOS-station ground truth, written *bef
 |---|---:|---|
 | **Consistent** (SN+model+date+5×error%) | **11,226** | Faithful parse, confirmed |
 | Retest — DB date newer than `.TXT` | 35 | ON-CONFLICT updated DB to a later test (expected) |
-| Retest — same date, stim matches, run differs | 42 | Unit tested twice same day; DB keeps first run (strictly-greater-date rule) |
+| Retest — same date, stim matches, run differs | 42 | **FAITHFULNESS VIOLATION** — unit tested 2+ times same day; DB froze on a non-latest run (strictly-greater-date rule). Staged subset of the 311 whole-source cases. |
 | VAS/single-point format | 5 | No 5-row accuracy block (SCMVAS) — not comparable by this method |
 | Serial collision (generic SN, diff family) | 2 | `1-1`/`1-2` reused across products; unique-on-serial keeps one |
 | **Genuine parse fault** | **0** | — |
@@ -34,7 +39,7 @@ The last 16 suspects were all `SCM5B37K-1530` (K-thermocouple). Their stim value
 ## Two follow-up items (NOT parsing-correctness bugs)
 
 1. **608 staged originals have no DB record** (mostly A-prefix `10xxx` serials, e.g. `A243-1` = `10243-1`, model `5B45-25D`). These exist as staged `.TXT` but are absent from the DB under both decoded and encoded serial. This is an **ingestion-completeness** question (the source `.DAT` for these units appears to be out of the import scan scope, or these are custom `-NND` variants), separate from parsing fidelity. Worth a completeness pass: confirm which `.DAT` paths the importer scans and whether these models' `.DAT` files are present.
-2. **Same-day retests keep the first run.** The `ON CONFLICT` rule updates only when `EXCLUDED.test_date > test_records.test_date` (strictly greater). Two runs on the *same date* leave the DB on the first-imported run, which may differ from the latest staged datasheet. If "latest run wins" is desired, the rule needs a tiebreaker (e.g. `>=` with an import-time or sequence guard). 42 records currently sit on a same-day earlier run.
+2. **Same-day retests don't apply "latest wins" (PRIMARY DEFECT).** The `ON CONFLICT` rule updates only when `EXCLUDED.test_date > test_records.test_date` (strictly greater). For a unit tested 2+ times on one date, the rule freezes on whichever same-day run the import processed first and never advances to the latest — so the DB (and the website cert) can show non-final measured values. Whole-source exposure (981,716 records / 406,549 serials): **6,515 same-day multi-run events across 5,977 serials; the DB holds a non-latest run for 311 of them** (3,803 already on latest, 984 superseded by a later-date retest, 1,417 serial absent). Fix proposed in `CONFLICT-RULE-FIX-PROPOSAL-2026-06-17.md`. Directly audit-relevant: same-day runs are typically trim/re-test iterations and the **last** run is the accepted cert result.
 
 ## How to re-run
 
diff --git a/projects/dataforth-dos/SAMEDAY-RETEST-EXPOSURE-2026-06-17.txt b/projects/dataforth-dos/SAMEDAY-RETEST-EXPOSURE-2026-06-17.txt
new file mode 100644
index 00000000..db12b866
--- /dev/null
+++ b/projects/dataforth-dos/SAMEDAY-RETEST-EXPOSURE-2026-06-17.txt
@@ -0,0 +1,29 @@
+
+========== SAME-DAY RETEST EXPOSURE (whole source) ==========
+Records parsed                       : 981716
+Distinct serials in source           : 406549
+Serial+date with same-day multi-runs : 6515
+Distinct serials affected            : 5977
+
+Of those same-day multi-run (serial,date) groups, the DB row:
+  matches the LATEST same-day run    : 3803
+  does NOT hold the latest run       : 311   <-- faithfulness violations
+  holds an even newer-date test (ok) : 984
+  serial absent from DB              : 1417
+
+Examples (not-latest):
+  4321-1 (2020-07-16, 2 runs): DB sig != latest
+  82001-1 (2012-09-05, 6 runs): DB sig != latest
+  608-55 (2018-01-28, 2 runs): DB sig != latest
+  610-7 (2020-03-05, 2 runs): DB sig != latest
+  1-2: DB date 2017-02-06 < multirun 2021-07-07
+  1-2: DB date 2017-02-06 < multirun 2021-08-19
+  1-2: DB date 2017-02-06 < multirun 2021-08-23
+  1-2: DB date 2017-02-06 < multirun 2021-08-16
+  1-2: DB date 2017-02-06 < multirun 2021-08-22
+  1-2: DB date 2017-02-06 < multirun 2021-08-26
+  1-2: DB date 2017-02-06 < multirun 2017-08-31
+  1-2: DB date 2017-02-06 < multirun 2021-06-23
+  1-2: DB date 2017-02-06 < multirun 2021-08-02
+  1-2: DB date 2017-02-06 < multirun 2021-08-03
+  1-2: DB date 2017-02-06 < multirun 2022-06-22
diff --git a/projects/dataforth-dos/datasheet-pipeline/implementation/tools/sweep-sameday-retests.js b/projects/dataforth-dos/datasheet-pipeline/implementation/tools/sweep-sameday-retests.js
new file mode 100644
index 00000000..cb1f21a9
--- /dev/null
+++ b/projects/dataforth-dos/datasheet-pipeline/implementation/tools/sweep-sameday-retests.js
@@ -0,0 +1,104 @@
+// Whole-source sweep (READ-ONLY): find serials with same-day multi-runs (distinct values)
+// and measure how many the DB does NOT hold the latest run for. Scans the import's .DAT sources.
+const fs = require('fs');
+const path = require('path');
+const db = require('./database/db');
+
+const ROOTS = ['C:/Shares/test/Ate/HISTLOGS'];           // central combined logs first
+const STATION_BASE = 'C:/Shares/test';
+
+function datFiles(dir, out) {
+  let it = []; try { it = fs.readdirSync(dir, { withFileTypes: true }); } catch { return out; }
+  for (const e of it) { const p = path.join(dir, e.name);
+    if (e.isDirectory()) datFiles(p, out);
+    else if (/\.dat$/i.test(e.name)) out.push(p);
+  }
+  return out;
+}
+
+// signature of a record = the 5 Error(%) columns joined (distinguishes runs)
+function recSig(block) {
+  const errs = [];
+  for (const l of block) {
+    if (/,"(PASS|FAIL)"/.test(l)) { const f = l.split(','); if (f.length >= 5) { errs.push(f[3].trim()); if (errs.length === 5) break; } }
+  }
+  return errs.length === 5 ? errs.join('|') : null;
+}
+
+(async () => {
+  // gather files: HISTLOGS, then station LOGS (mirrors import order; station = latest)
+  let files = [];
+  for (const r of ROOTS) datFiles(r, files);
+  let stations = [];
+  try { stations = fs.readdirSync(STATION_BASE, { withFileTypes: true }).filter(d => d.isDirectory() && /^TS-\d+[LR]?$/i.test(d.name)).map(d => d.name); } catch {}
+  for (const s of stations) datFiles(path.join(STATION_BASE, s, 'LOGS'), files);
+  console.log('Scanning ' + files.length + ' .DAT files (' + stations.length + ' stations + HISTLOGS)...');
+
+  // serial -> date -> { sigs:Set, last:sig }
+  const map = new Map();
+  let recCount = 0, fi = 0;
+  for (const f of files) {
+    fi++; if (fi % 3000 === 0) console.log('  ...' + fi + '/' + files.length + ' files, ' + recCount + ' records');
+    let lines; try { lines = fs.readFileSync(f, 'utf8').split('\n'); } catch { continue; }
+    let block = [];
+    for (let i = 0; i < lines.length; i++) {
+      const t = lines[i].trim();
+      const sd = t.match(/^"(\d+-\d+[A-Za-z]?)","(\d{2}-\d{2}-\d{4})"$/);
+      if (sd) {
+        const sig = recSig(block);
+        if (sig) {
+          recCount++;
+          const sn = sd[1]; const [mm,dd,yy] = sd[2].split('-'); const date = `${yy}-${mm}-${dd}`;
+          let dm = map.get(sn); if (!dm) { dm = new Map(); map.set(sn, dm); }
+          let e = dm.get(date); if (!e) { e = { sigs: new Set(), last: null }; dm.set(date, e); }
+          e.sigs.add(sig); e.last = sig;
+        }
+        block = [];
+      } else if (t) block.push(t);
+    }
+  }
+  console.log('Parsed ' + recCount + ' records, ' + map.size + ' distinct serials.');
+
+  // find serials with same-day multi-runs (>=2 distinct sigs on one date)
+  const multi = [];   // { sn, date, runs, lastSig }
+  for (const [sn, dm] of map) for (const [date, e] of dm) if (e.sigs.size >= 2) multi.push({ sn, date, runs: e.sigs.size, lastSig: e.last });
+  console.log('Serials*date with same-day multi-runs (distinct values): ' + multi.length);
+  const multiSerials = new Set(multi.map(m => m.sn));
+  console.log('Distinct serials affected: ' + multiSerials.size);
+
+  // For each, check what the DB holds vs the latest same-day run
+  const sns = [...multiSerials];
+  const dbMap = new Map();
+  for (let i = 0; i < sns.length; i += 1000) {
+    const rows = await db.query('SELECT serial_number, test_date, raw_data FROM test_records WHERE serial_number = ANY($1)', [sns.slice(i, i+1000)]);
+    for (const r of rows) dbMap.set(r.serial_number, r);
+  }
+  let notLatest = 0, dbNewer = 0, dbAbsent = 0, dbMatches = 0, examples = [];
+  for (const m of multi) {
+    const d = dbMap.get(m.sn);
+    if (!d) { dbAbsent++; continue; }
+    const dbDate = d.test_date && d.test_date.toISOString ? d.test_date.toISOString().slice(0,10) : String(d.test_date);
+    if (dbDate > m.date) { dbNewer++; continue; }            // DB has an even later test -> fine
+    if (dbDate < m.date) { notLatest++; if (examples.length<15) examples.push(`${m.sn}: DB date ${dbDate} < multirun ${m.date}`); continue; }
+    const dbSig = recSig((d.raw_data||'').split('\n').map(s=>s.trim()));
+    if (dbSig === m.lastSig) dbMatches++;
+    else { notLatest++; if (examples.length<15) examples.push(`${m.sn} (${m.date}, ${m.runs} runs): DB sig != latest`); }
+  }
+
+  const out = [];
+  const L = s => { out.push(s); console.log(s); };
+  L('\n========== SAME-DAY RETEST EXPOSURE (whole source) ==========');
+  L('Records parsed                       : ' + recCount);
+  L('Distinct serials in source           : ' + map.size);
+  L('Serial+date with same-day multi-runs : ' + multi.length);
+  L('Distinct serials affected            : ' + multiSerials.size);
+  L('');
+  L('Of those same-day multi-run (serial,date) groups, the DB row:');
+  L('  matches the LATEST same-day run    : ' + dbMatches);
+  L('  does NOT hold the latest run       : ' + notLatest + '   <-- faithfulness violations');
+  L('  holds an even newer-date test (ok) : ' + dbNewer);
+  L('  serial absent from DB              : ' + dbAbsent);
+  if (examples.length) { L(''); L('Examples (not-latest):'); examples.forEach(x=>L('  '+x)); }
+  if (process.argv[2]) fs.writeFileSync(process.argv[2], out.join('\n')+'\n');
+  await db.close();
+})().catch(e => { console.error(e); process.exit(1); });