sync: auto-sync from GURU-5070 at 2026-06-11 08:29:58

Author: Mike Swanson
Machine: GURU-5070
Timestamp: 2026-06-11 08:29:58
This commit is contained in:
2026-06-11 08:30:10 -07:00
parent 65ad20ae0f
commit d0f90d4023
2 changed files with 35 additions and 0 deletions

View File

@@ -432,6 +432,20 @@ def jaccard(a: set[str], b: set[str]) -> float:
return inter / union if union else 0.0
# Suffixes that denote an INTENTIONAL current/archive split (e.g. project_cascades
# + project_cascades_history). These are deliberately separate files — current
# state vs on-demand detail, cross-linked in frontmatter — NOT duplicates. They
# must not be flagged as merge candidates.
ARCHIVE_SUFFIXES = ("_history", "_archive", "_detail", "_log", "_rationale")
def strip_archive_suffix(slug: str) -> str:
for suf in ARCHIVE_SUFFIXES:
if slug.endswith(suf):
return slug[: -len(suf)]
return slug
def cluster_overlaps(mems: list[Memory], threshold: float = 0.34):
"""
Within each type, find pairs with token-overlap >= threshold, then union
@@ -466,6 +480,7 @@ def cluster_overlaps(mems: list[Memory], threshold: float = 0.34):
parent[rx] = ry
files = [m.filename for m in group]
slug_of = {m.filename: m.slug for m in group}
slug_prefix = {}
for m in group:
parts = m.slug.split("_")
@@ -480,6 +495,11 @@ def cluster_overlaps(mems: list[Memory], threshold: float = 0.34):
and len(slug_prefix[fi].split("_")) >= 2
)
if sim >= threshold or same_prefix:
# Don't flag intentional current/archive splits (X + X_history):
# deliberately separate files, cross-linked in frontmatter, not dupes.
si, sj = slug_of[fi], slug_of[fj]
if si != sj and strip_archive_suffix(si) == strip_archive_suffix(sj):
continue
union(fi, fj)
groups: dict[str, list[str]] = {}