sync: auto-sync from GURU-5070 at 2026-06-11 08:29:58
Author: Mike Swanson Machine: GURU-5070 Timestamp: 2026-06-11 08:29:58
This commit is contained in:
@@ -432,6 +432,20 @@ def jaccard(a: set[str], b: set[str]) -> float:
|
||||
return inter / union if union else 0.0
|
||||
|
||||
|
||||
# Suffixes that denote an INTENTIONAL current/archive split (e.g. project_cascades
|
||||
# + project_cascades_history). These are deliberately separate files — current
|
||||
# state vs on-demand detail, cross-linked in frontmatter — NOT duplicates. They
|
||||
# must not be flagged as merge candidates.
|
||||
ARCHIVE_SUFFIXES = ("_history", "_archive", "_detail", "_log", "_rationale")
|
||||
|
||||
|
||||
def strip_archive_suffix(slug: str) -> str:
|
||||
for suf in ARCHIVE_SUFFIXES:
|
||||
if slug.endswith(suf):
|
||||
return slug[: -len(suf)]
|
||||
return slug
|
||||
|
||||
|
||||
def cluster_overlaps(mems: list[Memory], threshold: float = 0.34):
|
||||
"""
|
||||
Within each type, find pairs with token-overlap >= threshold, then union
|
||||
@@ -466,6 +480,7 @@ def cluster_overlaps(mems: list[Memory], threshold: float = 0.34):
|
||||
parent[rx] = ry
|
||||
|
||||
files = [m.filename for m in group]
|
||||
slug_of = {m.filename: m.slug for m in group}
|
||||
slug_prefix = {}
|
||||
for m in group:
|
||||
parts = m.slug.split("_")
|
||||
@@ -480,6 +495,11 @@ def cluster_overlaps(mems: list[Memory], threshold: float = 0.34):
|
||||
and len(slug_prefix[fi].split("_")) >= 2
|
||||
)
|
||||
if sim >= threshold or same_prefix:
|
||||
# Don't flag intentional current/archive splits (X + X_history):
|
||||
# deliberately separate files, cross-linked in frontmatter, not dupes.
|
||||
si, sj = slug_of[fi], slug_of[fj]
|
||||
if si != sj and strip_archive_suffix(si) == strip_archive_suffix(sj):
|
||||
continue
|
||||
union(fi, fj)
|
||||
|
||||
groups: dict[str, list[str]] = {}
|
||||
|
||||
Reference in New Issue
Block a user