memory-dream: read-only memory lint/consolidation analyzer (index, backlinks, stale refs, dup clusters, profile drift); additive-only --apply-safe, all merges/deletes are proposals. sync-memory.sh: additive repo<->harness-profile union (no delete/overwrite, conflicts surfaced), wired to a SessionStart hook. Migrates the useful profile-only memories into the synced repo store. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
904 lines
33 KiB
Python
904 lines
33 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
memory_dream.py -- memory lint + consolidation analyzer for the ClaudeTools REPO
|
|
memory store (.claude/memory/).
|
|
|
|
ADDITIVE-ONLY by design. The default run is READ-ONLY and mutates nothing.
|
|
The only mutating mode is --apply-safe, which performs ONLY additive,
|
|
non-destructive actions:
|
|
* append missing index lines to MEMORY.md for orphan memory files
|
|
* copy profile-only memory files INTO the repo store (never overwriting)
|
|
It NEVER deletes a file, NEVER removes an index line, NEVER overwrites differing
|
|
content, and NEVER performs a proposed merge. Every destructive idea stays in
|
|
the report as a PROPOSED action for a human to approve.
|
|
|
|
Stdlib only. Python launcher on Windows fleet is `py`; also runs under
|
|
python3/python.
|
|
|
|
Usage:
|
|
py memory_dream.py # REPORT ONLY (default)
|
|
py memory_dream.py --apply-safe # additive-only fixes + report
|
|
py memory_dream.py --no-file # report to stdout only, skip _reports/ file
|
|
py memory_dream.py --report-file X # write report to an explicit path
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import datetime
|
|
import os
|
|
import re
|
|
import shutil
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Windows consoles default to cp1252; memory bodies contain Unicode (arrows,
|
|
# em dashes). Force UTF-8 stdout/stderr with replacement so printing never
|
|
# crashes regardless of the active code page.
|
|
for _stream in (sys.stdout, sys.stderr):
|
|
try:
|
|
_stream.reconfigure(encoding="utf-8", errors="replace")
|
|
except Exception:
|
|
pass
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Path resolution -- no hardcoded drive letters.
|
|
# --------------------------------------------------------------------------
|
|
|
|
STALE_MONTHS = 6 # project facts older than this (in "as of <date>") -> re-verify
|
|
|
|
|
|
def _read_identity_root(repo_guess: Path) -> str | None:
|
|
"""Best-effort read of claudetools_root from .claude/identity.json."""
|
|
ident = repo_guess / ".claude" / "identity.json"
|
|
if not ident.is_file():
|
|
return None
|
|
try:
|
|
import json
|
|
|
|
data = json.loads(ident.read_text(encoding="utf-8"))
|
|
root = data.get("claudetools_root")
|
|
if root and Path(root).is_dir():
|
|
return root
|
|
except Exception:
|
|
return None
|
|
return None
|
|
|
|
|
|
def resolve_claudetools_root() -> Path:
|
|
"""
|
|
Resolve CLAUDETOOLS_ROOT:
|
|
1. env CLAUDETOOLS_ROOT
|
|
2. .claude/identity.json claudetools_root (found by walking up from script)
|
|
3. derive from this script's location (.../.claude/skills/memory-dream/scripts/)
|
|
"""
|
|
env_root = os.environ.get("CLAUDETOOLS_ROOT")
|
|
if env_root and Path(env_root).is_dir():
|
|
return Path(env_root).resolve()
|
|
|
|
# Walk up from this file looking for a .claude dir.
|
|
here = Path(__file__).resolve()
|
|
derived = None
|
|
for parent in here.parents:
|
|
if (parent / ".claude").is_dir():
|
|
derived = parent
|
|
break
|
|
|
|
if derived is not None:
|
|
ident_root = _read_identity_root(derived)
|
|
if ident_root:
|
|
return Path(ident_root).resolve()
|
|
return derived.resolve()
|
|
|
|
# Last resort: assume scripts/ -> memory-dream/ -> skills/ -> .claude/ -> ROOT
|
|
# (script is at ROOT/.claude/skills/memory-dream/scripts/memory_dream.py)
|
|
return here.parents[4].resolve()
|
|
|
|
|
|
def profile_memory_dir(repo_root: Path) -> Path | None:
|
|
"""
|
|
Derive the harness profile memory dir for this project.
|
|
|
|
Slug: take the absolute project path, replace every run of non-alphanumeric
|
|
chars with '-', then look under $HOME/.claude/projects/<slug>/memory/.
|
|
|
|
Prefers CLAUDE_PROJECT_DIR if set; falls back to repo_root.
|
|
Returns the dir if it exists, else None.
|
|
"""
|
|
home = Path(os.environ.get("HOME") or os.path.expanduser("~"))
|
|
project_dir = os.environ.get("CLAUDE_PROJECT_DIR") or str(repo_root)
|
|
abspath = str(Path(project_dir).resolve())
|
|
projects_root = home / ".claude" / "projects"
|
|
|
|
# The single-dash collapse: replace every run of non-alphanumeric chars with
|
|
# a single '-'. This is the historical/POSIX-style derivation.
|
|
slug_single = re.sub(r"[^A-Za-z0-9]+", "-", abspath)
|
|
|
|
# The Claude Code harness maps a Windows drive colon to '--' (so
|
|
# "D:\\claudetools" -> "D--claudetools"), but the single-dash collapse above
|
|
# produces "D-claudetools". Reproduce the harness rule by doubling a leading
|
|
# "<drive>-" into "<drive>--".
|
|
slug_double = re.sub(r"^([A-Za-z])-", r"\1--", slug_single)
|
|
|
|
# Try the EXACT candidate slugs in priority order; use the first whose
|
|
# profile memory dir actually exists. The double-dash (harness) variant is
|
|
# primary; the single-dash collapse is the secondary exact candidate.
|
|
seen: set[str] = set()
|
|
for slug in (slug_double, slug_single):
|
|
if slug in seen:
|
|
continue
|
|
seen.add(slug)
|
|
base = projects_root / slug
|
|
for candidate in (base / "memory", base):
|
|
if candidate.is_dir():
|
|
# If the slug dir itself was matched (no nested memory/), use the
|
|
# conventional memory subdir under it.
|
|
return (base / "memory") if candidate == base else candidate
|
|
|
|
# ONLY if none of the exact candidates exist, fall back to a case-insensitive
|
|
# tail-scan of $HOME/.claude/projects/*/memory for a dir whose slug "looks
|
|
# like" this repo (tail match on the last path component). If MORE THAN ONE
|
|
# dir matches, do NOT guess -- report the ambiguity and skip.
|
|
if projects_root.is_dir():
|
|
tail = re.sub(r"[^A-Za-z0-9]+", "-", repo_root.name).lower()
|
|
matches: list[Path] = []
|
|
for child in sorted(projects_root.iterdir()):
|
|
if not child.is_dir():
|
|
continue
|
|
if child.name.lower().endswith(tail):
|
|
mem = child / "memory"
|
|
if mem.is_dir():
|
|
matches.append(mem)
|
|
if len(matches) > 1:
|
|
names = ", ".join(str(m.parent.name) for m in matches)
|
|
print(
|
|
f"[WARNING] multiple profile dirs matched ({names}); "
|
|
"skipping profile drift analysis to avoid cross-project contamination"
|
|
)
|
|
return None
|
|
if len(matches) == 1:
|
|
return matches[0]
|
|
return None
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Frontmatter / memory file parsing
|
|
# --------------------------------------------------------------------------
|
|
|
|
|
|
class Memory:
|
|
def __init__(self, path: Path):
|
|
self.path = path
|
|
self.filename = path.name
|
|
self.slug = path.stem
|
|
self.name: str | None = None
|
|
self.description: str | None = None
|
|
self.type: str | None = None
|
|
self.body: str = ""
|
|
self._parse()
|
|
|
|
def _parse(self) -> None:
|
|
text = self.path.read_text(encoding="utf-8", errors="replace")
|
|
lines = text.splitlines()
|
|
if not lines or lines[0].strip() != "---":
|
|
# No frontmatter; whole file is body.
|
|
self.body = text
|
|
return
|
|
# Find closing fence.
|
|
end = None
|
|
for i in range(1, len(lines)):
|
|
if lines[i].strip() == "---":
|
|
end = i
|
|
break
|
|
if end is None:
|
|
self.body = text
|
|
return
|
|
fm = lines[1:end]
|
|
self.body = "\n".join(lines[end + 1 :])
|
|
self._parse_frontmatter(fm)
|
|
|
|
def _parse_frontmatter(self, fm_lines: list[str]) -> None:
|
|
"""
|
|
Tolerant YAML-ish parse. Handles:
|
|
name: X
|
|
description: X (or '>-' folded block following)
|
|
type: X (top-level)
|
|
metadata:
|
|
type: X (nested)
|
|
"""
|
|
i = 0
|
|
in_metadata = False
|
|
while i < len(fm_lines):
|
|
raw = fm_lines[i]
|
|
line = raw.rstrip("\n")
|
|
stripped = line.strip()
|
|
indent = len(line) - len(line.lstrip())
|
|
|
|
if not stripped:
|
|
i += 1
|
|
continue
|
|
|
|
if stripped == "metadata:":
|
|
in_metadata = True
|
|
i += 1
|
|
continue
|
|
|
|
# Detect leaving the metadata block (a top-level key reappears).
|
|
if in_metadata and indent == 0 and ":" in stripped:
|
|
in_metadata = False
|
|
|
|
m = re.match(r"^([A-Za-z_][\w\-]*):\s*(.*)$", stripped)
|
|
if not m:
|
|
i += 1
|
|
continue
|
|
key, val = m.group(1), m.group(2)
|
|
|
|
# Folded/literal block scalar -> capture following more-indented lines.
|
|
if val in (">-", ">", "|", "|-", "|+"):
|
|
block_lines = []
|
|
j = i + 1
|
|
base_indent = indent
|
|
while j < len(fm_lines):
|
|
nxt = fm_lines[j]
|
|
nxt_indent = len(nxt) - len(nxt.lstrip())
|
|
if nxt.strip() == "" or nxt_indent > base_indent:
|
|
block_lines.append(nxt.strip())
|
|
j += 1
|
|
else:
|
|
break
|
|
val = " ".join(x for x in block_lines if x)
|
|
i = j
|
|
else:
|
|
val = val.strip().strip('"').strip("'")
|
|
i += 1
|
|
|
|
if key == "name" and not in_metadata:
|
|
self.name = val
|
|
elif key == "description":
|
|
self.description = val
|
|
elif key == "type":
|
|
# Both top-level and metadata.type land here.
|
|
self.type = (val or "").lower() or None
|
|
else:
|
|
continue
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Index (MEMORY.md) parsing
|
|
# --------------------------------------------------------------------------
|
|
|
|
INDEX_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
|
# Body backlinks like [[some-name]]
|
|
BACKLINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
|
# "as of <date>" style dated claims.
|
|
DATE_RE = re.compile(
|
|
r"(?:as of|updated|corrected|lesson|fixed|live)\s+"
|
|
r"(\d{4}-\d{2}-\d{2})",
|
|
re.IGNORECASE,
|
|
)
|
|
ISO_DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
|
|
|
|
# Type -> index header. Index uses singular headers.
|
|
TYPE_HEADER = {
|
|
"reference": "Reference",
|
|
"feedback": "Feedback",
|
|
"project": "Project",
|
|
"user": "Users",
|
|
}
|
|
|
|
|
|
def parse_index(index_path: Path):
|
|
"""
|
|
Returns:
|
|
links: list of (title, target, lineno, raw_line)
|
|
headers: dict header-name -> lineno
|
|
lines: original file lines (no newline)
|
|
"""
|
|
links = []
|
|
headers = {}
|
|
if not index_path.is_file():
|
|
return links, headers, []
|
|
text = index_path.read_text(encoding="utf-8", errors="replace")
|
|
lines = text.split("\n")
|
|
for idx, line in enumerate(lines):
|
|
hm = re.match(r"^##\s+(.+?)\s*$", line)
|
|
if hm:
|
|
headers[hm.group(1).strip()] = idx
|
|
continue
|
|
if line.lstrip().startswith("- "):
|
|
m = INDEX_LINK_RE.search(line)
|
|
if m:
|
|
links.append((m.group(1), m.group(2), idx, line))
|
|
return links, headers, lines
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Referenced-artifact extraction (conservative)
|
|
# --------------------------------------------------------------------------
|
|
|
|
# Referenced-artifact extraction is intentionally CONSERVATIVE: it only inspects
|
|
# backtick-wrapped spans (`...`) and only treats a span as a repo path when the
|
|
# whole span is a single path-like token. Extensions are ordered longest-first
|
|
# so `identity.json` is never truncated to `identity.js`. We do NOT scan bare
|
|
# prose -- too many false positives.
|
|
PATHISH_RE = re.compile(r"`([^`\n]+?)`")
|
|
|
|
# Longest-first extension alternation, anchored to end-of-token, prevents the
|
|
# json->js / yaml->yml style truncation bug.
|
|
KNOWN_EXTS = (
|
|
"tsx", "json", "yaml", "toml", "service",
|
|
"py", "sh", "rs", "ts", "js", "md", "yml", "sql", "ps1",
|
|
)
|
|
EXT_RE = re.compile(r"\.(?:" + "|".join(KNOWN_EXTS) + r")$", re.IGNORECASE)
|
|
|
|
# Vault-style secret paths live in the SEPARATE vault repo, not claudetools.
|
|
VAULT_HINT_RE = re.compile(r"\.sops\.ya?ml$", re.IGNORECASE)
|
|
|
|
# Tokens we never treat as repo paths.
|
|
ABS_PREFIXES = ("/api/", "/home/", "/var/", "/opt/", "/etc/", "/tmp/",
|
|
"/proc/", "/dev/", "/data/", "/usr/")
|
|
|
|
|
|
def looks_like_repo_path(token: str) -> bool:
|
|
token = token.strip()
|
|
if not token:
|
|
return False
|
|
# Reject anything with whitespace, glob/placeholder/url/colon characters --
|
|
# those are descriptions or templates, not concrete repo paths.
|
|
if any(c in token for c in (" ", "<", ">", "*", "?", ":", "|", "\\")):
|
|
return False
|
|
if token.startswith(("http://", "https://", "//", "git@", "vault:")):
|
|
return False
|
|
if token.startswith(ABS_PREFIXES):
|
|
return False # server absolute paths, not repo-relative
|
|
# Vault secret refs belong to the vault repo -- not a staleness signal here.
|
|
if VAULT_HINT_RE.search(token):
|
|
return False
|
|
# Must end in a recognized extension (anchored, longest-first).
|
|
if not EXT_RE.search(token):
|
|
return False
|
|
# A real reference is either repo-relative-with-slash or a bare filename.
|
|
# Reject single-segment tokens that are clearly prose-y (no slash AND no
|
|
# underscore/dash) unless they look like a script filename.
|
|
has_slash = "/" in token
|
|
if not has_slash:
|
|
# bare filename: require it to look like an actual file (has a dot ext,
|
|
# already guaranteed) and contain a separator or be a known script ext.
|
|
return True
|
|
return True
|
|
|
|
|
|
def extract_referenced_paths(body: str) -> list[str]:
|
|
found = set()
|
|
for m in PATHISH_RE.finditer(body):
|
|
span = m.group(1).strip()
|
|
# A backtick span counts only if the ENTIRE span is one token (a path).
|
|
# Spans with spaces are commands/prose -> skip (avoids `cmd args` noise).
|
|
if not span or " " in span:
|
|
continue
|
|
token = span.lstrip("./")
|
|
if looks_like_repo_path(token):
|
|
found.add(token)
|
|
return sorted(found)
|
|
|
|
|
|
def repo_path_exists(repo_root: Path, token: str) -> bool:
|
|
token = token.lstrip("./")
|
|
# Try repo-relative.
|
|
if (repo_root / token).exists():
|
|
return True
|
|
# Bare filename -> search anywhere in repo (cheap, bounded).
|
|
if "/" not in token:
|
|
try:
|
|
return any(True for _ in repo_root.rglob(token))
|
|
except OSError:
|
|
return False
|
|
# Also try matching just the tail (last 2 segments) anywhere, since memories
|
|
# often cite paths relative to a subproject root.
|
|
parts = token.split("/")
|
|
if len(parts) >= 2:
|
|
tail = "/".join(parts[-2:])
|
|
try:
|
|
for p in repo_root.rglob(parts[-1]):
|
|
if str(p).replace("\\", "/").endswith(tail):
|
|
return True
|
|
except OSError:
|
|
return False
|
|
return False
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Similarity / duplicate clustering (token-overlap heuristic)
|
|
# --------------------------------------------------------------------------
|
|
|
|
STOPWORDS = {
|
|
"the", "a", "an", "and", "or", "to", "of", "in", "on", "for", "with",
|
|
"is", "are", "be", "not", "via", "use", "used", "uses", "no", "never",
|
|
"always", "only", "via", "from", "by", "at", "as", "it", "this", "that",
|
|
"when", "if", "then", "do", "don't", "we", "our", "you", "your",
|
|
}
|
|
|
|
|
|
def tokenize(text: str) -> set[str]:
|
|
toks = re.findall(r"[a-z0-9]+", (text or "").lower())
|
|
return {t for t in toks if t not in STOPWORDS and len(t) > 2}
|
|
|
|
|
|
def jaccard(a: set[str], b: set[str]) -> float:
|
|
if not a or not b:
|
|
return 0.0
|
|
inter = len(a & b)
|
|
union = len(a | b)
|
|
return inter / union if union else 0.0
|
|
|
|
|
|
def cluster_overlaps(mems: list[Memory], threshold: float = 0.34):
|
|
"""
|
|
Within each type, find pairs with token-overlap >= threshold, then union
|
|
them into clusters. Returns list of (type, [filenames]) for clusters >1.
|
|
"""
|
|
clusters_out = []
|
|
by_type: dict[str, list[Memory]] = {}
|
|
for m in mems:
|
|
by_type.setdefault(m.type or "untyped", []).append(m)
|
|
|
|
for typ, group in by_type.items():
|
|
# token signature per memory: name + description + slug words
|
|
sigs = {}
|
|
for m in group:
|
|
base = " ".join(
|
|
filter(None, [m.name, m.description, m.slug.replace("_", " ")])
|
|
)
|
|
sigs[m.filename] = tokenize(base)
|
|
|
|
# Also bias by shared slug prefix (e.g. feedback_syncro_*).
|
|
parent = {m.filename: m.filename for m in group}
|
|
|
|
def find(x):
|
|
while parent[x] != x:
|
|
parent[x] = parent[parent[x]]
|
|
x = parent[x]
|
|
return x
|
|
|
|
def union(x, y):
|
|
rx, ry = find(x), find(y)
|
|
if rx != ry:
|
|
parent[rx] = ry
|
|
|
|
files = [m.filename for m in group]
|
|
slug_prefix = {}
|
|
for m in group:
|
|
parts = m.slug.split("_")
|
|
slug_prefix[m.filename] = "_".join(parts[:2]) if len(parts) >= 2 else m.slug
|
|
|
|
for i in range(len(files)):
|
|
for j in range(i + 1, len(files)):
|
|
fi, fj = files[i], files[j]
|
|
sim = jaccard(sigs[fi], sigs[fj])
|
|
same_prefix = (
|
|
slug_prefix[fi] == slug_prefix[fj]
|
|
and len(slug_prefix[fi].split("_")) >= 2
|
|
)
|
|
if sim >= threshold or same_prefix:
|
|
union(fi, fj)
|
|
|
|
groups: dict[str, list[str]] = {}
|
|
for f in files:
|
|
groups.setdefault(find(f), []).append(f)
|
|
for members in groups.values():
|
|
if len(members) > 1:
|
|
clusters_out.append((typ, sorted(members)))
|
|
return clusters_out
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Stale dated facts
|
|
# --------------------------------------------------------------------------
|
|
|
|
|
|
def find_stale_dates(mem: Memory, today: datetime.date):
|
|
"""Return list of (date_str, age_days) for dated claims older than STALE_MONTHS."""
|
|
hits = []
|
|
seen = set()
|
|
for rx in (DATE_RE, ISO_DATE_RE):
|
|
for m in rx.finditer(mem.body):
|
|
ds = m.group(1)
|
|
if ds in seen:
|
|
continue
|
|
seen.add(ds)
|
|
try:
|
|
d = datetime.date.fromisoformat(ds)
|
|
except ValueError:
|
|
continue
|
|
age = (today - d).days
|
|
if age > STALE_MONTHS * 30:
|
|
hits.append((ds, age))
|
|
return hits
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Report
|
|
# --------------------------------------------------------------------------
|
|
|
|
|
|
class Report:
|
|
def __init__(self):
|
|
self.lines: list[str] = []
|
|
|
|
def add(self, s: str = ""):
|
|
self.lines.append(s)
|
|
|
|
def __str__(self):
|
|
return "\n".join(self.lines)
|
|
|
|
|
|
def slugify_link_target(target: str) -> str:
|
|
return Path(target).stem
|
|
|
|
|
|
def run(args) -> int:
|
|
repo_root = resolve_claudetools_root()
|
|
mem_dir = repo_root / ".claude" / "memory"
|
|
index_path = mem_dir / "MEMORY.md"
|
|
|
|
if not mem_dir.is_dir():
|
|
print(f"[ERROR] memory dir not found: {mem_dir}")
|
|
return 2
|
|
|
|
today = datetime.date.today()
|
|
rpt = Report()
|
|
rpt.add("# Memory Dream Report")
|
|
rpt.add(f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
|
rpt.add(f"Repo root: {repo_root}")
|
|
rpt.add(f"Memory store: {mem_dir}")
|
|
rpt.add(f"Mode: {'APPLY-SAFE (additive)' if args.apply_safe else 'REPORT-ONLY'}")
|
|
rpt.add("")
|
|
|
|
# Load memories.
|
|
mem_files = sorted(p for p in mem_dir.glob("*.md") if p.name != "MEMORY.md")
|
|
mems = [Memory(p) for p in mem_files]
|
|
mem_by_file = {m.filename: m for m in mems}
|
|
rpt.add(f"Loaded {len(mems)} memory files (excluding MEMORY.md).")
|
|
rpt.add("")
|
|
|
|
# ----- 1. INDEX RECONCILE -----
|
|
links, headers, index_lines = parse_index(index_path)
|
|
indexed_targets = {slugify_link_target(t): (title, t, ln)
|
|
for (title, t, ln, _raw) in links}
|
|
rpt.add("## 1. INDEX RECONCILE")
|
|
rpt.add("")
|
|
|
|
orphans = [] # files with no index line
|
|
for m in mems:
|
|
if m.slug not in indexed_targets:
|
|
orphans.append(m)
|
|
rpt.add(f"### Orphan files (no index line): {len(orphans)}")
|
|
for m in orphans:
|
|
rpt.add(f"- [INFO] {m.filename} (type={m.type or '?'})")
|
|
rpt.add("")
|
|
|
|
missing_targets = [] # index lines whose file is missing
|
|
for title, target, ln, _raw in links:
|
|
# Only consider links that look like local memory files.
|
|
tgt = target.strip()
|
|
if tgt.startswith(("http://", "https://")):
|
|
continue
|
|
resolved = (mem_dir / tgt).resolve()
|
|
if not resolved.is_file():
|
|
missing_targets.append((title, target, ln))
|
|
rpt.add(f"### Index lines pointing at missing files: {len(missing_targets)}")
|
|
for title, target, ln in missing_targets:
|
|
rpt.add(f"- [WARNING] line {ln + 1}: [{title}]({target}) -> file not found")
|
|
rpt.add("")
|
|
|
|
name_mismatches = [] # frontmatter name vs filename slug
|
|
for m in mems:
|
|
if m.name is None:
|
|
name_mismatches.append((m.filename, "(no name in frontmatter)"))
|
|
continue
|
|
# The convention is loose: name may be a title, not the slug. Only flag
|
|
# when name itself looks like a slug AND differs from the filename slug.
|
|
name_as_slug = re.sub(r"[^A-Za-z0-9]+", "_", m.name.strip().lower()).strip("_")
|
|
if re.fullmatch(r"[a-z0-9_]+", m.name.strip()) and m.name.strip() != m.slug:
|
|
name_mismatches.append((m.filename, f"name='{m.name}' != slug='{m.slug}'"))
|
|
rpt.add(f"### Frontmatter name vs filename signals: {len(name_mismatches)}")
|
|
for fn, note in name_mismatches:
|
|
rpt.add(f"- [INFO] {fn}: {note}")
|
|
rpt.add("")
|
|
|
|
# ----- 2. BACKLINKS -----
|
|
rpt.add("## 2. BACKLINKS ([[name]] references)")
|
|
rpt.add("")
|
|
known_slugs = {m.slug for m in mems}
|
|
broken_backlinks = []
|
|
for m in mems:
|
|
for bm in BACKLINK_RE.finditer(m.body):
|
|
ref = bm.group(1).strip()
|
|
ref_slug = slugify_link_target(ref)
|
|
if ref_slug not in known_slugs and ref not in known_slugs:
|
|
broken_backlinks.append((m.filename, ref))
|
|
rpt.add(f"### Broken backlinks: {len(broken_backlinks)}")
|
|
for fn, ref in broken_backlinks:
|
|
rpt.add(f"- [WARNING] {fn}: [[{ref}]] has no matching memory file")
|
|
if not broken_backlinks:
|
|
rpt.add("- [OK] no broken backlinks found")
|
|
rpt.add("")
|
|
|
|
# ----- 3. REFERENCED-ARTIFACT VALIDITY -----
|
|
rpt.add("## 3. REFERENCED-ARTIFACT VALIDITY (conservative; 'verify', not 'delete')")
|
|
rpt.add("")
|
|
artifact_flags = []
|
|
for m in mems:
|
|
for tok in extract_referenced_paths(m.body):
|
|
if not repo_path_exists(repo_root, tok):
|
|
artifact_flags.append((m.filename, tok))
|
|
rpt.add(f"### Referenced paths not found in repo: {len(artifact_flags)}")
|
|
for fn, tok in artifact_flags:
|
|
rpt.add(f"- [VERIFY] {fn}: `{tok}` not found under repo (may be server-side "
|
|
f"or renamed -- verify, do not auto-delete)")
|
|
if not artifact_flags:
|
|
rpt.add("- [OK] no clearly-stale repo paths detected")
|
|
rpt.add("")
|
|
|
|
# ----- 4. DUPLICATE / OVERLAP CLUSTERS -----
|
|
rpt.add("## 4. DUPLICATE / OVERLAP CLUSTERS (PROPOSED merges -- never auto-applied)")
|
|
rpt.add("")
|
|
clusters = cluster_overlaps(mems)
|
|
clusters.sort(key=lambda c: (-len(c[1]), c[0]))
|
|
rpt.add(f"### Candidate clusters: {len(clusters)}")
|
|
for typ, members in clusters:
|
|
rpt.add(f"- [{typ}] {len(members)} related memories:")
|
|
for f in members:
|
|
mm = mem_by_file.get(f)
|
|
desc = (mm.description or mm.name or "") if mm else ""
|
|
desc = desc[:90]
|
|
rpt.add(f" - {f} -- {desc}")
|
|
if not clusters:
|
|
rpt.add("- [OK] no overlap clusters above threshold")
|
|
rpt.add("")
|
|
|
|
# ----- 5. STALE DATED FACTS -----
|
|
rpt.add(f"## 5. STALE DATED FACTS (project-type, dated > {STALE_MONTHS} months)")
|
|
rpt.add("")
|
|
stale_hits = []
|
|
for m in mems:
|
|
if (m.type or "") != "project":
|
|
continue
|
|
hits = find_stale_dates(m, today)
|
|
if hits:
|
|
stale_hits.append((m.filename, hits))
|
|
rpt.add(f"### Project memories with stale dated claims: {len(stale_hits)}")
|
|
for fn, hits in stale_hits:
|
|
for ds, age in hits:
|
|
rpt.add(f"- [VERIFY] {fn}: dated {ds} (~{age} days old) -- re-verify")
|
|
if not stale_hits:
|
|
rpt.add("- [OK] no stale dated project facts")
|
|
rpt.add("")
|
|
|
|
# ----- 6. DRIFT vs PROFILE STORE -----
|
|
rpt.add("## 6. DRIFT vs HARNESS PROFILE STORE")
|
|
rpt.add("")
|
|
prof_dir = profile_memory_dir(repo_root)
|
|
profile_only = []
|
|
repo_only = []
|
|
conflicts = []
|
|
if prof_dir is None:
|
|
rpt.add("- [INFO] profile memory dir not found; skipping drift check.")
|
|
else:
|
|
rpt.add(f"Profile store: {prof_dir}")
|
|
rpt.add("")
|
|
prof_files = {p.name for p in prof_dir.glob("*.md") if p.name != "MEMORY.md"}
|
|
repo_files = {m.filename for m in mems}
|
|
|
|
for pf in sorted(prof_files - repo_files):
|
|
profile_only.append(pf)
|
|
for rf in sorted(repo_files - prof_files):
|
|
repo_only.append(rf)
|
|
for both in sorted(prof_files & repo_files):
|
|
a = (prof_dir / both).read_text(encoding="utf-8", errors="replace")
|
|
b = (mem_dir / both).read_text(encoding="utf-8", errors="replace")
|
|
if a != b:
|
|
conflicts.append(both)
|
|
|
|
rpt.add(f"### Profile-only (candidates to MIGRATE INTO repo): {len(profile_only)}")
|
|
for f in profile_only:
|
|
rpt.add(f"- [INFO] {f}")
|
|
rpt.add("")
|
|
rpt.add(f"### Repo-only (candidates to PUSH OUT to profile): {len(repo_only)}")
|
|
for f in repo_only:
|
|
rpt.add(f"- [INFO] {f}")
|
|
rpt.add("")
|
|
rpt.add(f"### Present in BOTH but differing (CONFLICT -- human review): "
|
|
f"{len(conflicts)}")
|
|
for f in conflicts:
|
|
rpt.add(f"- [WARNING] {f}: content differs between repo and profile")
|
|
rpt.add("")
|
|
|
|
# ----- APPLY-SAFE ACTIONS (additive-only) -----
|
|
actions_taken = []
|
|
if args.apply_safe:
|
|
rpt.add("## APPLY-SAFE ACTIONS PERFORMED (additive-only)")
|
|
rpt.add("")
|
|
|
|
# (a) Append missing index lines for orphan files.
|
|
if orphans and index_path.is_file():
|
|
appended = append_index_lines(index_path, orphans, index_lines, headers)
|
|
for line, hdr in appended:
|
|
actions_taken.append(f"INDEX += [{hdr}] {line}")
|
|
rpt.add(f"- [OK] appended index line under ## {hdr}: {line}")
|
|
elif orphans:
|
|
rpt.add("- [WARNING] orphans exist but MEMORY.md missing; nothing appended")
|
|
|
|
# (b) Copy profile-only files INTO repo (never overwrite).
|
|
if prof_dir is not None:
|
|
for f in profile_only:
|
|
src = prof_dir / f
|
|
dst = mem_dir / f
|
|
if dst.exists():
|
|
rpt.add(f"- [SKIP] {f}: already exists in repo (not overwriting)")
|
|
continue
|
|
shutil.copy2(src, dst)
|
|
actions_taken.append(f"COPIED profile->repo: {f}")
|
|
rpt.add(f"- [OK] copied profile-only file into repo: {f}")
|
|
if not actions_taken:
|
|
rpt.add("- [INFO] no additive actions were necessary")
|
|
rpt.add("")
|
|
|
|
# ----- SUMMARY -----
|
|
rpt.add("## SUMMARY")
|
|
rpt.add("")
|
|
rpt.add(f"- memory files: {len(mems)}")
|
|
rpt.add(f"- orphan files (no index): {len(orphans)}")
|
|
rpt.add(f"- index -> missing file: {len(missing_targets)}")
|
|
rpt.add(f"- name/filename signals: {len(name_mismatches)}")
|
|
rpt.add(f"- broken backlinks: {len(broken_backlinks)}")
|
|
rpt.add(f"- stale referenced paths: {len(artifact_flags)}")
|
|
rpt.add(f"- overlap clusters: {len(clusters)}")
|
|
rpt.add(f"- stale dated project facts: {len(stale_hits)}")
|
|
rpt.add(f"- profile-only files: {len(profile_only)}")
|
|
rpt.add(f"- repo-only files: {len(repo_only)}")
|
|
rpt.add(f"- repo<->profile conflicts: {len(conflicts)}")
|
|
if args.apply_safe:
|
|
rpt.add(f"- additive actions performed: {len(actions_taken)}")
|
|
rpt.add("")
|
|
rpt.add("## PROPOSED (needs human approval -- NEVER auto-applied)")
|
|
rpt.add("")
|
|
n_prop = 0
|
|
for typ, members in clusters:
|
|
n_prop += 1
|
|
rpt.add(f"- [MERGE?] consolidate {len(members)} '{typ}' memories: "
|
|
f"{', '.join(members)}")
|
|
for fn, hits in stale_hits:
|
|
n_prop += 1
|
|
rpt.add(f"- [REVERIFY?] {fn} (dated facts) -- confirm still true, then update")
|
|
for fn, tok in artifact_flags:
|
|
n_prop += 1
|
|
rpt.add(f"- [STALE-REF?] {fn} references `{tok}` -- confirm/repoint or note moved")
|
|
for title, target, ln in missing_targets:
|
|
n_prop += 1
|
|
rpt.add(f"- [INDEX-CLEANUP?] MEMORY.md line {ln + 1} points at missing "
|
|
f"{target} -- human decides keep/remove")
|
|
if prof_dir is not None:
|
|
for f in conflicts:
|
|
n_prop += 1
|
|
rpt.add(f"- [DRIFT-RESOLVE?] {f} differs repo vs profile -- human picks "
|
|
f"winner (sync-memory.sh leaves both untouched)")
|
|
if n_prop == 0:
|
|
rpt.add("- [OK] nothing proposed; memory store is clean")
|
|
rpt.add("")
|
|
|
|
out = str(rpt)
|
|
print(out)
|
|
|
|
# Write report file unless suppressed.
|
|
if not args.no_file:
|
|
reports_dir = mem_dir / "_reports"
|
|
reports_dir.mkdir(parents=True, exist_ok=True)
|
|
if args.report_file:
|
|
rpath = Path(args.report_file)
|
|
else:
|
|
stamp = datetime.datetime.now().strftime("%Y-%m-%d-%H%M")
|
|
rpath = reports_dir / f"{stamp}-dream.md"
|
|
rpath.write_text(out + "\n", encoding="utf-8")
|
|
print(f"\n[INFO] report written: {rpath}")
|
|
|
|
return 0
|
|
|
|
|
|
def append_index_lines(index_path: Path, orphans, index_lines, headers):
|
|
"""
|
|
Additive only: append a '- [Name](file.md) -- description' line for each
|
|
orphan under the correct '## <Header>' section. Never reorders or removes
|
|
existing lines. If a header doesn't exist, append it at end of file.
|
|
|
|
Returns list of (line_text, header_used).
|
|
"""
|
|
text = index_path.read_text(encoding="utf-8", errors="replace")
|
|
lines = text.split("\n")
|
|
appended = []
|
|
|
|
# Group orphans by target header.
|
|
by_header: dict[str, list[Memory]] = {}
|
|
for m in orphans:
|
|
hdr = TYPE_HEADER.get(m.type or "", None)
|
|
if hdr is None:
|
|
hdr = "Project" # safe default bucket; human can recategorize
|
|
by_header.setdefault(hdr, []).append(m)
|
|
|
|
def build_line(m: Memory) -> str:
|
|
title = m.name or m.slug
|
|
hook = (m.description or "").strip()
|
|
if hook:
|
|
return f"- [{title}]({m.filename}) -- {hook}"
|
|
return f"- [{title}]({m.filename})"
|
|
|
|
for hdr, members in by_header.items():
|
|
# Find header line index.
|
|
hidx = None
|
|
for i, ln in enumerate(lines):
|
|
hm = re.match(r"^##\s+(.+?)\s*$", ln)
|
|
if hm and hm.group(1).strip() == hdr:
|
|
hidx = i
|
|
break
|
|
|
|
new_lines = [build_line(m) for m in members]
|
|
|
|
if hidx is None:
|
|
# Append a fresh section at end of file.
|
|
if lines and lines[-1].strip() != "":
|
|
lines.append("")
|
|
lines.append(f"## {hdr}")
|
|
lines.extend(new_lines)
|
|
for nl, m in zip(new_lines, members):
|
|
appended.append((nl, hdr))
|
|
continue
|
|
|
|
# Find end of this section: next '## ' or EOF.
|
|
end = len(lines)
|
|
for j in range(hidx + 1, len(lines)):
|
|
if re.match(r"^##\s+", lines[j]):
|
|
end = j
|
|
break
|
|
# Insert after the last non-blank line of the section.
|
|
insert_at = end
|
|
while insert_at - 1 > hidx and lines[insert_at - 1].strip() == "":
|
|
insert_at -= 1
|
|
for off, (nl, m) in enumerate(zip(new_lines, members)):
|
|
lines.insert(insert_at + off, nl)
|
|
appended.append((nl, hdr))
|
|
|
|
index_path.write_text("\n".join(lines), encoding="utf-8")
|
|
return appended
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(
|
|
description="Memory lint + consolidation analyzer (additive-only)."
|
|
)
|
|
ap.add_argument(
|
|
"--apply-safe",
|
|
action="store_true",
|
|
help="Perform ONLY additive fixes (append index lines, copy profile-only "
|
|
"files into repo). Never deletes/overwrites/merges.",
|
|
)
|
|
ap.add_argument(
|
|
"--no-file",
|
|
action="store_true",
|
|
help="Print report to stdout only; do not write a _reports/ file.",
|
|
)
|
|
ap.add_argument(
|
|
"--report-file",
|
|
default=None,
|
|
help="Explicit path for the report file (overrides _reports/ default).",
|
|
)
|
|
args = ap.parse_args()
|
|
try:
|
|
return run(args)
|
|
except KeyboardInterrupt:
|
|
print("[ERROR] interrupted")
|
|
return 130
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|