claudetools/.claude/skills/memory-dream/scripts/memory_dream.py

#!/usr/bin/env python3
"""
memory_dream.py -- memory lint + consolidation analyzer for the ClaudeTools REPO
memory store (.claude/memory/).

ADDITIVE-ONLY by design. The default run is READ-ONLY and mutates nothing.
The only mutating mode is --apply-safe, which performs ONLY additive,
non-destructive actions:
  * append missing index lines to MEMORY.md for orphan memory files
  * copy profile-only memory files INTO the repo store (never overwriting)
It NEVER deletes a file, NEVER removes an index line, NEVER overwrites differing
content, and NEVER performs a proposed merge. Every destructive idea stays in
the report as a PROPOSED action for a human to approve.

Stdlib only. Python launcher on Windows fleet is `py`; also runs under
python3/python.

Usage:
  py memory_dream.py                 # REPORT ONLY (default)
  py memory_dream.py --apply-safe    # additive-only fixes + report
  py memory_dream.py --no-file       # report to stdout only, skip _reports/ file
  py memory_dream.py --report-file X # write report to an explicit path
"""

from __future__ import annotations

import argparse
import datetime
import os
import re
import shutil
import sys
from pathlib import Path

# Windows consoles default to cp1252; memory bodies contain Unicode (arrows,
# em dashes). Force UTF-8 stdout/stderr with replacement so printing never
# crashes regardless of the active code page.
for _stream in (sys.stdout, sys.stderr):
    try:
        _stream.reconfigure(encoding="utf-8", errors="replace")
    except Exception:
        pass

# --------------------------------------------------------------------------
# Path resolution -- no hardcoded drive letters.
# --------------------------------------------------------------------------

STALE_MONTHS = 6  # project facts older than this (in "as of <date>") -> re-verify


def _read_identity_root(repo_guess: Path) -> str | None:
    """Best-effort read of claudetools_root from .claude/identity.json."""
    ident = repo_guess / ".claude" / "identity.json"
    if not ident.is_file():
        return None
    try:
        import json

        data = json.loads(ident.read_text(encoding="utf-8"))
        root = data.get("claudetools_root")
        if root and Path(root).is_dir():
            return root
    except Exception:
        return None
    return None


def resolve_claudetools_root() -> Path:
    """
    Resolve CLAUDETOOLS_ROOT:
      1. env CLAUDETOOLS_ROOT
      2. .claude/identity.json claudetools_root (found by walking up from script)
      3. derive from this script's location (.../.claude/skills/memory-dream/scripts/)
    """
    env_root = os.environ.get("CLAUDETOOLS_ROOT")
    if env_root and Path(env_root).is_dir():
        return Path(env_root).resolve()

    # Walk up from this file looking for a .claude dir.
    here = Path(__file__).resolve()
    derived = None
    for parent in here.parents:
        if (parent / ".claude").is_dir():
            derived = parent
            break

    if derived is not None:
        ident_root = _read_identity_root(derived)
        if ident_root:
            return Path(ident_root).resolve()
        return derived.resolve()

    # Last resort: assume scripts/ -> memory-dream/ -> skills/ -> .claude/ -> ROOT
    # (script is at ROOT/.claude/skills/memory-dream/scripts/memory_dream.py)
    return here.parents[4].resolve()


def profile_memory_dir(repo_root: Path) -> Path | None:
    """
    Derive the harness profile memory dir for this project.

    Slug: take the absolute project path, replace every run of non-alphanumeric
    chars with '-', then look under $HOME/.claude/projects/<slug>/memory/.

    Prefers CLAUDE_PROJECT_DIR if set; falls back to repo_root.
    Returns the dir if it exists, else None.
    """
    home = Path(os.environ.get("HOME") or os.path.expanduser("~"))
    project_dir = os.environ.get("CLAUDE_PROJECT_DIR") or str(repo_root)
    abspath = str(Path(project_dir).resolve())
    projects_root = home / ".claude" / "projects"

    # The single-dash collapse: replace every run of non-alphanumeric chars with
    # a single '-'. This is the historical/POSIX-style derivation.
    slug_single = re.sub(r"[^A-Za-z0-9]+", "-", abspath)

    # The Claude Code harness maps a Windows drive colon to '--' (so
    # "D:\\claudetools" -> "D--claudetools"), but the single-dash collapse above
    # produces "D-claudetools". Reproduce the harness rule by doubling a leading
    # "<drive>-" into "<drive>--".
    slug_double = re.sub(r"^([A-Za-z])-", r"\1--", slug_single)

    # Try the EXACT candidate slugs in priority order; use the first whose
    # profile memory dir actually exists. The double-dash (harness) variant is
    # primary; the single-dash collapse is the secondary exact candidate.
    seen: set[str] = set()
    for slug in (slug_double, slug_single):
        if slug in seen:
            continue
        seen.add(slug)
        base = projects_root / slug
        for candidate in (base / "memory", base):
            if candidate.is_dir():
                # If the slug dir itself was matched (no nested memory/), use the
                # conventional memory subdir under it.
                return (base / "memory") if candidate == base else candidate

    # ONLY if none of the exact candidates exist, fall back to a case-insensitive
    # tail-scan of $HOME/.claude/projects/*/memory for a dir whose slug "looks
    # like" this repo (tail match on the last path component). If MORE THAN ONE
    # dir matches, do NOT guess -- report the ambiguity and skip.
    if projects_root.is_dir():
        tail = re.sub(r"[^A-Za-z0-9]+", "-", repo_root.name).lower()
        matches: list[Path] = []
        for child in sorted(projects_root.iterdir()):
            if not child.is_dir():
                continue
            if child.name.lower().endswith(tail):
                mem = child / "memory"
                if mem.is_dir():
                    matches.append(mem)
        if len(matches) > 1:
            names = ", ".join(str(m.parent.name) for m in matches)
            print(
                f"[WARNING] multiple profile dirs matched ({names}); "
                "skipping profile drift analysis to avoid cross-project contamination"
            )
            return None
        if len(matches) == 1:
            return matches[0]
    return None


# --------------------------------------------------------------------------
# Frontmatter / memory file parsing
# --------------------------------------------------------------------------


class Memory:
    def __init__(self, path: Path):
        self.path = path
        self.filename = path.name
        self.slug = path.stem
        self.name: str | None = None
        self.description: str | None = None
        self.type: str | None = None
        self.body: str = ""
        self._parse()

    def _parse(self) -> None:
        text = self.path.read_text(encoding="utf-8", errors="replace")
        lines = text.splitlines()
        if not lines or lines[0].strip() != "---":
            # No frontmatter; whole file is body.
            self.body = text
            return
        # Find closing fence.
        end = None
        for i in range(1, len(lines)):
            if lines[i].strip() == "---":
                end = i
                break
        if end is None:
            self.body = text
            return
        fm = lines[1:end]
        self.body = "\n".join(lines[end + 1 :])
        self._parse_frontmatter(fm)

    def _parse_frontmatter(self, fm_lines: list[str]) -> None:
        """
        Tolerant YAML-ish parse. Handles:
          name: X
          description: X   (or '>-' folded block following)
          type: X          (top-level)
          metadata:
            type: X        (nested)
        """
        i = 0
        in_metadata = False
        while i < len(fm_lines):
            raw = fm_lines[i]
            line = raw.rstrip("\n")
            stripped = line.strip()
            indent = len(line) - len(line.lstrip())

            if not stripped:
                i += 1
                continue

            if stripped == "metadata:":
                in_metadata = True
                i += 1
                continue

            # Detect leaving the metadata block (a top-level key reappears).
            if in_metadata and indent == 0 and ":" in stripped:
                in_metadata = False

            m = re.match(r"^([A-Za-z_][\w\-]*):\s*(.*)$", stripped)
            if not m:
                i += 1
                continue
            key, val = m.group(1), m.group(2)

            # Folded/literal block scalar -> capture following more-indented lines.
            if val in (">-", ">", "|", "|-", "|+"):
                block_lines = []
                j = i + 1
                base_indent = indent
                while j < len(fm_lines):
                    nxt = fm_lines[j]
                    nxt_indent = len(nxt) - len(nxt.lstrip())
                    if nxt.strip() == "" or nxt_indent > base_indent:
                        block_lines.append(nxt.strip())
                        j += 1
                    else:
                        break
                val = " ".join(x for x in block_lines if x)
                i = j
            else:
                val = val.strip().strip('"').strip("'")
                i += 1

            if key == "name" and not in_metadata:
                self.name = val
            elif key == "description":
                self.description = val
            elif key == "type":
                # Both top-level and metadata.type land here.
                self.type = (val or "").lower() or None
            else:
                continue


# --------------------------------------------------------------------------
# Index (MEMORY.md) parsing
# --------------------------------------------------------------------------

INDEX_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
# Body backlinks like [[some-name]]
BACKLINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
# "as of <date>" style dated claims.
DATE_RE = re.compile(
    r"(?:as of|updated|corrected|lesson|fixed|live)\s+"
    r"(\d{4}-\d{2}-\d{2})",
    re.IGNORECASE,
)
ISO_DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")

# Type -> index header. Index uses singular headers.
TYPE_HEADER = {
    "reference": "Reference",
    "feedback": "Feedback",
    "project": "Project",
    "user": "Users",
}


def parse_index(index_path: Path):
    """
    Returns:
      links: list of (title, target, lineno, raw_line)
      headers: dict header-name -> lineno
      lines: original file lines (no newline)
    """
    links = []
    headers = {}
    if not index_path.is_file():
        return links, headers, []
    text = index_path.read_text(encoding="utf-8", errors="replace")
    lines = text.split("\n")
    for idx, line in enumerate(lines):
        hm = re.match(r"^##\s+(.+?)\s*$", line)
        if hm:
            headers[hm.group(1).strip()] = idx
            continue
        if line.lstrip().startswith("- "):
            m = INDEX_LINK_RE.search(line)
            if m:
                links.append((m.group(1), m.group(2), idx, line))
    return links, headers, lines


# --------------------------------------------------------------------------
# Referenced-artifact extraction (conservative)
# --------------------------------------------------------------------------

# Referenced-artifact extraction is intentionally CONSERVATIVE: it only inspects
# backtick-wrapped spans (`...`) and only treats a span as a repo path when the
# whole span is a single path-like token. Extensions are ordered longest-first
# so `identity.json` is never truncated to `identity.js`. We do NOT scan bare
# prose -- too many false positives.
PATHISH_RE = re.compile(r"`([^`\n]+?)`")

# Longest-first extension alternation, anchored to end-of-token, prevents the
# json->js / yaml->yml style truncation bug.
KNOWN_EXTS = (
    "tsx", "json", "yaml", "toml", "service",
    "py", "sh", "rs", "ts", "js", "md", "yml", "sql", "ps1",
)
EXT_RE = re.compile(r"\.(?:" + "|".join(KNOWN_EXTS) + r")$", re.IGNORECASE)

# Vault-style secret paths live in the SEPARATE vault repo, not claudetools.
VAULT_HINT_RE = re.compile(r"\.sops\.ya?ml$", re.IGNORECASE)

# Tokens we never treat as repo paths.
ABS_PREFIXES = ("/api/", "/home/", "/var/", "/opt/", "/etc/", "/tmp/",
                "/proc/", "/dev/", "/data/", "/usr/")


def looks_like_repo_path(token: str) -> bool:
    token = token.strip()
    if not token:
        return False
    # Reject anything with whitespace, glob/placeholder/url/colon characters --
    # those are descriptions or templates, not concrete repo paths.
    if any(c in token for c in (" ", "<", ">", "*", "?", ":", "|", "\\")):
        return False
    if token.startswith(("http://", "https://", "//", "git@", "vault:")):
        return False
    if token.startswith(ABS_PREFIXES):
        return False  # server absolute paths, not repo-relative
    # Vault secret refs belong to the vault repo -- not a staleness signal here.
    if VAULT_HINT_RE.search(token):
        return False
    # Must end in a recognized extension (anchored, longest-first).
    if not EXT_RE.search(token):
        return False
    # A real reference is either repo-relative-with-slash or a bare filename.
    # Reject single-segment tokens that are clearly prose-y (no slash AND no
    # underscore/dash) unless they look like a script filename.
    has_slash = "/" in token
    if not has_slash:
        # bare filename: require it to look like an actual file (has a dot ext,
        # already guaranteed) and contain a separator or be a known script ext.
        return True
    return True


def extract_referenced_paths(body: str) -> list[str]:
    found = set()
    for m in PATHISH_RE.finditer(body):
        span = m.group(1).strip()
        # A backtick span counts only if the ENTIRE span is one token (a path).
        # Spans with spaces are commands/prose -> skip (avoids `cmd args` noise).
        if not span or " " in span:
            continue
        token = span.lstrip("./")
        if looks_like_repo_path(token):
            found.add(token)
    return sorted(found)


def repo_path_exists(repo_root: Path, token: str) -> bool:
    token = token.lstrip("./")
    # Try repo-relative.
    if (repo_root / token).exists():
        return True
    # Bare filename -> search anywhere in repo (cheap, bounded).
    if "/" not in token:
        try:
            return any(True for _ in repo_root.rglob(token))
        except OSError:
            return False
    # Also try matching just the tail (last 2 segments) anywhere, since memories
    # often cite paths relative to a subproject root.
    parts = token.split("/")
    if len(parts) >= 2:
        tail = "/".join(parts[-2:])
        try:
            for p in repo_root.rglob(parts[-1]):
                if str(p).replace("\\", "/").endswith(tail):
                    return True
        except OSError:
            return False
    return False


# --------------------------------------------------------------------------
# Similarity / duplicate clustering (token-overlap heuristic)
# --------------------------------------------------------------------------

STOPWORDS = {
    "the", "a", "an", "and", "or", "to", "of", "in", "on", "for", "with",
    "is", "are", "be", "not", "via", "use", "used", "uses", "no", "never",
    "always", "only", "via", "from", "by", "at", "as", "it", "this", "that",
    "when", "if", "then", "do", "don't", "we", "our", "you", "your",
}


def tokenize(text: str) -> set[str]:
    toks = re.findall(r"[a-z0-9]+", (text or "").lower())
    return {t for t in toks if t not in STOPWORDS and len(t) > 2}


def jaccard(a: set[str], b: set[str]) -> float:
    if not a or not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union else 0.0


def cluster_overlaps(mems: list[Memory], threshold: float = 0.34):
    """
    Within each type, find pairs with token-overlap >= threshold, then union
    them into clusters. Returns list of (type, [filenames]) for clusters >1.
    """
    clusters_out = []
    by_type: dict[str, list[Memory]] = {}
    for m in mems:
        by_type.setdefault(m.type or "untyped", []).append(m)

    for typ, group in by_type.items():
        # token signature per memory: name + description + slug words
        sigs = {}
        for m in group:
            base = " ".join(
                filter(None, [m.name, m.description, m.slug.replace("_", " ")])
            )
            sigs[m.filename] = tokenize(base)

        # Also bias by shared slug prefix (e.g. feedback_syncro_*).
        parent = {m.filename: m.filename for m in group}

        def find(x):
            while parent[x] != x:
                parent[x] = parent[parent[x]]
                x = parent[x]
            return x

        def union(x, y):
            rx, ry = find(x), find(y)
            if rx != ry:
                parent[rx] = ry

        files = [m.filename for m in group]
        slug_prefix = {}
        for m in group:
            parts = m.slug.split("_")
            slug_prefix[m.filename] = "_".join(parts[:2]) if len(parts) >= 2 else m.slug

        for i in range(len(files)):
            for j in range(i + 1, len(files)):
                fi, fj = files[i], files[j]
                sim = jaccard(sigs[fi], sigs[fj])
                same_prefix = (
                    slug_prefix[fi] == slug_prefix[fj]
                    and len(slug_prefix[fi].split("_")) >= 2
                )
                if sim >= threshold or same_prefix:
                    union(fi, fj)

        groups: dict[str, list[str]] = {}
        for f in files:
            groups.setdefault(find(f), []).append(f)
        for members in groups.values():
            if len(members) > 1:
                clusters_out.append((typ, sorted(members)))
    return clusters_out


# --------------------------------------------------------------------------
# Stale dated facts
# --------------------------------------------------------------------------


def find_stale_dates(mem: Memory, today: datetime.date):
    """Return list of (date_str, age_days) for dated claims older than STALE_MONTHS."""
    hits = []
    seen = set()
    for rx in (DATE_RE, ISO_DATE_RE):
        for m in rx.finditer(mem.body):
            ds = m.group(1)
            if ds in seen:
                continue
            seen.add(ds)
            try:
                d = datetime.date.fromisoformat(ds)
            except ValueError:
                continue
            age = (today - d).days
            if age > STALE_MONTHS * 30:
                hits.append((ds, age))
    return hits


# --------------------------------------------------------------------------
# Report
# --------------------------------------------------------------------------


class Report:
    def __init__(self):
        self.lines: list[str] = []

    def add(self, s: str = ""):
        self.lines.append(s)

    def __str__(self):
        return "\n".join(self.lines)


def slugify_link_target(target: str) -> str:
    return Path(target).stem


def run(args) -> int:
    repo_root = resolve_claudetools_root()
    mem_dir = repo_root / ".claude" / "memory"
    index_path = mem_dir / "MEMORY.md"

    if not mem_dir.is_dir():
        print(f"[ERROR] memory dir not found: {mem_dir}")
        return 2

    today = datetime.date.today()
    rpt = Report()
    rpt.add("# Memory Dream Report")
    rpt.add(f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}")
    rpt.add(f"Repo root: {repo_root}")
    rpt.add(f"Memory store: {mem_dir}")
    rpt.add(f"Mode: {'APPLY-SAFE (additive)' if args.apply_safe else 'REPORT-ONLY'}")
    rpt.add("")

    # Load memories.
    mem_files = sorted(p for p in mem_dir.glob("*.md") if p.name != "MEMORY.md")
    mems = [Memory(p) for p in mem_files]
    mem_by_file = {m.filename: m for m in mems}
    rpt.add(f"Loaded {len(mems)} memory files (excluding MEMORY.md).")
    rpt.add("")

    # ----- 1. INDEX RECONCILE -----
    links, headers, index_lines = parse_index(index_path)
    indexed_targets = {slugify_link_target(t): (title, t, ln)
                       for (title, t, ln, _raw) in links}
    rpt.add("## 1. INDEX RECONCILE")
    rpt.add("")

    orphans = []  # files with no index line
    for m in mems:
        if m.slug not in indexed_targets:
            orphans.append(m)
    rpt.add(f"### Orphan files (no index line): {len(orphans)}")
    for m in orphans:
        rpt.add(f"- [INFO] {m.filename}  (type={m.type or '?'})")
    rpt.add("")

    missing_targets = []  # index lines whose file is missing
    for title, target, ln, _raw in links:
        # Only consider links that look like local memory files.
        tgt = target.strip()
        if tgt.startswith(("http://", "https://")):
            continue
        resolved = (mem_dir / tgt).resolve()
        if not resolved.is_file():
            missing_targets.append((title, target, ln))
    rpt.add(f"### Index lines pointing at missing files: {len(missing_targets)}")
    for title, target, ln in missing_targets:
        rpt.add(f"- [WARNING] line {ln + 1}: [{title}]({target}) -> file not found")
    rpt.add("")

    name_mismatches = []  # frontmatter name vs filename slug
    for m in mems:
        if m.name is None:
            name_mismatches.append((m.filename, "(no name in frontmatter)"))
            continue
        # The convention is loose: name may be a title, not the slug. Only flag
        # when name itself looks like a slug AND differs from the filename slug.
        name_as_slug = re.sub(r"[^A-Za-z0-9]+", "_", m.name.strip().lower()).strip("_")
        if re.fullmatch(r"[a-z0-9_]+", m.name.strip()) and m.name.strip() != m.slug:
            name_mismatches.append((m.filename, f"name='{m.name}' != slug='{m.slug}'"))
    rpt.add(f"### Frontmatter name vs filename signals: {len(name_mismatches)}")
    for fn, note in name_mismatches:
        rpt.add(f"- [INFO] {fn}: {note}")
    rpt.add("")

    # ----- 2. BACKLINKS -----
    rpt.add("## 2. BACKLINKS ([[name]] references)")
    rpt.add("")
    known_slugs = {m.slug for m in mems}
    broken_backlinks = []
    for m in mems:
        for bm in BACKLINK_RE.finditer(m.body):
            ref = bm.group(1).strip()
            ref_slug = slugify_link_target(ref)
            if ref_slug not in known_slugs and ref not in known_slugs:
                broken_backlinks.append((m.filename, ref))
    rpt.add(f"### Broken backlinks: {len(broken_backlinks)}")
    for fn, ref in broken_backlinks:
        rpt.add(f"- [WARNING] {fn}: [[{ref}]] has no matching memory file")
    if not broken_backlinks:
        rpt.add("- [OK] no broken backlinks found")
    rpt.add("")

    # ----- 3. REFERENCED-ARTIFACT VALIDITY -----
    rpt.add("## 3. REFERENCED-ARTIFACT VALIDITY (conservative; 'verify', not 'delete')")
    rpt.add("")
    artifact_flags = []
    for m in mems:
        for tok in extract_referenced_paths(m.body):
            if not repo_path_exists(repo_root, tok):
                artifact_flags.append((m.filename, tok))
    rpt.add(f"### Referenced paths not found in repo: {len(artifact_flags)}")
    for fn, tok in artifact_flags:
        rpt.add(f"- [VERIFY] {fn}: `{tok}` not found under repo (may be server-side "
                f"or renamed -- verify, do not auto-delete)")
    if not artifact_flags:
        rpt.add("- [OK] no clearly-stale repo paths detected")
    rpt.add("")

    # ----- 4. DUPLICATE / OVERLAP CLUSTERS -----
    rpt.add("## 4. DUPLICATE / OVERLAP CLUSTERS (PROPOSED merges -- never auto-applied)")
    rpt.add("")
    clusters = cluster_overlaps(mems)
    clusters.sort(key=lambda c: (-len(c[1]), c[0]))
    rpt.add(f"### Candidate clusters: {len(clusters)}")
    for typ, members in clusters:
        rpt.add(f"- [{typ}] {len(members)} related memories:")
        for f in members:
            mm = mem_by_file.get(f)
            desc = (mm.description or mm.name or "") if mm else ""
            desc = desc[:90]
            rpt.add(f"    - {f}  -- {desc}")
    if not clusters:
        rpt.add("- [OK] no overlap clusters above threshold")
    rpt.add("")

    # ----- 5. STALE DATED FACTS -----
    rpt.add(f"## 5. STALE DATED FACTS (project-type, dated > {STALE_MONTHS} months)")
    rpt.add("")
    stale_hits = []
    for m in mems:
        if (m.type or "") != "project":
            continue
        hits = find_stale_dates(m, today)
        if hits:
            stale_hits.append((m.filename, hits))
    rpt.add(f"### Project memories with stale dated claims: {len(stale_hits)}")
    for fn, hits in stale_hits:
        for ds, age in hits:
            rpt.add(f"- [VERIFY] {fn}: dated {ds} (~{age} days old) -- re-verify")
    if not stale_hits:
        rpt.add("- [OK] no stale dated project facts")
    rpt.add("")

    # ----- 6. DRIFT vs PROFILE STORE -----
    rpt.add("## 6. DRIFT vs HARNESS PROFILE STORE")
    rpt.add("")
    prof_dir = profile_memory_dir(repo_root)
    profile_only = []
    repo_only = []
    conflicts = []
    if prof_dir is None:
        rpt.add("- [INFO] profile memory dir not found; skipping drift check.")
    else:
        rpt.add(f"Profile store: {prof_dir}")
        rpt.add("")
        prof_files = {p.name for p in prof_dir.glob("*.md") if p.name != "MEMORY.md"}
        repo_files = {m.filename for m in mems}

        for pf in sorted(prof_files - repo_files):
            profile_only.append(pf)
        for rf in sorted(repo_files - prof_files):
            repo_only.append(rf)
        for both in sorted(prof_files & repo_files):
            a = (prof_dir / both).read_text(encoding="utf-8", errors="replace")
            b = (mem_dir / both).read_text(encoding="utf-8", errors="replace")
            if a != b:
                conflicts.append(both)

        rpt.add(f"### Profile-only (candidates to MIGRATE INTO repo): {len(profile_only)}")
        for f in profile_only:
            rpt.add(f"- [INFO] {f}")
        rpt.add("")
        rpt.add(f"### Repo-only (candidates to PUSH OUT to profile): {len(repo_only)}")
        for f in repo_only:
            rpt.add(f"- [INFO] {f}")
        rpt.add("")
        rpt.add(f"### Present in BOTH but differing (CONFLICT -- human review): "
                f"{len(conflicts)}")
        for f in conflicts:
            rpt.add(f"- [WARNING] {f}: content differs between repo and profile")
        rpt.add("")

    # ----- APPLY-SAFE ACTIONS (additive-only) -----
    actions_taken = []
    if args.apply_safe:
        rpt.add("## APPLY-SAFE ACTIONS PERFORMED (additive-only)")
        rpt.add("")

        # (a) Append missing index lines for orphan files.
        if orphans and index_path.is_file():
            appended = append_index_lines(index_path, orphans, index_lines, headers)
            for line, hdr in appended:
                actions_taken.append(f"INDEX += [{hdr}] {line}")
                rpt.add(f"- [OK] appended index line under ## {hdr}: {line}")
        elif orphans:
            rpt.add("- [WARNING] orphans exist but MEMORY.md missing; nothing appended")

        # (b) Copy profile-only files INTO repo (never overwrite).
        if prof_dir is not None:
            for f in profile_only:
                src = prof_dir / f
                dst = mem_dir / f
                if dst.exists():
                    rpt.add(f"- [SKIP] {f}: already exists in repo (not overwriting)")
                    continue
                shutil.copy2(src, dst)
                actions_taken.append(f"COPIED profile->repo: {f}")
                rpt.add(f"- [OK] copied profile-only file into repo: {f}")
        if not actions_taken:
            rpt.add("- [INFO] no additive actions were necessary")
        rpt.add("")

    # ----- SUMMARY -----
    rpt.add("## SUMMARY")
    rpt.add("")
    rpt.add(f"- memory files:                 {len(mems)}")
    rpt.add(f"- orphan files (no index):      {len(orphans)}")
    rpt.add(f"- index -> missing file:        {len(missing_targets)}")
    rpt.add(f"- name/filename signals:        {len(name_mismatches)}")
    rpt.add(f"- broken backlinks:             {len(broken_backlinks)}")
    rpt.add(f"- stale referenced paths:       {len(artifact_flags)}")
    rpt.add(f"- overlap clusters:             {len(clusters)}")
    rpt.add(f"- stale dated project facts:    {len(stale_hits)}")
    rpt.add(f"- profile-only files:           {len(profile_only)}")
    rpt.add(f"- repo-only files:              {len(repo_only)}")
    rpt.add(f"- repo<->profile conflicts:     {len(conflicts)}")
    if args.apply_safe:
        rpt.add(f"- additive actions performed:   {len(actions_taken)}")
    rpt.add("")
    rpt.add("## PROPOSED (needs human approval -- NEVER auto-applied)")
    rpt.add("")
    n_prop = 0
    for typ, members in clusters:
        n_prop += 1
        rpt.add(f"- [MERGE?] consolidate {len(members)} '{typ}' memories: "
                f"{', '.join(members)}")
    for fn, hits in stale_hits:
        n_prop += 1
        rpt.add(f"- [REVERIFY?] {fn} (dated facts) -- confirm still true, then update")
    for fn, tok in artifact_flags:
        n_prop += 1
        rpt.add(f"- [STALE-REF?] {fn} references `{tok}` -- confirm/repoint or note moved")
    for title, target, ln in missing_targets:
        n_prop += 1
        rpt.add(f"- [INDEX-CLEANUP?] MEMORY.md line {ln + 1} points at missing "
                f"{target} -- human decides keep/remove")
    if prof_dir is not None:
        for f in conflicts:
            n_prop += 1
            rpt.add(f"- [DRIFT-RESOLVE?] {f} differs repo vs profile -- human picks "
                    f"winner (sync-memory.sh leaves both untouched)")
    if n_prop == 0:
        rpt.add("- [OK] nothing proposed; memory store is clean")
    rpt.add("")

    out = str(rpt)
    print(out)

    # Write report file unless suppressed.
    if not args.no_file:
        reports_dir = mem_dir / "_reports"
        reports_dir.mkdir(parents=True, exist_ok=True)
        if args.report_file:
            rpath = Path(args.report_file)
        else:
            stamp = datetime.datetime.now().strftime("%Y-%m-%d-%H%M")
            rpath = reports_dir / f"{stamp}-dream.md"
        rpath.write_text(out + "\n", encoding="utf-8")
        print(f"\n[INFO] report written: {rpath}")

    return 0


def append_index_lines(index_path: Path, orphans, index_lines, headers):
    """
    Additive only: append a '- [Name](file.md) -- description' line for each
    orphan under the correct '## <Header>' section. Never reorders or removes
    existing lines. If a header doesn't exist, append it at end of file.

    Returns list of (line_text, header_used).
    """
    text = index_path.read_text(encoding="utf-8", errors="replace")
    lines = text.split("\n")
    appended = []

    # Group orphans by target header.
    by_header: dict[str, list[Memory]] = {}
    for m in orphans:
        hdr = TYPE_HEADER.get(m.type or "", None)
        if hdr is None:
            hdr = "Project"  # safe default bucket; human can recategorize
        by_header.setdefault(hdr, []).append(m)

    def build_line(m: Memory) -> str:
        title = m.name or m.slug
        hook = (m.description or "").strip()
        if hook:
            return f"- [{title}]({m.filename}) -- {hook}"
        return f"- [{title}]({m.filename})"

    for hdr, members in by_header.items():
        # Find header line index.
        hidx = None
        for i, ln in enumerate(lines):
            hm = re.match(r"^##\s+(.+?)\s*$", ln)
            if hm and hm.group(1).strip() == hdr:
                hidx = i
                break

        new_lines = [build_line(m) for m in members]

        if hidx is None:
            # Append a fresh section at end of file.
            if lines and lines[-1].strip() != "":
                lines.append("")
            lines.append(f"## {hdr}")
            lines.extend(new_lines)
            for nl, m in zip(new_lines, members):
                appended.append((nl, hdr))
            continue

        # Find end of this section: next '## ' or EOF.
        end = len(lines)
        for j in range(hidx + 1, len(lines)):
            if re.match(r"^##\s+", lines[j]):
                end = j
                break
        # Insert after the last non-blank line of the section.
        insert_at = end
        while insert_at - 1 > hidx and lines[insert_at - 1].strip() == "":
            insert_at -= 1
        for off, (nl, m) in enumerate(zip(new_lines, members)):
            lines.insert(insert_at + off, nl)
            appended.append((nl, hdr))

    index_path.write_text("\n".join(lines), encoding="utf-8")
    return appended


def main() -> int:
    ap = argparse.ArgumentParser(
        description="Memory lint + consolidation analyzer (additive-only)."
    )
    ap.add_argument(
        "--apply-safe",
        action="store_true",
        help="Perform ONLY additive fixes (append index lines, copy profile-only "
        "files into repo). Never deletes/overwrites/merges.",
    )
    ap.add_argument(
        "--no-file",
        action="store_true",
        help="Print report to stdout only; do not write a _reports/ file.",
    )
    ap.add_argument(
        "--report-file",
        default=None,
        help="Explicit path for the report file (overrides _reports/ default).",
    )
    args = ap.parse_args()
    try:
        return run(args)
    except KeyboardInterrupt:
        print("[ERROR] interrupted")
        return 130


if __name__ == "__main__":
    sys.exit(main())