claudetools/temp/vwp_resolve_victims.py

"""
Valley Wide Plastering - Resolve victim email addresses from display names.

Strategy:
1. Load victim names from vwp_victim_emails.json
2. Pull ALL contacts from JR's mailbox via Graph API
3. Search JR's sent items for Box.com invitation emails
4. Search JR's inbox for emails from box.com containing "invited"
5. Match victim names against contacts + email extractions
6. Output resolved and unresolved lists
"""

import json
import re
import sys
import time
import requests
from collections import defaultdict

# --- Configuration ---
TENANT_ID = "5c53ae9f-7071-4248-b834-8685b646450f"
APP_ID = "fabb3421-8b34-484b-bc17-e46de9703418"
APP_SECRET = "~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO"
JR_USER_ID = "0af923d0-48c5-4cc1-8553-c60625802815"

INPUT_FILE = r"D:\ClaudeTools\temp\vwp_victim_emails.json"
OUTPUT_FILE = r"D:\ClaudeTools\temp\vwp_resolved_victims.json"

GRAPH_BASE = "https://graph.microsoft.com/v1.0"


def get_token():
    url = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token"
    data = {
        "client_id": APP_ID,
        "client_secret": APP_SECRET,
        "scope": "https://graph.microsoft.com/.default",
        "grant_type": "client_credentials",
    }
    r = requests.post(url, data=data)
    r.raise_for_status()
    return r.json()["access_token"]


def graph_get_all(token, url, params=None):
    """Page through all results from a Graph API endpoint."""
    headers = {"Authorization": f"Bearer {token}"}
    results = []
    next_url = url
    while next_url:
        r = requests.get(next_url, headers=headers, params=params)
        if r.status_code == 429:
            retry = int(r.headers.get("Retry-After", 5))
            print(f"  [THROTTLED] Waiting {retry}s...")
            time.sleep(retry)
            continue
        r.raise_for_status()
        data = r.json()
        results.extend(data.get("value", []))
        next_url = data.get("@odata.nextLink")
        params = None  # nextLink already has params
    return results


def normalize(name):
    """Normalize a name for comparison."""
    if not name:
        return ""
    # Remove parenthetical suffixes like (Contractor)
    name = re.sub(r'\s*\(.*?\)\s*', ' ', name)
    # Remove numbers
    name = re.sub(r'\d+', '', name)
    # Lowercase, strip extra whitespace
    return ' '.join(name.lower().split())


def name_variants(name):
    """Generate matching variants for a name."""
    n = normalize(name)
    variants = {n}
    parts = n.split()
    if len(parts) >= 2:
        # "Last, First" -> "first last"
        if ',' in name:
            cleaned = name.replace(',', ' ')
            parts2 = cleaned.lower().split()
            if len(parts2) >= 2:
                variants.add(f"{parts2[1]} {parts2[0]}")
                variants.add(f"{parts2[0]} {parts2[1]}")
        # first last
        variants.add(f"{parts[0]} {parts[-1]}")
        # last first
        variants.add(f"{parts[-1]} {parts[0]}")
    return variants


def extract_emails_from_text(text):
    """Extract email addresses from text."""
    if not text:
        return []
    pattern = r'[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}'
    return list(set(re.findall(pattern, text)))


def main():
    # Load victim data
    with open(INPUT_FILE, 'r') as f:
        victim_data = json.load(f)

    name_only_victims = victim_data["victims_identified_by_name_only"]
    already_resolved = victim_data["confirmed_victim_emails_from_box_acceptance"]
    print(f"[INFO] {len(name_only_victims)} victims to resolve by name")
    print(f"[INFO] {len(already_resolved)} already resolved")

    # Get token
    print("[INFO] Authenticating...")
    token = get_token()
    print("[OK] Token acquired")

    # --- Strategy 1: Pull JR's contacts ---
    print("\n[INFO] Pulling JR's contacts...")
    contacts = []
    try:
        contacts_url = f"{GRAPH_BASE}/users/{JR_USER_ID}/contacts"
        contacts = graph_get_all(token, contacts_url, {"$top": "999", "$select": "displayName,emailAddresses,givenName,surname"})
        print(f"[OK] Got {len(contacts)} contacts")
    except Exception as e:
        print(f"[WARNING] Contacts API failed (likely missing Contacts.Read permission): {e}")
        print("[INFO] Will rely on mail search and GAL lookup instead")

    # Build contact lookup: normalized name -> list of emails
    contact_map = defaultdict(set)
    for c in contacts:
        dn = c.get("displayName", "")
        gn = c.get("givenName", "")
        sn = c.get("surname", "")
        emails = [e.get("address", "") for e in c.get("emailAddresses", []) if e.get("address")]
        if not emails:
            continue
        # Index by displayName variants
        for v in name_variants(dn):
            for em in emails:
                contact_map[v].add(em.lower())
        # Also index by givenName + surname
        if gn and sn:
            full = f"{gn} {sn}".lower().strip()
            for em in emails:
                contact_map[full].add(em.lower())

    # --- Strategy 2: Search JR's sent items for Box invitation emails ---
    print("\n[INFO] Searching JR's sent items for Box.com invitations...")
    sent_emails = []
    for search_q in ["box.com invitation", "box.com invited", "has been invited to"]:
        url = f"{GRAPH_BASE}/users/{JR_USER_ID}/mailFolders/sentitems/messages"
        params = {
            "$search": f'"{search_q}"',
            "$top": "200",
            "$select": "subject,body,toRecipients,ccRecipients,bccRecipients,sentDateTime",
        }
        try:
            results = graph_get_all(token, url, params)
            sent_emails.extend(results)
            print(f"  Found {len(results)} sent messages matching '{search_q}'")
        except Exception as e:
            print(f"  [WARNING] Search for '{search_q}' failed: {e}")

    # Deduplicate by message id
    seen_ids = set()
    unique_sent = []
    for m in sent_emails:
        mid = m.get("id", "")
        if mid not in seen_ids:
            seen_ids.add(mid)
            unique_sent.append(m)
    print(f"[OK] {len(unique_sent)} unique sent messages found")

    # Extract name->email mappings from sent items
    sent_map = defaultdict(set)
    for m in unique_sent:
        # Get all recipients
        for field in ["toRecipients", "ccRecipients", "bccRecipients"]:
            for recip in m.get(field, []) or []:
                ea = recip.get("emailAddress", {})
                name = ea.get("name", "")
                addr = ea.get("address", "")
                if name and addr:
                    for v in name_variants(name):
                        sent_map[v].add(addr.lower())
        # Also extract emails from body
        body_content = m.get("body", {}).get("content", "")
        body_emails = extract_emails_from_text(body_content)
        # Try to associate body emails with subject names
        subject = m.get("subject", "")
        for em in body_emails:
            if "box.com" not in em and "noreply" not in em and "valleywide" not in em.lower():
                # Store under a generic key - we'll try to match later
                sent_map["__body_emails__"].add(em.lower())

    # --- Strategy 3: Search JR's inbox for emails FROM box.com ---
    print("\n[INFO] Searching JR's inbox for Box.com notification emails...")
    inbox_emails = []
    for search_q in ["from:box.com invited", "from:box.com invitation", "from:noreply@box.com"]:
        url = f"{GRAPH_BASE}/users/{JR_USER_ID}/messages"
        params = {
            "$search": f'"{search_q}"',
            "$top": "200",
            "$select": "subject,body,from,toRecipients,ccRecipients,sentDateTime",
        }
        try:
            results = graph_get_all(token, url, params)
            inbox_emails.extend(results)
            print(f"  Found {len(results)} inbox messages matching '{search_q}'")
        except Exception as e:
            print(f"  [WARNING] Search for '{search_q}' failed: {e}")

    # Deduplicate
    seen_ids2 = set()
    unique_inbox = []
    for m in inbox_emails:
        mid = m.get("id", "")
        if mid not in seen_ids2:
            seen_ids2.add(mid)
            unique_inbox.append(m)
    print(f"[OK] {len(unique_inbox)} unique inbox messages found")

    # Extract from inbox - look for victim names and emails in body/subject
    inbox_map = defaultdict(set)
    all_body_emails = set()
    for m in unique_inbox:
        body_content = m.get("body", {}).get("content", "")
        subject = m.get("subject", "")

        # Extract all emails from body
        body_emails = extract_emails_from_text(body_content)
        for em in body_emails:
            em_lower = em.lower()
            if "box.com" not in em_lower and "noreply" not in em_lower and "valleywide" not in em_lower:
                all_body_emails.add(em_lower)

        # Check recipients
        for field in ["toRecipients", "ccRecipients"]:
            for recip in m.get(field, []) or []:
                ea = recip.get("emailAddress", {})
                name = ea.get("name", "")
                addr = ea.get("address", "")
                if name and addr:
                    for v in name_variants(name):
                        inbox_map[v].add(addr.lower())

        # Try to extract name-email pairs from body HTML
        for em in body_emails:
            em_lower = em.lower()
            if "box.com" in em_lower or "noreply" in em_lower:
                continue
            # Use local part as potential name hint
            local_part = em.split('@')[0]
            local_clean = re.sub(r'[._\-\d]+', ' ', local_part).strip().lower()
            if len(local_clean) > 2:
                inbox_map[local_clean].add(em_lower)

    print(f"[INFO] Extracted {len(all_body_emails)} unique non-Box emails from inbox bodies")

    # --- Strategy 4: Search for Box collaboration/sharing emails specifically ---
    print("\n[INFO] Searching for Box collaboration emails...")
    collab_emails = []
    for search_q in ["box.com collaborate", "shared a file with you", "shared a folder with you"]:
        url = f"{GRAPH_BASE}/users/{JR_USER_ID}/messages"
        params = {
            "$search": f'"{search_q}"',
            "$top": "200",
            "$select": "subject,body,from,toRecipients,ccRecipients,sentDateTime",
        }
        try:
            results = graph_get_all(token, url, params)
            collab_emails.extend(results)
            print(f"  Found {len(results)} messages matching '{search_q}'")
        except Exception as e:
            print(f"  [WARNING] Search for '{search_q}' failed: {e}")

    # Process collaboration emails
    for m in collab_emails:
        body_content = m.get("body", {}).get("content", "")
        body_emails = extract_emails_from_text(body_content)
        for em in body_emails:
            em_lower = em.lower()
            if "box.com" not in em_lower and "noreply" not in em_lower and "valleywide" not in em_lower:
                all_body_emails.add(em_lower)

    # --- Strategy 5: Search tenant directory (GAL) for victim names ---
    print("\n[INFO] Searching tenant directory (GAL) for victim names...")
    gal_map = defaultdict(set)
    # Pull all users from the directory
    try:
        users_url = f"{GRAPH_BASE}/users"
        all_users = graph_get_all(token, users_url, {"$top": "999", "$select": "displayName,mail,userPrincipalName,givenName,surname"})
        print(f"[OK] Got {len(all_users)} directory users")
        for u in all_users:
            dn = u.get("displayName", "")
            mail = u.get("mail", "") or u.get("userPrincipalName", "")
            gn = u.get("givenName", "")
            sn = u.get("surname", "")
            if not mail:
                continue
            for v in name_variants(dn):
                gal_map[v].add(mail.lower())
            if gn and sn:
                full = f"{gn} {sn}".lower().strip()
                gal_map[full].add(mail.lower())
    except Exception as e:
        print(f"[WARNING] Directory users lookup failed: {e}")

    # --- Strategy 6: Try People API for broader name resolution ---
    print("\n[INFO] Searching People API for victim names...")
    people_map = defaultdict(set)
    # Only search for names that are specific enough (2+ words, not generic)
    specific_names = [n for n in name_only_victims if len(n.split()) >= 2 and len(n) > 5]
    searched = 0
    people_api_works = True
    for victim_name in specific_names:
        if not people_api_works:
            break
        url = f"{GRAPH_BASE}/users/{JR_USER_ID}/people"
        params = {
            "$search": f'"{victim_name}"',
            "$top": "5",
            "$select": "displayName,scoredEmailAddresses,givenName,surname",
        }
        headers = {"Authorization": f"Bearer {token}"}
        try:
            r = requests.get(url, headers=headers, params=params)
            if r.status_code == 403:
                print(f"  [WARNING] People API returned 403 - skipping")
                people_api_works = False
                break
            if r.status_code == 429:
                retry = int(r.headers.get("Retry-After", 5))
                print(f"  [THROTTLED] Waiting {retry}s...")
                time.sleep(retry)
                r = requests.get(url, headers=headers, params=params)
            if r.status_code == 200:
                people = r.json().get("value", [])
                for p in people:
                    pname = p.get("displayName", "")
                    pemails = [e.get("address", "") for e in p.get("scoredEmailAddresses", []) if e.get("address")]
                    if pemails:
                        for v in name_variants(pname):
                            for em in pemails:
                                people_map[v].add(em.lower())
            searched += 1
            if searched % 50 == 0:
                print(f"  Searched {searched}/{len(specific_names)} names...")
        except Exception as e:
            pass  # Silently continue on individual failures

    print(f"[OK] People API searched for {searched} names, found {len(people_map)} name entries")

    # --- Strategy 7: Search JR's mail for each unresolved name directly ---
    # This catches cases where someone emailed JR and their display name matches
    print("\n[INFO] Searching JR's mailbox for unresolved victim names...")
    mail_search_map = defaultdict(set)
    mail_searched = 0
    for victim_name in name_only_victims:
        # Skip single-word or very short names - too many false positives
        if len(victim_name.split()) < 2 or len(victim_name) < 5:
            continue
        url = f"{GRAPH_BASE}/users/{JR_USER_ID}/messages"
        params = {
            "$search": f'"from:{victim_name}"',
            "$top": "5",
            "$select": "from,subject",
        }
        headers_req = {"Authorization": f"Bearer {token}"}
        try:
            r = requests.get(url, headers=headers_req, params=params)
            if r.status_code == 429:
                retry = int(r.headers.get("Retry-After", 5))
                time.sleep(retry)
                r = requests.get(url, headers=headers_req, params=params)
            if r.status_code == 200:
                msgs = r.json().get("value", [])
                for msg in msgs:
                    fr = msg.get("from", {}).get("emailAddress", {})
                    fname = fr.get("name", "")
                    faddr = fr.get("address", "")
                    if fname and faddr:
                        # Check if the from name actually matches the victim
                        fname_norm = normalize(fname)
                        victim_norm = normalize(victim_name)
                        # Require strong match
                        if fname_norm == victim_norm or set(fname_norm.split()) == set(victim_norm.split()):
                            mail_search_map[victim_norm].add(faddr.lower())
            mail_searched += 1
            if mail_searched % 50 == 0:
                print(f"  Searched {mail_searched} names...")
        except Exception as e:
            pass

    print(f"[OK] Mail search completed for {mail_searched} names, found {len(mail_search_map)} matches")

    # --- Now resolve victims ---
    print("\n[INFO] Resolving victim names to email addresses...")
    resolved = {}
    unresolved = []
    resolution_source = {}

    for victim_name in name_only_victims:
        found_emails = set()
        source = []

        victim_variants = name_variants(victim_name)

        # Check contacts
        for v in victim_variants:
            if v in contact_map:
                found_emails.update(contact_map[v])
                source.append("contacts")

        # Check sent items
        for v in victim_variants:
            if v in sent_map:
                found_emails.update(sent_map[v])
                source.append("sent_items")

        # Check inbox
        for v in victim_variants:
            if v in inbox_map:
                found_emails.update(inbox_map[v])
                source.append("inbox")

        # Check GAL/directory
        for v in victim_variants:
            if v in gal_map:
                found_emails.update(gal_map[v])
                source.append("directory")

        # Check people API
        for v in victim_variants:
            if v in people_map:
                found_emails.update(people_map[v])
                source.append("people_api")

        # Check direct mail search
        vn = normalize(victim_name)
        if vn in mail_search_map:
            found_emails.update(mail_search_map[vn])
            source.append("mail_from_search")

        # Filter out obviously wrong emails
        exclude_patterns = ['box.com', 'noreply', 'valleywideplastering', 'buildingconnected.com', 'team@', 'no-reply', 'donotreply']
        found_emails = {e for e in found_emails if e and '@' in e and not any(p in e for p in exclude_patterns)}

        if found_emails:
            resolved[victim_name] = sorted(found_emails)
            resolution_source[victim_name] = list(set(source))
        else:
            unresolved.append(victim_name)

    # --- Build output ---
    all_resolved_emails = set()
    for emails in resolved.values():
        all_resolved_emails.update(emails)

    # Combine with already-known emails
    all_victim_emails = set(e.lower() for e in already_resolved) | all_resolved_emails

    output = {
        "investigation": "Valley Wide Plastering BEC - Victim Email Resolution",
        "run_date": time.strftime("%Y-%m-%d %H:%M:%S"),
        "summary": {
            "previously_resolved": len(already_resolved),
            "newly_resolved_by_name": len(resolved),
            "still_unresolved": len(unresolved),
            "total_unique_victim_emails": len(all_victim_emails),
            "total_victims_identified": len(already_resolved) + len(resolved) + len(unresolved),
        },
        "all_victim_emails_combined": sorted(all_victim_emails),
        "newly_resolved": {
            name: {
                "emails": emails,
                "source": resolution_source.get(name, [])
            }
            for name, emails in sorted(resolved.items())
        },
        "previously_confirmed_emails": sorted(already_resolved, key=str.lower),
        "unresolved_names": sorted(unresolved, key=lambda x: x.lower()),
        "body_emails_found_but_unmatched": sorted(all_body_emails - all_victim_emails),
    }

    with open(OUTPUT_FILE, 'w') as f:
        json.dump(output, f, indent=2)

    # --- Print summary ---
    print("\n" + "=" * 60)
    print("RESOLUTION RESULTS")
    print("=" * 60)
    print(f"Previously resolved emails:     {len(already_resolved)}")
    print(f"Newly resolved by name:         {len(resolved)}")
    print(f"Still unresolved:               {len(unresolved)}")
    print(f"Total unique victim emails:     {len(all_victim_emails)}")
    print(f"Unmatched body emails found:    {len(all_body_emails - all_victim_emails)}")
    print()

    if resolved:
        print("--- Newly Resolved ---")
        for name, emails in sorted(resolved.items()):
            src = ", ".join(resolution_source.get(name, []))
            print(f"  {name}: {', '.join(emails)}  [{src}]")
        print()

    if unresolved:
        print(f"--- Unresolved ({len(unresolved)} names) ---")
        for name in sorted(unresolved, key=lambda x: x.lower()):
            print(f"  {name}")

    print(f"\n[OK] Results saved to {OUTPUT_FILE}")


if __name__ == "__main__":
    main()