claudetools/temp/vwp_extract_victim_emails.py

#!/usr/bin/env python3
"""
Extract victim email addresses from Box.com acceptance notifications
in JR's compromised mailbox (Valley Wide Plastering BEC investigation).

Strategy:
1. Search acceptance notifications for email addresses in body/subject
2. Extract display names from subjects where no email found
3. Search JR's Sent Items for the original Box sharing invitations
4. Cross-reference to map names -> emails
"""

import subprocess
import json
import re
import sys
import time
import urllib.parse

TENANT_ID = "5c53ae9f-7071-4248-b834-8685b646450f"
CLIENT_ID = "fabb3421-8b34-484b-bc17-e46de9703418"
CLIENT_SECRET = "~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO"
USER_ID = "0af923d0-48c5-4cc1-8553-c60625802815"
GRAPH_BASE = "https://graph.microsoft.com/v1.0"

def get_token():
    url = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token"
    result = subprocess.run([
        "curl", "-s", "-X", "POST", url,
        "-H", "Content-Type: application/x-www-form-urlencoded",
        "-d", f"client_id={CLIENT_ID}&scope=https%3A%2F%2Fgraph.microsoft.com%2F.default&client_secret={CLIENT_SECRET}&grant_type=client_credentials"
    ], capture_output=True, text=True)
    data = json.loads(result.stdout)
    if "access_token" not in data:
        print(f"[ERROR] Failed to get token: {json.dumps(data, indent=2)}")
        sys.exit(1)
    print("[OK] Got access token")
    return data["access_token"]


def graph_get(token, url):
    result = subprocess.run([
        "curl", "-s", "-X", "GET", url,
        "-H", f"Authorization: Bearer {token}",
        "-H", "Content-Type: application/json",
        "-H", "Prefer: outlook.body-content-type=text"
    ], capture_output=True, text=True)
    if not result.stdout.strip():
        return {"error": "empty response"}
    return json.loads(result.stdout)


def extract_emails_from_text(text):
    pattern = r'[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}'
    return re.findall(pattern, text)


def extract_name_from_subject(subject):
    """Extract the person's name/identifier from acceptance subject."""
    # Pattern: "NAME has accepted the invitation to your 'Valley Wide..."
    m = re.match(r"^(.+?)\s+has accepted the invitation to your", subject)
    if m:
        return m.group(1).strip()
    return None


def main():
    print("=" * 70)
    print("VWP BEC Investigation - Box.com Victim Email Extraction")
    print("=" * 70)

    token = get_token()

    # ================================================================
    # PHASE 1: Get ALL acceptance notification emails
    # ================================================================
    print("\n[INFO] Phase 1: Fetching ALL Box acceptance emails...")
    all_acceptance_emails = []
    url = (
        f"{GRAPH_BASE}/users/{USER_ID}/messages"
        f"?$search=%22from%3Anoreply%40box.com%20subject%3Aaccepted%22"
        f"&$top=50"
        f"&$select=id,subject,bodyPreview,from,receivedDateTime"
    )

    page = 1
    while url:
        print(f"  Fetching page {page}...")
        data = graph_get(token, url)
        if "value" not in data:
            print(f"  [WARNING] Error: {json.dumps(data, indent=2)[:300]}")
            break
        all_acceptance_emails.extend(data["value"])
        print(f"  Page {page}: {len(data['value'])} emails (total: {len(all_acceptance_emails)})")
        url = data.get("@odata.nextLink")
        page += 1
        if page > 20:
            break
        time.sleep(0.3)

    print(f"\n[INFO] Total acceptance emails: {len(all_acceptance_emails)}")

    # ================================================================
    # PHASE 2: Extract emails from body + names from subjects
    # ================================================================
    print("\n[INFO] Phase 2: Extracting emails from message bodies...")
    victim_emails = set()
    names_without_emails = []  # (name, subject) tuples
    box_internal = {"noreply@box.com", "no-reply@box.com"}
    jr_email = "j-r@valleywideplastering.com"

    for i, email in enumerate(all_acceptance_emails):
        msg_id = email["id"]
        subject = email.get("subject", "")

        # Get full body
        full_url = (
            f"{GRAPH_BASE}/users/{USER_ID}/messages/{msg_id}"
            f"?$select=id,subject,body,toRecipients,ccRecipients"
        )
        full = graph_get(token, full_url)
        body_content = full.get("body", {}).get("content", "")

        # Extract all emails from body
        found_emails = set()
        for addr in extract_emails_from_text(body_content):
            addr_lower = addr.lower().strip()
            if (addr_lower not in box_internal and
                "box.com" not in addr_lower and
                addr_lower != jr_email):
                found_emails.add(addr_lower)

        # Also check toRecipients
        for r in full.get("toRecipients", []):
            addr = r.get("emailAddress", {}).get("address", "")
            if addr:
                addr_lower = addr.lower().strip()
                if addr_lower != jr_email and addr_lower not in box_internal:
                    found_emails.add(addr_lower)

        # Check subject for email-as-name pattern
        name = extract_name_from_subject(subject)
        if name:
            name_emails = extract_emails_from_text(name)
            if name_emails:
                for e in name_emails:
                    found_emails.add(e.lower())

        if found_emails:
            for e in found_emails:
                if e not in victim_emails:
                    print(f"  [FOUND] {e}")
                victim_emails.add(e)
        else:
            # We only got the name, not the email
            if name and name.lower() != jr_email:
                names_without_emails.append((name, subject))

        if (i + 1) % 20 == 0:
            print(f"  Processed {i+1}/{len(all_acceptance_emails)} emails... ({len(victim_emails)} emails found so far)")
        time.sleep(0.15)

    print(f"\n[INFO] Phase 2 complete:")
    print(f"  Emails found directly: {len(victim_emails)}")
    print(f"  Names without emails: {len(names_without_emails)}")

    # Deduplicate names
    unique_names = list(set([n for n, s in names_without_emails]))
    if unique_names:
        print(f"\n[INFO] Names without email addresses ({len(unique_names)}):")
        for n in sorted(unique_names):
            print(f"    {n}")

    # ================================================================
    # PHASE 3: Search Sent Items for original Box invitations
    # ================================================================
    print("\n[INFO] Phase 3: Searching Sent Items for Box invitation emails...")
    # Box sends invitations FROM the sharer, so check sent items
    # Also search for Box collaboration emails in the inbox
    url = (
        f"{GRAPH_BASE}/users/{USER_ID}/mailFolders/sentitems/messages"
        f"?$search=%22Valley%20Wide%20Plastering%22"
        f"&$top=50"
        f"&$select=id,subject,toRecipients,ccRecipients,bccRecipients,bodyPreview,receivedDateTime"
    )
    sent_data = graph_get(token, url)
    if "value" in sent_data:
        print(f"  Found {len(sent_data['value'])} sent emails mentioning Valley Wide Plastering")
        for email in sent_data["value"]:
            for field in ["toRecipients", "ccRecipients", "bccRecipients"]:
                for r in email.get(field, []):
                    addr = r.get("emailAddress", {}).get("address", "")
                    if addr:
                        addr_lower = addr.lower().strip()
                        if (addr_lower != jr_email and
                            addr_lower not in box_internal and
                            "box.com" not in addr_lower):
                            if addr_lower not in victim_emails:
                                print(f"  [NEW from sent] {addr_lower} (subject: {email.get('subject', '')[:60]})")
                            victim_emails.add(addr_lower)

    # Also search for Box invitation emails (sent by Box on behalf of JR)
    print("\n[INFO] Phase 3b: Searching for Box invitation sent notifications...")
    url = (
        f"{GRAPH_BASE}/users/{USER_ID}/messages"
        f"?$search=%22from%3Anoreply%40box.com%20subject%3Ainvited%22"
        f"&$top=50"
        f"&$select=id,subject,body,receivedDateTime"
    )
    page = 1
    while url:
        data = graph_get(token, url)
        if "value" not in data:
            break
        print(f"  Page {page}: {len(data['value'])} invitation emails")
        for email in data["value"]:
            # Get full body for each
            full_url = (
                f"{GRAPH_BASE}/users/{USER_ID}/messages/{email['id']}"
                f"?$select=body,subject,toRecipients"
            )
            full = graph_get(token, full_url)
            body = full.get("body", {}).get("content", "")
            for addr in extract_emails_from_text(body):
                addr_lower = addr.lower().strip()
                if (addr_lower != jr_email and
                    addr_lower not in box_internal and
                    "box.com" not in addr_lower):
                    if addr_lower not in victim_emails:
                        print(f"  [NEW from invitation] {addr_lower}")
                    victim_emails.add(addr_lower)
            time.sleep(0.15)
        url = data.get("@odata.nextLink")
        page += 1
        if page > 10:
            break
        time.sleep(0.3)

    # ================================================================
    # PHASE 4: Search for Box "shared" notifications
    # ================================================================
    print("\n[INFO] Phase 4: Searching for Box 'shared' notifications...")
    url = (
        f"{GRAPH_BASE}/users/{USER_ID}/messages"
        f"?$search=%22from%3Anoreply%40box.com%20subject%3Ashared%22"
        f"&$top=50"
        f"&$select=id,subject,body,receivedDateTime"
    )
    data = graph_get(token, url)
    if "value" in data:
        print(f"  Found {len(data['value'])} 'shared' emails")
        for email in data["value"]:
            full_url = (
                f"{GRAPH_BASE}/users/{USER_ID}/messages/{email['id']}"
                f"?$select=body,subject"
            )
            full = graph_get(token, full_url)
            body = full.get("body", {}).get("content", "")
            subject = full.get("subject", "")
            for addr in extract_emails_from_text(body):
                addr_lower = addr.lower().strip()
                if (addr_lower != jr_email and
                    addr_lower not in box_internal and
                    "box.com" not in addr_lower):
                    if addr_lower not in victim_emails:
                        print(f"  [NEW from shared] {addr_lower} (subject: {subject[:60]})")
                    victim_emails.add(addr_lower)
            time.sleep(0.15)

    # ================================================================
    # RESULTS
    # ================================================================
    victim_list = sorted(victim_emails)
    print("\n" + "=" * 70)
    print(f"FINAL RESULTS: {len(victim_list)} unique victim email addresses")
    print("=" * 70)
    for addr in victim_list:
        print(f"  {addr}")

    if unique_names:
        print(f"\n[WARNING] {len(unique_names)} victims identified by NAME only (no email extracted):")
        for n in sorted(unique_names):
            print(f"  {n}")

    output = {
        "investigation": "Valley Wide Plastering BEC",
        "source": "Box.com notifications in JR mailbox",
        "total_acceptance_emails": len(all_acceptance_emails),
        "unique_victim_emails": len(victim_list),
        "victim_emails": victim_list,
        "names_without_emails": sorted(unique_names) if unique_names else []
    }
    output_path = r"D:\ClaudeTools\temp\vwp_victim_emails.json"
    with open(output_path, "w") as f:
        json.dump(output, f, indent=2)
    print(f"\n[OK] Results saved to {output_path}")


if __name__ == "__main__":
    main()