claudetools/temp/bardach_missing_real_contacts.py

#!/usr/bin/env python3
"""Find real two-way correspondents missing from Barbara's contacts and extract phone numbers from signatures."""

import json
import re
import subprocess
import time
import html
import urllib.parse
from datetime import datetime

# ── Config ──
INPUT_FILE = r"D:\ClaudeTools\temp\bardach_missing_contacts.json"
OUTPUT_FILE = r"D:\ClaudeTools\temp\bardach_missing_real_contacts.json"

TENANT = "dd4a82e8-85a3-44ac-8800-07945ab4d95f"
CLIENT_ID = "fabb3421-8b34-484b-bc17-e46de9703418"
CLIENT_SECRET = "~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO"
USER_EMAIL = "barbara@bardach.net"

TOKEN_URL = f"https://login.microsoftonline.com/{TENANT}/oauth2/v2.0/token"
GRAPH_BASE = f"https://graph.microsoft.com/v1.0/users/{USER_EMAIL}"

# ── Junk filters ──
JUNK_KEYWORDS = [
    "noreply", "no-reply", "donotreply", "notification", "alert",
    "mailer-daemon", "postmaster", "unsubscribe", "bounce",
    "support@", "info@", "help@", "service@", "billing@",
    "news@", "newsletter", "marketing", "promo"
]

COMMERCIAL_DOMAINS = [
    "amazon.com", "google.com", "facebook.com", "apple.com", "microsoft.com",
    "paypal.com", "ebay.com", "nextdoor.com", "linkedin.com", "twitter.com",
    "instagram.com", "fidelity.com", "schwab.com", "vanguard.com",
    "intuit.com", "turbotax.com"
]

# ── Token management ──
_token = None
_api_call_count = 0

def get_token():
    """Get a fresh OAuth2 token."""
    result = subprocess.run(
        ["curl", "-s", "-X", "POST", TOKEN_URL,
         "-d", f"client_id={CLIENT_ID}",
         "-d", f"client_secret={CLIENT_SECRET}",
         "-d", "scope=https://graph.microsoft.com/.default",
         "-d", "grant_type=client_credentials"],
        capture_output=True, text=True
    )
    data = json.loads(result.stdout)
    if "access_token" not in data:
        print(f"[ERROR] Token request failed: {data}")
        raise RuntimeError("Failed to get token")
    return data["access_token"]

def refresh_token_if_needed():
    """Refresh token every 30 API calls."""
    global _token, _api_call_count
    if _token is None or _api_call_count >= 30:
        _token = get_token()
        _api_call_count = 0
        print(f"  [Token refreshed]")
    return _token

def graph_get(url, retries=3):
    """Make a GET request to Graph API using curl -G with --data-urlencode for proper encoding."""
    global _api_call_count
    token = refresh_token_if_needed()
    _api_call_count += 1

    for attempt in range(retries):
        result = subprocess.run(
            ["curl", "-s", "--url", url,
             "-H", f"Authorization: Bearer {token}",
             "-H", "Content-Type: application/json",
             "-H", "ConsistencyLevel: eventual"],
            capture_output=True, text=True
        )
        if not result.stdout:
            if attempt < retries - 1:
                time.sleep(2)
                continue
            return None

        try:
            data = json.loads(result.stdout)
        except json.JSONDecodeError:
            if attempt < retries - 1:
                time.sleep(2)
                continue
            return None

        if "error" in data:
            code = data["error"].get("code", "")
            if code in ("TooManyRequests", "ServiceUnavailable", "GatewayTimeout") or "429" in str(code):
                wait = 5 * (attempt + 1)
                print(f"    [Throttled, waiting {wait}s...]")
                time.sleep(wait)
                token = get_token()
                _api_call_count = 0
                continue
            return None
        return data

    return None

def graph_search(email, top=3):
    """Search messages from a specific email using $search (which works, unlike $filter on from)."""
    global _api_call_count
    token = refresh_token_if_needed()
    _api_call_count += 1

    base_url = f"{GRAPH_BASE}/messages"

    for attempt in range(3):
        result = subprocess.run(
            ["curl", "-s", "-G", base_url,
             "--data-urlencode", f"$search=\"from:{email}\"",
             "--data-urlencode", "$select=subject,from,body",
             "--data-urlencode", f"$top={top}",
             "-H", f"Authorization: Bearer {token}",
             "-H", "Content-Type: application/json",
             "-H", "ConsistencyLevel: eventual"],
            capture_output=True, text=True
        )

        if not result.stdout:
            if attempt < 2:
                time.sleep(2)
                continue
            return None

        try:
            data = json.loads(result.stdout)
        except json.JSONDecodeError:
            if attempt < 2:
                time.sleep(2)
                continue
            return None

        if "error" in data:
            code = data["error"].get("code", "")
            if code in ("TooManyRequests", "ServiceUnavailable", "GatewayTimeout") or "429" in str(code):
                wait = 5 * (attempt + 1)
                print(f"    [Throttled, waiting {wait}s...]")
                time.sleep(wait)
                token = get_token()
                _api_call_count = 0
                continue
            return None
        return data

    return None

# ── Phone extraction ──
PHONE_RE = re.compile(r'[\(]?\d{3}[\)\s.\-]?\s?\d{3}[\s.\-]?\d{4}')
LABELED_PHONE_RE = re.compile(
    r'(?:Tel|Phone|Cell|Mobile|Office|Direct|Fax)[:\s]*\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}',
    re.IGNORECASE
)
LABEL_RE = re.compile(r'(Tel|Phone|Cell|Mobile|Office|Direct|Fax)', re.IGNORECASE)
SIGNATURE_MARKERS = [
    '--', '---', '____', '====', 'Best regards', 'Kind regards', 'Regards',
    'Sincerely', 'Thank you', 'Thanks', 'Sent from', 'Get Outlook',
    'Best,', 'Cheers', 'Warm regards', 'All the best'
]

# Markers that indicate the start of a quoted/forwarded reply (stop searching past these)
REPLY_MARKERS = [
    'From:', 'Sent:', '-----Original Message', '________________________________',
    'On ', '> On ', 'Begin forwarded message', 'wrote:'
]

def strip_html(text):
    """Remove HTML tags and decode entities."""
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'</?(?:p|div|tr|td|li|blockquote|table|tbody|thead|th|hr)[^>]*>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'<[^>]+>', '', text)
    text = html.unescape(text)
    # Collapse multiple blank lines
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text

def extract_first_message_body(body_html):
    """Extract just the first (most recent) message from a thread, cutting off quoted replies."""
    text = strip_html(body_html)
    lines = text.split('\n')

    # Find where the quoted reply starts (typically after the first message + signature)
    # Look for reply markers starting from line 5 (skip subject/header area)
    cutoff = len(lines)
    for i in range(5, len(lines)):
        line = lines[i].strip()
        # "From: Name <email>" pattern indicating quoted message
        if re.match(r'^From:\s+.+', line) and i > 10:
            cutoff = i
            break
        # "On <date>, <name> wrote:" pattern
        if re.match(r'^On .+wrote:\s*$', line):
            cutoff = i
            break
        if '-----Original Message' in line:
            cutoff = i
            break
        if line.startswith('________________________________'):
            cutoff = i
            break

    return '\n'.join(lines[:cutoff])

def extract_phone_from_body(body_html, sender_email):
    """Extract phone number from email signature area of the FIRST message only."""
    if not body_html:
        return None, None

    # Get just the first message (not quoted replies) to avoid picking up OTHER people's numbers
    first_msg = extract_first_message_body(body_html)
    lines = first_msg.split('\n')

    # Find signature start - search from bottom up for signature markers
    sig_start = None
    for i in range(len(lines) - 1, max(len(lines) - 40, -1), -1):
        line = lines[i].strip()
        for marker in SIGNATURE_MARKERS:
            if marker.lower() in line.lower():
                sig_start = i
                break
        if sig_start is not None:
            break

    # If no signature marker found, use last 25 lines of first message
    if sig_start is None:
        sig_start = max(0, len(lines) - 25)

    sig_text = '\n'.join(lines[sig_start:])

    # First try labeled phone numbers in signature
    labeled = LABELED_PHONE_RE.search(sig_text)
    if labeled:
        match_text = labeled.group(0)
        label_match = LABEL_RE.search(match_text)
        label = label_match.group(1).capitalize() if label_match else None
        phone = PHONE_RE.search(match_text)
        if phone:
            return normalize_phone(phone.group(0)), label

    # Then try any phone number in signature
    phone = PHONE_RE.search(sig_text)
    if phone:
        return normalize_phone(phone.group(0)), None

    # Fallback: search entire first message for labeled phones
    labeled_full = LABELED_PHONE_RE.search(first_msg)
    if labeled_full:
        match_text = labeled_full.group(0)
        label_match = LABEL_RE.search(match_text)
        label = label_match.group(1).capitalize() if label_match else None
        phone = PHONE_RE.search(match_text)
        if phone:
            return normalize_phone(phone.group(0)), label

    # Last resort: any phone in the first message
    phone = PHONE_RE.search(first_msg)
    if phone:
        return normalize_phone(phone.group(0)), None

    return None, None

def normalize_phone(raw):
    """Normalize phone to (xxx) xxx-xxxx format."""
    digits = re.sub(r'\D', '', raw)
    if len(digits) == 11 and digits[0] == '1':
        digits = digits[1:]
    if len(digits) == 10:
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
    return raw.strip()

# ── Main ──
def main():
    print("=" * 80)
    print("  Bardach Missing Real Contacts - Phone Number Finder")
    print("=" * 80)

    # 1. Load input
    with open(INPUT_FILE, encoding='utf-8') as f:
        data = json.load(f)

    missing = data["missing"]
    print(f"\n[INFO] Total missing contacts loaded: {len(missing)}")

    # 2. Filter sent_count > 0
    two_way = [c for c in missing if c["sent_count"] > 0]
    print(f"[INFO] Two-way correspondents (sent_count > 0): {len(two_way)}")

    # 3. Filter junk
    def is_junk(email):
        email_lower = email.lower()
        for kw in JUNK_KEYWORDS:
            if kw in email_lower:
                return True
        domain = email_lower.split('@')[-1] if '@' in email_lower else ''
        for cd in COMMERCIAL_DOMAINS:
            if domain == cd or domain.endswith('.' + cd):
                return True
        return False

    real = [c for c in two_way if not is_junk(c["email"])]
    print(f"[INFO] After junk filter: {len(real)}")

    # 4. Sort by total descending
    real.sort(key=lambda c: c["total"], reverse=True)

    print(f"\n[SUCCESS] {len(real)} real two-way correspondents are missing from contacts\n")

    # 5. Phone lookup for top 60
    top_n = min(60, len(real))
    print(f"[INFO] Searching for phone numbers in top {top_n} contacts...")
    print("-" * 80)

    results = []
    phones_found = 0

    for idx, contact in enumerate(real[:top_n]):
        email = contact["email"]
        name = contact["display_name"] or email.split('@')[0]
        print(f"  [{idx+1:2d}/{top_n}] {name[:35]:35s} <{email[:40]}>", end="", flush=True)

        # Search for 3 most recent emails FROM this address using $search
        phone = None
        phone_label = None
        resp = graph_search(email, top=3)

        if resp and "value" in resp:
            for msg in resp["value"]:
                # Verify this message is actually FROM the target email
                msg_from = msg.get("from", {}).get("emailAddress", {}).get("address", "").lower()
                if msg_from != email.lower():
                    continue
                body_content = msg.get("body", {}).get("content", "")
                phone, phone_label = extract_phone_from_body(body_content, email)
                if phone:
                    break

        if phone:
            phones_found += 1
            label_str = f" ({phone_label})" if phone_label else ""
            print(f"  -> {phone}{label_str}")
        else:
            print(f"  -> --")

        results.append({
            "email": email,
            "display_name": contact["display_name"],
            "sent_count": contact["sent_count"],
            "received_count": contact["received_count"],
            "total": contact["total"],
            "phone": phone,
            "phone_label": phone_label
        })

    # Add remaining contacts (beyond top 60) without phone lookup
    for contact in real[top_n:]:
        results.append({
            "email": contact["email"],
            "display_name": contact["display_name"],
            "sent_count": contact["sent_count"],
            "received_count": contact["received_count"],
            "total": contact["total"],
            "phone": None,
            "phone_label": None
        })

    # 7. Save output
    output = {
        "generated": datetime.now().isoformat(),
        "total_two_way": len(real),
        "with_phone": phones_found,
        "without_phone": len(real) - phones_found,
        "contacts": results
    }

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print(f"\n[SUCCESS] Saved to {OUTPUT_FILE}")

    # 8. Print table
    print(f"\n{'='*110}")
    print(f"  MISSING REAL CONTACTS - TOP {top_n} (sorted by total exchanges)")
    print(f"{'='*110}")
    print(f"  {'#':>3}  {'Name':<30} {'Email':<40} {'Total':>6}  {'Phone':<25}")
    print(f"  {'-'*3}  {'-'*30} {'-'*40} {'-'*6}  {'-'*25}")

    for i, c in enumerate(results[:top_n]):
        name = (c["display_name"] or c["email"].split('@')[0])[:30]
        email_short = c["email"][:40]
        phone_str = c["phone"] or "--"
        if c["phone_label"]:
            phone_str = f"{c['phone']} ({c['phone_label']})"
        print(f"  {i+1:3d}  {name:<30} {email_short:<40} {c['total']:6d}  {phone_str}")

    print(f"\n{'='*110}")
    print(f"  SUMMARY")
    print(f"{'='*110}")
    print(f"  Total two-way correspondents missing: {len(real)}")
    print(f"  Phone numbers found (top {top_n}):      {phones_found}")
    print(f"  Without phone (top {top_n}):             {top_n - phones_found}")
    print(f"{'='*110}")

if __name__ == "__main__":
    main()