#!/usr/bin/env python3 """Find real two-way correspondents missing from Barbara's contacts and extract phone numbers from signatures.""" import json import re import subprocess import time import html import urllib.parse from datetime import datetime # ── Config ── INPUT_FILE = r"D:\ClaudeTools\temp\bardach_missing_contacts.json" OUTPUT_FILE = r"D:\ClaudeTools\temp\bardach_missing_real_contacts.json" TENANT = "dd4a82e8-85a3-44ac-8800-07945ab4d95f" CLIENT_ID = "fabb3421-8b34-484b-bc17-e46de9703418" CLIENT_SECRET = "~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO" USER_EMAIL = "barbara@bardach.net" TOKEN_URL = f"https://login.microsoftonline.com/{TENANT}/oauth2/v2.0/token" GRAPH_BASE = f"https://graph.microsoft.com/v1.0/users/{USER_EMAIL}" # ── Junk filters ── JUNK_KEYWORDS = [ "noreply", "no-reply", "donotreply", "notification", "alert", "mailer-daemon", "postmaster", "unsubscribe", "bounce", "support@", "info@", "help@", "service@", "billing@", "news@", "newsletter", "marketing", "promo" ] COMMERCIAL_DOMAINS = [ "amazon.com", "google.com", "facebook.com", "apple.com", "microsoft.com", "paypal.com", "ebay.com", "nextdoor.com", "linkedin.com", "twitter.com", "instagram.com", "fidelity.com", "schwab.com", "vanguard.com", "intuit.com", "turbotax.com" ] # ── Token management ── _token = None _api_call_count = 0 def get_token(): """Get a fresh OAuth2 token.""" result = subprocess.run( ["curl", "-s", "-X", "POST", TOKEN_URL, "-d", f"client_id={CLIENT_ID}", "-d", f"client_secret={CLIENT_SECRET}", "-d", "scope=https://graph.microsoft.com/.default", "-d", "grant_type=client_credentials"], capture_output=True, text=True ) data = json.loads(result.stdout) if "access_token" not in data: print(f"[ERROR] Token request failed: {data}") raise RuntimeError("Failed to get token") return data["access_token"] def refresh_token_if_needed(): """Refresh token every 30 API calls.""" global _token, _api_call_count if _token is None or _api_call_count >= 30: _token = get_token() _api_call_count = 0 print(f" [Token refreshed]") return _token def graph_get(url, retries=3): """Make a GET request to Graph API using curl -G with --data-urlencode for proper encoding.""" global _api_call_count token = refresh_token_if_needed() _api_call_count += 1 for attempt in range(retries): result = subprocess.run( ["curl", "-s", "--url", url, "-H", f"Authorization: Bearer {token}", "-H", "Content-Type: application/json", "-H", "ConsistencyLevel: eventual"], capture_output=True, text=True ) if not result.stdout: if attempt < retries - 1: time.sleep(2) continue return None try: data = json.loads(result.stdout) except json.JSONDecodeError: if attempt < retries - 1: time.sleep(2) continue return None if "error" in data: code = data["error"].get("code", "") if code in ("TooManyRequests", "ServiceUnavailable", "GatewayTimeout") or "429" in str(code): wait = 5 * (attempt + 1) print(f" [Throttled, waiting {wait}s...]") time.sleep(wait) token = get_token() _api_call_count = 0 continue return None return data return None def graph_search(email, top=3): """Search messages from a specific email using $search (which works, unlike $filter on from).""" global _api_call_count token = refresh_token_if_needed() _api_call_count += 1 base_url = f"{GRAPH_BASE}/messages" for attempt in range(3): result = subprocess.run( ["curl", "-s", "-G", base_url, "--data-urlencode", f"$search=\"from:{email}\"", "--data-urlencode", "$select=subject,from,body", "--data-urlencode", f"$top={top}", "-H", f"Authorization: Bearer {token}", "-H", "Content-Type: application/json", "-H", "ConsistencyLevel: eventual"], capture_output=True, text=True ) if not result.stdout: if attempt < 2: time.sleep(2) continue return None try: data = json.loads(result.stdout) except json.JSONDecodeError: if attempt < 2: time.sleep(2) continue return None if "error" in data: code = data["error"].get("code", "") if code in ("TooManyRequests", "ServiceUnavailable", "GatewayTimeout") or "429" in str(code): wait = 5 * (attempt + 1) print(f" [Throttled, waiting {wait}s...]") time.sleep(wait) token = get_token() _api_call_count = 0 continue return None return data return None # ── Phone extraction ── PHONE_RE = re.compile(r'[\(]?\d{3}[\)\s.\-]?\s?\d{3}[\s.\-]?\d{4}') LABELED_PHONE_RE = re.compile( r'(?:Tel|Phone|Cell|Mobile|Office|Direct|Fax)[:\s]*\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}', re.IGNORECASE ) LABEL_RE = re.compile(r'(Tel|Phone|Cell|Mobile|Office|Direct|Fax)', re.IGNORECASE) SIGNATURE_MARKERS = [ '--', '---', '____', '====', 'Best regards', 'Kind regards', 'Regards', 'Sincerely', 'Thank you', 'Thanks', 'Sent from', 'Get Outlook', 'Best,', 'Cheers', 'Warm regards', 'All the best' ] # Markers that indicate the start of a quoted/forwarded reply (stop searching past these) REPLY_MARKERS = [ 'From:', 'Sent:', '-----Original Message', '________________________________', 'On ', '> On ', 'Begin forwarded message', 'wrote:' ] def strip_html(text): """Remove HTML tags and decode entities.""" text = re.sub(r'', '\n', text, flags=re.IGNORECASE) text = re.sub(r']*>', '\n', text, flags=re.IGNORECASE) text = re.sub(r'<[^>]+>', '', text) text = html.unescape(text) # Collapse multiple blank lines text = re.sub(r'\n{3,}', '\n\n', text) return text def extract_first_message_body(body_html): """Extract just the first (most recent) message from a thread, cutting off quoted replies.""" text = strip_html(body_html) lines = text.split('\n') # Find where the quoted reply starts (typically after the first message + signature) # Look for reply markers starting from line 5 (skip subject/header area) cutoff = len(lines) for i in range(5, len(lines)): line = lines[i].strip() # "From: Name " pattern indicating quoted message if re.match(r'^From:\s+.+', line) and i > 10: cutoff = i break # "On , wrote:" pattern if re.match(r'^On .+wrote:\s*$', line): cutoff = i break if '-----Original Message' in line: cutoff = i break if line.startswith('________________________________'): cutoff = i break return '\n'.join(lines[:cutoff]) def extract_phone_from_body(body_html, sender_email): """Extract phone number from email signature area of the FIRST message only.""" if not body_html: return None, None # Get just the first message (not quoted replies) to avoid picking up OTHER people's numbers first_msg = extract_first_message_body(body_html) lines = first_msg.split('\n') # Find signature start - search from bottom up for signature markers sig_start = None for i in range(len(lines) - 1, max(len(lines) - 40, -1), -1): line = lines[i].strip() for marker in SIGNATURE_MARKERS: if marker.lower() in line.lower(): sig_start = i break if sig_start is not None: break # If no signature marker found, use last 25 lines of first message if sig_start is None: sig_start = max(0, len(lines) - 25) sig_text = '\n'.join(lines[sig_start:]) # First try labeled phone numbers in signature labeled = LABELED_PHONE_RE.search(sig_text) if labeled: match_text = labeled.group(0) label_match = LABEL_RE.search(match_text) label = label_match.group(1).capitalize() if label_match else None phone = PHONE_RE.search(match_text) if phone: return normalize_phone(phone.group(0)), label # Then try any phone number in signature phone = PHONE_RE.search(sig_text) if phone: return normalize_phone(phone.group(0)), None # Fallback: search entire first message for labeled phones labeled_full = LABELED_PHONE_RE.search(first_msg) if labeled_full: match_text = labeled_full.group(0) label_match = LABEL_RE.search(match_text) label = label_match.group(1).capitalize() if label_match else None phone = PHONE_RE.search(match_text) if phone: return normalize_phone(phone.group(0)), label # Last resort: any phone in the first message phone = PHONE_RE.search(first_msg) if phone: return normalize_phone(phone.group(0)), None return None, None def normalize_phone(raw): """Normalize phone to (xxx) xxx-xxxx format.""" digits = re.sub(r'\D', '', raw) if len(digits) == 11 and digits[0] == '1': digits = digits[1:] if len(digits) == 10: return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}" return raw.strip() # ── Main ── def main(): print("=" * 80) print(" Bardach Missing Real Contacts - Phone Number Finder") print("=" * 80) # 1. Load input with open(INPUT_FILE, encoding='utf-8') as f: data = json.load(f) missing = data["missing"] print(f"\n[INFO] Total missing contacts loaded: {len(missing)}") # 2. Filter sent_count > 0 two_way = [c for c in missing if c["sent_count"] > 0] print(f"[INFO] Two-way correspondents (sent_count > 0): {len(two_way)}") # 3. Filter junk def is_junk(email): email_lower = email.lower() for kw in JUNK_KEYWORDS: if kw in email_lower: return True domain = email_lower.split('@')[-1] if '@' in email_lower else '' for cd in COMMERCIAL_DOMAINS: if domain == cd or domain.endswith('.' + cd): return True return False real = [c for c in two_way if not is_junk(c["email"])] print(f"[INFO] After junk filter: {len(real)}") # 4. Sort by total descending real.sort(key=lambda c: c["total"], reverse=True) print(f"\n[SUCCESS] {len(real)} real two-way correspondents are missing from contacts\n") # 5. Phone lookup for top 60 top_n = min(60, len(real)) print(f"[INFO] Searching for phone numbers in top {top_n} contacts...") print("-" * 80) results = [] phones_found = 0 for idx, contact in enumerate(real[:top_n]): email = contact["email"] name = contact["display_name"] or email.split('@')[0] print(f" [{idx+1:2d}/{top_n}] {name[:35]:35s} <{email[:40]}>", end="", flush=True) # Search for 3 most recent emails FROM this address using $search phone = None phone_label = None resp = graph_search(email, top=3) if resp and "value" in resp: for msg in resp["value"]: # Verify this message is actually FROM the target email msg_from = msg.get("from", {}).get("emailAddress", {}).get("address", "").lower() if msg_from != email.lower(): continue body_content = msg.get("body", {}).get("content", "") phone, phone_label = extract_phone_from_body(body_content, email) if phone: break if phone: phones_found += 1 label_str = f" ({phone_label})" if phone_label else "" print(f" -> {phone}{label_str}") else: print(f" -> --") results.append({ "email": email, "display_name": contact["display_name"], "sent_count": contact["sent_count"], "received_count": contact["received_count"], "total": contact["total"], "phone": phone, "phone_label": phone_label }) # Add remaining contacts (beyond top 60) without phone lookup for contact in real[top_n:]: results.append({ "email": contact["email"], "display_name": contact["display_name"], "sent_count": contact["sent_count"], "received_count": contact["received_count"], "total": contact["total"], "phone": None, "phone_label": None }) # 7. Save output output = { "generated": datetime.now().isoformat(), "total_two_way": len(real), "with_phone": phones_found, "without_phone": len(real) - phones_found, "contacts": results } with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(output, f, indent=2, ensure_ascii=False) print(f"\n[SUCCESS] Saved to {OUTPUT_FILE}") # 8. Print table print(f"\n{'='*110}") print(f" MISSING REAL CONTACTS - TOP {top_n} (sorted by total exchanges)") print(f"{'='*110}") print(f" {'#':>3} {'Name':<30} {'Email':<40} {'Total':>6} {'Phone':<25}") print(f" {'-'*3} {'-'*30} {'-'*40} {'-'*6} {'-'*25}") for i, c in enumerate(results[:top_n]): name = (c["display_name"] or c["email"].split('@')[0])[:30] email_short = c["email"][:40] phone_str = c["phone"] or "--" if c["phone_label"]: phone_str = f"{c['phone']} ({c['phone_label']})" print(f" {i+1:3d} {name:<30} {email_short:<40} {c['total']:6d} {phone_str}") print(f"\n{'='*110}") print(f" SUMMARY") print(f"{'='*110}") print(f" Total two-way correspondents missing: {len(real)}") print(f" Phone numbers found (top {top_n}): {phones_found}") print(f" Without phone (top {top_n}): {top_n - phones_found}") print(f"{'='*110}") if __name__ == "__main__": main()