Files
claudetools/temp/bardach_missing_real_contacts.py
Mike Swanson fa15b03180 sync: Auto-sync from ACG-M-L5090 at 2026-03-10 19:11:00
Synced files:
- Quote wizard frontend (all components, hooks, types, config)
- API updates (config, models, routers, schemas, services)
- Client work (bg-builders, gurushow)
- Scripts (BGB Lesley termination, CIPP, Datto, migration)
- Temp files (Bardach contacts, VWP investigation, misc)
- Credentials and session logs
- Email service, PHP API, session logs

Machine: ACG-M-L5090
Timestamp: 2026-03-10 19:11:00

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 19:59:08 -07:00

415 lines
14 KiB
Python

#!/usr/bin/env python3
"""Find real two-way correspondents missing from Barbara's contacts and extract phone numbers from signatures."""
import json
import re
import subprocess
import time
import html
import urllib.parse
from datetime import datetime
# ── Config ──
INPUT_FILE = r"D:\ClaudeTools\temp\bardach_missing_contacts.json"
OUTPUT_FILE = r"D:\ClaudeTools\temp\bardach_missing_real_contacts.json"
TENANT = "dd4a82e8-85a3-44ac-8800-07945ab4d95f"
CLIENT_ID = "fabb3421-8b34-484b-bc17-e46de9703418"
CLIENT_SECRET = "~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO"
USER_EMAIL = "barbara@bardach.net"
TOKEN_URL = f"https://login.microsoftonline.com/{TENANT}/oauth2/v2.0/token"
GRAPH_BASE = f"https://graph.microsoft.com/v1.0/users/{USER_EMAIL}"
# ── Junk filters ──
JUNK_KEYWORDS = [
"noreply", "no-reply", "donotreply", "notification", "alert",
"mailer-daemon", "postmaster", "unsubscribe", "bounce",
"support@", "info@", "help@", "service@", "billing@",
"news@", "newsletter", "marketing", "promo"
]
COMMERCIAL_DOMAINS = [
"amazon.com", "google.com", "facebook.com", "apple.com", "microsoft.com",
"paypal.com", "ebay.com", "nextdoor.com", "linkedin.com", "twitter.com",
"instagram.com", "fidelity.com", "schwab.com", "vanguard.com",
"intuit.com", "turbotax.com"
]
# ── Token management ──
_token = None
_api_call_count = 0
def get_token():
"""Get a fresh OAuth2 token."""
result = subprocess.run(
["curl", "-s", "-X", "POST", TOKEN_URL,
"-d", f"client_id={CLIENT_ID}",
"-d", f"client_secret={CLIENT_SECRET}",
"-d", "scope=https://graph.microsoft.com/.default",
"-d", "grant_type=client_credentials"],
capture_output=True, text=True
)
data = json.loads(result.stdout)
if "access_token" not in data:
print(f"[ERROR] Token request failed: {data}")
raise RuntimeError("Failed to get token")
return data["access_token"]
def refresh_token_if_needed():
"""Refresh token every 30 API calls."""
global _token, _api_call_count
if _token is None or _api_call_count >= 30:
_token = get_token()
_api_call_count = 0
print(f" [Token refreshed]")
return _token
def graph_get(url, retries=3):
"""Make a GET request to Graph API using curl -G with --data-urlencode for proper encoding."""
global _api_call_count
token = refresh_token_if_needed()
_api_call_count += 1
for attempt in range(retries):
result = subprocess.run(
["curl", "-s", "--url", url,
"-H", f"Authorization: Bearer {token}",
"-H", "Content-Type: application/json",
"-H", "ConsistencyLevel: eventual"],
capture_output=True, text=True
)
if not result.stdout:
if attempt < retries - 1:
time.sleep(2)
continue
return None
try:
data = json.loads(result.stdout)
except json.JSONDecodeError:
if attempt < retries - 1:
time.sleep(2)
continue
return None
if "error" in data:
code = data["error"].get("code", "")
if code in ("TooManyRequests", "ServiceUnavailable", "GatewayTimeout") or "429" in str(code):
wait = 5 * (attempt + 1)
print(f" [Throttled, waiting {wait}s...]")
time.sleep(wait)
token = get_token()
_api_call_count = 0
continue
return None
return data
return None
def graph_search(email, top=3):
"""Search messages from a specific email using $search (which works, unlike $filter on from)."""
global _api_call_count
token = refresh_token_if_needed()
_api_call_count += 1
base_url = f"{GRAPH_BASE}/messages"
for attempt in range(3):
result = subprocess.run(
["curl", "-s", "-G", base_url,
"--data-urlencode", f"$search=\"from:{email}\"",
"--data-urlencode", "$select=subject,from,body",
"--data-urlencode", f"$top={top}",
"-H", f"Authorization: Bearer {token}",
"-H", "Content-Type: application/json",
"-H", "ConsistencyLevel: eventual"],
capture_output=True, text=True
)
if not result.stdout:
if attempt < 2:
time.sleep(2)
continue
return None
try:
data = json.loads(result.stdout)
except json.JSONDecodeError:
if attempt < 2:
time.sleep(2)
continue
return None
if "error" in data:
code = data["error"].get("code", "")
if code in ("TooManyRequests", "ServiceUnavailable", "GatewayTimeout") or "429" in str(code):
wait = 5 * (attempt + 1)
print(f" [Throttled, waiting {wait}s...]")
time.sleep(wait)
token = get_token()
_api_call_count = 0
continue
return None
return data
return None
# ── Phone extraction ──
PHONE_RE = re.compile(r'[\(]?\d{3}[\)\s.\-]?\s?\d{3}[\s.\-]?\d{4}')
LABELED_PHONE_RE = re.compile(
r'(?:Tel|Phone|Cell|Mobile|Office|Direct|Fax)[:\s]*\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}',
re.IGNORECASE
)
LABEL_RE = re.compile(r'(Tel|Phone|Cell|Mobile|Office|Direct|Fax)', re.IGNORECASE)
SIGNATURE_MARKERS = [
'--', '---', '____', '====', 'Best regards', 'Kind regards', 'Regards',
'Sincerely', 'Thank you', 'Thanks', 'Sent from', 'Get Outlook',
'Best,', 'Cheers', 'Warm regards', 'All the best'
]
# Markers that indicate the start of a quoted/forwarded reply (stop searching past these)
REPLY_MARKERS = [
'From:', 'Sent:', '-----Original Message', '________________________________',
'On ', '> On ', 'Begin forwarded message', 'wrote:'
]
def strip_html(text):
"""Remove HTML tags and decode entities."""
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'</?(?:p|div|tr|td|li|blockquote|table|tbody|thead|th|hr)[^>]*>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'<[^>]+>', '', text)
text = html.unescape(text)
# Collapse multiple blank lines
text = re.sub(r'\n{3,}', '\n\n', text)
return text
def extract_first_message_body(body_html):
"""Extract just the first (most recent) message from a thread, cutting off quoted replies."""
text = strip_html(body_html)
lines = text.split('\n')
# Find where the quoted reply starts (typically after the first message + signature)
# Look for reply markers starting from line 5 (skip subject/header area)
cutoff = len(lines)
for i in range(5, len(lines)):
line = lines[i].strip()
# "From: Name <email>" pattern indicating quoted message
if re.match(r'^From:\s+.+', line) and i > 10:
cutoff = i
break
# "On <date>, <name> wrote:" pattern
if re.match(r'^On .+wrote:\s*$', line):
cutoff = i
break
if '-----Original Message' in line:
cutoff = i
break
if line.startswith('________________________________'):
cutoff = i
break
return '\n'.join(lines[:cutoff])
def extract_phone_from_body(body_html, sender_email):
"""Extract phone number from email signature area of the FIRST message only."""
if not body_html:
return None, None
# Get just the first message (not quoted replies) to avoid picking up OTHER people's numbers
first_msg = extract_first_message_body(body_html)
lines = first_msg.split('\n')
# Find signature start - search from bottom up for signature markers
sig_start = None
for i in range(len(lines) - 1, max(len(lines) - 40, -1), -1):
line = lines[i].strip()
for marker in SIGNATURE_MARKERS:
if marker.lower() in line.lower():
sig_start = i
break
if sig_start is not None:
break
# If no signature marker found, use last 25 lines of first message
if sig_start is None:
sig_start = max(0, len(lines) - 25)
sig_text = '\n'.join(lines[sig_start:])
# First try labeled phone numbers in signature
labeled = LABELED_PHONE_RE.search(sig_text)
if labeled:
match_text = labeled.group(0)
label_match = LABEL_RE.search(match_text)
label = label_match.group(1).capitalize() if label_match else None
phone = PHONE_RE.search(match_text)
if phone:
return normalize_phone(phone.group(0)), label
# Then try any phone number in signature
phone = PHONE_RE.search(sig_text)
if phone:
return normalize_phone(phone.group(0)), None
# Fallback: search entire first message for labeled phones
labeled_full = LABELED_PHONE_RE.search(first_msg)
if labeled_full:
match_text = labeled_full.group(0)
label_match = LABEL_RE.search(match_text)
label = label_match.group(1).capitalize() if label_match else None
phone = PHONE_RE.search(match_text)
if phone:
return normalize_phone(phone.group(0)), label
# Last resort: any phone in the first message
phone = PHONE_RE.search(first_msg)
if phone:
return normalize_phone(phone.group(0)), None
return None, None
def normalize_phone(raw):
"""Normalize phone to (xxx) xxx-xxxx format."""
digits = re.sub(r'\D', '', raw)
if len(digits) == 11 and digits[0] == '1':
digits = digits[1:]
if len(digits) == 10:
return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
return raw.strip()
# ── Main ──
def main():
print("=" * 80)
print(" Bardach Missing Real Contacts - Phone Number Finder")
print("=" * 80)
# 1. Load input
with open(INPUT_FILE, encoding='utf-8') as f:
data = json.load(f)
missing = data["missing"]
print(f"\n[INFO] Total missing contacts loaded: {len(missing)}")
# 2. Filter sent_count > 0
two_way = [c for c in missing if c["sent_count"] > 0]
print(f"[INFO] Two-way correspondents (sent_count > 0): {len(two_way)}")
# 3. Filter junk
def is_junk(email):
email_lower = email.lower()
for kw in JUNK_KEYWORDS:
if kw in email_lower:
return True
domain = email_lower.split('@')[-1] if '@' in email_lower else ''
for cd in COMMERCIAL_DOMAINS:
if domain == cd or domain.endswith('.' + cd):
return True
return False
real = [c for c in two_way if not is_junk(c["email"])]
print(f"[INFO] After junk filter: {len(real)}")
# 4. Sort by total descending
real.sort(key=lambda c: c["total"], reverse=True)
print(f"\n[SUCCESS] {len(real)} real two-way correspondents are missing from contacts\n")
# 5. Phone lookup for top 60
top_n = min(60, len(real))
print(f"[INFO] Searching for phone numbers in top {top_n} contacts...")
print("-" * 80)
results = []
phones_found = 0
for idx, contact in enumerate(real[:top_n]):
email = contact["email"]
name = contact["display_name"] or email.split('@')[0]
print(f" [{idx+1:2d}/{top_n}] {name[:35]:35s} <{email[:40]}>", end="", flush=True)
# Search for 3 most recent emails FROM this address using $search
phone = None
phone_label = None
resp = graph_search(email, top=3)
if resp and "value" in resp:
for msg in resp["value"]:
# Verify this message is actually FROM the target email
msg_from = msg.get("from", {}).get("emailAddress", {}).get("address", "").lower()
if msg_from != email.lower():
continue
body_content = msg.get("body", {}).get("content", "")
phone, phone_label = extract_phone_from_body(body_content, email)
if phone:
break
if phone:
phones_found += 1
label_str = f" ({phone_label})" if phone_label else ""
print(f" -> {phone}{label_str}")
else:
print(f" -> --")
results.append({
"email": email,
"display_name": contact["display_name"],
"sent_count": contact["sent_count"],
"received_count": contact["received_count"],
"total": contact["total"],
"phone": phone,
"phone_label": phone_label
})
# Add remaining contacts (beyond top 60) without phone lookup
for contact in real[top_n:]:
results.append({
"email": contact["email"],
"display_name": contact["display_name"],
"sent_count": contact["sent_count"],
"received_count": contact["received_count"],
"total": contact["total"],
"phone": None,
"phone_label": None
})
# 7. Save output
output = {
"generated": datetime.now().isoformat(),
"total_two_way": len(real),
"with_phone": phones_found,
"without_phone": len(real) - phones_found,
"contacts": results
}
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\n[SUCCESS] Saved to {OUTPUT_FILE}")
# 8. Print table
print(f"\n{'='*110}")
print(f" MISSING REAL CONTACTS - TOP {top_n} (sorted by total exchanges)")
print(f"{'='*110}")
print(f" {'#':>3} {'Name':<30} {'Email':<40} {'Total':>6} {'Phone':<25}")
print(f" {'-'*3} {'-'*30} {'-'*40} {'-'*6} {'-'*25}")
for i, c in enumerate(results[:top_n]):
name = (c["display_name"] or c["email"].split('@')[0])[:30]
email_short = c["email"][:40]
phone_str = c["phone"] or "--"
if c["phone_label"]:
phone_str = f"{c['phone']} ({c['phone_label']})"
print(f" {i+1:3d} {name:<30} {email_short:<40} {c['total']:6d} {phone_str}")
print(f"\n{'='*110}")
print(f" SUMMARY")
print(f"{'='*110}")
print(f" Total two-way correspondents missing: {len(real)}")
print(f" Phone numbers found (top {top_n}): {phones_found}")
print(f" Without phone (top {top_n}): {top_n - phones_found}")
print(f"{'='*110}")
if __name__ == "__main__":
main()