#!/usr/bin/env python3 """Analyze website/URL fields for all Bardach contacts in Microsoft 365.""" import json import subprocess import sys import time from urllib.parse import urlparse from collections import defaultdict TENANT_ID = "dd4a82e8-85a3-44ac-8800-07945ab4d95f" CLIENT_ID = "fabb3421-8b34-484b-bc17-e46de9703418" CLIENT_SECRET = "~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO" SCOPE = "https://graph.microsoft.com/.default" USER = "barbara@bardach.net" TOKEN_URL = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token" JUNK_PATTERNS = [ "ms-outlook://", "linkedin.com/in/", "linkedin.com/company/", "outlook.live.com", "profile.live.com", "people.live.com", "social.microsoft.com", "contact.skype.com", "d.docs.live.net", "storage.live.com", "onedrive.live.com", "1drv.ms", "facebook.com", "twitter.com", "x.com", "plus.google.com", "instagram.com", "myspace.com", "flickr.com", "foursquare.com", "about.me", "gravatar.com", "apis.live.net", "cid-", "skype:", ] # Social media domains that M365 auto-links SOCIAL_DOMAINS = { "linkedin.com", "www.linkedin.com", "facebook.com", "www.facebook.com", "m.facebook.com", "twitter.com", "www.twitter.com", "x.com", "www.x.com", "instagram.com", "www.instagram.com", "plus.google.com", "myspace.com", "www.myspace.com", "flickr.com", "www.flickr.com", "foursquare.com", "www.foursquare.com", "about.me", "gravatar.com", "www.gravatar.com", } # Microsoft internal domains MS_DOMAINS = { "outlook.live.com", "profile.live.com", "people.live.com", "social.microsoft.com", "contact.skype.com", "d.docs.live.net", "storage.live.com", "onedrive.live.com", "1drv.ms", "apis.live.net", } def get_token(): result = subprocess.run( ["curl", "-s", "-X", "POST", TOKEN_URL, "-H", "Content-Type: application/x-www-form-urlencoded", "-d", f"client_id={CLIENT_ID}&scope={SCOPE}&client_secret={CLIENT_SECRET}&grant_type=client_credentials"], capture_output=True, text=True ) data = json.loads(result.stdout) if "access_token" not in data: print(f"[ERROR] Token acquisition failed: {data}") sys.exit(1) return data["access_token"] def fetch_all_contacts(token): """Fetch all contacts with pagination.""" contacts = [] select = "id,displayName,businessHomePage,emailAddresses,personalNotes,companyName" url = f"https://graph.microsoft.com/v1.0/users/{USER}/contacts?$select={select}&$top=100" call_count = 0 while url: call_count += 1 # Re-acquire token every 500 calls if call_count > 1 and (call_count - 1) % 500 == 0: print(f"[INFO] Re-acquiring token at call {call_count}...") token = get_token() print("[OK] Token re-acquired") result = subprocess.run( ["curl", "-s", "-X", "GET", url, "-H", f"Authorization: Bearer {token}", "-H", "Content-Type: application/json"], capture_output=True, text=True ) data = json.loads(result.stdout) if "value" not in data: print(f"[ERROR] Unexpected response: {json.dumps(data)[:300]}") break batch = data["value"] contacts.extend(batch) if len(contacts) % 500 == 0 or len(contacts) % 100 < len(batch): if len(contacts) % 500 < 100: print(f"[INFO] Fetched {len(contacts)} contacts so far...") url = data.get("@odata.nextLink") if url: time.sleep(0.05) return contacts, token def categorize_url(url_str): """Categorize a URL as junk, legitimate, or suspicious.""" if not url_str or not url_str.strip(): return "suspicious", "empty" url_lower = url_str.lower().strip() # Check for obviously malformed if len(url_lower) < 4: return "suspicious", "too_short" # Check junk patterns for pattern in JUNK_PATTERNS: if pattern in url_lower: return "junk", pattern # Try parsing try: # Add scheme if missing parse_url = url_lower if not parse_url.startswith("http"): parse_url = "https://" + parse_url parsed = urlparse(parse_url) domain = parsed.netloc.lower() # Check social/MS domains if domain in SOCIAL_DOMAINS: return "junk", domain if domain in MS_DOMAINS: return "junk", domain # Check for no real domain if not domain or "." not in domain: return "suspicious", "no_valid_domain" return "legitimate", domain except Exception: return "suspicious", "parse_error" def extract_domain(url_str): """Extract domain from URL.""" if not url_str: return "unknown" url_lower = url_str.lower().strip() if not url_lower.startswith("http"): url_lower = "https://" + url_lower try: parsed = urlparse(url_lower) return parsed.netloc or "unknown" except Exception: return "unknown" def main(): print("=" * 70) print("BARDACH CONTACTS - WEBSITE/URL FIELD ANALYSIS") print("=" * 70) print() token = get_token() print("[OK] Token acquired") print("[INFO] Fetching all contacts...") contacts, token = fetch_all_contacts(token) total = len(contacts) print(f"[OK] Fetched {total} total contacts") print() # Analyze businessHomePage contacts_with_url = [] contacts_without_url = [] for c in contacts: bhp = c.get("businessHomePage") if bhp and bhp.strip(): contacts_with_url.append(c) else: contacts_without_url.append(c) print(f"[INFO] Contacts with businessHomePage: {len(contacts_with_url)}") print(f"[INFO] Contacts without businessHomePage: {len(contacts_without_url)}") print() # Categorize junk_contacts = [] legitimate_contacts = [] suspicious_contacts = [] linkedin_profiles = [] facebook_profiles = [] domain_counts = defaultdict(int) junk_by_pattern = defaultdict(list) for c in contacts_with_url: url = c.get("businessHomePage", "").strip() category, detail = categorize_url(url) domain = extract_domain(url) domain_counts[domain] += 1 entry = { "id": c["id"], "displayName": c.get("displayName", ""), "url": url, "companyName": c.get("companyName", ""), "category": category, "detail": detail, "domain": domain, } if category == "junk": junk_contacts.append(entry) junk_by_pattern[detail].append(entry) # Cross-reference LinkedIn url_lower = url.lower() if "linkedin.com" in url_lower: linkedin_profiles.append(entry) elif "facebook.com" in url_lower: facebook_profiles.append(entry) elif category == "legitimate": legitimate_contacts.append(entry) else: suspicious_contacts.append(entry) # Print report print("=" * 70) print("RESULTS SUMMARY") print("=" * 70) print(f"Total contacts: {total}") print(f"Contacts with businessHomePage: {len(contacts_with_url)}") print(f" - Junk (auto-inserted): {len(junk_contacts)}") print(f" - Legitimate websites: {len(legitimate_contacts)}") print(f" - Suspicious/broken: {len(suspicious_contacts)}") print() # Junk URLs grouped by pattern print("=" * 70) print("JUNK URLs BY PATTERN") print("=" * 70) for pattern, entries in sorted(junk_by_pattern.items(), key=lambda x: -len(x[1])): print(f"\n Pattern: {pattern} ({len(entries)} contacts)") for e in entries[:5]: print(f" - {e['displayName']}: {e['url']}") if len(entries) > 5: print(f" ... and {len(entries) - 5} more") # Suspicious URLs print() print("=" * 70) print("SUSPICIOUS/BROKEN URLs") print("=" * 70) if suspicious_contacts: for e in suspicious_contacts: print(f" - {e['displayName']}: \"{e['url']}\" (reason: {e['detail']})") else: print(" None found") # Legitimate URLs (first 30) print() print("=" * 70) print("LEGITIMATE URLs (first 30)") print("=" * 70) for e in legitimate_contacts[:30]: company = f" [{e['companyName']}]" if e['companyName'] else "" print(f" - {e['displayName']}{company}: {e['url']}") if len(legitimate_contacts) > 30: print(f" ... and {len(legitimate_contacts) - 30} more") # Domain distribution print() print("=" * 70) print("DOMAIN DISTRIBUTION") print("=" * 70) for domain, count in sorted(domain_counts.items(), key=lambda x: -x[1]): print(f" {domain}: {count}") # LinkedIn cross-reference print() print("=" * 70) print(f"LINKEDIN PROFILES ({len(linkedin_profiles)})") print("=" * 70) for e in linkedin_profiles: print(f" - {e['displayName']}: {e['url']}") # Facebook cross-reference print() print("=" * 70) print(f"FACEBOOK PROFILES ({len(facebook_profiles)})") print("=" * 70) for e in facebook_profiles: print(f" - {e['displayName']}: {e['url']}") # Save results results = { "summary": { "total_contacts": total, "contacts_with_url": len(contacts_with_url), "contacts_without_url": len(contacts_without_url), "junk_count": len(junk_contacts), "legitimate_count": len(legitimate_contacts), "suspicious_count": len(suspicious_contacts), "linkedin_count": len(linkedin_profiles), "facebook_count": len(facebook_profiles), }, "junk_contacts": junk_contacts, "legitimate_contacts": legitimate_contacts, "suspicious_contacts": suspicious_contacts, "linkedin_profiles": linkedin_profiles, "facebook_profiles": facebook_profiles, "domain_distribution": dict(sorted(domain_counts.items(), key=lambda x: -x[1])), "junk_by_pattern": {k: [{"displayName": e["displayName"], "url": e["url"]} for e in v] for k, v in sorted(junk_by_pattern.items(), key=lambda x: -len(x[1]))}, } out_path = "D:/ClaudeTools/temp/bardach_url_analysis.json" with open(out_path, "w", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False) print(f"\n[OK] Results saved to {out_path}") if __name__ == "__main__": main()