Files
claudetools/temp/bardach_url_analysis.py
Mike Swanson fa15b03180 sync: Auto-sync from ACG-M-L5090 at 2026-03-10 19:11:00
Synced files:
- Quote wizard frontend (all components, hooks, types, config)
- API updates (config, models, routers, schemas, services)
- Client work (bg-builders, gurushow)
- Scripts (BGB Lesley termination, CIPP, Datto, migration)
- Temp files (Bardach contacts, VWP investigation, misc)
- Credentials and session logs
- Email service, PHP API, session logs

Machine: ACG-M-L5090
Timestamp: 2026-03-10 19:11:00

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 19:59:08 -07:00

350 lines
10 KiB
Python

#!/usr/bin/env python3
"""Analyze website/URL fields for all Bardach contacts in Microsoft 365."""
import json
import subprocess
import sys
import time
from urllib.parse import urlparse
from collections import defaultdict
TENANT_ID = "dd4a82e8-85a3-44ac-8800-07945ab4d95f"
CLIENT_ID = "fabb3421-8b34-484b-bc17-e46de9703418"
CLIENT_SECRET = "~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO"
SCOPE = "https://graph.microsoft.com/.default"
USER = "barbara@bardach.net"
TOKEN_URL = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token"
JUNK_PATTERNS = [
"ms-outlook://",
"linkedin.com/in/",
"linkedin.com/company/",
"outlook.live.com",
"profile.live.com",
"people.live.com",
"social.microsoft.com",
"contact.skype.com",
"d.docs.live.net",
"storage.live.com",
"onedrive.live.com",
"1drv.ms",
"facebook.com",
"twitter.com",
"x.com",
"plus.google.com",
"instagram.com",
"myspace.com",
"flickr.com",
"foursquare.com",
"about.me",
"gravatar.com",
"apis.live.net",
"cid-",
"skype:",
]
# Social media domains that M365 auto-links
SOCIAL_DOMAINS = {
"linkedin.com", "www.linkedin.com",
"facebook.com", "www.facebook.com", "m.facebook.com",
"twitter.com", "www.twitter.com",
"x.com", "www.x.com",
"instagram.com", "www.instagram.com",
"plus.google.com",
"myspace.com", "www.myspace.com",
"flickr.com", "www.flickr.com",
"foursquare.com", "www.foursquare.com",
"about.me",
"gravatar.com", "www.gravatar.com",
}
# Microsoft internal domains
MS_DOMAINS = {
"outlook.live.com", "profile.live.com", "people.live.com",
"social.microsoft.com", "contact.skype.com",
"d.docs.live.net", "storage.live.com", "onedrive.live.com",
"1drv.ms", "apis.live.net",
}
def get_token():
result = subprocess.run(
["curl", "-s", "-X", "POST", TOKEN_URL,
"-H", "Content-Type: application/x-www-form-urlencoded",
"-d", f"client_id={CLIENT_ID}&scope={SCOPE}&client_secret={CLIENT_SECRET}&grant_type=client_credentials"],
capture_output=True, text=True
)
data = json.loads(result.stdout)
if "access_token" not in data:
print(f"[ERROR] Token acquisition failed: {data}")
sys.exit(1)
return data["access_token"]
def fetch_all_contacts(token):
"""Fetch all contacts with pagination."""
contacts = []
select = "id,displayName,businessHomePage,emailAddresses,personalNotes,companyName"
url = f"https://graph.microsoft.com/v1.0/users/{USER}/contacts?$select={select}&$top=100"
call_count = 0
while url:
call_count += 1
# Re-acquire token every 500 calls
if call_count > 1 and (call_count - 1) % 500 == 0:
print(f"[INFO] Re-acquiring token at call {call_count}...")
token = get_token()
print("[OK] Token re-acquired")
result = subprocess.run(
["curl", "-s", "-X", "GET", url,
"-H", f"Authorization: Bearer {token}",
"-H", "Content-Type: application/json"],
capture_output=True, text=True
)
data = json.loads(result.stdout)
if "value" not in data:
print(f"[ERROR] Unexpected response: {json.dumps(data)[:300]}")
break
batch = data["value"]
contacts.extend(batch)
if len(contacts) % 500 == 0 or len(contacts) % 100 < len(batch):
if len(contacts) % 500 < 100:
print(f"[INFO] Fetched {len(contacts)} contacts so far...")
url = data.get("@odata.nextLink")
if url:
time.sleep(0.05)
return contacts, token
def categorize_url(url_str):
"""Categorize a URL as junk, legitimate, or suspicious."""
if not url_str or not url_str.strip():
return "suspicious", "empty"
url_lower = url_str.lower().strip()
# Check for obviously malformed
if len(url_lower) < 4:
return "suspicious", "too_short"
# Check junk patterns
for pattern in JUNK_PATTERNS:
if pattern in url_lower:
return "junk", pattern
# Try parsing
try:
# Add scheme if missing
parse_url = url_lower
if not parse_url.startswith("http"):
parse_url = "https://" + parse_url
parsed = urlparse(parse_url)
domain = parsed.netloc.lower()
# Check social/MS domains
if domain in SOCIAL_DOMAINS:
return "junk", domain
if domain in MS_DOMAINS:
return "junk", domain
# Check for no real domain
if not domain or "." not in domain:
return "suspicious", "no_valid_domain"
return "legitimate", domain
except Exception:
return "suspicious", "parse_error"
def extract_domain(url_str):
"""Extract domain from URL."""
if not url_str:
return "unknown"
url_lower = url_str.lower().strip()
if not url_lower.startswith("http"):
url_lower = "https://" + url_lower
try:
parsed = urlparse(url_lower)
return parsed.netloc or "unknown"
except Exception:
return "unknown"
def main():
print("=" * 70)
print("BARDACH CONTACTS - WEBSITE/URL FIELD ANALYSIS")
print("=" * 70)
print()
token = get_token()
print("[OK] Token acquired")
print("[INFO] Fetching all contacts...")
contacts, token = fetch_all_contacts(token)
total = len(contacts)
print(f"[OK] Fetched {total} total contacts")
print()
# Analyze businessHomePage
contacts_with_url = []
contacts_without_url = []
for c in contacts:
bhp = c.get("businessHomePage")
if bhp and bhp.strip():
contacts_with_url.append(c)
else:
contacts_without_url.append(c)
print(f"[INFO] Contacts with businessHomePage: {len(contacts_with_url)}")
print(f"[INFO] Contacts without businessHomePage: {len(contacts_without_url)}")
print()
# Categorize
junk_contacts = []
legitimate_contacts = []
suspicious_contacts = []
linkedin_profiles = []
facebook_profiles = []
domain_counts = defaultdict(int)
junk_by_pattern = defaultdict(list)
for c in contacts_with_url:
url = c.get("businessHomePage", "").strip()
category, detail = categorize_url(url)
domain = extract_domain(url)
domain_counts[domain] += 1
entry = {
"id": c["id"],
"displayName": c.get("displayName", ""),
"url": url,
"companyName": c.get("companyName", ""),
"category": category,
"detail": detail,
"domain": domain,
}
if category == "junk":
junk_contacts.append(entry)
junk_by_pattern[detail].append(entry)
# Cross-reference LinkedIn
url_lower = url.lower()
if "linkedin.com" in url_lower:
linkedin_profiles.append(entry)
elif "facebook.com" in url_lower:
facebook_profiles.append(entry)
elif category == "legitimate":
legitimate_contacts.append(entry)
else:
suspicious_contacts.append(entry)
# Print report
print("=" * 70)
print("RESULTS SUMMARY")
print("=" * 70)
print(f"Total contacts: {total}")
print(f"Contacts with businessHomePage: {len(contacts_with_url)}")
print(f" - Junk (auto-inserted): {len(junk_contacts)}")
print(f" - Legitimate websites: {len(legitimate_contacts)}")
print(f" - Suspicious/broken: {len(suspicious_contacts)}")
print()
# Junk URLs grouped by pattern
print("=" * 70)
print("JUNK URLs BY PATTERN")
print("=" * 70)
for pattern, entries in sorted(junk_by_pattern.items(), key=lambda x: -len(x[1])):
print(f"\n Pattern: {pattern} ({len(entries)} contacts)")
for e in entries[:5]:
print(f" - {e['displayName']}: {e['url']}")
if len(entries) > 5:
print(f" ... and {len(entries) - 5} more")
# Suspicious URLs
print()
print("=" * 70)
print("SUSPICIOUS/BROKEN URLs")
print("=" * 70)
if suspicious_contacts:
for e in suspicious_contacts:
print(f" - {e['displayName']}: \"{e['url']}\" (reason: {e['detail']})")
else:
print(" None found")
# Legitimate URLs (first 30)
print()
print("=" * 70)
print("LEGITIMATE URLs (first 30)")
print("=" * 70)
for e in legitimate_contacts[:30]:
company = f" [{e['companyName']}]" if e['companyName'] else ""
print(f" - {e['displayName']}{company}: {e['url']}")
if len(legitimate_contacts) > 30:
print(f" ... and {len(legitimate_contacts) - 30} more")
# Domain distribution
print()
print("=" * 70)
print("DOMAIN DISTRIBUTION")
print("=" * 70)
for domain, count in sorted(domain_counts.items(), key=lambda x: -x[1]):
print(f" {domain}: {count}")
# LinkedIn cross-reference
print()
print("=" * 70)
print(f"LINKEDIN PROFILES ({len(linkedin_profiles)})")
print("=" * 70)
for e in linkedin_profiles:
print(f" - {e['displayName']}: {e['url']}")
# Facebook cross-reference
print()
print("=" * 70)
print(f"FACEBOOK PROFILES ({len(facebook_profiles)})")
print("=" * 70)
for e in facebook_profiles:
print(f" - {e['displayName']}: {e['url']}")
# Save results
results = {
"summary": {
"total_contacts": total,
"contacts_with_url": len(contacts_with_url),
"contacts_without_url": len(contacts_without_url),
"junk_count": len(junk_contacts),
"legitimate_count": len(legitimate_contacts),
"suspicious_count": len(suspicious_contacts),
"linkedin_count": len(linkedin_profiles),
"facebook_count": len(facebook_profiles),
},
"junk_contacts": junk_contacts,
"legitimate_contacts": legitimate_contacts,
"suspicious_contacts": suspicious_contacts,
"linkedin_profiles": linkedin_profiles,
"facebook_profiles": facebook_profiles,
"domain_distribution": dict(sorted(domain_counts.items(), key=lambda x: -x[1])),
"junk_by_pattern": {k: [{"displayName": e["displayName"], "url": e["url"]} for e in v]
for k, v in sorted(junk_by_pattern.items(), key=lambda x: -len(x[1]))},
}
out_path = "D:/ClaudeTools/temp/bardach_url_analysis.json"
with open(out_path, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\n[OK] Results saved to {out_path}")
if __name__ == "__main__":
main()