sync: Auto-sync from ACG-M-L5090 at 2026-03-10 19:11:00
Synced files: - Quote wizard frontend (all components, hooks, types, config) - API updates (config, models, routers, schemas, services) - Client work (bg-builders, gurushow) - Scripts (BGB Lesley termination, CIPP, Datto, migration) - Temp files (Bardach contacts, VWP investigation, misc) - Credentials and session logs - Email service, PHP API, session logs Machine: ACG-M-L5090 Timestamp: 2026-03-10 19:11:00 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
349
temp/bardach_url_analysis.py
Normal file
349
temp/bardach_url_analysis.py
Normal file
@@ -0,0 +1,349 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Analyze website/URL fields for all Bardach contacts in Microsoft 365."""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
from collections import defaultdict
|
||||
|
||||
TENANT_ID = "dd4a82e8-85a3-44ac-8800-07945ab4d95f"
|
||||
CLIENT_ID = "fabb3421-8b34-484b-bc17-e46de9703418"
|
||||
CLIENT_SECRET = "~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO"
|
||||
SCOPE = "https://graph.microsoft.com/.default"
|
||||
USER = "barbara@bardach.net"
|
||||
TOKEN_URL = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token"
|
||||
|
||||
JUNK_PATTERNS = [
|
||||
"ms-outlook://",
|
||||
"linkedin.com/in/",
|
||||
"linkedin.com/company/",
|
||||
"outlook.live.com",
|
||||
"profile.live.com",
|
||||
"people.live.com",
|
||||
"social.microsoft.com",
|
||||
"contact.skype.com",
|
||||
"d.docs.live.net",
|
||||
"storage.live.com",
|
||||
"onedrive.live.com",
|
||||
"1drv.ms",
|
||||
"facebook.com",
|
||||
"twitter.com",
|
||||
"x.com",
|
||||
"plus.google.com",
|
||||
"instagram.com",
|
||||
"myspace.com",
|
||||
"flickr.com",
|
||||
"foursquare.com",
|
||||
"about.me",
|
||||
"gravatar.com",
|
||||
"apis.live.net",
|
||||
"cid-",
|
||||
"skype:",
|
||||
]
|
||||
|
||||
# Social media domains that M365 auto-links
|
||||
SOCIAL_DOMAINS = {
|
||||
"linkedin.com", "www.linkedin.com",
|
||||
"facebook.com", "www.facebook.com", "m.facebook.com",
|
||||
"twitter.com", "www.twitter.com",
|
||||
"x.com", "www.x.com",
|
||||
"instagram.com", "www.instagram.com",
|
||||
"plus.google.com",
|
||||
"myspace.com", "www.myspace.com",
|
||||
"flickr.com", "www.flickr.com",
|
||||
"foursquare.com", "www.foursquare.com",
|
||||
"about.me",
|
||||
"gravatar.com", "www.gravatar.com",
|
||||
}
|
||||
|
||||
# Microsoft internal domains
|
||||
MS_DOMAINS = {
|
||||
"outlook.live.com", "profile.live.com", "people.live.com",
|
||||
"social.microsoft.com", "contact.skype.com",
|
||||
"d.docs.live.net", "storage.live.com", "onedrive.live.com",
|
||||
"1drv.ms", "apis.live.net",
|
||||
}
|
||||
|
||||
|
||||
def get_token():
|
||||
result = subprocess.run(
|
||||
["curl", "-s", "-X", "POST", TOKEN_URL,
|
||||
"-H", "Content-Type: application/x-www-form-urlencoded",
|
||||
"-d", f"client_id={CLIENT_ID}&scope={SCOPE}&client_secret={CLIENT_SECRET}&grant_type=client_credentials"],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
data = json.loads(result.stdout)
|
||||
if "access_token" not in data:
|
||||
print(f"[ERROR] Token acquisition failed: {data}")
|
||||
sys.exit(1)
|
||||
return data["access_token"]
|
||||
|
||||
|
||||
def fetch_all_contacts(token):
|
||||
"""Fetch all contacts with pagination."""
|
||||
contacts = []
|
||||
select = "id,displayName,businessHomePage,emailAddresses,personalNotes,companyName"
|
||||
url = f"https://graph.microsoft.com/v1.0/users/{USER}/contacts?$select={select}&$top=100"
|
||||
call_count = 0
|
||||
|
||||
while url:
|
||||
call_count += 1
|
||||
# Re-acquire token every 500 calls
|
||||
if call_count > 1 and (call_count - 1) % 500 == 0:
|
||||
print(f"[INFO] Re-acquiring token at call {call_count}...")
|
||||
token = get_token()
|
||||
print("[OK] Token re-acquired")
|
||||
|
||||
result = subprocess.run(
|
||||
["curl", "-s", "-X", "GET", url,
|
||||
"-H", f"Authorization: Bearer {token}",
|
||||
"-H", "Content-Type: application/json"],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
data = json.loads(result.stdout)
|
||||
|
||||
if "value" not in data:
|
||||
print(f"[ERROR] Unexpected response: {json.dumps(data)[:300]}")
|
||||
break
|
||||
|
||||
batch = data["value"]
|
||||
contacts.extend(batch)
|
||||
|
||||
if len(contacts) % 500 == 0 or len(contacts) % 100 < len(batch):
|
||||
if len(contacts) % 500 < 100:
|
||||
print(f"[INFO] Fetched {len(contacts)} contacts so far...")
|
||||
|
||||
url = data.get("@odata.nextLink")
|
||||
if url:
|
||||
time.sleep(0.05)
|
||||
|
||||
return contacts, token
|
||||
|
||||
|
||||
def categorize_url(url_str):
|
||||
"""Categorize a URL as junk, legitimate, or suspicious."""
|
||||
if not url_str or not url_str.strip():
|
||||
return "suspicious", "empty"
|
||||
|
||||
url_lower = url_str.lower().strip()
|
||||
|
||||
# Check for obviously malformed
|
||||
if len(url_lower) < 4:
|
||||
return "suspicious", "too_short"
|
||||
|
||||
# Check junk patterns
|
||||
for pattern in JUNK_PATTERNS:
|
||||
if pattern in url_lower:
|
||||
return "junk", pattern
|
||||
|
||||
# Try parsing
|
||||
try:
|
||||
# Add scheme if missing
|
||||
parse_url = url_lower
|
||||
if not parse_url.startswith("http"):
|
||||
parse_url = "https://" + parse_url
|
||||
parsed = urlparse(parse_url)
|
||||
domain = parsed.netloc.lower()
|
||||
|
||||
# Check social/MS domains
|
||||
if domain in SOCIAL_DOMAINS:
|
||||
return "junk", domain
|
||||
if domain in MS_DOMAINS:
|
||||
return "junk", domain
|
||||
|
||||
# Check for no real domain
|
||||
if not domain or "." not in domain:
|
||||
return "suspicious", "no_valid_domain"
|
||||
|
||||
return "legitimate", domain
|
||||
|
||||
except Exception:
|
||||
return "suspicious", "parse_error"
|
||||
|
||||
|
||||
def extract_domain(url_str):
|
||||
"""Extract domain from URL."""
|
||||
if not url_str:
|
||||
return "unknown"
|
||||
url_lower = url_str.lower().strip()
|
||||
if not url_lower.startswith("http"):
|
||||
url_lower = "https://" + url_lower
|
||||
try:
|
||||
parsed = urlparse(url_lower)
|
||||
return parsed.netloc or "unknown"
|
||||
except Exception:
|
||||
return "unknown"
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("BARDACH CONTACTS - WEBSITE/URL FIELD ANALYSIS")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
token = get_token()
|
||||
print("[OK] Token acquired")
|
||||
print("[INFO] Fetching all contacts...")
|
||||
|
||||
contacts, token = fetch_all_contacts(token)
|
||||
total = len(contacts)
|
||||
print(f"[OK] Fetched {total} total contacts")
|
||||
print()
|
||||
|
||||
# Analyze businessHomePage
|
||||
contacts_with_url = []
|
||||
contacts_without_url = []
|
||||
|
||||
for c in contacts:
|
||||
bhp = c.get("businessHomePage")
|
||||
if bhp and bhp.strip():
|
||||
contacts_with_url.append(c)
|
||||
else:
|
||||
contacts_without_url.append(c)
|
||||
|
||||
print(f"[INFO] Contacts with businessHomePage: {len(contacts_with_url)}")
|
||||
print(f"[INFO] Contacts without businessHomePage: {len(contacts_without_url)}")
|
||||
print()
|
||||
|
||||
# Categorize
|
||||
junk_contacts = []
|
||||
legitimate_contacts = []
|
||||
suspicious_contacts = []
|
||||
linkedin_profiles = []
|
||||
facebook_profiles = []
|
||||
|
||||
domain_counts = defaultdict(int)
|
||||
junk_by_pattern = defaultdict(list)
|
||||
|
||||
for c in contacts_with_url:
|
||||
url = c.get("businessHomePage", "").strip()
|
||||
category, detail = categorize_url(url)
|
||||
domain = extract_domain(url)
|
||||
domain_counts[domain] += 1
|
||||
|
||||
entry = {
|
||||
"id": c["id"],
|
||||
"displayName": c.get("displayName", ""),
|
||||
"url": url,
|
||||
"companyName": c.get("companyName", ""),
|
||||
"category": category,
|
||||
"detail": detail,
|
||||
"domain": domain,
|
||||
}
|
||||
|
||||
if category == "junk":
|
||||
junk_contacts.append(entry)
|
||||
junk_by_pattern[detail].append(entry)
|
||||
|
||||
# Cross-reference LinkedIn
|
||||
url_lower = url.lower()
|
||||
if "linkedin.com" in url_lower:
|
||||
linkedin_profiles.append(entry)
|
||||
elif "facebook.com" in url_lower:
|
||||
facebook_profiles.append(entry)
|
||||
|
||||
elif category == "legitimate":
|
||||
legitimate_contacts.append(entry)
|
||||
else:
|
||||
suspicious_contacts.append(entry)
|
||||
|
||||
# Print report
|
||||
print("=" * 70)
|
||||
print("RESULTS SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"Total contacts: {total}")
|
||||
print(f"Contacts with businessHomePage: {len(contacts_with_url)}")
|
||||
print(f" - Junk (auto-inserted): {len(junk_contacts)}")
|
||||
print(f" - Legitimate websites: {len(legitimate_contacts)}")
|
||||
print(f" - Suspicious/broken: {len(suspicious_contacts)}")
|
||||
print()
|
||||
|
||||
# Junk URLs grouped by pattern
|
||||
print("=" * 70)
|
||||
print("JUNK URLs BY PATTERN")
|
||||
print("=" * 70)
|
||||
for pattern, entries in sorted(junk_by_pattern.items(), key=lambda x: -len(x[1])):
|
||||
print(f"\n Pattern: {pattern} ({len(entries)} contacts)")
|
||||
for e in entries[:5]:
|
||||
print(f" - {e['displayName']}: {e['url']}")
|
||||
if len(entries) > 5:
|
||||
print(f" ... and {len(entries) - 5} more")
|
||||
|
||||
# Suspicious URLs
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("SUSPICIOUS/BROKEN URLs")
|
||||
print("=" * 70)
|
||||
if suspicious_contacts:
|
||||
for e in suspicious_contacts:
|
||||
print(f" - {e['displayName']}: \"{e['url']}\" (reason: {e['detail']})")
|
||||
else:
|
||||
print(" None found")
|
||||
|
||||
# Legitimate URLs (first 30)
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("LEGITIMATE URLs (first 30)")
|
||||
print("=" * 70)
|
||||
for e in legitimate_contacts[:30]:
|
||||
company = f" [{e['companyName']}]" if e['companyName'] else ""
|
||||
print(f" - {e['displayName']}{company}: {e['url']}")
|
||||
if len(legitimate_contacts) > 30:
|
||||
print(f" ... and {len(legitimate_contacts) - 30} more")
|
||||
|
||||
# Domain distribution
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("DOMAIN DISTRIBUTION")
|
||||
print("=" * 70)
|
||||
for domain, count in sorted(domain_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {domain}: {count}")
|
||||
|
||||
# LinkedIn cross-reference
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(f"LINKEDIN PROFILES ({len(linkedin_profiles)})")
|
||||
print("=" * 70)
|
||||
for e in linkedin_profiles:
|
||||
print(f" - {e['displayName']}: {e['url']}")
|
||||
|
||||
# Facebook cross-reference
|
||||
print()
|
||||
print("=" * 70)
|
||||
print(f"FACEBOOK PROFILES ({len(facebook_profiles)})")
|
||||
print("=" * 70)
|
||||
for e in facebook_profiles:
|
||||
print(f" - {e['displayName']}: {e['url']}")
|
||||
|
||||
# Save results
|
||||
results = {
|
||||
"summary": {
|
||||
"total_contacts": total,
|
||||
"contacts_with_url": len(contacts_with_url),
|
||||
"contacts_without_url": len(contacts_without_url),
|
||||
"junk_count": len(junk_contacts),
|
||||
"legitimate_count": len(legitimate_contacts),
|
||||
"suspicious_count": len(suspicious_contacts),
|
||||
"linkedin_count": len(linkedin_profiles),
|
||||
"facebook_count": len(facebook_profiles),
|
||||
},
|
||||
"junk_contacts": junk_contacts,
|
||||
"legitimate_contacts": legitimate_contacts,
|
||||
"suspicious_contacts": suspicious_contacts,
|
||||
"linkedin_profiles": linkedin_profiles,
|
||||
"facebook_profiles": facebook_profiles,
|
||||
"domain_distribution": dict(sorted(domain_counts.items(), key=lambda x: -x[1])),
|
||||
"junk_by_pattern": {k: [{"displayName": e["displayName"], "url": e["url"]} for e in v]
|
||||
for k, v in sorted(junk_by_pattern.items(), key=lambda x: -len(x[1]))},
|
||||
}
|
||||
|
||||
out_path = "D:/ClaudeTools/temp/bardach_url_analysis.json"
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
print(f"\n[OK] Results saved to {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user