"""Pull all Temp contacts and analyze internal duplicates.""" import subprocess, json, sys from collections import Counter, defaultdict TENANT_ID = "dd4a82e8-85a3-44ac-8800-07945ab4d95f" CLAUDE_APP = "fabb3421-8b34-484b-bc17-e46de9703418" CLAUDE_SECRET = "~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO" USER = "barbara@bardach.net" SELECT = ("id,displayName,givenName,surname,emailAddresses," "homePhones,businessPhones,companyName,jobTitle," "personalNotes,homeAddress,businessAddress,lastModifiedDateTime") # Get token r = subprocess.run([ 'curl', '-s', '-X', 'POST', f'https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token', '-d', f'client_id={CLAUDE_APP}&client_secret={CLAUDE_SECRET}&scope=https://graph.microsoft.com/.default&grant_type=client_credentials' ], capture_output=True, text=True) token = json.loads(r.stdout)['access_token'] print("[OK] Token acquired") # Get Temp folder ID r2 = subprocess.run(['curl', '-s', '-H', f'Authorization: Bearer {token}', f'https://graph.microsoft.com/v1.0/users/{USER}/contactFolders?$select=displayName,id'], capture_output=True, text=True) folders = json.loads(r2.stdout).get('value', []) temp_id = next(f['id'] for f in folders if f['displayName'] == 'Temp') # Pull all Temp contacts print("Pulling Temp contacts...") url = f"https://graph.microsoft.com/v1.0/users/{USER}/contactFolders/{temp_id}/contacts?$top=100&$select={SELECT}" all_contacts = [] page = 0 while url: page += 1 r = subprocess.run(['curl', '-s', '-H', f'Authorization: Bearer {token}', url], capture_output=True, text=True) data = json.loads(r.stdout) if 'error' in data: print(f"Error page {page}: {data['error'].get('message','')[:200]}") break items = data.get('value', []) all_contacts.extend(items) url = data.get('@odata.nextLink') if page % 20 == 0: print(f" Page {page}: {len(all_contacts)} contacts...") if not items: break print(f"\nTotal Temp contacts pulled: {len(all_contacts)}") # Save raw data with open('D:/ClaudeTools/temp/bardach_temp_all.json', 'w') as f: json.dump(all_contacts, f) print("Saved to bardach_temp_all.json") # Analyze duplicates by displayName print(f"\n{'='*60}") print("DUPLICATE ANALYSIS BY NAME") print(f"{'='*60}") name_groups = defaultdict(list) for c in all_contacts: name = (c.get('displayName') or '').strip().lower() if name: name_groups[name].append(c) no_name = [c for c in all_contacts if not (c.get('displayName') or '').strip()] unique_names = len(name_groups) dupe_names = {k: v for k, v in name_groups.items() if len(v) > 1} total_dupes = sum(len(v) - 1 for v in dupe_names.values()) print(f"Total contacts: {len(all_contacts)}") print(f"Contacts with no name: {len(no_name)}") print(f"Unique names: {unique_names}") print(f"Names with duplicates: {len(dupe_names)}") print(f"Total duplicate entries (removable): {total_dupes}") print(f"Estimated after dedup: {unique_names + len(no_name)}") # Distribution of duplicate counts dupe_dist = Counter(len(v) for v in dupe_names.values()) print(f"\nDuplicate distribution:") for count, num_names in sorted(dupe_dist.items()): print(f" {count}x duplicated: {num_names} names") # Top duplicated names sorted_dupes = sorted(dupe_names.items(), key=lambda x: -len(x[1])) print(f"\nTop 30 most duplicated:") for name, contacts in sorted_dupes[:30]: emails = set() notes_count = 0 for c in contacts: for e in c.get('emailAddresses', []): if e.get('address'): emails.add(e['address'].lower()) if (c.get('personalNotes') or '').strip(): notes_count += 1 email_str = ', '.join(list(emails)[:2]) if emails else '(no email)' print(f" {len(contacts)}x - {name} | {email_str} | {notes_count} have notes") # Sample notes to find cleanup patterns print(f"\n{'='*60}") print("NOTES CLEANUP PATTERNS") print(f"{'='*60}") # Collect all notes all_notes = [] for c in all_contacts: notes = (c.get('personalNotes') or '').strip() if notes: all_notes.append(notes) print(f"Contacts with notes: {len(all_notes)}") # Find common patterns patterns_found = defaultdict(int) for notes in all_notes: lines = notes.split('\n') for line in lines: line = line.strip() if 'read-only' in line.lower() and 'outlook' in line.lower(): patterns_found['read-only outlook warning'] += 1 elif 'tap the link' in line.lower(): patterns_found['tap the link instruction'] += 1 elif 'edit in outlook' in line.lower(): patterns_found['edit in outlook'] += 1 elif line.startswith('20') and len(line) > 10 and ('This contact' in line or 'read-only' in line.lower()): patterns_found['dated read-only warning'] += 1 print(f"\nKnown junk patterns found:") for pattern, count in sorted(patterns_found.items(), key=lambda x: -x[1]): print(f" {pattern}: {count} occurrences") # Show sample notes with the junk pattern print(f"\nSample notes containing 'read-only' (first 5):") shown = 0 for notes in all_notes: if 'read-only' in notes.lower(): print(f" ---") # Show first 300 chars print(f" {notes[:300]}") shown += 1 if shown >= 5: break # Show sample of notes that DON'T have the junk pattern (real data) print(f"\nSample notes WITHOUT 'read-only' junk (first 5):") shown = 0 for notes in all_notes: if 'read-only' not in notes.lower() and len(notes) > 5: print(f" ---") print(f" {notes[:300]}") shown += 1 if shown >= 5: break