From f7045ecda1af1f8f02fd9b631f82f220aa01bbe5 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Sat, 13 Jun 2026 20:21:37 -0700 Subject: [PATCH] sync: auto-sync from GURU-5070 at 2026-06-13 20:21:10 Author: Mike Swanson Machine: GURU-5070 Timestamp: 2026-06-13 20:21:10 --- .tmp-xen-token.py | 18 ++ .../peaceful-spirit/AD-DC2-REBUILD-RUNBOOK.md | 72 ++++++++ ...ke-pst-server2-dc-rebuild-and-g-cleanup.md | 171 ++++++++++++++++++ 3 files changed, 261 insertions(+) create mode 100644 .tmp-xen-token.py create mode 100644 clients/peaceful-spirit/AD-DC2-REBUILD-RUNBOOK.md create mode 100644 clients/peaceful-spirit/session-logs/2026-06/2026-06-13-mike-pst-server2-dc-rebuild-and-g-cleanup.md diff --git a/.tmp-xen-token.py b/.tmp-xen-token.py new file mode 100644 index 00000000..a705cd92 --- /dev/null +++ b/.tmp-xen-token.py @@ -0,0 +1,18 @@ +import os, paramiko +host="192.168.0.104"; user="root"; pw=os.environ["XEN_PW"] +c=paramiko.SSHClient(); c.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +c.connect(host, username=user, password=pw, timeout=20, + disabled_algorithms={'pubkeys': ['rsa-sha2-256','rsa-sha2-512']}, + look_for_keys=False, allow_agent=False) +def run(cmd): + i,o,e=c.exec_command(cmd,timeout=60); return (o.read().decode(errors="replace")+e.read().decode(errors="replace")).strip() +# Generate a temporary XenAPI session ref for the HTTP export (avoids putting root pw in the RMM command log) +pyc = ( +"import XenAPI;" +"s=XenAPI.Session('https://localhost');" +"s.login_with_password('root', __import__('os').environ['XPW'], '1.0', 'g-migration');" +"print(s._session)" +) +out=run(f"XPW='{pw}' python -c \"{pyc}\"") +print("SESSION_REF:", out) +c.close() diff --git a/clients/peaceful-spirit/AD-DC2-REBUILD-RUNBOOK.md b/clients/peaceful-spirit/AD-DC2-REBUILD-RUNBOOK.md new file mode 100644 index 00000000..d04944ed --- /dev/null +++ b/clients/peaceful-spirit/AD-DC2-REBUILD-RUNBOOK.md @@ -0,0 +1,72 @@ +# Peaceful Spirit — PST-SERVER2 evict + re-promote runbook + +**Created:** 2026-06-13 by Mike Swanson (GURU-5070) +**Why:** PST-SERVER2 is a past-tombstone-lifetime DC. AD replication dead both directions +(err 8614 "exceeded tombstone lifetime"; err 0x8009030C broken secure channel). SYSVOL + +data DFS-R in State 5 (InError), stale 200-224 days. A past-TSL DC must NOT be allowed to +resume replication (lingering-object reanimation risk). So: evict SERVER2, metadata-clean, +re-promote fresh. + +**Authoritative/healthy DC:** PST-SERVER (192.168.0.2) — holds ALL 5 FSMO. Server 2016 +Essentials. Domain PEACEFULSPIRIT.local (Win2016 functional level). +**DC to rebuild:** PST-SERVER2 (192.168.1.127, NW site) — Server 2019 Standard, additional DC only. + +**Execution channel:** GuruRMM (SYSTEM context). PST-SERVER `87293069-33b6-45e8-a68f-6811216cdb96`, +PST-SERVER2 `5d2d7ba0-3903-4aa3-9e97-6ca4424ffe65`. Domain admin = `sysadmin` (vault: +clients/peaceful-spirit/server.sops.yaml). NOTE: promotion needs Domain Admin creds passed in +the RMM command — that password lands in RMM command_text/history (internal). Consider rotation +after if RMM DB exposure is a concern. + +--- + +## Gates (confirm with Mike before each) + +### Gate 0 — Pre-flight + safety backup (SAFE: read-only + backup) +- Confirm PST-SERVER is a Global Catalog. (If SERVER2 were the only GC, must GC-flag SERVER first.) +- Confirm all 5 FSMO on PST-SERVER (done: yes). +- dcdiag focused (Advertising/FSMOCheck/Services) on PST-SERVER — must be clean. +- Enable Strict Replication Consistency on PST-SERVER (protective; reg key) — *change, but safe/recommended*. +- BACK UP authoritative SYSVOL: robocopy `C:\Windows\SYSVOL\domain\Policies` -> `C:\PST-Backup\SYSVOL-Policies` + and `Backup-GPO -All`. Insurance before any AD change. + +### Gate 1 — Force-demote PST-SERVER2 (DESTRUCTIVE to SERVER2; reboots SERVER2) +- On SERVER2: `Uninstall-ADDSDomainController -ForceRemoval -DemoteOperationMasterRole -Force + -LocalAdministratorPassword ` (graceful demote impossible — replication dead). +- SERVER2 becomes a member/standalone server and reboots. Blast radius = SERVER2 only. +- Risk: AD changes made ONLY on SERVER2 during isolation are lost (already stranded; PDC authoritative). + +### Gate 2 — Metadata cleanup on PST-SERVER (DESTRUCTIVE to AD metadata for SERVER2) +- Remove SERVER2 NTDS Settings / server object (ntdsutil metadata cleanup, or Remove-ADObject of the + NTDS Settings object with -Credential domain admin). +- Remove SERVER2 from AD Sites & Services (NW site server object). +- DNS cleanup: SERVER2 host A, _msdcs CNAME/GUID, NS records, SRV records. +- DFSR cleanup: remove SERVER2 member from "Domain System Volume" (SYSVOL) and "PST-DFS" groups. +- Verify: `repadmin /viewlist *` shows only PST-SERVER; dcdiag clean. + +### Gate 3 — Re-promote PST-SERVER2 (re-introduces a DC) +- Ensure SERVER2 DNS points to PST-SERVER (192.168.0.2) primary. (Currently 192.168.0.2,192.168.1.5,8.8.8.8,1.1.1.1.) +- `Install-ADDSDomainController -DomainName PEACEFULSPIRIT.local -Credential -InstallDns + -SiteName NW -SafeModeAdministratorPassword ` — fresh promotion. +- SYSVOL initializes clean via DFSR initial sync from PST-SERVER (no D2/D4 needed). +- Verify: repadmin /replsummary 0% fails; SYSVOL+NETLOGON shared on SERVER2; dcdiag clean; + GPO count matches SERVER (11). + +### Gate 4 — Rebuild data DFS-R (deferred — separate decision) +- Provision SERVER2 data volume (shrink C: / add disk / folder-on-C: — TBD after G: cleanup + sizing). +- Recreate `Shares` folder target on SERVER2 + re-establish PST-DFS replication. +- Add PST-SERVER2 as 2nd namespace ROOT target (namespace HA for VPN-outage resilience). +- Confirm backlog drains to 0. + +### Gate 5 — G: cleanup on PST-SERVER (separate) +- ~160 GB candidates: G:\Windows (32), G:\Program Files (x86) (13), G:\ProgramData (10), + G:\Users (51), G:\$Recycle.Bin (5.6), VSS in System Volume Information (~46). Confirm junk first. +- D: recovery junk (~700 GB): Recovery-EXT, Recovery2019, "Unknown folder" — confirm before delete. + +--- + +## Rollback notes +- Gate 0 changes (strict consistency reg) are trivially reversible. +- After Gate 1 demotion, SERVER2 is a plain member server — re-promotion (Gate 3) restores it. + No rollback needed for the eviction itself; the domain runs fine on PST-SERVER alone meanwhile. +- The SYSVOL/GPO backup from Gate 0 is the restore point if PST-SERVER's SYSVOL were ever harmed + (it should not be touched by this procedure). diff --git a/clients/peaceful-spirit/session-logs/2026-06/2026-06-13-mike-pst-server2-dc-rebuild-and-g-cleanup.md b/clients/peaceful-spirit/session-logs/2026-06/2026-06-13-mike-pst-server2-dc-rebuild-and-g-cleanup.md new file mode 100644 index 00000000..b6deeb66 --- /dev/null +++ b/clients/peaceful-spirit/session-logs/2026-06/2026-06-13-mike-pst-server2-dc-rebuild-and-g-cleanup.md @@ -0,0 +1,171 @@ +## User +- **User:** Mike Swanson (mike) +- **Machine:** GURU-5070 +- **Role:** admin + +## Session Summary + +Resumed the Peaceful Spirit multi-site AD+DFS work from the 2026-06-11 plan. Read-only recon +(via GuruRMM) revealed the environment was far past the plan baseline AND broken: PST-SERVER2 +had already been promoted as a second DC with DFS roles ~200+ days ago, then the site +disconnected for ~7 months. PST-SERVER2 was a **past-tombstone-lifetime DC** — AD replication +dead both directions (err 8614 "exceeded tombstone lifetime", err 0x8009030C broken secure +channel), SYSVOL DFSR stale 224 days (Event 4012), data DFS-R (`PST-DFS`/`Shares`) stale 200 +days, and its DFS-R data target drive (a dynamic disk holding `D:\Shares`) was physically +**missing** (`diskpart` showed "Disk M0 Missing"). A past-TSL DC must never resume replication +(lingering-object reanimation risk), so the plan shifted from "fix SYSVOL" to **evict + rebuild +SERVER2**. + +Executed a gated runbook (`clients/peaceful-spirit/AD-DC2-REBUILD-RUNBOOK.md`): Gate 0 pre-flight ++ SYSVOL/GPO backup; Gate 1 force-demote PST-SERVER2; Gate 2 metadata cleanup on PST-SERVER; an +authoritative **D4 SYSVOL restore** on PST-SERVER; Gate 3 re-join + re-promote PST-SERVER2 as a +fresh DC, then a **D2 non-authoritative SYSVOL sync** to fix its stale leftover DFSR database. +End state: **two healthy DCs**, AD replication 0 errors both directions, SYSVOL DFSR State 4 +Normal on both (11 GPOs each), both GC, both advertising as logon servers. The 224-day-broken AD +is fully repaired and NW now has a local DC surviving a VPN outage. + +Then did Gate 5 — **G: cleanup on PST-SERVER**. Investigation showed G: is the OLD server's +former C: drive (live OS runs from C:; everything OS-related on G: dated 2009-2018, profiles named +`*.NEWSERVER`). Deleted the entire dead old-OS install (~99 GB: Windows, Users, Program Files*, +ProgramData, $Recycle.Bin, 13 misc folders) using takeown + robocopy-mirror for +TrustedInstaller-protected files, and bounded G: VSS shadow storage from UNBOUNDED to 25 GB +(trimmed 41 GB -> 9.6 GB). G: free went **51 GB -> 182 GB** (~131 GB reclaimed). Live data +(`G:\Shares`, `ServerFolders`, `System Volume Information`) untouched. + +Gate 4 (data DFS-R rebuild + SERVER2 data-volume decision) and several follow-ups remain. + +## Key Decisions + +- **Evict + rebuild SERVER2 (not repair).** Past tombstone lifetime -> resuming replication risks + lingering-object reanimation. Force-demote -> metadata cleanup -> clean re-promote sidesteps the + whole stale-AD/SYSVOL mess. Safe because PST-SERVER holds ALL 5 FSMO + is GC; SERVER2 was only + an additional DC. +- **D4 authoritative on PST-SERVER, D2 non-authoritative on SERVER2.** PST-SERVER's SYSVOL was + State 5 (InError) from the old 4012; D4 made it the authoritative source. After re-promotion, + SERVER2's SYSVOL hit 4012 again from a **leftover DFSR database** from its prior DC life -> D2 + forced it to discard the stale DB and re-sync fresh. +- **Forced-removal method.** `Uninstall-ADDSDomainController -ForceRemoval` is blocked by a + non-skippable test (General.21) while the DC is a DNS server + GC. Fix: clear the GC flag + (nTDSDSA options=0) and uninstall the DNS role (reboot), THEN force-remove. `-SkipPreChecks` / + `-Force` / `-Confirm:$false` do NOT bypass General.21. +- **AD writes need DA creds via FQDN.** Agent SYSTEM context can't do metadata cleanup. Use + `-Credential sysadmin -Server PST-SERVER.PEACEFULSPIRIT.local` (FQDN, valid SPN). `-Server + localhost` fails with a Kerberos SPN error; WinRM loopback with creds also failed. +- **G: cleanup confirmed as old-OS drive.** Live OS on C:; G: = old server's C: (2009-2018 dates, + `*.NEWSERVER` profiles). Deleted whole old OS; kept Shares/ServerFolders/SVI. Bounded VSS rather + than clearing (keep recent Previous-Versions history). + +## Problems Encountered + +- **`get-field` dot-notation bug:** `vault.sh get-field clients/peaceful-spirit/server credentials.password` + returned the literal string **"null"** -> every DA attempt sent password "null" -> auth rejected + (~6 bad attempts; harmless, domain LockoutThreshold=0). Real password `r3tr0gradE99!` was read + from the full `vault.sh get`. **Use full `get` (or verify the field path) for nested creds.** +- **PowerShell-over-RMM quoting traps (recurring):** scripts are wrapped in bash single quotes, so + (a) can't use single quotes inside (property names like `$obj."prop"` must use double quotes), + (b) `\"` over-escaping breaks `-join`, (c) `$g:` parsed as a drive ref -> use `${g}`. Several + commands failed to parse (no-ops) until fixed. +- **`Uninstall-ADDSDomainController` parameter-set / General.21** — see Key Decisions. +- **AV grabbed mingw curl** ("curl: Permission denied") intermittently; Mike disabled AV. +- **SERVER2 slow reboots** — DNS-role removal + DC promotion finalize add several minutes to boot; + RMM `last_seen` gave false "online" before the real reboot took hold (verify actual state, not + just the timestamp). +- **dcdiag SysVolCheck/NetLogons failed** right after SYSVOL init -> fixed by `Restart-Service + Netlogon` (re-advertise). +- **Stubborn OS files** survived `rd /s /q` even after takeown -> robocopy-mirror-from-empty + + `rd` cleared them. +- **Free-space readout bug:** WMI filter `DeviceID=''G:''` (doubled quotes) returned 0; used + `(Get-PSDrive G).Free` / `Get-Volume` instead. + +## Configuration Changes + +AD / DC (domain PEACEFULSPIRIT.local, via GuruRMM): +- PST-SERVER: SYSVOL backup -> `C:\PST-Backup\SYSVOL-Policies-20260613-1611`; GPO backup -> + `C:\PST-Backup\GPO-20260613-1611` (11 GPOs). Strict Replication Consistency already =1. +- PST-SERVER: **D4 authoritative SYSVOL restore** (msDFSR-Enabled FALSE+options=1, then TRUE) -> + Event 4602, SYSVOL State 4. +- PST-SERVER: metadata cleanup of PST-SERVER2 (server object + NTDS Settings + computer account + removed; DNS A/NS/CNAME/7xSRV removed in both zones). +- PST-SERVER2: cleared GC flag (nTDSDSA options 1->0); uninstalled DNS Server role; force-removed + AD DS (became WORKGROUP member); re-joined PEACEFULSPIRIT.local; **re-promoted** as DC (site NW, + InstallDns, GC); **D2 non-authoritative SYSVOL sync** (Event 4604) -> State 4, 11 GPOs, SYSVOL+ + NETLOGON shared; Netlogon restarted (dcdiag green); DNS client set to 192.168.0.2. +- PST-SERVER: **G: cleanup** — deleted old-OS dirs (Windows, Users, Program Files, Program Files + (x86), ProgramData, $Recycle.Bin, MSOCache, v12, v13, RAPID, 8x8, Brother, Intel, inetpub, + Support, BackupTemp, Documents and Settings, $WINDOWS.~BT). VSS shadow storage for G: resized + to maxsize=25GB. G: free 51 -> 182 GB. + +Repo / vault: +- Created `clients/peaceful-spirit/AD-DC2-REBUILD-RUNBOOK.md`. +- Vault: created `clients/peaceful-spirit/server2.sops.yaml` with `local_admin_password` and + `dsrm_password` (see below). **Pending publish** via sync. + +## Credentials & Secrets + +- **PST domain admin:** `PEACEFULSPIRIT\sysadmin` / `r3tr0gradE99!` (vault + `clients/peaceful-spirit/server`, field `credentials.password`). NOTE the get-field dot bug above. +- **PST-SERVER2 new local admin + DSRM:** generated this session, stored in vault + `clients/peaceful-spirit/server2` (`local_admin_password`, `dsrm_password`). Values are in the + vault only (not pasted here per policy); entry is **not yet pushed** to Gitea. +- **[WARNING] DA password exposure:** the sysadmin password was passed (base64-wrapped) in RMM + command_text for the cleanup/D4/D2/promote/join commands, so it is recoverable from the GuruRMM + command history/DB. RMM is internal; consider rotating `sysadmin` if RMM DB exposure is a concern. + +## Infrastructure & Servers + +- Domain **PEACEFULSPIRIT.local** (NetBIOS PEACEFULSPIRIT? -> uses PEACEFULSPIRIT\). Win2016 + domain/forest functional level. LockoutThreshold=0 (lockout disabled). +- **PST-SERVER** — 192.168.0.2, site **CC**, holds all 5 FSMO, GC, DNS, Server 2016 Essentials. + GuruRMM `87293069-33b6-45e8-a68f-6811216cdb96` (v0.6.66). G: 465.7 GB (182 GB free post-cleanup); + C: 931 GB; D: 931 GB (Recovery-EXT/Recovery2019/"Unknown folder" ~700 GB of old backup junk). +- **PST-SERVER2** — 192.168.1.127 (DHCP), site **NW**, GC, DNS, Server 2019 Standard. GuruRMM + `5d2d7ba0-3903-4aa3-9e97-6ca4424ffe65` (v0.6.66). Single 1 TB NVMe (Crucial P2), C: only (no + D:); old dynamic data disk is MISSING. +- AD Sites & Services (pre-existing, correct): sites CC + NW; subnets 192.168.0.0/24 -> CC, + 192.168.1.0/24 -> NW. S2S VPN between sites is UP (389/445/135/88 reachable SERVER2->SERVER). +- DFS namespace `\\PEACEFULSPIRIT.local\PST-Files` -> folder `Shares` (root target only on + PST-SERVER). DFS-R group `PST-DFS` replicated `Shares` (G:\Shares <-> D:\Shares) — currently + broken/stale, to be rebuilt at Gate 4. +- Business data on PST-SERVER `G:\Shares`: Private ~154 GB, Scanned ~105 GB, ITServices ~5 GB + (~265 GB). A service `SshWiaRestart` runs from `G:\Shares\ITServices\...` (Fujitsu scanner). + +## Commands & Outputs + +- Tombstone evidence: `repadmin /replsummary` -> 8614 / 0x8009030C, 100% fails (pre-rebuild). +- Force-demote (after clearing GC + DNS role): `Uninstall-ADDSDomainController -ForceRemoval + -LocalAdministratorPassword -NoRebootOnCompletion -Force` -> Status=Success. +- Metadata cleanup: `Remove-ADObject -Recursive -Server PST-SERVER.PEACEFULSPIRIT.local -Credential + $cred` on `CN=PST-SERVER2,CN=Servers,CN=NW,CN=Sites,...` + computer object; DNS via + `Remove-DnsServerResourceRecord`. +- D4/D2: `Set-ADObject -Replace @{ "msDFSR-Enabled"=$false/$true; + "msDFSR-options"=1 (D4 only) }` + `net stop/start dfsr` + `dfsrdiag pollad`. Events 4114 -> 4602 + (D4) / 4614 -> 4604 (D2). +- Re-promote: `Install-ADDSDomainController -DomainName PEACEFULSPIRIT.local -Credential $cred + -SafeModeAdministratorPassword $dsrm -SiteName NW -InstallDns -Force -NoRebootOnCompletion`. +- Final health: `repadmin /replsummary` 0/5 fails both DCs; dcdiag PST-SERVER2 passed + Advertising/SysVolCheck/NetLogons after `Restart-Service Netlogon`. +- G: cleanup: takeown /f /r /d y; icacls grant; robocopy /MIR; rd /s /q. + `vssadmin resize shadowstorage /for=G: /on=G: /maxsize=25GB`. + +## Pending / Incomplete Tasks + +- **Gate 4 — data DFS-R rebuild:** decide PST-SERVER2 data volume (shrink C: / add disk / + folder-on-C:), recreate `Shares` folder target on SERVER2, re-establish PST-DFS replication, + and ADD PST-SERVER2 as a 2nd **namespace root target** (namespace HA for VPN-outage). Clean the + orphaned PST-DFS member left from metadata cleanup. +- **PST-SERVER2 static IP** — currently DHCP 192.168.1.127; DCs should be static (dcpromo warned). +- **Stale DNS tidy:** extra `192.168.1.5` A record for PEACEFULSPIRIT.local; `.240` RRAS-pool + registration on PST-SERVER. Identify what 192.168.1.5 is (a resolver in SERVER2's DNS list). +- **Vault publish:** push `clients/peaceful-spirit/server2.sops.yaml` (handled by this save's sync). +- **Optional:** rotate `sysadmin` (RMM command_text exposure); clean D: backup junk on PST-SERVER + (~700 GB) if not needed; remove `C:\PST-Backup\*` once rebuild confirmed stable. +- **Wiki:** update `wiki/clients/peaceful-spirit.md` (now 2 DCs; SERVER2 rebuilt; G: cleaned). + +## Reference Information + +- Runbook: `clients/peaceful-spirit/AD-DC2-REBUILD-RUNBOOK.md`. +- Backups on PST-SERVER: `C:\PST-Backup\SYSVOL-Policies-20260613-1611`, `C:\PST-Backup\GPO-20260613-1611`. +- Vault: `clients/peaceful-spirit/server` (DA), `clients/peaceful-spirit/server2` (new local admin+DSRM). +- GuruRMM agents: PST-SERVER `87293069-33b6-45e8-a68f-6811216cdb96`; PST-SERVER2 + `5d2d7ba0-3903-4aa3-9e97-6ca4424ffe65`. RMM API `http://172.16.3.30:3001`. +- KB 2218556 (authoritative/non-authoritative DFSR SYSVOL D4/D2).