From bf74f967eb5b43bff6e381be7053fc5af4bca5ae Mon Sep 17 00:00:00 2001 From: Howard Enos Date: Mon, 15 Jun 2026 21:59:51 -0700 Subject: [PATCH] sync: auto-sync from HOWARD-HOME at 2026-06-15 21:59:42 Author: Howard Enos Machine: HOWARD-HOME Timestamp: 2026-06-15 21:59:42 --- .claude/skills/unifi-wifi/scripts/watch-ap.sh | 22 ++- ...6-06-15-howard-cs-server-raid-vpn-reset.md | 182 ++++++++++++++++++ 2 files changed, 200 insertions(+), 4 deletions(-) create mode 100644 clients/cascades-tucson/session-logs/2026-06/2026-06-15-howard-cs-server-raid-vpn-reset.md diff --git a/.claude/skills/unifi-wifi/scripts/watch-ap.sh b/.claude/skills/unifi-wifi/scripts/watch-ap.sh index 2a291f9..668204d 100644 --- a/.claude/skills/unifi-wifi/scripts/watch-ap.sh +++ b/.claude/skills/unifi-wifi/scripts/watch-ap.sh @@ -6,7 +6,10 @@ # # REQUIRES: L3 reach to the AP's mgmt IP. At Cascades the APs are on 192.168.2.x/3.x (mgmt VLANs) — # bring up the Cascades VPN first. Device-auth SSH cred is vaulted (clients/cascades-tucson/unifi-ap-ssh). -# Needs `sshpass` locally (UniFi device-auth is password-based). Find AP IPs via: +# AUTH (UniFi device-auth is password-based): uses `sshpass` if installed, otherwise falls back to +# OpenSSH's SSH_ASKPASS helper (no sshpass needed). NOTE the fallback uses `ssh` from PATH: on Windows +# that must be MSYS/Git-bash ssh — Win10/11 system OpenSSH cannot exec a shell askpass (CreateProcessW +# error 193); on Linux/macOS system ssh works fine. Find AP IPs via: # echo 'db.device.find({site_id:"685f39068e65331c46ef6dd2",type:"uap"},{name:1,ip:1}).forEach(printjson)' | bash .claude/scripts/uos-mongo.sh # # Usage: bash .claude/skills/unifi-wifi/scripts/watch-ap.sh [interval=2] [vault-path] @@ -17,12 +20,23 @@ AP="${1:?usage: watch-ap.sh [interval] [vault-path]}"; INT="${2:-2}"; VP U="$(bash "$VAULT" get-field "$VP" credentials.username 2>/dev/null)" P="$(bash "$VAULT" get-field "$VP" credentials.password 2>/dev/null)" [ -n "$U" ] && [ -n "$P" ] || { echo "[ERROR] no device-auth cred at vault:$VP"; exit 1; } -command -v sshpass >/dev/null || { echo "[ERROR] sshpass not installed (apt-get install sshpass / brew install sshpass)"; exit 1; } + +# Auth method: sshpass if available, else SSH_ASKPASS fallback (no sshpass needed). +SSH_OPTS=(-o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile=/dev/null \ + -o PreferredAuthentications=password -o PubkeyAuthentication=no -o NumberOfPasswordPrompts=1) +if command -v sshpass >/dev/null 2>&1; then + run_ssh() { SSHPASS="$P" sshpass -e ssh "${SSH_OPTS[@]}" "$@"; } + echo "[INFO] auth: sshpass" +else + ASKPASS="$(mktemp)"; printf '#!/bin/sh\nprintf "%%s\\n" "$WATCH_AP_PW"\n' > "$ASKPASS"; chmod +x "$ASKPASS" + trap 'rm -f "$ASKPASS"' EXIT + run_ssh() { WATCH_AP_PW="$P" SSH_ASKPASS="$ASKPASS" SSH_ASKPASS_REQUIRE=force DISPLAY="${DISPLAY:-:0}" ssh "${SSH_OPTS[@]}" "$@"; } + echo "[INFO] auth: SSH_ASKPASS fallback (sshpass not installed)" +fi echo "[INFO] watching $AP every ${INT}s (Ctrl-C to stop). Needs Cascades VPN reach." # Run the sampling loop ON the AP so each tick is one round-trip; mca-dump for cu/clients, iw survey for busy%/noise. -SSHPASS="$P" sshpass -e ssh -o ConnectTimeout=8 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ - "$U@$AP" "INT=$INT sh -s" <<'REMOTE' 2>&1 | grep -viE 'Warning: Permanently|pq.html' +run_ssh "$U@$AP" "INT=$INT sh -s" <<'REMOTE' 2>&1 | grep -viE 'Warning: Permanently|pq.html' radios=$(iw dev 2>/dev/null | awk '/Interface/{print $2}' | grep -E 'wifi|ath' || echo "wifi0 wifi1 wifi2") prev="" while :; do diff --git a/clients/cascades-tucson/session-logs/2026-06/2026-06-15-howard-cs-server-raid-vpn-reset.md b/clients/cascades-tucson/session-logs/2026-06/2026-06-15-howard-cs-server-raid-vpn-reset.md new file mode 100644 index 0000000..b46640d --- /dev/null +++ b/clients/cascades-tucson/session-logs/2026-06/2026-06-15-howard-cs-server-raid-vpn-reset.md @@ -0,0 +1,182 @@ +--- +title: CS-SERVER slowness diagnosis (degraded RAID) + pfSense OpenVPN password reset +date: 2026-06-15 +user: howard +client: cascades-tucson +tags: [cascades-tucson, cs-server, raid, hardware, pfsense, openvpn, rmm, bitdefender] +--- + +## User +- **User:** Howard Enos (howard) +- **Machine:** Howard-Home +- **Role:** tech + +## Session Summary + +Investigated a "CS-SERVER running slow, check for infections" report for Cascades of Tucson. +Resolved CS-SERVER to GuruRMM agent `c39f1de7-d5b6-45ae-b132-e06977ab1713` (online, agent +v0.6.66) and ran a multi-pass health/security probe over RMM (PowerShell as SYSTEM). The box +is NOT resource-starved: 48 GB RAM (~72% free), CPU avg 16-26%, C: 151 GB free / D: 490 GB +free, 10-day uptime, no unexpected-shutdown or crash events. So slowness is not RAM, CPU, or +disk-space exhaustion. + +Root cause is a **degraded RAID-1 array**. Dell OMSA (`omreport`) on the box shows Physical +Disk 0:0:3 (WDC WD3200BEVT, 320 GB SATA laptop drive) as **Critical / Removed**, and Virtual +Disk2 (RAID-1, = Windows Disk 1 = C:) as **Non-Critical / Degraded**. The C: mirror (OS / AD / +SQL / page file) is running on a single surviving 320 GB Hitachi 5400 RPM laptop drive (0:0:2) +with zero redundancy. The healthy D: volume is a separate RAID-1 of two 1.2 TB SAS drives +(Virtual Disk0, OK). A 1.2 TB SAS disk (1:0:4) sits "Ready" but is the wrong size/type to +rebuild the 320 GB SATA mirror, so no auto-rebuild occurred. Controller is the basic SAS 6/iR +Integrated (3 Gbps, no real cache) — degraded mirror on a slow laptop spindle fully explains +the slowness, and combined with the documented no-backup / single-DC posture it is a +data-loss emergency. + +Infection check came back clean. CS-SERVER is NOT enrolled in the ACG GravityZone/Bitdefender +tenant (only 3 Cascades endpoints there: LAPTOP-DCQNDJJ2 + RECEPTIONIST-PC). Windows Defender +is replaced by a third-party "Endpoint Protection Service" (Syncro-managed) with no detected +threats. IOC sweep (external connections, scheduled tasks, Run keys, recent temp/ProgramData +binaries) found only legitimate management/EDR tooling. Notable: the box carries the previous +MSP's leftover Datto stack (Datto RMM/CentraStage + Datto EDR/Infocyte) on top of ACG's Syncro ++ GuruRMM + ScreenConnect + KPAX — multiple overlapping agents thrashing the degraded spindle. + +Discussed the SSD migration question: the rebuild-then-swap approach (replace failed member +with SSD #1, rebuild, then swap the other member for SSD #2) is valid and will materially help +(workload is random-I/O bound; SSDs win big even at the controller's 3 Gbps cap), but with hard +caveats: back up/image C: BEFORE any rebuild, use enterprise SATA SSDs >= 320 GB (no TRIM +through this controller), and it is a band-aid on EOL hardware — the planned DC migration is +still the real fix. Mike then installed ACG's cloud backup (MSP360/CloudBerry to the +ACG-backup server) and started a backup, addressing the #1 prerequisite. + +Second task: Howard had lost the OpenVPN password for his pfSense user. Vault held only the +pfSense admin GUI login (no per-user OpenVPN credential), and pfSense stores user passwords as +bcrypt (unrecoverable) — so a reset, not a lookup. Drove the pfSense web UI from CS-SERVER via +RMM (CS-SERVER can reach 192.168.0.1:443/22; OpenVPN local-DB user is `Howard`, userid 0). +Reset the password to the value Howard supplied using pfSense's native +`local_user_set_password()` via the Diagnostics PHP-exec endpoint (surgical — preserves groups +/cert/OpenVPN bindings), verified it with `local_backed()` returning AUTHOK, vaulted it, and +synced. + +## Key Decisions + +- Used Dell OMSA `omreport storage pdisk/vdisk/controller` rather than `Get-PhysicalDisk` to + see real drive health — `Get-PhysicalDisk` only shows the PERC virtual disks (both "Healthy") + and hides the failed member behind the RAID controller. +- Reset the pfSense password via the Diagnostics PHP console calling `local_user_set_password()` + + `local_user_set()` + `write_config()`, instead of scripting the user-edit form POST. The + form approach requires resubmitting every field (groups, cert) and risks wiping the user's + OpenVPN cert/group bindings; the PHP API touches only the password. +- Did not use SSH for the reset: pfSense SSH presents the admin menu and Windows OpenSSH cannot + feed a password non-interactively, so the HTTPS GUI + curl path was the only scriptable option. +- Verified at every write step (read user list before changing; `local_backed()` auth test + after) rather than trusting the OKRESET echo — which caught that the first reset attempts had + silently failed (see Problems). +- Left the leftover Datto/CentraStage/Infocyte agents in place — cleanup is a separate, + lower-priority decision, not done this session. +- Held all CS-SERVER changes: only read-only diagnostics were run on the server; the drive swap + must wait until the first full backup completes and verifies. + +## Problems Encountered + +- **Windows `curl.exe` invoked from PowerShell strips embedded double-quotes** from + `--data-urlencode` arguments (CommandLineToArgvW), silently mangling the POST body. My pfSense + PHP payload `echo "PHPRUNS-OK";` arrived as `echo PHPRUNS-OK;` -> PHP "Undefined constant", + and the multi-line reset PHP hit a parse error so it never executed (no partial change). Fixed + by writing the PHP with single-quotes only, building `$` via `[char]36`, on a single line. Cost + ~4 wasted RMM round-trips. Logged to errorlog.md with `--friction`. +- pfSense `diag_command.php` dispatches the PHP branch on the submit button value + `submit=EXECPHP` (not on field presence); omitting it returned the bare form. Confirmed by + GETting the form and enumerating field names (`txtPHPCommand` textarea + `submit=EXECPHP`). +- `vault-helper.sh new` failed with `ModuleNotFoundError: No module named 'yaml'` — the `py` + launcher lacked PyYAML. Fixed with `py -m pip install pyyaml` (installed 6.0.3), then the + entry created/encrypted/verified cleanly. +- First RMM health probe returned `interrupted` ("Agent restarted during execution"); simply + re-dispatched and it completed. The big probes exit 1 because individual `Get-WinEvent` + no-match branches and the client-only `root\SecurityCenter2` namespace throw on a Server OS — + cosmetic, the useful sections still print. + +## Configuration Changes + +- **pfSense (192.168.0.1) — Cascades:** reset local user `Howard` (userid 0) OpenVPN/GUI + password. `write_config` revision committed on the firewall ("Reset Howard local/OpenVPN + password (ACG remote support)"). No other pfSense settings touched. +- **Vault (new file):** `clients/cascades-tucson/pfsense-openvpn-howard.sops.yaml` (kind vpn, + encrypted, pushed). +- **Howard-Home:** `py -m pip install pyyaml` (6.0.3) — PyYAML was not previously present for + the `py` launcher; needed by `vault-helper.sh`. +- **errorlog.md:** one `--friction` entry (curl.exe double-quote stripping). +- No changes made on CS-SERVER itself (read-only diagnostics only). + +## Credentials & Secrets + +- **Cascades pfSense OpenVPN — user `Howard`:** password reset to `Gptf*77ttb#` (supplied by + Howard). Verified authenticating against pfSense Local Database (`local_backed` AUTHOK). + Vaulted: `clients/cascades-tucson/pfsense-openvpn-howard.sops.yaml` + (credentials.username=Howard, credentials.password). +- **Cascades pfSense admin GUI:** `admin` — vault `clients/cascades-tucson/pfsense-firewall.sops.yaml` + (used to drive the reset). Unchanged. + +## Infrastructure & Servers + +- **CS-SERVER** — Cascades of Tucson DC/DNS/file/print/Hyper-V host. Dell PowerEdge R610 + (~2009). Win Server 2019 Std. 48 GB RAM, 16 logical procs. GuruRMM agent + `c39f1de7-d5b6-45ae-b132-e06977ab1713` (v0.6.66). LAN 192.168.2.254. + - **RAID:** Controller SAS 6/iR Integrated (fw 00.25.47.00.06.22.03.00). + - Virtual Disk0 = Windows Disk 0 = **D:** (1117 GB), RAID-1, **OK** — members 0:0:0 + 0:0:1 + (Seagate ST1200MM0088 1.2 TB SAS). + - Virtual Disk2 = Windows Disk 1 = **C:** (297 GB), RAID-1, **DEGRADED / Non-Critical** — + surviving member 0:0:2 (Hitachi HTS545032B9A300 320 GB SATA, 5400 RPM); failed member + 0:0:3 (WDC WD3200BEVT 320 GB SATA) = **Critical / Removed**. + - 1:0:4 (ST1200MM0088 1.2 TB SAS) = "Ready" — wrong size to rebuild the 320 GB mirror. + - **Volumes:** C: 145.8 used / 151.1 free GB; D: 627.5 used / 489.7 free GB. + - **AV:** Windows Defender disabled (Get-MpComputerStatus empty); third-party "Endpoint + Protection Service" (Syncro-managed) running, no threats. NOT in GravityZone tenant. + - **Agent stack present:** GuruRMM, Syncro (+ SyncroLive, FilePusher), ScreenConnect, Datto + RMM (CentraStage), Datto EDR (Infocyte, C:\Program Files\infocyte\agent), KPAX (managed + print), Dell OMSA (dsm_om_connsvc64), Entra Connect (miiserver, ~598 MB), DNS, 2x sqlservr. + - Logged on: `sysadmin` (console, since 6/8). +- **Cascades pfSense** — 192.168.0.1 (HTTPS 443 + SSH 22 reachable from CS-SERVER), pfSense + 24.x ("pfsense.cascades.local"). Local users: Howard(0), admin(1), rturner(2), sysadmin(3). +- **Bitdefender / GravityZone:** Cascades company id `66b0448e1e0441d02508bad8` + ("Cascades of Tucson - Monica Ramirez_20149445"); 3 endpoints, all OK, CS-SERVER absent. +- **Backup:** Mike installed ACG cloud backup (MSP360/CloudBerry -> ACG-backup server) and + started a job. Not yet confirmed complete/verified this session. + +## Commands & Outputs + +- Resolve agent: `bash .claude/scripts/rmm-search.sh cs-server` -> CS-SERVER, Cascades, online. +- OMSA physical disks: `& 'C:\Program Files\Dell\SysMgt\oma\bin\omreport.exe' storage pdisk + controller=0` -> 0:0:3 Status Critical / State Removed; `... storage vdisk controller=0` -> + Virtual Disk2 Status Non-Critical / State Degraded. +- pfSense reset (single-quote-only, one-line PHP via diag_command.php EXECPHP): + `local_user_set_password($ue,'Gptf*77ttb#'); local_user_set($ue); write_config(...)` -> + `OKRESET idx=0 name=Howard hashlen=60`. +- Verify: `echo local_backed('Howard','Gptf*77ttb#')` -> `AUTHOK user=1`. +- Vault: `vault-helper.sh new clients/cascades-tucson/pfsense-openvpn-howard --kind vpn ...` -> + encrypted + decrypts cleanly. +- Friction: Windows curl.exe drops embedded `"`; use single quotes + `[char]36` for `$`, one line. + +## Pending / Incomplete Tasks + +- **Backup must complete AND verify before any drive work.** Confirm the MSP360 job is + image-based / bare-metal + system-state (not file-only) for DC recoverability, and that it + finished a first full. (Offered to check progress via RMM — not yet done.) +- **RAID remediation:** either source a matching 320 GB+ SATA SSD/HDD to rebuild the C: mirror, + or (preferred) migrate C: to enterprise SATA SSDs via rebuild-then-swap — only after backup + verifies. The existing 1.2 TB "Ready" disk cannot rebuild the 320 GB mirror. +- **Strategic:** accelerate the planned DC migration off the EOL R610; still single-DC + the + array is now actively unprotected. +- **Optional:** clean up leftover previous-MSP Datto RMM (CentraStage) + Datto EDR (Infocyte) + agents to cut I/O churn. +- **Consider** enrolling CS-SERVER into GravityZone (currently no Bitdefender coverage on the DC). + +## Reference Information + +- GuruRMM agent (CS-SERVER): `c39f1de7-d5b6-45ae-b132-e06977ab1713`. RMM API + http://172.16.3.30:3001. +- Bitdefender Cascades company id: `66b0448e1e0441d02508bad8`. +- pfSense: https://192.168.0.1 ; diag PHP-exec form fields = `txtPHPCommand` + `submit=EXECPHP`. +- Vault entries: `clients/cascades-tucson/pfsense-openvpn-howard.sops.yaml` (new), + `clients/cascades-tucson/pfsense-firewall.sops.yaml` (admin). +- RMM cmd ids this session: health b649f202/a93f6421, perf a62a414b, IOC 4a2c5950, + pf-reset 5c83ec9d, pf-verify c99adb35. +- Client wiki: `wiki/clients/cascades-tucson.md`.