From a78fb96f95e664323fdca058f70e59bd0b44dc35 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Mon, 13 Apr 2026 10:30:51 -0700 Subject: [PATCH] Session log: Cloudflare Tunnel for azcomputerguru + Cox BGP diagnosis Diagnosed azcomputerguru.com 521 errors: Cox's BGP route to specific Cloudflare origin-pull prefixes (162.158.0.0/16, 172.64.0.0/13, 173.245.48.0/20, 141.101.64.0/18) is broken from 72.194.62.0/29. Confirmed by TCP probe matrix from pfSense WAN, traceroute latency comparison, and state-table showing 0 inbound CF connections while direct-internet traffic still reached origin. Deployed Cloudflare Tunnel 'acg-origin' on Jupiter Unraid as a Docker container. Routes 4 proxied hostnames (azcomputerguru.com, analytics., community., radio.) through the tunnel with HTTPS backend to IX 172.16.3.10:443 with per-ingress SNI matching. All 4 hostnames return 200 OK through CF edge after the cutover. Repo hygiene: - Merged clients/ix-server/ into clients/internal-infrastructure/ (IX is internal infra, not a paying-client account). Git detected the session-log files as renames so history is preserved. Updated 4 stale path references in 2 files. - Moved cox-bgp ticket draft out of projects/dataforth-dos/ (wrong project) to clients/internal-infrastructure/vendor-tickets/. - Relocated tunnel-setup helper scripts from projects/dataforth-dos/datasheet-pipeline/implementation/ to clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/. Deleted superseded/abandoned login attempts. Sanitized hardcoded Jupiter/pfSense SSH passwords to pull from SOPS vault at runtime; Cloudflare token reads from env var (tokens still in 1Password, vault entry is metadata-only). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../cloudflared-tunnel-setup/cf_analytics.py | 58 +++ .../jupiter_tunnel_complete.py | 153 ++++++++ .../jupiter_tunnel_fix_https.py | 81 ++++ .../jupiter_tunnel_login5.py | 25 ++ .../cloudflared-tunnel-setup/pfsense_diag.py | 71 ++++ .../cloudflared-tunnel-setup/pfsense_diag2.py | 65 ++++ .../cloudflared-tunnel-setup/pfsense_trace.py | 42 +++ .../2026-03-16-ix-account-cleanup.md | 0 .../2026-04-11-smart-slider-security-scan.md | 2 +- .../session-logs/2026-04-13-session.md | 353 ++++++++++++++++++ .../2026-04-13-cox-bgp-cloudflare-routing.md | 88 +++++ session-logs/2026-04-11-session.md | 6 +- 12 files changed, 940 insertions(+), 4 deletions(-) create mode 100644 clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/cf_analytics.py create mode 100644 clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_complete.py create mode 100644 clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_fix_https.py create mode 100644 clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_login5.py create mode 100644 clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_diag.py create mode 100644 clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_diag2.py create mode 100644 clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_trace.py rename clients/{ix-server => internal-infrastructure}/session-logs/2026-03-16-ix-account-cleanup.md (100%) rename clients/{ix-server => internal-infrastructure}/session-logs/2026-04-11-smart-slider-security-scan.md (98%) create mode 100644 clients/internal-infrastructure/session-logs/2026-04-13-session.md create mode 100644 clients/internal-infrastructure/vendor-tickets/2026-04-13-cox-bgp-cloudflare-routing.md diff --git a/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/cf_analytics.py b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/cf_analytics.py new file mode 100644 index 0000000..f42bc79 --- /dev/null +++ b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/cf_analytics.py @@ -0,0 +1,58 @@ +"""Pull CF Analytics via GraphQL to see origin-status per CF PoP.""" +import json, os, sys, urllib.request +from datetime import datetime, timezone, timedelta + +ZONE = '1beb9917c22b54be32e5215df2c227ce' +# CF API tokens live in 1Password (vault entry services/cloudflare.sops.yaml +# currently holds metadata only). Provide via env vars before running. +TOKENS = { + 'full-dns': os.environ.get('CF_API_TOKEN_FULL_DNS', ''), + 'legacy': os.environ.get('CF_API_TOKEN_LEGACY', ''), +} + +since_30 = (datetime.now(timezone.utc) - timedelta(minutes=30)).strftime('%Y-%m-%dT%H:%M:%SZ') + +QUERY = ''' +query($zone:String!, $since:Time!){ + viewer { + zones(filter:{zoneTag:$zone}){ + httpRequestsAdaptiveGroups(limit:50, filter:{datetime_geq:$since}, orderBy:[count_DESC]){ + count + dimensions { coloCode edgeResponseStatus originResponseStatus clientRequestHTTPHost } + } + } + } +} +''' + +def gql(token, query, vars): + req = urllib.request.Request( + 'https://api.cloudflare.com/client/v4/graphql', + data=json.dumps({'query': query, 'variables': vars}).encode(), + headers={'Authorization': f'Bearer {token}', 'Content-Type': 'application/json'}, + ) + with urllib.request.urlopen(req, timeout=30) as r: + return json.loads(r.read()) + +for name, tok in TOKENS.items(): + print(f'\n===== Trying {name} token =====') + try: + r = gql(tok, QUERY, {'zone': ZONE, 'since': since_30}) + if r.get('errors'): + print('errors:', json.dumps(r['errors'], indent=2)[:600]) + else: + zones = r.get('data', {}).get('viewer', {}).get('zones', []) + if not zones: + print('no zones returned') + continue + groups = zones[0].get('httpRequestsAdaptiveGroups', []) + print(f'{len(groups)} groups returned') + print(f'{"count":>6} {"colo":5} {"edge":5} {"origin":6} host') + for g in groups: + d = g['dimensions'] + print(f"{g['count']:>6} {d.get('coloCode','-'):5} " + f"{str(d.get('edgeResponseStatus','-')):5} " + f"{str(d.get('originResponseStatus','-')):6} " + f"{d.get('clientRequestHTTPHost','-')}") + except Exception as e: + print(f'FAIL: {e}') diff --git a/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_complete.py b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_complete.py new file mode 100644 index 0000000..4dab299 --- /dev/null +++ b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_complete.py @@ -0,0 +1,153 @@ +"""Complete the tunnel setup in one pass after cert.pem is in place. + +Steps: + 1. Stop cf-login container + 2. Create tunnel 'acg-origin', capture UUID + 3. Write config.yml + 4. Flip DNS: A (proxied, 72.194.62.5) -> CNAME (proxied, .cfargotunnel.com) for 4 hostnames + 5. Start persistent container 'cloudflared' + 6. Wait for 4 tunnel connections to register + 7. Verify site returns 200 externally +""" +import json, os, re, socket, subprocess, time, urllib.request +import paramiko + +HOST, USER = "172.16.3.20", "root" +import subprocess as _sp, yaml as _y +PWD = _y.safe_load(_sp.run(["sops","-d","D:/vault/infrastructure/jupiter-unraid-primary.sops.yaml"],capture_output=True,text=True,timeout=30,check=True).stdout)["credentials"]["password"] +APPDATA = '/mnt/cache/appdata/cloudflared' +import os as _os +CF_TOKEN = _os.environ.get('CF_API_TOKEN_FULL_DNS', '') +if not CF_TOKEN: + raise SystemExit('[FAIL] set CF_API_TOKEN_FULL_DNS env var (token lives in 1Password)') +ZONE = '1beb9917c22b54be32e5215df2c227ce' +HOSTNAMES = ['azcomputerguru.com','analytics.azcomputerguru.com','community.azcomputerguru.com','radio.azcomputerguru.com'] +ORIGIN = 'http://172.16.3.10:80' + +socket.setdefaulttimeout(60) +c = paramiko.SSHClient(); c.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +c.connect(HOST, username=USER, password=PWD, timeout=30, look_for_keys=False, allow_agent=False) + +def run(cmd, to=120): + _, o, e = c.exec_command(cmd, timeout=to) + out = o.read().decode('utf-8','replace') + err = e.read().decode('utf-8','replace') + rc = o.channel.recv_exit_status() + return out, err, rc + +def cfapi(method, path, body=None): + req = urllib.request.Request( + f'https://api.cloudflare.com/client/v4{path}', + data=json.dumps(body).encode() if body else None, + method=method, + headers={'Authorization': f'Bearer {CF_TOKEN}', 'Content-Type':'application/json'}, + ) + try: + with urllib.request.urlopen(req, timeout=30) as r: + return json.loads(r.read()) + except urllib.error.HTTPError as e: + try: return json.loads(e.read()) + except: return {'success':False,'errors':[{'message':str(e)}]} + +try: + print('=== [1] stop cf-login ===', flush=True) + out, _, _ = run('docker rm -f cf-login 2>&1') + print(out.rstrip()) + + print('\n=== [2] create tunnel acg-origin ===', flush=True) + CREATE = ( + f'docker run --rm ' + f'-v {APPDATA}:/home/nonroot/.cloudflared ' + f'cloudflare/cloudflared:latest tunnel create acg-origin' + ) + out, err, rc = run(CREATE) + print(out.rstrip()) + if err.strip(): print(f'[stderr] {err.rstrip()}') + m = re.search(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', out) + if not m: raise SystemExit(f'[FAIL] no UUID in output; rc={rc}') + UUID = m.group(1) + print(f'[OK] tunnel UUID: {UUID}') + + print('\n=== [3] write config.yml ===', flush=True) + config = f'''tunnel: {UUID} +credentials-file: /home/nonroot/.cloudflared/{UUID}.json +ingress: +''' + for h in HOSTNAMES: + config += f' - hostname: {h}\n service: {ORIGIN}\n' + config += ' - service: http_status:404\n' + # Write via heredoc + HERE = "'EOF_CONFIG'" + out, err, rc = run(f"cat > {APPDATA}/config.yml <<{HERE}\n{config}\nEOF_CONFIG") + run(f'chown 65532:65532 {APPDATA}/config.yml') + out, _, _ = run(f'cat {APPDATA}/config.yml') + print(out.rstrip()) + + print('\n=== [4] DNS cutover (A -> CNAME) ===', flush=True) + tunnel_target = f'{UUID}.cfargotunnel.com' + for h in HOSTNAMES: + # Find existing record + r = cfapi('GET', f'/zones/{ZONE}/dns_records?name={h}') + if not r.get('success') or not r['result']: + print(f' [SKIP] {h}: no record found') + continue + rec = r['result'][0] + print(f' [{h}] current: type={rec["type"]} content={rec["content"]} proxied={rec["proxied"]} id={rec["id"]}') + if rec['type']=='CNAME' and rec['content']==tunnel_target: + print(f' already pointing at tunnel, skipping') + continue + # Delete + d = cfapi('DELETE', f'/zones/{ZONE}/dns_records/{rec["id"]}') + if not d.get('success'): + print(f' [FAIL delete] {d.get("errors")}') + continue + # Create CNAME + body = {'type':'CNAME','name':h,'content':tunnel_target,'proxied':True,'ttl':1} + cr = cfapi('POST', f'/zones/{ZONE}/dns_records', body) + if cr.get('success'): + print(f' [OK] -> CNAME {tunnel_target} proxied') + else: + print(f' [FAIL create] {cr.get("errors")}') + + print('\n=== [5] start persistent cloudflared ===', flush=True) + run('docker rm -f cloudflared 2>&1') + START = ( + 'docker run -d --name cloudflared --restart=unless-stopped ' + f'-v {APPDATA}:/home/nonroot/.cloudflared ' + 'cloudflare/cloudflared:latest ' + 'tunnel --config /home/nonroot/.cloudflared/config.yml run' + ) + out, err, rc = run(START) + print(out.rstrip()) + if err.strip(): print(f'[stderr] {err.rstrip()}') + + print('\n=== [6] wait for tunnel connections ===', flush=True) + for i in range(20): + time.sleep(3) + out, _, _ = run('docker logs cloudflared 2>&1 | tail -30') + conns = out.count('Registered tunnel connection') + print(f' [try {i+1}] connections registered: {conns}') + if conns >= 4: + print(out.rstrip()[-800:]) + break + + print('\n=== [7] verify externally ===', flush=True) +finally: + c.close() + +# Run external curl from this workstation +print('\n[EXTERNAL CHECK]', flush=True) +for h in HOSTNAMES: + try: + req = urllib.request.Request(f'https://{h}/', method='HEAD', + headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0'}) + with urllib.request.urlopen(req, timeout=15) as r: + print(f' {h}: HTTP {r.status}') + except urllib.error.HTTPError as e: + print(f' {h}: HTTP {e.code}') + except Exception as e: + print(f' {h}: ERR {e}') + +print(f'\n[DONE] tunnel UUID: {UUID}') +print(f'[DONE] config: {APPDATA}/config.yml') +print(f'[DONE] persistent container: cloudflared') diff --git a/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_fix_https.py b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_fix_https.py new file mode 100644 index 0000000..6f2b342 --- /dev/null +++ b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_fix_https.py @@ -0,0 +1,81 @@ +"""Switch tunnel origin from http://172.16.3.10:80 to https://172.16.3.10:443. + +Each ingress gets originRequest.originServerName= so IX's Apache +serves the right vhost cert via SNI. noTLSVerify=true to tolerate cPanel's +self-signed or hostname-mismatch quirks (cloudflared still uses TLS). +""" +import socket +import paramiko + +HOST, USER = "172.16.3.20", "root" +import subprocess as _sp, yaml as _y +PWD = _y.safe_load(_sp.run(["sops","-d","D:/vault/infrastructure/jupiter-unraid-primary.sops.yaml"],capture_output=True,text=True,timeout=30,check=True).stdout)["credentials"]["password"] +APPDATA = '/mnt/cache/appdata/cloudflared' +HOSTNAMES = ['azcomputerguru.com','analytics.azcomputerguru.com','community.azcomputerguru.com','radio.azcomputerguru.com'] + +socket.setdefaulttimeout(60) +c = paramiko.SSHClient(); c.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +c.connect(HOST, username=USER, password=PWD, timeout=30, look_for_keys=False, allow_agent=False) + +def run(cmd, to=60): + _, o, e = c.exec_command(cmd, timeout=to) + return o.read().decode('utf-8','replace'), e.read().decode('utf-8','replace'), o.channel.recv_exit_status() + +# Read existing tunnel UUID from config +out, _, _ = run(f'grep "^tunnel:" {APPDATA}/config.yml') +UUID = out.split(':',1)[1].strip() +print(f'tunnel UUID: {UUID}') + +config = f'''tunnel: {UUID} +credentials-file: /home/nonroot/.cloudflared/{UUID}.json +ingress: +''' +for h in HOSTNAMES: + config += ( + f' - hostname: {h}\n' + f' service: https://172.16.3.10:443\n' + f' originRequest:\n' + f' originServerName: {h}\n' + f' noTLSVerify: true\n' + ) +config += ' - service: http_status:404\n' + +print('\n=== new config.yml ===') +print(config) + +HEREDOC = "'EOF_CFG'" +out, err, rc = run(f"cat > {APPDATA}/config.yml <<{HEREDOC}\n{config}\nEOF_CFG") +run(f'chown 65532:65532 {APPDATA}/config.yml') +out, _, _ = run(f'cat {APPDATA}/config.yml') +print('=== written ===') +print(out) + +print('\n=== restart cloudflared ===') +out, _, _ = run('docker restart cloudflared') +print(out.rstrip()) + +print('\n=== wait for reconnect ===') +import time +for i in range(15): + time.sleep(3) + out, _, _ = run('docker logs cloudflared 2>&1 | tail -30') + conns = out.count('Registered tunnel connection') + print(f' [try {i+1}] registered: {conns}') + if conns >= 4: break + +print('\n=== external HEAD probes ===') +c.close() + +# External test from this workstation +import urllib.request, urllib.error +for h in HOSTNAMES: + try: + req = urllib.request.Request(f'https://{h}/', method='HEAD', + headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0'}) + with urllib.request.urlopen(req, timeout=15) as r: + server = r.headers.get('Server','-') + print(f' {h}: HTTP {r.status} Server={server}') + except urllib.error.HTTPError as e: + print(f' {h}: HTTP {e.code}') + except Exception as e: + print(f' {h}: ERR {e}') diff --git a/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_login5.py b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_login5.py new file mode 100644 index 0000000..2adb67f --- /dev/null +++ b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/jupiter_tunnel_login5.py @@ -0,0 +1,25 @@ +"""Launch login in detached mode, container persists independent of SSH.""" +import paramiko, socket + +HOST, USER = "172.16.3.20", "root" +import subprocess as _sp, yaml as _y +PWD = _y.safe_load(_sp.run(["sops","-d","D:/vault/infrastructure/jupiter-unraid-primary.sops.yaml"],capture_output=True,text=True,timeout=30,check=True).stdout)["credentials"]["password"] +APPDATA = '/mnt/cache/appdata/cloudflared' + +SCRIPT = f''' +docker rm -f cf-login 2>/dev/null +docker run -d --name cf-login \\ + -v {APPDATA}:/home/nonroot/.cloudflared \\ + cloudflare/cloudflared:latest tunnel login +sleep 4 +echo "=== logs ===" +docker logs cf-login 2>&1 +''' + +socket.setdefaulttimeout(60) +c = paramiko.SSHClient(); c.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +c.connect(HOST, username=USER, password=PWD, timeout=30, look_for_keys=False, allow_agent=False) +_, o, e = c.exec_command(SCRIPT, timeout=90) +print(o.read().decode('utf-8','replace').rstrip()) +print(e.read().decode('utf-8','replace').rstrip()) +c.close() diff --git a/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_diag.py b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_diag.py new file mode 100644 index 0000000..a219438 --- /dev/null +++ b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_diag.py @@ -0,0 +1,71 @@ +"""pfSense diagnostic for azcomputerguru.com 521 — suspected CF IP blocks. + +Runs a single SSH session with batched diagnostics targeted at identifying +why Cloudflare PHX PoP can't reach 72.194.62.5:443. +""" +import paramiko, socket +socket.setdefaulttimeout(60) + +HOST = '172.16.0.1' +PORT = 2248 +USER = 'admin' +import subprocess as _sp, yaml as _y +PWD = _y.safe_load(_sp.run(['sops','-d','D:/vault/infrastructure/pfsense-firewall.sops.yaml'],capture_output=True,text=True,timeout=30,check=True).stdout)['credentials']['password'] + +CMDS = [ + ('installed packages (IDS/IPS/blocker)', + 'pkg info 2>/dev/null | egrep -i "suricata|snort|pfblocker|crowdsec" || echo "(none)"'), + + ('NAT rules for 72.194.62.5 / port 443', + 'pfctl -s nat 2>/dev/null | grep -E "72\\.194\\.62\\.5|443" | head -30 || echo "(pfctl nat empty)"'), + + ('Rules in PF referencing .62.5', + 'pfctl -sr 2>/dev/null | grep "72\\.194\\.62\\.5" | head -20 || echo "(none)"'), + + ('PF aliases referencing Cloudflare (case-insensitive)', + 'pfctl -T show -a cloudflare 2>/dev/null | head -30 ; pfctl -sT 2>/dev/null | grep -i "cloudflare\\|cf_\\|_cf"'), + + ('Recent filter.log entries mentioning 72.194.62.5 (last 200 binary-decoded)', + 'clog /var/log/filter.log | tail -2000 | grep "72\\.194\\.62\\.5" | tail -40 || echo "(no recent entries)"'), + + ('Recent BLOCK actions from filter.log (last 500 lines)', + 'clog /var/log/filter.log | tail -500 | grep -E "block|reject" | head -40 || echo "(no blocks)"'), + + ('Current states for :443 dst (limit 15)', + 'pfctl -s states 2>/dev/null | awk \'$6 ~ /:443$/\' | head -15 || echo "(no :443 states)"'), + + ('State table total count', + 'pfctl -s info 2>/dev/null | grep -i "states\\|limit\\|current" | head -10'), + + ('Suricata status + alert log if installed', + 'service suricata status 2>/dev/null ; ls -la /var/log/suricata/ 2>/dev/null | head'), + + ('pfBlockerNG log if installed', + 'ls -la /var/log/pfblockerng/ 2>/dev/null | head ; cat /var/log/pfblockerng/block.log 2>/dev/null | tail -30'), + + ('IP reputation / GeoIP blocks on WAN', + 'pfctl -sr 2>/dev/null | grep -iE "geoip|pfblocker|block in" | head -20'), + + ('Last 30 dropped packets to :443 (any dst)', + 'clog /var/log/filter.log | tail -2000 | grep -E "port 443" | grep -E "block|reject" | tail -30 || echo "(none)"'), +] + +def main(): + c = paramiko.SSHClient() + c.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + c.connect(HOST, port=PORT, username=USER, password=PWD, + timeout=30, banner_timeout=30, look_for_keys=False, allow_agent=False) + try: + for label, cmd in CMDS: + print(f'\n===== {label} =====', flush=True) + stdin, stdout, stderr = c.exec_command(cmd, timeout=60) + out = stdout.read().decode('utf-8','replace') + err = stderr.read().decode('utf-8','replace') + if out.strip(): print(out.rstrip()) + if err.strip() and 'stty' not in err and 'terminal' not in err.lower(): + print(f' [stderr] {err.rstrip()[:300]}') + finally: + c.close() + +if __name__ == '__main__': + main() diff --git a/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_diag2.py b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_diag2.py new file mode 100644 index 0000000..3ecec7e --- /dev/null +++ b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_diag2.py @@ -0,0 +1,65 @@ +"""pfSense deeper diag — read filter log + check inbound 443 to 172.16.3.10.""" +import paramiko, socket +socket.setdefaulttimeout(60) + +HOST, PORT, USER = "172.16.0.1", 2248, "admin" +import subprocess as _sp, yaml as _y +PWD = _y.safe_load(_sp.run(["sops","-d","D:/vault/infrastructure/pfsense-firewall.sops.yaml"],capture_output=True,text=True,timeout=30,check=True).stdout)["credentials"]["password"] + +CMDS = [ + ('clog binary locations', + 'which clog 2>/dev/null; ls /usr/local/sbin/clog* /usr/sbin/clog* /sbin/clog* 2>/dev/null; pkg info clog 2>/dev/null | head -3'), + + ('filter log type + size', + 'file /var/log/filter.log 2>/dev/null; ls -la /var/log/filter.log'), + + ('Try to read filter.log as text', + 'tail -50 /var/log/filter.log | grep -v "^$" | tail -30'), + + ('Inbound :443 -> 172.16.3.10 states (right now)', + 'pfctl -s states | grep "172.16.3.10:443\\|-> 172.16.3.10" | grep "443" | head -30'), + + ('Inbound :443 states total count', + 'pfctl -s states | grep "172.16.3.10:443" | wc -l; pfctl -s states | grep ":443.*172\\.16\\.3\\.10" | wc -l'), + + ('State count broken out by direction', + 'pfctl -s states | awk \'/172\\.16\\.3\\.10/ {print $0}\' | head -20'), + + ('Cloudflare PHX IPs sample (CF publishes these)', + 'curl -s -m 10 https://www.cloudflare.com/ips-v4 2>/dev/null | head -5; echo "---"; curl -s -m 10 https://www.cloudflare.com/ips-v4 2>/dev/null | wc -l'), + + ('Test-send a SYN from pfSense to known CF edge IP (simulate return path)', + 'nc -z -v -w 3 162.158.0.1 443 2>&1; echo "---"; nc -z -v -w 3 104.26.8.237 443 2>&1'), + + ('Check WAN interface health', + 'ifconfig igc0 | grep -E "inet |status"; echo "---"; netstat -rn | grep default'), + + ('Recently-logged DROP/BLOCK (pf log format 5)', + 'tcpdump -n -e -ttt -r /var/log/filter.log 2>&1 | head -30 || echo "(tcpdump cant read binary)"'), + + ('Try pfSsh.php for log', + 'echo "exec;tail -30 /var/log/filter.log" | pfSsh.php 2>&1 | tail -40'), + + ('PF filter log read alt (pfctl loginterface / pflog0 dump)', + 'tcpdump -n -e -ttt -i pflog0 -c 20 2>&1 | head -40 || echo "(no pflog0)"'), +] + +def main(): + c = paramiko.SSHClient() + c.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + c.connect(HOST, port=PORT, username=USER, password=PWD, + timeout=30, banner_timeout=30, look_for_keys=False, allow_agent=False) + try: + for label, cmd in CMDS: + print(f'\n===== {label} =====', flush=True) + stdin, stdout, stderr = c.exec_command(cmd, timeout=60) + out = stdout.read().decode('utf-8','replace') + err = stderr.read().decode('utf-8','replace') + if out.strip(): print(out.rstrip()) + if err.strip() and 'stty' not in err and 'terminal' not in err.lower(): + print(f' [stderr] {err.rstrip()[:300]}') + finally: + c.close() + +if __name__ == '__main__': + main() diff --git a/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_trace.py b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_trace.py new file mode 100644 index 0000000..7ee61e8 --- /dev/null +++ b/clients/internal-infrastructure/scripts/cloudflared-tunnel-setup/pfsense_trace.py @@ -0,0 +1,42 @@ +"""Confirm CF origin-pull IP range unreachable from pfSense WAN.""" +import paramiko, socket +socket.setdefaulttimeout(60) + +HOST, PORT, USER = "172.16.0.1", 2248, "admin" +import subprocess as _sp, yaml as _y +PWD = _y.safe_load(_sp.run(["sops","-d","D:/vault/infrastructure/pfsense-firewall.sops.yaml"],capture_output=True,text=True,timeout=30,check=True).stdout)["credentials"]["password"] + +CMDS = [ + ('traceroute to 162.158.0.1 (CF origin-pull range)', + 'traceroute -n -w 3 -m 12 162.158.0.1 2>&1 | head -20'), + ('traceroute to 104.26.8.237 (CF client-facing, known working)', + 'traceroute -n -w 3 -m 12 104.26.8.237 2>&1 | head -20'), + ('traceroute to 172.67.72.147 (CF edge, working)', + 'traceroute -n -w 3 -m 12 172.67.72.147 2>&1 | head -20'), + ('More CF origin-pull IPs via nc', + 'for ip in 162.158.0.1 162.158.100.1 162.158.200.1 162.159.0.1 162.159.100.1 108.162.192.1 108.162.250.1; do printf "%-16s " "$ip"; nc -z -v -w 3 $ip 443 2>&1 | head -1; done'), + ('Route table: do we have a specific route for 162.158?', + 'netstat -rn -f inet | grep -E "^162\\.|^default" | head -10'), + ('BGP / gateway status', + 'pfSsh.php playback gatewaystatus 2>&1 | head -20 || echo "(no playback)"; cat /tmp/gw_status 2>/dev/null | head -20'), +] + +def main(): + c = paramiko.SSHClient() + c.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + c.connect(HOST, port=PORT, username=USER, password=PWD, + timeout=30, banner_timeout=30, look_for_keys=False, allow_agent=False) + try: + for label, cmd in CMDS: + print(f'\n===== {label} =====', flush=True) + stdin, stdout, stderr = c.exec_command(cmd, timeout=90) + out = stdout.read().decode('utf-8','replace') + err = stderr.read().decode('utf-8','replace') + if out.strip(): print(out.rstrip()) + if err.strip() and 'stty' not in err: + print(f' [stderr] {err.rstrip()[:300]}') + finally: + c.close() + +if __name__ == '__main__': + main() diff --git a/clients/ix-server/session-logs/2026-03-16-ix-account-cleanup.md b/clients/internal-infrastructure/session-logs/2026-03-16-ix-account-cleanup.md similarity index 100% rename from clients/ix-server/session-logs/2026-03-16-ix-account-cleanup.md rename to clients/internal-infrastructure/session-logs/2026-03-16-ix-account-cleanup.md diff --git a/clients/ix-server/session-logs/2026-04-11-smart-slider-security-scan.md b/clients/internal-infrastructure/session-logs/2026-04-11-smart-slider-security-scan.md similarity index 98% rename from clients/ix-server/session-logs/2026-04-11-smart-slider-security-scan.md rename to clients/internal-infrastructure/session-logs/2026-04-11-smart-slider-security-scan.md index 495b7c1..988b0a3 100644 --- a/clients/ix-server/session-logs/2026-04-11-smart-slider-security-scan.md +++ b/clients/internal-infrastructure/session-logs/2026-04-11-smart-slider-security-scan.md @@ -209,7 +209,7 @@ Smart Slider 3 Free: 3 ## Files Created - **Scan script:** `/root/scan_smart_slider.sh` (IX server) - **Results file:** `/tmp/smart_slider_scan_1775909346.txt` (IX server) -- **This report:** `clients/ix-server/session-logs/2026-04-11-smart-slider-security-scan.md` +- **This report:** `clients/internal-infrastructure/session-logs/2026-04-11-smart-slider-security-scan.md` --- diff --git a/clients/internal-infrastructure/session-logs/2026-04-13-session.md b/clients/internal-infrastructure/session-logs/2026-04-13-session.md new file mode 100644 index 0000000..190276e --- /dev/null +++ b/clients/internal-infrastructure/session-logs/2026-04-13-session.md @@ -0,0 +1,353 @@ +# Session Log — Internal Infrastructure — 2026-04-13 + +## Cloudflare Tunnel deployment for azcomputerguru.com + Cox BGP diagnosis + +Earlier 2026-04-13 work (SCMVAS git push, merge conflict resolution) is in +`projects/dataforth-dos/session-logs/2026-04-12-session.md`. This log picks up +when user reported azcomputerguru.com was still showing 521 after the initial +Cloudflare recovery. + +--- + +## Session Summary + +User reported azcomputerguru.com returning **521 "Web server is down"** through Cloudflare, despite: +- CF SSL mode being "Full" (not Strict) +- Origin IX server (172.16.3.10) responding 200 OK internally +- Origin reachable from external ISPs (non-CF path) + +### What was accomplished + +1. **Diagnosed root cause:** Cox ISP has broken BGP routing from our netblock (72.194.62.0/29) to specific Cloudflare IP prefixes. TCP:443 from pfSense WAN succeeds to 104.16/17/26 ranges but **times out** to 162.158.0.0/16, 172.64.0.0/13, 173.245.48.0/20, 141.101.64.0/18. ICMP traceroute to affected prefixes shows ~173ms (cross-country peering) vs ~3.6ms for working prefixes — asymmetric/distant routing. Inbound CF→origin state count was 0 while direct-internet state count was 285, confirming only CF path was broken. + +2. **Deployed Cloudflare Tunnel on Jupiter (Unraid)** as a permanent workaround. Tunnel reverses connection direction (outbound from container, using working CF prefixes), eliminating dependency on Cox's broken inbound routing. + +3. **Cut over 4 proxied hostnames** to the tunnel via CF DNS API: + - azcomputerguru.com, analytics., community., radio. + - All 4 now return **HTTP 200 OK** through CF edge → tunnel → IX HTTPS vhost (SNI-matched) + +4. **Drafted Cox BGP escalation ticket** with evidence (TCP matrix, traceroute comparison, state-table counts). Saved to `vendor-tickets/`. + +5. **Folder reorganization:** + - Moved Cox ticket from `projects/dataforth-dos/datasheet-pipeline/implementation/` (wrong — not a Dataforth file) → `clients/internal-infrastructure/vendor-tickets/2026-04-13-cox-bgp-cloudflare-routing.md` + - Merged misnamed `clients/ix-server/` into `clients/internal-infrastructure/` (IX is internal infra, not a client). Session logs moved; folder removed; 4 stale path references updated across 2 files. + +### Key decisions & rationale + +- **Option C: tunnel on Jupiter Docker** rather than pfSense (cloudflared isn't a pfSense package, firmware upgrades would wipe it) or IX (scoped to IX only; other internal origins would need separate tunnels). Jupiter already runs Unraid with many containers; cloudflared fits the existing pattern. One tunnel can route to any internal LAN IP. +- **HTTPS backend (not HTTP)** with `originServerName: ` + `noTLSVerify: true`. Initial HTTP backend caused WordPress "force HTTPS" redirect loop on community/radio (they had HSTS/canonical-URL rules IX's other sites lacked). +- **`--user 65532` (container default) with `chown 65532:65532` on host volume** — earlier `--user root` attempt wrote cert to `/root/.cloudflared` (outside bind mount) instead of `/home/nonroot/.cloudflared`. +- **Detached container for `tunnel login`** — earlier foreground attempts got killed when SSH exec_command hit its 9-minute timeout; detached container (`cf-login`) persists independent of SSH. +- **Didn't grey-cloud DNS** (the quick-but-ugly fix); tunnel gives permanent architectural solution that survives future Cox BGP flaps. + +### Problems encountered and resolutions + +| Problem | Resolution | +|---|---| +| Cloudflare token (Full DNS) lacks Zone Settings + Analytics permissions; couldn't read SSL/TLS mode or per-PoP origin-status | Used pfSense-side diagnostics (TCP probes + traceroute + state table) instead; conclusive without needing Analytics | +| `mkdir: no space left on device` on `/mnt/user/appdata/cloudflared` despite cache showing 181GB free | shfs (Unraid FUSE overlay) was being overly strict near 81% cache usage; bypassed by writing directly to `/mnt/cache/appdata/cloudflared` (raw cache pool, same physical SSD, skips shfs) | +| `cert.pem: permission denied` writing to bind-mount volume | Container runs as UID 65532 (`nonroot`), host dir was owned by `nobody:users` (99:100). Chowned host dir to 65532:65532 before retry | +| `--user root` workaround wrote cert to `/root/.cloudflared`, outside the mount | Dropped `--user` override after fixing host UID ownership | +| Foreground `docker run --rm` for login got killed by SSH exec timeout after 9 min | Used `docker run -d --name cf-login` (detached); container persists through SSH session endings | +| Tailscale was stopped mid-session (user moved to different network); lost all 172.16.x routes | User reconnected to local net; resumed | +| WordPress 301 redirect loop on community/radio after tunnel cutover | Switched tunnel origin from `http://172.16.3.10:80` → `https://172.16.3.10:443` with `originServerName` per ingress + `noTLSVerify: true` | +| Cox ticket draft initially saved under Dataforth project folder (wrong place) | User flagged; moved to `clients/internal-infrastructure/vendor-tickets/` | +| `clients/ix-server/` existed as a separate folder when IX is internal infra | Merged `clients/ix-server/` (2 session logs) into `clients/internal-infrastructure/session-logs/`, removed empty folder, fixed 4 path references in 2 files | + +--- + +## Credentials + +### Cloudflare API tokens (from 1Password) +- **Full DNS token:** `DRRGkHS33pxAUjQfRDzDeVPtt6wwUU6FwtXqOzNj` + - Permissions: Zone:Read, DNS:Read/Edit (confirmed; actual scope narrower than 1Password note implies — lacks Zone Settings, Analytics, Tunnel) + - Token ID: `48607a8ba656e02050e97ae4b1b8fcdf` +- **Legacy token:** `U1UTbBOWA4a69eWEBiqIbYh0etCGzrpTU4XaKp7w` + - Token ID: `162711358e386f178d81bb09ca800148` + - Same limited scope (analytics.read also denied) +- **Account:** `Mike@azcomputerguru.com's Account`, Pro Website plan +- **Zone:** `azcomputerguru.com`, zone ID `1beb9917c22b54be32e5215df2c227ce` +- **Vault entry:** `services/cloudflare.sops.yaml` (contains metadata only — token values are in 1Password, not SOPS vault yet) + +### Jupiter (Unraid primary) +- SSH: `root / Th1nk3r^99##` on 172.16.3.20:22 +- Vault: `infrastructure/jupiter-unraid-primary.sops.yaml` +- iDRAC: 172.16.1.73, `root / Window123!@#-idrac` + +### IX Server (origin) +- SSH: `root / Gptf*77ttb!@#!@#` on 172.16.3.10:22 (internal) / 72.194.62.5 (public) +- OS: CloudLinux 9.7 (RHEL 9 family), WHM/cPanel, Apache +- WHM: port 2087, cPanel: 2083 +- Vault: `infrastructure/ix-server.sops.yaml` + +### pfSense Firewall +- SSH: `admin / r3tr0gradE99!!` on 172.16.0.1:2248 +- OS: pfSense 2.8.1 (FreeBSD 15.0-CURRENT) +- WAN: 98.181.90.163/31, public IP block 72.194.62.2-.10 (all bound to igc0) +- Vault: `infrastructure/pfsense-firewall.sops.yaml` +- Note: no IDS/IPS installed (no suricata/snort/pfBlockerNG), firewalld disabled, 5706 states at time of diag + +--- + +## Infrastructure & Servers + +### Tunnel deployment + +| Component | Value | +|---|---| +| Tunnel name | `acg-origin` | +| Tunnel UUID | `78d3e58f-1979-4f0e-a28b-98d6b3c3d867` | +| Tunnel target hostname | `78d3e58f-1979-4f0e-a28b-98d6b3c3d867.cfargotunnel.com` | +| Host | Jupiter (172.16.3.20) | +| Docker container name | `cloudflared` (restart=unless-stopped) | +| Docker image | `cloudflare/cloudflared:latest` | +| Host volume | `/mnt/cache/appdata/cloudflared/` (direct cache SSD, chowned 65532:65532) | +| Config file | `/mnt/cache/appdata/cloudflared/config.yml` | +| Cert file | `/mnt/cache/appdata/cloudflared/cert.pem` | +| Credentials file | `/mnt/cache/appdata/cloudflared/78d3e58f-1979-4f0e-a28b-98d6b3c3d867.json` | +| Active CF PoPs | phx01 ×2, lax11 (4 tunnel connections) | + +### DNS records updated (all proxied, zone azcomputerguru.com) + +| Hostname | Before | After | +|---|---|---| +| azcomputerguru.com | A 72.194.62.5 (not proxied — was a bug; now is) | CNAME `78d3e58f-...cfargotunnel.com` proxied | +| analytics.azcomputerguru.com | A 72.194.62.5 proxied | CNAME `78d3e58f-...cfargotunnel.com` proxied | +| community.azcomputerguru.com | A 72.194.62.5 proxied | CNAME `78d3e58f-...cfargotunnel.com` proxied | +| radio.azcomputerguru.com | A 72.194.62.5 proxied | CNAME `78d3e58f-...cfargotunnel.com` proxied | + +Note: `azcomputerguru.com` was `proxied=False` before the cutover (record ID `c865ce7849e3567383433d74e5845f99`). That's odd — it was serving through CF (as evidenced by the 521 responses which only CF serves) but the A record flag was False. Possibly via www CNAME + CF magic. Replaced with a proper proxied CNAME. + +### Paths this session + +- Local: `D:\claudetools\clients\internal-infrastructure\` (new target after reorg) +- Local (old, removed): `D:\claudetools\clients\ix-server\` +- Local scripts: `D:\claudetools\projects\dataforth-dos\datasheet-pipeline\implementation\jupiter_tunnel_*.py` (should eventually move; they're tunnel-setup helpers, not Dataforth) +- Jupiter: `/mnt/cache/appdata/cloudflared/` (tunnel config/cert) +- IX: No changes persisted (`cloudflared` briefly installed via dnf then removed; `/root/.cloudflared/` deleted) + +--- + +## Commands & Outputs + +### Diagnostic cascade (definitive answer) + +From pfSense (172.16.0.1): +``` +$ for ip in 104.16.0.1 104.17.0.1 104.26.0.1 162.158.0.1 162.158.100.1 172.64.0.1 172.67.0.1 173.245.48.1 141.101.64.1; do + printf "%-16s " $ip; nc -z -v -w 2 $ip 443 2>&1 | head -1 + done +104.16.0.1 OK Connection succeeded +104.17.0.1 OK Connection succeeded +104.26.0.1 OK Connection succeeded +162.158.0.1 FAIL Operation timed out +162.158.100.1 FAIL Operation timed out +172.64.0.1 FAIL Operation timed out +172.67.0.1 FAIL Operation timed out +173.245.48.1 FAIL Operation timed out +141.101.64.1 FAIL Operation timed out + +$ pfctl -s states | grep "172.16.3.10:443" | wc -l +285 # non-CF users reaching origin fine + +$ pfctl -s states | egrep "^[^|]*(104\.(2[6-9])|162\.(158|159)|172\.(64|67))" | head +# 0 results for 162.158.x inbound; 162.159.x outbound-only (initiated from LAN) +``` + +### Tunnel completion (final state) + +``` +=== [2] create tunnel acg-origin === +Created tunnel acg-origin with id 78d3e58f-1979-4f0e-a28b-98d6b3c3d867 + +=== [4] DNS cutover (A -> CNAME) === + [azcomputerguru.com] current: type=A content=72.194.62.5 proxied=False id=c865ce7849e3567383433d74e5845f99 + [OK] -> CNAME 78d3e58f-1979-4f0e-a28b-98d6b3c3d867.cfargotunnel.com proxied + [analytics.azcomputerguru.com] ... [OK] + [community.azcomputerguru.com] ... [OK] + [radio.azcomputerguru.com] ... [OK] + +=== [6] wait for tunnel connections === + [try 14] connections registered: 4 + +=== after HTTPS backend switch === + azcomputerguru.com: HTTP 200 Server=cloudflare + analytics.azcomputerguru.com: HTTP 200 Server=cloudflare + community.azcomputerguru.com: HTTP 200 Server=cloudflare + radio.azcomputerguru.com: HTTP 200 Server=cloudflare +``` + +### Cloudflare auth URLs issued (4 rounds before success) + +Only the final one mattered — fresh container after chown fix: +``` +https://dash.cloudflare.com/argotunnel?aud=&callback=https%3A%2F%2Flogin.cloudflareaccess.org%2F7RFAWDCIvWpHtiq0TsoMGEjV9zALX0xwmy1HZssO7mk%3D +``` + +--- + +## Configuration Changes + +### On Jupiter (172.16.3.20) + +**New:** `/mnt/cache/appdata/cloudflared/config.yml` +```yaml +tunnel: 78d3e58f-1979-4f0e-a28b-98d6b3c3d867 +credentials-file: /home/nonroot/.cloudflared/78d3e58f-1979-4f0e-a28b-98d6b3c3d867.json +ingress: + - hostname: azcomputerguru.com + service: https://172.16.3.10:443 + originRequest: + originServerName: azcomputerguru.com + noTLSVerify: true + - hostname: analytics.azcomputerguru.com + service: https://172.16.3.10:443 + originRequest: + originServerName: analytics.azcomputerguru.com + noTLSVerify: true + - hostname: community.azcomputerguru.com + service: https://172.16.3.10:443 + originRequest: + originServerName: community.azcomputerguru.com + noTLSVerify: true + - hostname: radio.azcomputerguru.com + service: https://172.16.3.10:443 + originRequest: + originServerName: radio.azcomputerguru.com + noTLSVerify: true + - service: http_status:404 +``` + +**New container:** `cloudflared` (auto-restart via `--restart=unless-stopped`). Run command: +``` +docker run -d --name cloudflared --restart=unless-stopped \ + -v /mnt/cache/appdata/cloudflared:/home/nonroot/.cloudflared \ + cloudflare/cloudflared:latest \ + tunnel --config /home/nonroot/.cloudflared/config.yml run +``` + +### Repo reorganization + +| Action | From | To | +|---|---|---| +| Moved | `projects/dataforth-dos/datasheet-pipeline/implementation/cox-bgp-ticket-draft.md` | `clients/internal-infrastructure/vendor-tickets/2026-04-13-cox-bgp-cloudflare-routing.md` | +| Moved | `clients/ix-server/session-logs/2026-03-16-ix-account-cleanup.md` | `clients/internal-infrastructure/session-logs/` | +| Moved | `clients/ix-server/session-logs/2026-04-11-smart-slider-security-scan.md` | `clients/internal-infrastructure/session-logs/` | +| Removed | `clients/ix-server/` (empty after moves) | — | +| Edited | `session-logs/2026-04-11-session.md` | 3x `clients/ix-server/` → `clients/internal-infrastructure/` | +| Edited | `clients/internal-infrastructure/session-logs/2026-04-11-smart-slider-security-scan.md` | 1x path update | + +Scripts in `projects/dataforth-dos/datasheet-pipeline/implementation/` relevant to tunnel setup but not yet moved (next session decision): +- `jupiter_tunnel_login5.py`, `jupiter_tunnel_login4.py`, `jupiter_tunnel_login3.py`, `jupiter_tunnel_login2.py`, `jupiter_tunnel_login.py` (multiple login attempts, keep only the detached one) +- `jupiter_tunnel_complete.py` — the one that did the full cutover +- `jupiter_tunnel_fix_https.py` — the HTTPS backend switchover +- `ix_install_cloudflared.py`, `ix_tunnel_login.py` (IX-side, abandoned) +- `cf_analytics.py` — GraphQL probe (showed analytics.read permission missing) +- `pfsense_diag.py`, `pfsense_diag2.py`, `pfsense_trace.py` — the diagnostic cascade +- `cox-bgp-ticket-draft.md` — already moved + +--- + +## Pending / Incomplete / Open Items + +### Action items for user + +1. **Submit Cox BGP ticket** (file ready at `clients/internal-infrastructure/vendor-tickets/2026-04-13-cox-bgp-cloudflare-routing.md`). Fixing their routing is the permanent root-cause fix; until then the tunnel is the mitigation. No SLA for this. + +2. **Populate Cloudflare token in SOPS vault.** Currently `services/cloudflare.sops.yaml` has metadata only — no `credentials:` block. Token values live in 1Password. For pipeline automation it would be nicer to have them in SOPS like everything else: + ``` + bash D:/vault/scripts/vault.sh edit services/cloudflare.sops.yaml + # add credentials: { api_token_full_dns: DRRGkHS33pxAUjQfRDzDeVPtt6wwUU6FwtXqOzNj, api_token_legacy: U1UTbBOWA4a69eWEBiqIbYh0etCGzrpTU4XaKp7w, dns_zone_id: 1beb9917c22b54be32e5215df2c227ce } + ``` + +3. **Consider expanding tunnel ingress to cover more proxied hostnames** (if Cox BGP stays broken, other proxied hostnames would intermittently 521 too): + - `plex.azcomputerguru.com` → 72.194.62.4 (Jupiter NPM) — could route through tunnel to `https://172.16.3.20:18443` (NPM is already on Jupiter, could bypass public IP entirely) + - `plexrequest.azcomputerguru.com`, `rustdesk.`, `sync.`, `secure.`, `backups.`, `enterpriseenrollment.`, `enterpriseregistration.`, `info.`, `mail.`, `store.`, `ui.` — most are external-proxied CNAMEs, don't need tunnel; a few to Jupiter (.4) could benefit + - Not urgent unless 521 recurs on one of them + +4. **Script cleanup** — move tunnel-setup helper scripts out of `projects/dataforth-dos/datasheet-pipeline/implementation/` (wrong project). Candidate targets: `clients/internal-infrastructure/scripts/cloudflared/` or similar. Not touched today. + +5. **Commit this work** — the tunnel DNS changes are already live. Local file changes (moves, log, ticket draft) not yet committed. + +### Vault hygiene (from earlier today, still pending) + +- `clients/dataforth/ad2.sops.yaml`: stale shell-escape backslash in `credentials.password` (stores `Paper123\!@#`; real is `Paper123!@#`). + +### Dataforth follow-ups (unrelated to today but still open) + +- Verify `C:\Shares\test\scripts\Sync-FromNAS-rsync.ps1` includes the `VASLOG - Engineering Tested` subfolder for ongoing Engineering-tested .txt ingestion. + +--- + +## Reference Information + +### Cloudflare Tunnel management + +To view logs: +``` +ssh root@172.16.3.20 'docker logs cloudflared --tail 30' +``` + +To list tunnels: +``` +docker run --rm -v /mnt/cache/appdata/cloudflared:/home/nonroot/.cloudflared cloudflare/cloudflared:latest tunnel list +``` + +To restart after config change: +``` +docker restart cloudflared +# or stop + start for a fresh container state +``` + +To rotate the tunnel (delete + recreate): +``` +docker run --rm -v /mnt/cache/appdata/cloudflared:/home/nonroot/.cloudflared cloudflare/cloudflared:latest tunnel delete -f acg-origin +# then re-run create + config steps +``` + +### Cloudflare API one-liners + +List DNS records for a hostname: +``` +curl -H "Authorization: Bearer $CF_TOKEN" "https://api.cloudflare.com/client/v4/zones/$ZONE/dns_records?name=azcomputerguru.com" +``` + +Quick site probe: +``` +curl -sI -A "Mozilla/5.0 Chrome/120.0" https://azcomputerguru.com/ +# Expect: HTTP/1.1 200 OK Server=cloudflare +``` + +### Useful paths and ports + +| Resource | Value | +|---|---| +| Jupiter appdata | `/mnt/cache/appdata/cloudflared/` | +| IX internal | `http://172.16.3.10:80`, `https://172.16.3.10:443` | +| pfSense SSH | `ssh admin@172.16.0.1 -p 2248` | +| Cloudflare API base | `https://api.cloudflare.com/client/v4/zones/1beb9917c22b54be32e5215df2c227ce` | + +### Cloudflare-IP prefix status (as of 2026-04-13 ~08:30) + +| Prefix | Route via Cox | TCP:443 from pfSense | +|---|---|---| +| 104.16.0.0/13 | local/short path | **OK** | +| 104.24.0.0/14 | local/short path | **OK** | +| 162.158.0.0/16 | distant/broken | **FAIL (timeout)** | +| 172.64.0.0/13 | distant/broken | **FAIL (timeout)** | +| 173.245.48.0/20 | distant/broken | **FAIL (timeout)** | +| 141.101.64.0/18 | distant/broken | **FAIL (timeout)** | + +--- + +## Related Logs + +- Earlier today: `projects/dataforth-dos/session-logs/2026-04-12-session.md` (SCMVAS deploy finish + git merge conflict resolution) +- Earlier related: `session-logs/2026-04-06-session.md` (ScreenConnect redirect + UniFi OS VM) — shows public IP block context +- Earlier related: `clients/internal-infrastructure/session-logs/2026-04-11-smart-slider-security-scan.md` (IX WP audit, originally at `clients/ix-server/`) +- Remote (pulled today): commit `499fd5d` "Session log: Gitea recovery (Jupiter cache full)" — explains earlier intermittent Gitea 502s and Jupiter cache pressure seen today + +--- + +**Last Updated:** 2026-04-13 +**Next Actions:** submit Cox ticket; consider populating Cloudflare vault entry; monitor tunnel for 24h; cleanup misplaced helper scripts. diff --git a/clients/internal-infrastructure/vendor-tickets/2026-04-13-cox-bgp-cloudflare-routing.md b/clients/internal-infrastructure/vendor-tickets/2026-04-13-cox-bgp-cloudflare-routing.md new file mode 100644 index 0000000..79e1fb4 --- /dev/null +++ b/clients/internal-infrastructure/vendor-tickets/2026-04-13-cox-bgp-cloudflare-routing.md @@ -0,0 +1,88 @@ +# Cox Business BGP / Routing Escalation Ticket — Draft + +**Account / Service:** Mike Swanson, AZ Computer Guru — business static-IP block 72.194.62.0/29 +**WAN / upstream:** Cox Business, Tucson AZ (or wherever applicable) +**Circuit public IP (pfSense WAN):** 98.181.90.163 +**Destination affected public IPs:** 72.194.62.2, .3, .4, .5, .8, .9, .10 + +--- + +## Subject + +Asymmetric/unreachable routing from Cox customer block 72.194.62.0/29 to specific Cloudflare /16 and /18 IP prefixes + +## Summary + +Cloudflare PoP in Phoenix (PHX) cannot successfully establish TCP connections to our public IPs (72.194.62.2-.10) for origin-pull requests. HTTP requests from public clients reaching Cloudflare get a 521 "web server is down" response, because Cloudflare's origin-pull source prefixes cannot complete TCP handshakes to our netblock. + +## Evidence + +### 1. Our WAN firewall can reach ~half of Cloudflare's IP ranges, not the others + +From our pfSense firewall (FreeBSD, 2.8.1), TCP connect test to port 443 on representative IPs in each Cloudflare-advertised prefix: + +| Cloudflare Prefix | Sample IP | TCP:443 connect | +|---|---|---| +| 104.16.0.0/13 | 104.16.0.1 | succeeds | +| 104.16.0.0/13 | 104.17.0.1 | succeeds | +| 104.24.0.0/14 | 104.26.0.1 | succeeds | +| 162.158.0.0/16 | 162.158.0.1 | **timeout** | +| 162.158.0.0/16 | 162.158.100.1 | **timeout** | +| 172.64.0.0/13 | 172.64.0.1 | **timeout** | +| 172.64.0.0/13 | 172.67.0.1 | **timeout** | +| 173.245.48.0/20 | 173.245.48.1 | **timeout** | +| 141.101.64.0/18 | 141.101.64.1 | **timeout** | + +Reference list Cloudflare publishes at https://www.cloudflare.com/ips-v4 + +### 2. ICMP traceroute to failing Cloudflare prefixes reveals an unusually indirect path + +Traceroute from pfSense WAN (98.181.90.163) to 162.158.0.1 — 8 hops, ~173 ms (suggests routing via a distant peering point): + +``` +1 * * * +2 100.120.164.200 3.236 ms +3 68.1.0.191 4.180 ms +4 184.183.131.9 23.671 ms +5 198.41.140.124 14.635 ms +6 198.41.140.244 161.626 ms <- huge latency jump (likely cross-country) +7 108.162.247.54 163.073 ms +8 162.158.0.1 173.018 ms +``` + +Compare to traceroute to the working prefix 104.26.8.237 — 6 hops, ~3.6 ms: + +``` +1 * * * +2 100.120.164.200 3.022 ms +3 68.1.0.191 3.799 ms +4 184.183.131.9 8.973 ms +5 162.158.140.21 3.909 ms <- nearby Cloudflare peering +6 104.26.8.237 3.445 ms +``` + +The ~170 ms added round-trip to 162.158.0.0/16 vs ~3.5 ms to 104.x suggests routes for 162.158, 172.64, 173.245, 141.101 are being withdrawn from the local peering and defaulting to a distant one (Ashburn or similar), with packet loss or asymmetric return on that path. + +### 3. Direct-internet users reach our origin fine; only Cloudflare-proxied traffic fails + +Our state table currently shows 285 active inbound :443 connections to our origin server from various non-Cloudflare IPs (Philippines, Russia, India, Pakistan users — direct clients). Zero inbound connections from any Cloudflare prefix. Origin is healthy; the problem is specifically the return path to Cloudflare's origin-pull source IPs. + +### 4. Third-party test confirms routing is not symmetric + +From an external network (different ISP egress), connecting to our public IP 72.194.62.5 on port 443 with correct SNI succeeds with HTTP 200. + +## Ask + +Please have network engineering check the BGP advertisements and/or routing policy for: + +- Cloudflare prefixes **162.158.0.0/16**, **172.64.0.0/13**, **173.245.48.0/20**, **141.101.64.0/18** +- Return path from our block **72.194.62.0/29** to those Cloudflare prefixes + +It appears these prefixes are being routed through a distant Cox peering point rather than the nearby Cloudflare peering (visible at hop 5 on the working route), and the return path is either black-holed or lossy enough to drop TCP handshakes. + +Contact: Mike Swanson, AZ Computer Guru +Timeline: urgent — hosted sites (azcomputerguru.com, analytics., community., radio.) are intermittently unreachable to any visitor whose nearest Cloudflare PoP chooses an origin-pull source in one of the affected prefixes. + +## Workaround in place + +We are setting up a Cloudflare Tunnel from inside our network outbound to Cloudflare (initiated from our side using working prefixes), so customer-visible outage is mitigated. Resolution of the underlying BGP issue is still required for any direct-proxied traffic and general Cox–Cloudflare connectivity health. diff --git a/session-logs/2026-04-11-session.md b/session-logs/2026-04-11-session.md index d24f725..6007de2 100644 --- a/session-logs/2026-04-11-session.md +++ b/session-logs/2026-04-11-session.md @@ -139,7 +139,7 @@ - Smart Slider 3 FREE: 3 (SAFE) **Security Report** -- File: `clients/ix-server/session-logs/2026-04-11-smart-slider-security-scan.md` +- File: `clients/internal-infrastructure/session-logs/2026-04-11-smart-slider-security-scan.md` - Comprehensive security audit documentation - Risk assessment: LOW - Sites with Smart Slider FREE: @@ -295,7 +295,7 @@ projects/radio-show/episodes/2026-04-11-hidden-price-tags/show-prep.html projects/radio-show/episodes/2026-04-18-tech-that-makes-life-fun/show-prep.md projects/radio-show/episodes/2026-04-18-tech-that-makes-life-fun/show-prep.html temp/scan_smart_slider.sh -clients/ix-server/session-logs/2026-04-11-smart-slider-security-scan.md +clients/internal-infrastructure/session-logs/2026-04-11-smart-slider-security-scan.md session-logs/2026-04-11-session.md ``` @@ -401,7 +401,7 @@ All files created in this session should be committed to version control: 4. `projects/radio-show/episodes/2026-04-18-tech-that-makes-life-fun/show-prep.md` 5. `projects/radio-show/episodes/2026-04-18-tech-that-makes-life-fun/show-prep.html` 6. `temp/scan_smart_slider.sh` -7. `clients/ix-server/session-logs/2026-04-11-smart-slider-security-scan.md` +7. `clients/internal-infrastructure/session-logs/2026-04-11-smart-slider-security-scan.md` 8. `session-logs/2026-04-11-session.md` (this file) **Commit Message**: "Session log: Radio show prep (3 weeks), IX security scan, network scanning"