diff --git a/.claude/gururmm-tunnel-plan.md b/.claude/gururmm-tunnel-plan.md new file mode 100644 index 0000000..ec3ede4 --- /dev/null +++ b/.claude/gururmm-tunnel-plan.md @@ -0,0 +1,396 @@ +# GuruRMM Real-Time Tunnel Implementation Plan + +## Overview + +Transform GuruRMM agents from periodic check-in mode (30-second heartbeats) to persistent tunnel mode, enabling Claude Code on tech workstation to execute commands on remote machines through secure multiplexed channels. + +--- + +## Architecture Summary + +### Current State (Confirmed via exploration) +- **Server:** Axum 0.7 @ 172.16.3.30:3001, WebSocket endpoint, AgentConnections HashMap +- **Agent:** Tokio async, 30-second heartbeat confirmed, 3 concurrent tasks (metrics/network/heartbeat) +- **Protocol:** Tagged JSON enums (ServerMessage/AgentMessage) with serde + +### Key Architectural Decisions + +1. **Tunnel Lifecycle:** Hybrid - WebSocket stays persistent, tunnel mode is operational state change + - Agent modes: Heartbeat (default) ↔ Tunnel (active session) + - One tunnel per agent, on-demand activation, instant mode switching + +2. **Channel Multiplexing:** Unified protocol with channel_id routing + - Single WebSocket, multiple logical channels + - Enables concurrent operations (multiple terminals, simultaneous file transfers) + - Channel types: Terminal, FileRead, FileWrite, FileList, Registry, Services + +3. **Claude Integration:** Custom MCP server + - Tools: `gururmm_run_command`, `gururmm_read_file`, `gururmm_write_file`, `gururmm_list_directory`, `gururmm_list_agents` + - JWT authentication via environment variable + - Auto-manages tunnel sessions (open on first use, keep-alive, close on idle) + +4. **Security:** Three-layer model + - Layer 1: JWT authentication (24h expiration) + - Layer 2: Session authorization (tech_sessions table, 4h inactivity timeout) + - Layer 3: Command validation (working directory allowlist, rate limiting 100/min, audit logging) + +--- + +## Protocol Extensions + +### New Message Types + +```rust +// Server → Agent +enum ServerMessage { + // ... existing ... + TunnelOpen { session_id: String, tech_id: i32 }, + TunnelClose { session_id: String }, + TunnelData { channel_id: String, data: TunnelDataPayload }, +} + +// Agent → Server +enum AgentMessage { + // ... existing ... + TunnelReady { session_id: String }, + TunnelData { channel_id: String, data: TunnelDataPayload }, + TunnelError { channel_id: String, error: String }, +} + +enum TunnelDataPayload { + Terminal { command: String }, + TerminalOutput { stdout: String, stderr: String, exit_code: Option }, + FileRead { path: String }, + FileContent { content: Vec, mime_type: String }, + FileWrite { path: String, content: Vec }, + FileList { path: String }, + FileListResult { entries: Vec }, +} +``` + +### Agent Mode State Machine + +```rust +enum AgentMode { + Heartbeat, // Default: 30s heartbeats, metrics, network monitoring + Tunnel { + session_id: String, + tech_id: i32, + channels: HashMap, + }, +} +``` + +--- + +## Implementation Phases + +### Phase 1: Core Tunnel Infrastructure (Week 1) +**Goal:** Establish tunnel mode switching and channel routing + +**Server:** +- Add TunnelOpen/TunnelClose/TunnelData to ServerMessage enum +- Create tech_sessions table (id, session_id, tech_id, agent_id, opened_at, last_activity, status) +- Implement endpoints: POST /api/v1/tunnel/open, POST /close, GET /status/:session_id +- Add channel routing in WebSocket handler (route by channel_id) +- Session validation middleware (JWT + ownership check) + +**Agent:** +- Add TunnelReady/TunnelData/TunnelError to AgentMessage enum +- Implement AgentMode state machine +- Add channel manager (HashMap) +- Handle TunnelOpen → respond TunnelReady +- Handle TunnelClose → cleanup channels, return to heartbeat mode + +**Critical Files:** +- `server/src/ws/mod.rs` - WebSocket handler, protocol definitions +- `server/src/routes/tunnel.rs` - NEW: Tunnel API endpoints +- `server/src/middleware/auth.rs` - Session validation +- `agent/src/transport/websocket.rs` - WebSocket client, protocol handling +- `agent/src/tunnel/mod.rs` - NEW: Tunnel mode manager +- `migrations/XXX_create_tech_sessions.sql` - NEW: Database schema + +### Phase 2: Terminal Channel (Week 2) +**Goal:** Execute PowerShell/cmd/bash commands through tunnel + +**Implementation:** +- Create TerminalChannel handler on agent (spawn child process, capture streams) +- Implement TunnelDataPayload::Terminal on server +- Working directory validation on agent (configurable allowlist) +- Command result streaming for long-running commands +- Endpoint: POST /api/v1/tunnel/:session_id/command + +**Critical Files:** +- `agent/src/tunnel/terminal.rs` - NEW: Terminal channel handler +- `server/src/routes/tunnel.rs` - Add command execution endpoint +- `agent/config.toml` - Add allowed_paths configuration + +### Phase 3: File Operations (Week 3) +**Goal:** Read, write, list files through tunnel + +**Implementation:** +- Create FileChannel handler on agent +- Chunked transfer for files > 1MB (transfer_id tracking) +- Base64 encoding for binary data +- MIME type detection (magic numbers) +- Endpoints: GET /file, PUT /file, POST /file/list + +**Critical Files:** +- `agent/src/tunnel/file.rs` - NEW: File channel handler +- `server/src/routes/tunnel.rs` - Add file operation endpoints +- `common/src/transfer.rs` - NEW: Chunked transfer utilities + +### Phase 4: MCP Server Integration (Week 4) +**Goal:** Expose tunnel operations as MCP tools for Claude Code + +**Implementation:** +- Create new project: `gururmm-mcp-server` (Rust) +- Use `mcp-server-rs` crate +- Implement 5 core tools (run_command, read_file, write_file, list_dir, list_agents) +- JWT token from environment variable (GURURMM_AUTH_TOKEN) +- Auto-manage tunnel sessions (open on first tool use, 5min idle timeout) + +**Critical Files:** +- `mcp-server/src/main.rs` - NEW: MCP server entry point +- `mcp-server/src/tools.rs` - NEW: Tool implementations +- `mcp-server/src/session.rs` - NEW: Session manager +- `mcp-server/Cargo.toml` - NEW: Dependencies + +**MCP Config Example:** +```json +{ + "mcpServers": { + "gururmm": { + "command": "gururmm-mcp-server", + "env": { + "GURURMM_API_URL": "http://172.16.3.30:3001", + "GURURMM_AUTH_TOKEN": "jwt-token-here" + } + } + } +} +``` + +### Phase 5: Advanced Features (Week 5+) +- Registry operations (Windows winreg crate) +- Service management (sc.exe/WMI on Windows, systemctl on Linux) +- Interactive terminal with PTY (stretch goal) + +--- + +## Database Schema + +```sql +CREATE TABLE tech_sessions ( + id SERIAL PRIMARY KEY, + session_id VARCHAR(36) UNIQUE NOT NULL, + tech_id INTEGER NOT NULL REFERENCES techs(id), + agent_id INTEGER NOT NULL REFERENCES agents(id), + opened_at TIMESTAMP NOT NULL DEFAULT NOW(), + last_activity TIMESTAMP NOT NULL DEFAULT NOW(), + closed_at TIMESTAMP, + status VARCHAR(20) NOT NULL DEFAULT 'active', + UNIQUE(tech_id, agent_id, status) WHERE status = 'active' +); + +CREATE TABLE tunnel_audit ( + id SERIAL PRIMARY KEY, + session_id VARCHAR(36) NOT NULL REFERENCES tech_sessions(session_id), + channel_id VARCHAR(36) NOT NULL, + operation VARCHAR(50) NOT NULL, + details JSONB, + created_at TIMESTAMP NOT NULL DEFAULT NOW() +); + +CREATE INDEX idx_tech_sessions_tech ON tech_sessions(tech_id); +CREATE INDEX idx_tech_sessions_agent ON tech_sessions(agent_id); +CREATE INDEX idx_tunnel_audit_session ON tunnel_audit(session_id); +``` + +--- + +## API Endpoints (New) + +``` +POST /api/v1/tunnel/open + Body: { "agent_id": 123 } + Response: { "session_id": "uuid", "status": "active" } + +POST /api/v1/tunnel/close + Body: { "session_id": "uuid" } + +GET /api/v1/tunnel/status/:session_id + +POST /api/v1/tunnel/:session_id/command + Body: { "command": "...", "shell": "powershell", "working_dir": "...", "timeout": 30000 } + +GET /api/v1/tunnel/:session_id/file?path=... + +PUT /api/v1/tunnel/:session_id/file?path=... + +POST /api/v1/tunnel/:session_id/file/list?path=... +``` + +--- + +## MCP Tools + +``` +gururmm_run_command(agent_id, command, shell, working_dir, timeout) +gururmm_read_file(agent_id, path) +gururmm_write_file(agent_id, path, content) +gururmm_list_directory(agent_id, path) +gururmm_list_agents() +``` + +--- + +## Security Implementation + +### Working Directory Validation +```toml +# agent/config.toml +[security] +allowed_paths = ["C:\\Shares", "C:\\Temp"] +``` + +Agent validates all file operations against allowlist, rejects path traversal (`..`). + +### Rate Limiting +- Server enforces: 100 commands per minute per tech per agent +- Sliding window (in-memory or Redis) +- 429 response on limit exceeded +- Violations logged to tunnel_audit + +### Command Injection Prevention +- tokio::process::Command (no shell expansion) +- PowerShell: `-NoProfile -NonInteractive -Command` +- Input sanitization (escape quotes, reject backticks) +- Timeout enforcement + +### Session Security +- JWT 24h expiration +- Sessions auto-expire 4h inactivity +- One tunnel per agent (prevents concurrent session conflicts) +- Admin force-close endpoint + +--- + +## Testing Strategy + +### Unit Tests +- Channel routing (correct channel receives message) +- Session validation (JWT + ownership) +- Command sanitization +- Path validation (traversal prevention) + +### Integration Tests +- Full tunnel lifecycle (open → command → close) +- Concurrent sessions to different agents +- Session timeout enforcement +- Rate limiting + +### End-to-End Tests +- Claude Code MCP integration +- File upload via MCP, verify on agent +- Multi-step workflow (read file → modify → write back) + +--- + +## Rollout Plan + +1. **Week 5:** Internal testing (2 agents: AD2, DESKTOP-0O8A1RL) +2. **Week 6:** Beta release (3 power user techs) +3. **Week 7:** General availability (all techs, documentation, training) + +--- + +## Success Metrics + +**Infrastructure (Phase 1-2):** +- 95% tunnel open success rate +- <500ms command response time +- Zero session conflicts + +**MCP Integration (Phase 3-4):** +- 80% tech adoption within 2 weeks +- >50 tunnel sessions/day +- <5% command error rate + +**Long-term:** +- 20% reduction in RDP sessions +- 90% tech satisfaction +- <1% security incidents + +--- + +## Risks and Mitigations + +| Risk | Impact | Mitigation | +|------|--------|------------| +| Command injection | Critical | Input sanitization, no shell expansion, path allowlist | +| Session hijacking | High | Short-lived JWT, session ownership validation, audit logging | +| WebSocket instability | Medium | Auto-reconnect, session recovery | +| Rate limiting too strict | Medium | Configurable per-tech limits, user feedback | + +--- + +## Open Questions + +1. Registry operations scope (full access or specific hives only)? +2. Interactive terminal priority (defer to Phase 6)? +3. Multi-tech sessions for pair programming? +4. MCP server credential manager integration (1Password)? +5. Agent-side logging requirements (compliance)? + +--- + +## Verification Plan + +### Phase 1 Verification +```bash +# Tech opens tunnel session +curl -X POST http://172.16.3.30:3001/api/v1/tunnel/open \ + -H "Authorization: Bearer $JWT" \ + -d '{"agent_id": 1}' +# Response: {"session_id": "uuid", "status": "active"} + +# Check agent logs - should show: "Tunnel mode activated for session uuid" +# Check database: SELECT * FROM tech_sessions WHERE session_id = 'uuid'; +``` + +### Phase 2 Verification +```bash +# Execute command via tunnel +curl -X POST http://172.16.3.30:3001/api/v1/tunnel/$SESSION_ID/command \ + -H "Authorization: Bearer $JWT" \ + -d '{"command": "Get-Date", "shell": "powershell"}' +# Response: {"stdout": "Sunday, April 13, 2026...", "exit_code": 0} +``` + +### Phase 4 Verification (MCP) +```bash +# Configure MCP server in Claude Code +# Test tools appear in Claude's tool list +# Execute: "List files in C:\Shares on agent ID 1" +# Claude should call gururmm_list_directory tool +# Verify output shows directory listing +``` + +--- + +## Next Steps After Approval + +1. Create feature branch: `feature/real-time-tunnel` +2. Phase 1 database migrations (tech_sessions, tunnel_audit tables) +3. Update protocol enums (ServerMessage/AgentMessage) +4. Implement tunnel open/close endpoints +5. Update agent WebSocket handler for tunnel mode +6. Unit tests for session validation +7. Deploy to test environment + +**Estimated Timeline:** 5 weeks to MCP integration, 7 weeks to GA + +--- + +**Detailed plan location:** `projects/msp-tools/guru-rmm/plans/real-time-tunnel-architecture.md` diff --git a/projects/msp-tools/guru-rmm/plans/real-time-tunnel-architecture.md b/projects/msp-tools/guru-rmm/plans/real-time-tunnel-architecture.md new file mode 100644 index 0000000..15638c9 --- /dev/null +++ b/projects/msp-tools/guru-rmm/plans/real-time-tunnel-architecture.md @@ -0,0 +1,674 @@ +# GuruRMM Real-Time Tunnel Architecture Plan + +**Date:** 2026-04-13 +**Status:** DRAFT - Pending approval +**Goal:** Enable Claude Code on tech workstation to execute commands on remote machines through secure tunnel + +--- + +## Executive Summary + +This plan designs a real-time tunnel feature that transforms GuruRMM agents from periodic check-in mode (30-second heartbeats) to persistent tunnel mode when a tech opens a background session. The tunnel will support multiplexed channels for terminal access, filesystem operations, registry editor, and services management, accessible to Claude Code running on the tech's workstation. + +--- + +## Current Architecture (Discovered) + +### Server (172.16.3.30:3001) +- **Framework:** Axum 0.7 with Tokio async runtime +- **WebSocket endpoint:** wss://rmm-api.azcomputerguru.com/ws +- **Connection registry:** `AgentConnections` HashMap tracking active WebSocket connections +- **Message routing:** mpsc channels with dual-channel pattern (protocol messages + WebSocket Pong frames) +- **Protocol:** Tagged JSON enums with serde (ServerMessage/AgentMessage) + +### Agent +- **Runtime:** Tokio async with multiple concurrent tasks +- **Heartbeat interval:** 30 seconds (confirmed in code) +- **Concurrent tasks:** 3 sender tasks (metrics: 60s, network: 30s, heartbeat: 30s) +- **Inactivity timeout:** 90 seconds +- **Reconnect backoff:** 10 seconds + +### Existing Protocol +```rust +// Server → Agent +enum ServerMessage { + AuthAck(AuthAckPayload), + Command(CommandPayload), + ConfigUpdate(serde_json::Value), + Update(UpdatePayload), + Ack { message_id: Option }, + Error { code: String, message: String }, +} + +// Agent → Server +enum AgentMessage { + Auth(AuthPayload), + Heartbeat, + CommandResult(CommandResultPayload), + MetricsData(MetricsPayload), + NetworkData(NetworkPayload), +} +``` + +--- + +## Architectural Decisions + +### 1. Tunnel Lifecycle: On-Demand with Persistent Connection + +**Decision:** Hybrid approach - WebSocket stays persistent, tunnel mode is a state change + +**Rationale:** +- Existing architecture already maintains persistent WebSocket connections +- Heartbeat mode and tunnel mode are operational states, not connection states +- On-demand tunnel activation avoids resource waste +- Persistent WebSocket enables instant mode switching + +**Implementation:** +```rust +enum AgentMode { + Heartbeat, // Default: 30-second heartbeats, metrics, network monitoring + Tunnel { // Active session mode + session_id: String, + tech_id: i32, + channels: HashMap, + }, +} +``` + +### 2. Channel Multiplexing: Unified Protocol with Channel ID Routing + +**Decision:** Single WebSocket, multiple logical channels, channel_id field for routing + +**Rationale:** +- Maintains single WebSocket connection (simpler firewall rules, NAT traversal) +- Channel IDs enable concurrent operations (multiple terminals, simultaneous file transfers) +- Fits naturally into existing tagged enum protocol +- Allows adding new channel types without protocol changes + +**Protocol Extension:** +```rust +// New message types +enum ServerMessage { + // ... existing messages ... + TunnelOpen { session_id: String, tech_id: i32 }, + TunnelClose { session_id: String }, + TunnelData { channel_id: String, data: TunnelDataPayload }, +} + +enum AgentMessage { + // ... existing messages ... + TunnelReady { session_id: String }, + TunnelData { channel_id: String, data: TunnelDataPayload }, + TunnelError { channel_id: String, error: String }, +} + +#[serde(tag = "type", content = "payload")] +enum TunnelDataPayload { + Terminal { command: String }, + TerminalOutput { stdout: String, stderr: String, exit_code: Option }, + FileRead { path: String }, + FileContent { content: Vec, mime_type: String }, + FileWrite { path: String, content: Vec }, + FileList { path: String }, + FileListResult { entries: Vec }, + RegistryRead { path: String, value_name: Option }, + RegistryWrite { path: String, value_name: String, value: RegistryValue }, + ServiceList, + ServiceControl { name: String, action: ServiceAction }, +} +``` + +### 3. Claude Integration: Custom MCP Server + +**Decision:** Build GuruRMM MCP server that provides remote execution tools + +**Rationale:** +- MCP is Claude's native integration protocol +- Provides fine-grained tool permissions (user can approve specific operations) +- Tools appear naturally in Claude's tool list +- Can reuse existing API authentication (JWT tokens) +- Server can enforce rate limiting and audit logging + +**MCP Tools:** +```typescript +// MCP Server tools +{ + "run_remote_command": { + "agent_id": "string", + "command": "string", + "shell": "powershell|cmd|bash", + "working_dir": "string", + "timeout": "number" + }, + "read_remote_file": { + "agent_id": "string", + "path": "string" + }, + "write_remote_file": { + "agent_id": "string", + "path": "string", + "content": "string" + }, + "list_remote_directory": { + "agent_id": "string", + "path": "string" + }, + "get_remote_services": { + "agent_id": "string", + "filter": "string" + }, + "control_remote_service": { + "agent_id": "string", + "service_name": "string", + "action": "start|stop|restart" + } +} +``` + +### 4. File Operations: Hybrid Approach + +**Decision:** Dedicated file endpoints for binary/large files, PowerShell for metadata + +**Rationale:** +- Binary files (executables, images) need raw byte transfer +- Text files and metadata operations can use PowerShell (simpler, reuses existing command execution) +- Chunked transfer for large files (prevents WebSocket message size limits) +- Base64 encoding for binary data over JSON protocol + +**Implementation:** +- Files < 1MB: Direct transfer via TunnelData.FileContent +- Files > 1MB: Chunked transfer with transfer_id for reassembly +- PowerShell used for: directory listings, file metadata, permissions, ACLs + +### 5. Security Model + +**Decision:** Three-layer security: JWT auth, session authorization, command validation + +**Layer 1: JWT Authentication** +- Tech authenticates to server with credentials +- Server issues JWT with tech_id, permissions, expiration +- MCP server includes JWT in all tunnel requests + +**Layer 2: Session Authorization** +- Database tracks: tech_sessions table (tech_id, agent_id, session_id, opened_at) +- Server validates: JWT valid + session exists + tech owns session +- Sessions auto-expire after 4 hours of inactivity + +**Layer 3: Command Validation** +- Agent-side working directory restrictions (configurable per agent) +- Server-side command sanitization (prevent injection) +- Rate limiting: 100 commands per minute per tech per agent +- Audit logging: All tunnel operations logged to database + +--- + +## Implementation Plan + +### Phase 1: Core Tunnel Infrastructure (Week 1) + +**Goal:** Establish tunnel mode switching and channel routing + +**Server changes:** +1. Add `TunnelOpen`, `TunnelClose`, `TunnelData` to ServerMessage enum +2. Create `tech_sessions` table (id, tech_id, agent_id, session_id, opened_at, last_activity) +3. Implement tunnel session lifecycle endpoints: + - `POST /api/v1/tunnel/open` - Create session, send TunnelOpen to agent + - `POST /api/v1/tunnel/close` - Send TunnelClose, delete session + - `GET /api/v1/tunnel/status/:session_id` - Check tunnel health +4. Add channel routing logic in WebSocket handler (route by channel_id) +5. Implement session validation middleware (JWT + session ownership) + +**Agent changes:** +1. Add `TunnelReady`, `TunnelData`, `TunnelError` to AgentMessage enum +2. Implement AgentMode state machine (Heartbeat ↔ Tunnel transitions) +3. Add channel manager (HashMap) +4. Respond to TunnelOpen with TunnelReady confirmation +5. Handle TunnelClose gracefully (cleanup channels, return to heartbeat mode) + +**Testing:** +- Tech can open tunnel session via API +- Agent switches to tunnel mode +- Agent returns to heartbeat mode when session closes +- Concurrent sessions rejected (one tunnel per agent) + +### Phase 2: Terminal Channel (Week 2) + +**Goal:** Execute PowerShell/cmd/bash commands through tunnel + +**Implementation:** +1. Create `TerminalChannel` handler on agent + - Spawn child process (powershell.exe, cmd.exe, or bash) + - Capture stdout/stderr streams + - Handle exit codes and timeouts +2. Implement TunnelDataPayload::Terminal on server +3. Add working directory validation on agent +4. Add command result streaming (chunked output for long-running commands) + +**API endpoint:** +``` +POST /api/v1/tunnel/:session_id/command +Body: { + "command": "Get-Process | Where-Object CPU -gt 10", + "shell": "powershell", + "working_dir": "C:\\Shares\\test", + "timeout": 30000 +} +Response: { + "stdout": "...", + "stderr": "...", + "exit_code": 0, + "duration_ms": 1234 +} +``` + +**Testing:** +- Execute simple PowerShell command (Get-Date) +- Execute long-running command (Sleep 10) +- Test timeout enforcement +- Verify working directory restriction +- Test concurrent commands (multiple channel IDs) + +### Phase 3: File Operations (Week 3) + +**Goal:** Read, write, list files through tunnel + +**Implementation:** +1. Create `FileChannel` handler on agent + - Read file: fs::read, base64 encode if binary + - Write file: base64 decode, fs::write with backup + - List directory: fs::read_dir with metadata +2. Implement chunked transfer for files > 1MB +3. Add MIME type detection (read first bytes, use magic numbers) +4. Implement transfer_id tracking for multi-chunk uploads/downloads + +**API endpoints:** +``` +GET /api/v1/tunnel/:session_id/file?path=C:\logs\app.log +PUT /api/v1/tunnel/:session_id/file?path=C:\config\app.json +POST /api/v1/tunnel/:session_id/file/list?path=C:\Shares +``` + +**Testing:** +- Read small text file (< 1KB) +- Read large binary file (> 5MB, verify chunking) +- Write configuration file +- List directory with 100+ files +- Verify file permissions respected + +### Phase 4: MCP Server Integration (Week 4) + +**Goal:** Expose tunnel operations as MCP tools for Claude Code + +**Implementation:** +1. Create new Rust project: `gururmm-mcp-server` +2. Use `mcp-server-rs` crate for MCP protocol +3. Implement 6 core tools (run_command, read_file, write_file, list_dir, get_services, control_service) +4. Add JWT token configuration (user provides token from GuruRMM web UI) +5. Build tunnel session manager (open session on first tool use, keep alive, close on idle) +6. Add tool result formatting (pretty-print PowerShell objects, syntax highlight code) + +**MCP server config:** +```json +{ + "mcpServers": { + "gururmm": { + "command": "gururmm-mcp-server", + "args": [], + "env": { + "GURURMM_API_URL": "http://172.16.3.30:3001", + "GURURMM_AUTH_TOKEN": "jwt-token-here" + } + } + } +} +``` + +**Testing:** +- Claude Code can list available agents +- Claude Code can execute command on remote agent +- Claude Code can read/write files on remote agent +- Session auto-closes after 5 minutes idle +- Rate limiting enforced (100 commands/min) + +### Phase 5: Advanced Features (Week 5+) + +**Registry Operations:** +- Add RegistryChannel handler (Windows-only) +- Use winreg crate for safe registry access +- Support HKLM, HKCU, read/write/delete operations + +**Service Management:** +- Add ServiceChannel handler (cross-platform) +- Windows: use sc.exe or WMI +- Linux: use systemctl +- List services, start/stop/restart, get status + +**Interactive Terminal (Stretch Goal):** +- WebSocket-based PTY (pseudo-terminal) +- Bidirectional streaming (stdin → agent → process, stdout/stderr → agent → server) +- Support for interactive programs (vim, top, htop) +- Terminal emulation (xterm compatibility) + +--- + +## Database Schema Changes + +### New Tables + +```sql +-- Tunnel sessions +CREATE TABLE tech_sessions ( + id SERIAL PRIMARY KEY, + session_id VARCHAR(36) UNIQUE NOT NULL, + tech_id INTEGER NOT NULL REFERENCES techs(id), + agent_id INTEGER NOT NULL REFERENCES agents(id), + opened_at TIMESTAMP NOT NULL DEFAULT NOW(), + last_activity TIMESTAMP NOT NULL DEFAULT NOW(), + closed_at TIMESTAMP, + status VARCHAR(20) NOT NULL DEFAULT 'active', + UNIQUE(tech_id, agent_id, status) WHERE status = 'active' +); + +-- Tunnel audit log +CREATE TABLE tunnel_audit ( + id SERIAL PRIMARY KEY, + session_id VARCHAR(36) NOT NULL REFERENCES tech_sessions(session_id), + channel_id VARCHAR(36) NOT NULL, + operation VARCHAR(50) NOT NULL, + details JSONB, + created_at TIMESTAMP NOT NULL DEFAULT NOW() +); + +-- Indexes +CREATE INDEX idx_tech_sessions_tech ON tech_sessions(tech_id); +CREATE INDEX idx_tech_sessions_agent ON tech_sessions(agent_id); +CREATE INDEX idx_tech_sessions_status ON tech_sessions(status); +CREATE INDEX idx_tunnel_audit_session ON tunnel_audit(session_id); +CREATE INDEX idx_tunnel_audit_created ON tunnel_audit(created_at); +``` + +--- + +## Security Considerations + +### Working Directory Restrictions +- Agent config file specifies allowed paths: `allowed_paths: ["C:\\Shares", "C:\\Temp"]` +- All file operations validated against allowlist +- Path traversal attacks prevented (reject `..`, absolute path validation) + +### Rate Limiting +- Server enforces: 100 commands per minute per tech per agent +- Sliding window implementation (Redis or in-memory) +- 429 Too Many Requests response on limit exceeded +- Audit log tracks rate limit violations + +### Command Injection Prevention +- Agent uses tokio::process::Command (no shell expansion) +- PowerShell commands wrapped in `-NoProfile -NonInteractive -Command` +- Input sanitization: reject backticks, escape quotes +- Timeout enforcement: kill process after timeout + +### Session Management +- JWT tokens expire after 24 hours +- Sessions auto-expire after 4 hours inactivity +- Force-close endpoint for admins: `DELETE /api/v1/tunnel/:session_id/force-close` +- Concurrent session limit: 1 tunnel per agent (prevents session hijacking) + +### Audit Logging +- All tunnel operations logged to `tunnel_audit` table +- Logged fields: session_id, channel_id, operation, details (command/path/etc), timestamp +- Retention: 90 days (configurable) +- Suspicious activity alerts: >50 failed commands in 5 minutes + +--- + +## API Endpoints (New) + +``` +POST /api/v1/tunnel/open + Body: { "agent_id": 123 } + Response: { "session_id": "uuid", "status": "active" } + +POST /api/v1/tunnel/close + Body: { "session_id": "uuid" } + Response: { "status": "closed" } + +GET /api/v1/tunnel/status/:session_id + Response: { "session_id": "uuid", "agent_id": 123, "opened_at": "...", "last_activity": "..." } + +POST /api/v1/tunnel/:session_id/command + Body: { "command": "...", "shell": "powershell", "working_dir": "...", "timeout": 30000 } + Response: { "stdout": "...", "stderr": "...", "exit_code": 0, "duration_ms": 1234 } + +GET /api/v1/tunnel/:session_id/file?path=... + Response: { "content": "base64...", "mime_type": "text/plain", "size": 1234 } + +PUT /api/v1/tunnel/:session_id/file?path=... + Body: { "content": "base64..." } + Response: { "success": true, "path": "...", "size": 1234 } + +POST /api/v1/tunnel/:session_id/file/list?path=... + Response: { "entries": [{ "name": "...", "type": "file|dir", "size": 1234, "modified": "..." }] } +``` + +--- + +## MCP Server Implementation + +### Tool Definitions + +```json +{ + "tools": [ + { + "name": "gururmm_run_command", + "description": "Execute a command on a remote agent through GuruRMM tunnel", + "inputSchema": { + "type": "object", + "properties": { + "agent_id": { "type": "number", "description": "Agent ID to execute on" }, + "command": { "type": "string", "description": "Command to execute" }, + "shell": { "type": "string", "enum": ["powershell", "cmd", "bash"], "default": "powershell" }, + "working_dir": { "type": "string", "description": "Working directory (optional)" }, + "timeout": { "type": "number", "description": "Timeout in milliseconds", "default": 30000 } + }, + "required": ["agent_id", "command"] + } + }, + { + "name": "gururmm_read_file", + "description": "Read a file from a remote agent", + "inputSchema": { + "type": "object", + "properties": { + "agent_id": { "type": "number" }, + "path": { "type": "string", "description": "Full path to file" } + }, + "required": ["agent_id", "path"] + } + }, + { + "name": "gururmm_write_file", + "description": "Write a file to a remote agent", + "inputSchema": { + "type": "object", + "properties": { + "agent_id": { "type": "number" }, + "path": { "type": "string", "description": "Full path to file" }, + "content": { "type": "string", "description": "File content" } + }, + "required": ["agent_id", "path", "content"] + } + }, + { + "name": "gururmm_list_directory", + "description": "List files in a directory on a remote agent", + "inputSchema": { + "type": "object", + "properties": { + "agent_id": { "type": "number" }, + "path": { "type": "string", "description": "Directory path" } + }, + "required": ["agent_id", "path"] + } + }, + { + "name": "gururmm_list_agents", + "description": "List all available agents", + "inputSchema": { + "type": "object", + "properties": {}, + "required": [] + } + } + ] +} +``` + +### Session Management + +**Lifecycle:** +1. First tool call triggers tunnel open (POST /api/v1/tunnel/open) +2. MCP server caches session_id in memory +3. Subsequent tool calls reuse session +4. Idle timeout (5 minutes) triggers tunnel close +5. MCP server can handle concurrent sessions to different agents + +**Configuration:** +- MCP server reads JWT token from environment variable +- API URL configurable (default: http://172.16.3.30:3001) +- Session timeout configurable (default: 5 minutes) + +--- + +## Testing Strategy + +### Unit Tests +- Channel routing logic (correct channel receives message) +- Session validation (JWT + ownership) +- Command sanitization (injection prevention) +- Path validation (traversal prevention) + +### Integration Tests +- Full tunnel lifecycle (open → command → close) +- Concurrent sessions to different agents +- Session timeout enforcement +- Rate limiting triggers correctly + +### End-to-End Tests +- Claude Code MCP integration +- Tech opens session via web UI, Claude executes command +- File upload via MCP, verify on agent +- Service restart via MCP, verify status change + +--- + +## Rollout Plan + +### Phase 1: Internal Testing (Week 5) +- Deploy to test environment (172.16.3.30:3001) +- Test with 2 agents (AD2, DESKTOP-0O8A1RL) +- Tech team validates MCP integration +- Load testing: 10 concurrent sessions, 100 commands/min + +### Phase 2: Beta Release (Week 6) +- Deploy to production (rmm-api.azcomputerguru.com) +- Invite 3 beta techs (power users) +- Monitor audit logs for issues +- Gather feedback on MCP tool UX + +### Phase 3: General Availability (Week 7) +- Release to all techs +- Documentation: MCP server setup guide +- Training video: Claude Code + GuruRMM workflow +- Monitor error rates, tunnel session count + +--- + +## Risks and Mitigations + +| Risk | Impact | Mitigation | +|------|--------|------------| +| Command injection allows arbitrary code execution | Critical | Input sanitization, no shell expansion, allowlist-based path validation | +| Session hijacking via stolen JWT | High | Short-lived tokens (24h), session ownership validation, audit logging | +| WebSocket connection instability | Medium | Auto-reconnect logic, session recovery on reconnect | +| Rate limiting too strict (blocks legitimate use) | Medium | Configurable limits per tech, burst allowance, user feedback | +| File transfer timeouts on large files | Medium | Chunked transfer, resumable uploads | +| MCP server crashes (techs lose access) | Medium | Supervisor/systemd auto-restart, health check endpoint | + +--- + +## Open Questions + +1. **Registry operations scope**: Full registry access or restrict to specific hives (HKLM\Software, HKCU)? +2. **Interactive terminal priority**: High demand or defer to Phase 6? +3. **Multi-tech sessions**: Should multiple techs be able to share a session (pair programming)? +4. **Credential storage**: Should MCP server support credential manager integration (1Password, Windows Credential Manager)? +5. **Agent-side logging**: Should agent log tunnel operations locally (compliance requirement)? + +--- + +## Success Metrics + +**Phase 1-2 (Infrastructure):** +- 95% tunnel open success rate +- <500ms average command response time (non-blocking) +- Zero session conflicts (concurrent tunnel per agent) + +**Phase 3-4 (MCP Integration):** +- 80% of techs using MCP tools within 2 weeks +- >50 tunnel sessions per day +- <5% command error rate (excluding user errors) + +**Phase 5+ (Adoption):** +- 20% reduction in remote desktop sessions (techs use tunnel instead) +- 90% tech satisfaction rating (survey) +- <1% security incidents related to tunnel misuse + +--- + +## Dependencies + +**Server:** +- Axum 0.7 (existing) +- PostgreSQL (existing) +- JWT library (existing) +- tokio-tungstenite for WebSocket (existing) + +**Agent:** +- tokio 1.x (existing) +- serde/serde_json (existing) +- base64 crate (for file encoding) +- winreg crate (Windows registry, Phase 5) + +**MCP Server:** +- mcp-server-rs crate (new dependency) +- reqwest for HTTP client (new) +- tokio runtime (new) + +**Infrastructure:** +- No new servers required (runs on existing 172.16.3.30) +- Cloudflare tunnel already configured +- Database migrations automated (existing CI/CD) + +--- + +## Next Steps After Approval + +1. Create feature branch: `feature/real-time-tunnel` +2. Implement Phase 1 database migrations +3. Update protocol definitions (ServerMessage/AgentMessage enums) +4. Create tech_sessions table +5. Implement tunnel open/close endpoints +6. Update agent to handle TunnelOpen message +7. Write unit tests for session validation +8. Deploy to test environment for validation + +**Estimated timeline:** 5 weeks to MCP integration, 6-7 weeks to GA + +--- + +**Status:** READY FOR REVIEW +**Reviewer:** User approval required +**Questions:** See "Open Questions" section above diff --git a/projects/msp-tools/guru-rmm/server/migrations/006_tunnel_sessions.sql b/projects/msp-tools/guru-rmm/server/migrations/006_tunnel_sessions.sql new file mode 100644 index 0000000..0de6f39 --- /dev/null +++ b/projects/msp-tools/guru-rmm/server/migrations/006_tunnel_sessions.sql @@ -0,0 +1,43 @@ +-- GuruRMM Tunnel Sessions Schema +-- Creates tables for technician SSH tunnel sessions and audit logging + +-- Tech Sessions table +-- Stores active and historical SSH tunnel sessions between technicians and agents +CREATE TABLE tech_sessions ( + id SERIAL PRIMARY KEY, + session_id VARCHAR(36) UNIQUE NOT NULL, + tech_id INTEGER NOT NULL, + agent_id UUID NOT NULL REFERENCES agents(id) ON DELETE CASCADE, + opened_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + last_activity TIMESTAMPTZ NOT NULL DEFAULT NOW(), + closed_at TIMESTAMPTZ, + status VARCHAR(20) NOT NULL DEFAULT 'active', + CONSTRAINT unique_active_session UNIQUE (tech_id, agent_id, status) + WHERE status = 'active' +); + +-- Index for finding sessions by technician +CREATE INDEX idx_tech_sessions_tech ON tech_sessions(tech_id); + +-- Index for finding sessions by agent +CREATE INDEX idx_tech_sessions_agent ON tech_sessions(agent_id); + +-- Index for filtering by session status +CREATE INDEX idx_tech_sessions_status ON tech_sessions(status); + +-- Tunnel Audit table +-- Detailed audit log for all tunnel operations and channel activity +CREATE TABLE tunnel_audit ( + id BIGSERIAL PRIMARY KEY, + session_id VARCHAR(36) NOT NULL REFERENCES tech_sessions(session_id) ON DELETE CASCADE, + channel_id VARCHAR(36) NOT NULL, + operation VARCHAR(50) NOT NULL, + details JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +-- Index for querying audit logs by session +CREATE INDEX idx_tunnel_audit_session ON tunnel_audit(session_id); + +-- Index for time-based audit queries +CREATE INDEX idx_tunnel_audit_created ON tunnel_audit(created_at);