diff --git a/server/migrations/007_fix_machine_tags_null.sql b/server/migrations/007_fix_machine_tags_null.sql new file mode 100644 index 0000000..db9f9bb --- /dev/null +++ b/server/migrations/007_fix_machine_tags_null.sql @@ -0,0 +1,39 @@ +-- Migration: 007_fix_machine_tags_null.sql +-- Purpose: Make connect_machines.tags self-protecting against NULL. +-- +-- Migration 005 intended to create `tags TEXT[] NOT NULL DEFAULT '{}'`, but it +-- used `ADD COLUMN IF NOT EXISTS`. On the production instance the column already +-- existed (created without NOT NULL / without a default by an earlier path), so +-- 005's constraints were never applied — the column stayed nullable with no +-- default. Rows with `tags IS NULL` then failed to decode in the Rust `Machine` +-- FromRow (`unexpected null; try decoding as an Option`), which broke the startup +-- "reconcile managed sessions" task and would 500 the authenticated Machines list. +-- +-- Production was hot-patched with `UPDATE connect_machines SET tags='{}' WHERE +-- tags IS NULL`. This migration makes the schema itself enforce the invariant so +-- it cannot recur: backfill any remaining NULLs, set a column DEFAULT, then add a +-- NOT NULL constraint. The Rust decode was also made NULL-tolerant (manual FromRow) +-- as belt-and-suspenders. +-- +-- Idempotent: the backfill is a no-op once rows already hold '{}' (the prod hot-patch +-- already ran); SET DEFAULT is idempotent; SET NOT NULL is a no-op if the constraint +-- already exists. Applied on server startup by sqlx::migrate!(); never pre-applied +-- via psql. Ordered after 006. +-- See .claude/standards/gururmm/sqlx-migrations.md. + +-- 1. Backfill any rows still holding NULL (must precede SET NOT NULL). +UPDATE connect_machines +SET tags = '{}' +WHERE tags IS NULL; + +-- 2. Pin a column default so future INSERTs that omit `tags` get an empty array +-- rather than NULL. +ALTER TABLE connect_machines + ALTER COLUMN tags SET DEFAULT '{}'; + +-- 3. Enforce the invariant at the schema level. Safe: the only writer of `tags` +-- (db::machines::update_machine_metadata) binds a non-null TEXT[]; no INSERT path +-- supplies an explicit NULL for tags (upsert_machine omits the column, so it now +-- takes the DEFAULT above). After the backfill no NULL rows remain. +ALTER TABLE connect_machines + ALTER COLUMN tags SET NOT NULL; diff --git a/server/src/db/machines.rs b/server/src/db/machines.rs index a250aa1..15ee14a 100644 --- a/server/src/db/machines.rs +++ b/server/src/db/machines.rs @@ -2,11 +2,26 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; -use sqlx::PgPool; +use sqlx::{postgres::PgRow, FromRow, PgPool, Row}; use uuid::Uuid; /// Machine record from database -#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)] +/// +/// `FromRow` is implemented manually (not derived) so that every column whose +/// schema definition is *nullable* decodes NULL-tolerantly. The `connect_machines` +/// table was created in migration 001 with only `DEFAULT` clauses (no `NOT NULL`) +/// on `is_elevated`, `is_persistent`, `first_seen`, `last_seen`, `status`, +/// `created_at`, and `updated_at`; `tags` (migration 005) likewise ended up +/// nullable with no default on the production instance. A derived `FromRow` maps +/// those to non-`Option` Rust types and errors at decode time the moment any cell +/// is NULL (`unexpected null; try decoding as an Option`). In production a row with +/// `tags IS NULL` broke the startup reconcile task and would 500 the authenticated +/// Machines list. The manual impl below reads every nullable column as +/// `Option` and falls back to `Default::default()`, so a NULL can never panic or +/// error regardless of how the column was created. Truly non-null columns (`id`, +/// `agent_id`, `hostname`) are decoded directly. Migration 007 additionally pins +/// `tags` to `DEFAULT '{}'` and backfills existing NULLs (defense in depth). +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct Machine { pub id: Uuid, pub agent_id: String, @@ -31,11 +46,57 @@ pub struct Machine { pub site: Option, /// Free-form tags reported by the agent (`AgentStatus.tags`). Stored as a /// Postgres `TEXT[]`; matches the `&[String]` bound by `update_machine_metadata`. - /// Migration 005. `NULL`/absent column reads back as an empty vec via the default. + /// Migration 005. A `NULL` cell decodes to an empty vec (see the manual `FromRow` + /// impl) so it can never error at decode time. #[serde(default)] pub tags: Vec, } +impl<'r> FromRow<'r, PgRow> for Machine { + fn from_row(row: &'r PgRow) -> Result { + Ok(Self { + // NOT NULL columns: decode directly. + id: row.try_get("id")?, + agent_id: row.try_get("agent_id")?, + hostname: row.try_get("hostname")?, + // Schema-nullable in `Option` form already: decode directly. + os_version: row.try_get("os_version")?, + last_session_id: row.try_get("last_session_id")?, + tenant_id: row.try_get("tenant_id")?, + organization: row.try_get("organization")?, + site: row.try_get("site")?, + // Nullable-with-default columns mapped to non-`Option` Rust types: read as + // `Option` and fall back to the type default so a NULL cell never errors. + is_elevated: row + .try_get::, _>("is_elevated")? + .unwrap_or_default(), + is_persistent: row + .try_get::, _>("is_persistent")? + .unwrap_or_default(), + first_seen: row + .try_get::>, _>("first_seen")? + .unwrap_or_default(), + last_seen: row + .try_get::>, _>("last_seen")? + .unwrap_or_default(), + status: row + .try_get::, _>("status")? + .unwrap_or_default(), + created_at: row + .try_get::>, _>("created_at")? + .unwrap_or_default(), + updated_at: row + .try_get::>, _>("updated_at")? + .unwrap_or_default(), + // The production bug: `tags` was nullable with no default. A NULL cell + // decodes to an empty vec here instead of erroring. + tags: row + .try_get::>, _>("tags")? + .unwrap_or_default(), + }) + } +} + /// Get or create a machine by agent_id (upsert) pub async fn upsert_machine( pool: &PgPool,