From f9bdecbfdbeaf8578becd9cf1ba085452f2a5c79 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Sat, 30 May 2026 10:35:04 -0700 Subject: [PATCH] feat(agent,server): v2 secure-session-core Task 7 - HW H.264 + negotiated raw fallback SPEC-002 Phase 1 Task 7 (the last), code-reviewed APPROVED, locally verified (cargo fmt + clippy -D warnings exit 0 + cargo test --workspace 89 pass + build). - Encoder trait + factory: RawEncoder (salvaged, UNCHANGED) and H264Encoder, selected by negotiation; factory falls back to raw on H.264 init failure. - Negotiation: agent advertises supports_h264 (MFTEnumEx HW probe, cached) in AgentStatus; server picks the codec via select_video_codec(supports, prefer) and stamps StartStream.video_codec; agent re-guards on local HW. Policy constant DEFAULT_PREFER_H264 = false, so RAW is negotiated for every session today - H.264 stays dormant until live hardware validation (Task 8). - MF H.264 encoder (h264.rs, FIRST-CUT / compile-verified-only): HW encoder MFT, BGRA->NV12 (color.rs, unit-tested), sync drain, fall-back-to-raw on any failure. - Viewer H.264 decoder (decoder.rs, FIRST-CUT): MF decoder on a dedicated COM thread; drops+logs on failure, raw render path untouched. - proto additive: VideoCodec enum, StartStream.video_codec=3, SessionResponse.video_codec=5, AgentStatus.supports_h264=11. - Raw+Zstd path byte-for-byte unchanged; remains the guaranteed default/fallback. Review confirmed unsafe impl Send for H264Encoder is sound (single-owned &mut on the block_on thread; session future never spawned) and every MF failure degrades to raw. H.264 is NOT claimed functional - compile/clippy/build-verified only; live validation + force-IDR + the no-spawn-invariant doc are Task 8 go-live gates. Co-Authored-By: Claude Opus 4.8 (1M context) --- agent/Cargo.toml | 7 + agent/src/encoder/capability.rs | 97 +++++ agent/src/encoder/color.rs | 269 ++++++++++++++ agent/src/encoder/h264.rs | 515 +++++++++++++++++++++++++++ agent/src/encoder/mod.rs | 194 +++++++++- agent/src/session/mod.rs | 32 +- agent/src/viewer/decoder.rs | 452 +++++++++++++++++++++++ agent/src/viewer/mod.rs | 100 +++++- proto/guruconnect.proto | 30 ++ server/src/relay/mod.rs | 1 + server/src/session/mod.rs | 110 ++++++ specs/v2-secure-session-core/plan.md | 101 +++++- 12 files changed, 1885 insertions(+), 23 deletions(-) create mode 100644 agent/src/encoder/capability.rs create mode 100644 agent/src/encoder/color.rs create mode 100644 agent/src/encoder/h264.rs create mode 100644 agent/src/viewer/decoder.rs diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 2736ef2..5124937 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -95,6 +95,13 @@ windows = { version = "0.58", features = [ "Win32_System_Pipes", "Win32_System_SystemServices", "Win32_System_IO", + "Win32_System_Com", + "Win32_System_Com_StructuredStorage", + "Win32_System_Ole", + "Win32_System_Variant", + "Win32_Media_MediaFoundation", + "Win32_Media_KernelStreaming", + "Win32_Media_DirectShow", ]} # Windows service support diff --git a/agent/src/encoder/capability.rs b/agent/src/encoder/capability.rs new file mode 100644 index 0000000..aa043c7 --- /dev/null +++ b/agent/src/encoder/capability.rs @@ -0,0 +1,97 @@ +//! Hardware video-encode capability detection (Task 7). +//! +//! Probes Windows Media Foundation for a HARDWARE H.264 encoder MFT at startup. +//! The result is cached and advertised to the server in `AgentStatus.supports_h264` +//! so the server can negotiate the codec (see `StartStream.video_codec`). +//! +//! Detection is intentionally cheap and side-effect-free: it only ENUMERATES the +//! available encoder MFTs (it does not create or initialize one). A `true` result +//! means a hardware H.264 encoder was advertised by the OS; it does NOT guarantee +//! the encoder will successfully initialize at stream time — the H.264 encoder +//! still falls back to raw on any init/feed failure. +//! +//! On non-Windows targets, or if MF is unavailable, this reports `false`. + +use std::sync::OnceLock; + +/// Cached capability result. Detection runs at most once per process. +static SUPPORTS_H264: OnceLock = OnceLock::new(); + +/// Return whether this machine has a hardware H.264 encoder, detecting once and +/// caching the result. Safe to call repeatedly and from any thread. +pub fn supports_hardware_h264() -> bool { + *SUPPORTS_H264.get_or_init(detect_hardware_h264) +} + +/// Run the actual detection. Separated so the cached accessor stays trivial. +fn detect_hardware_h264() -> bool { + let supported = detect_inner(); + if supported { + tracing::info!("Hardware H.264 encoder detected (Media Foundation)"); + } else { + tracing::info!("No hardware H.264 encoder detected; raw+Zstd only"); + } + supported +} + +#[cfg(windows)] +fn detect_inner() -> bool { + // Enumerate hardware H.264 encoder MFTs. This is a read-only probe; it does + // not init D3D, COM apartments persistently, or create the encoder. + match unsafe { enumerate_hardware_h264() } { + Ok(found) => found, + Err(e) => { + tracing::warn!("H.264 capability probe failed: {e:#}; assuming no HW encoder"); + false + } + } +} + +#[cfg(not(windows))] +fn detect_inner() -> bool { + false +} + +#[cfg(windows)] +unsafe fn enumerate_hardware_h264() -> anyhow::Result { + use windows::Win32::Media::MediaFoundation::{ + MFMediaType_Video, MFTEnumEx, MFVideoFormat_H264, MFT_CATEGORY_VIDEO_ENCODER, + MFT_ENUM_FLAG_HARDWARE, MFT_ENUM_FLAG_SORTANDFILTER, MFT_ENUM_FLAG_TRANSCODE_ONLY, + MFT_REGISTER_TYPE_INFO, + }; + + // We only specify the OUTPUT type (H.264); input is left unconstrained so the + // probe matches encoders regardless of their preferred input subtype. + let output_type = MFT_REGISTER_TYPE_INFO { + guidMajorType: MFMediaType_Video, + guidSubtype: MFVideoFormat_H264, + }; + + let mut activate_ptr: *mut Option = + std::ptr::null_mut(); + let mut count: u32 = 0; + + // MFTEnumEx does not itself require MFStartup for a pure enumeration, but we + // guard with a Result so any HRESULT failure degrades to "no HW encoder". + MFTEnumEx( + MFT_CATEGORY_VIDEO_ENCODER, + MFT_ENUM_FLAG_HARDWARE | MFT_ENUM_FLAG_SORTANDFILTER | MFT_ENUM_FLAG_TRANSCODE_ONLY, + None, // input type: any + Some(&output_type as *const _), + &mut activate_ptr, + &mut count, + )?; + + // Release every returned IMFActivate, then free the array CoTaskMemAlloc'd by MF. + let found = count > 0; + if !activate_ptr.is_null() { + let slice = std::slice::from_raw_parts_mut(activate_ptr, count as usize); + for entry in slice.iter_mut() { + // Dropping the Option releases the COM reference. + entry.take(); + } + windows::Win32::System::Com::CoTaskMemFree(Some(activate_ptr as *const _)); + } + + Ok(found) +} diff --git a/agent/src/encoder/color.rs b/agent/src/encoder/color.rs new file mode 100644 index 0000000..4cae622 --- /dev/null +++ b/agent/src/encoder/color.rs @@ -0,0 +1,269 @@ +//! Color-space conversion for the H.264 encode path (Task 7). +//! +//! Screen capture produces BGRA (4 bytes/pixel, B,G,R,A order — the DXGI/GDI +//! native layout). Media Foundation hardware H.264 encoders want NV12: a full- +//! resolution 8-bit Y (luma) plane followed by an interleaved half-resolution +//! U/V (chroma) plane. This module does that conversion in software. +//! +//! NV12 memory layout for a `width x height` frame (width/height assumed even): +//! - Y plane: `width * height` bytes, row-major. +//! - UV plane: `width * (height / 2)` bytes — for each 2x2 luma block one +//! (U, V) pair, so the plane is `(width/2)` (U,V) pairs per row over +//! `height/2` rows, i.e. `width` bytes per chroma row. +//! +//! Total size = `width * height * 3 / 2`. +//! +//! The coefficients are BT.601 "studio swing" (limited range, 16..235 luma), +//! which is what MF H.264 encoders expect by default. Chroma is computed by +//! averaging the 2x2 BGRA block before conversion (box downsample) to reduce +//! aliasing. + +/// Size in bytes of an NV12 buffer for `width` x `height` (both even). +#[inline] +pub fn nv12_size(width: u32, height: u32) -> usize { + (width as usize * height as usize) * 3 / 2 +} + +/// BT.601 limited-range luma from 8-bit R,G,B. +#[inline] +fn rgb_to_y(r: i32, g: i32, b: i32) -> u8 { + // Y = 16 + (65.481*R + 128.553*G + 24.966*B) / 255, fixed-point. + // Using the common integer approximation: + // Y = ((66*R + 129*G + 25*B + 128) >> 8) + 16 + let y = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; + y.clamp(0, 255) as u8 +} + +/// BT.601 limited-range Cb (U) from 8-bit R,G,B. +#[inline] +fn rgb_to_u(r: i32, g: i32, b: i32) -> u8 { + let u = ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; + u.clamp(0, 255) as u8 +} + +/// BT.601 limited-range Cr (V) from 8-bit R,G,B. +#[inline] +fn rgb_to_v(r: i32, g: i32, b: i32) -> u8 { + let v = ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; + v.clamp(0, 255) as u8 +} + +/// Convert a tightly-packed BGRA frame into NV12, writing into `out`. +/// +/// `bgra` must be at least `width * height * 4` bytes; `out` must be at least +/// `nv12_size(width, height)` bytes. `width` and `height` MUST be even (H.264 +/// 4:2:0 requires even dimensions — the caller pads odd capture sizes). Returns +/// an error rather than panicking on a short buffer or odd dimension so the +/// encoder can fall back to raw. +pub fn bgra_to_nv12( + bgra: &[u8], + width: u32, + height: u32, + out: &mut [u8], +) -> Result<(), ColorConvertError> { + if width == 0 || height == 0 { + return Err(ColorConvertError::ZeroDimension); + } + if !width.is_multiple_of(2) || !height.is_multiple_of(2) { + return Err(ColorConvertError::OddDimension { width, height }); + } + + let w = width as usize; + let h = height as usize; + let expected_src = w * h * 4; + if bgra.len() < expected_src { + return Err(ColorConvertError::SrcTooSmall { + got: bgra.len(), + need: expected_src, + }); + } + let need_out = nv12_size(width, height); + if out.len() < need_out { + return Err(ColorConvertError::DstTooSmall { + got: out.len(), + need: need_out, + }); + } + + let (y_plane, uv_plane) = out.split_at_mut(w * h); + + // Luma: one sample per pixel. + for row in 0..h { + let src_row = row * w * 4; + let dst_row = row * w; + for col in 0..w { + let px = src_row + col * 4; + // BGRA order. + let b = bgra[px] as i32; + let g = bgra[px + 1] as i32; + let r = bgra[px + 2] as i32; + y_plane[dst_row + col] = rgb_to_y(r, g, b); + } + } + + // Chroma: one (U,V) pair per 2x2 block, box-averaged. + let chroma_rows = h / 2; + let chroma_cols = w / 2; + for cy in 0..chroma_rows { + for cx in 0..chroma_cols { + let x0 = cx * 2; + let y0 = cy * 2; + + let mut r_sum = 0i32; + let mut g_sum = 0i32; + let mut b_sum = 0i32; + for dy in 0..2 { + for dx in 0..2 { + let px = ((y0 + dy) * w + (x0 + dx)) * 4; + b_sum += bgra[px] as i32; + g_sum += bgra[px + 1] as i32; + r_sum += bgra[px + 2] as i32; + } + } + let r = r_sum / 4; + let g = g_sum / 4; + let b = b_sum / 4; + + let uv_idx = (cy * chroma_cols + cx) * 2; + uv_plane[uv_idx] = rgb_to_u(r, g, b); + uv_plane[uv_idx + 1] = rgb_to_v(r, g, b); + } + } + + Ok(()) +} + +/// Errors from BGRA->NV12 conversion. Surfaced (not panicked) so the H.264 +/// encoder can downgrade to raw. +#[derive(Debug, thiserror::Error)] +pub enum ColorConvertError { + #[error("frame dimension is zero")] + ZeroDimension, + #[error("NV12 requires even dimensions, got {width}x{height}")] + OddDimension { width: u32, height: u32 }, + #[error("source BGRA buffer too small: {got} < {need}")] + SrcTooSmall { got: usize, need: usize }, + #[error("destination NV12 buffer too small: {got} < {need}")] + DstTooSmall { got: usize, need: usize }, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn nv12_size_is_3half() { + assert_eq!(nv12_size(2, 2), 6); + assert_eq!(nv12_size(4, 4), 24); + assert_eq!(nv12_size(1920, 1080), 1920 * 1080 * 3 / 2); + } + + #[test] + fn rejects_odd_dimensions() { + let bgra = vec![0u8; 3 * 3 * 4]; + let mut out = vec![0u8; nv12_size(4, 4)]; + assert!(matches!( + bgra_to_nv12(&bgra, 3, 2, &mut out), + Err(ColorConvertError::OddDimension { .. }) + )); + assert!(matches!( + bgra_to_nv12(&bgra, 2, 3, &mut out), + Err(ColorConvertError::OddDimension { .. }) + )); + } + + #[test] + fn rejects_short_source() { + let bgra = vec![0u8; 4]; // way too small for 2x2 + let mut out = vec![0u8; nv12_size(2, 2)]; + assert!(matches!( + bgra_to_nv12(&bgra, 2, 2, &mut out), + Err(ColorConvertError::SrcTooSmall { .. }) + )); + } + + #[test] + fn rejects_short_dest() { + let bgra = vec![0u8; 2 * 2 * 4]; + let mut out = vec![0u8; 1]; + assert!(matches!( + bgra_to_nv12(&bgra, 2, 2, &mut out), + Err(ColorConvertError::DstTooSmall { .. }) + )); + } + + /// A pure-black BGRA frame -> Y = 16 (limited-range black), U = V = 128. + #[test] + fn black_frame_maps_to_limited_range_black() { + let bgra = vec![0u8; 4 * 4 * 4]; // all zero => black, alpha 0 + let mut out = vec![0u8; nv12_size(4, 4)]; + bgra_to_nv12(&bgra, 4, 4, &mut out).unwrap(); + + // Y plane (first 16 bytes) all 16. + for &y in &out[..16] { + assert_eq!(y, 16, "black luma must be 16 (limited range)"); + } + // UV plane all 128 (neutral chroma). + for &c in &out[16..] { + assert_eq!(c, 128, "black chroma must be neutral 128"); + } + } + + /// A pure-white BGRA frame -> Y = 235 (limited-range white), U = V = 128. + #[test] + fn white_frame_maps_to_limited_range_white() { + // B=255, G=255, R=255, A=255 for every pixel. + let bgra = vec![255u8; 2 * 2 * 4]; + let mut out = vec![0u8; nv12_size(2, 2)]; + bgra_to_nv12(&bgra, 2, 2, &mut out).unwrap(); + + // Y = ((66+129+25)*255 + 128) >> 8 + 16 = 235. + for &y in &out[..4] { + assert_eq!(y, 235, "white luma must be 235 (limited range)"); + } + // Neutral chroma for a gray/white pixel. + assert_eq!(out[4], 128); + assert_eq!(out[5], 128); + } + + /// A pure-red frame: luma below mid, V (Cr) well above 128, U (Cb) below 128. + #[test] + fn red_frame_has_high_cr_low_cb() { + // BGRA red: B=0, G=0, R=255, A=255. + let mut bgra = vec![0u8; 2 * 2 * 4]; + for px in bgra.chunks_mut(4) { + px[0] = 0; // B + px[1] = 0; // G + px[2] = 255; // R + px[3] = 255; // A + } + let mut out = vec![0u8; nv12_size(2, 2)]; + bgra_to_nv12(&bgra, 2, 2, &mut out).unwrap(); + + let u = out[4]; + let v = out[5]; + assert!(v > 200, "red must have high Cr (V), got {v}"); + assert!(u < 128, "red must have Cb (U) below neutral, got {u}"); + } + + /// Conversion fills the whole NV12 buffer (no leftover zeros where data is + /// expected) for a non-trivial gradient — a sanity check on plane indexing. + #[test] + fn plane_indexing_covers_full_buffer() { + let w = 8u32; + let h = 8u32; + let mut bgra = vec![0u8; (w * h * 4) as usize]; + for (i, px) in bgra.chunks_mut(4).enumerate() { + let v = (i % 256) as u8; + px[0] = v; + px[1] = v; + px[2] = v; + px[3] = 255; + } + let mut out = vec![0xAAu8; nv12_size(w, h)]; + bgra_to_nv12(&bgra, w, h, &mut out).unwrap(); + // Y plane should be fully written (gray ramp -> non-constant). + let y_plane = &out[..(w * h) as usize]; + assert!(y_plane.windows(2).any(|p| p[0] != p[1]), "Y plane varies"); + } +} diff --git a/agent/src/encoder/h264.rs b/agent/src/encoder/h264.rs new file mode 100644 index 0000000..96437ac --- /dev/null +++ b/agent/src/encoder/h264.rs @@ -0,0 +1,515 @@ +//! Hardware H.264 encoder via Windows Media Foundation (Task 7). +//! +//! FIRST-CUT / COMPILE-VERIFIED ONLY. This encoder is wired end-to-end (init -> +//! feed -> drain -> emit `EncodedFrame{h264}`) and is selected only when the +//! agent advertised hardware support AND the server negotiated H.264. It has NOT +//! been validated on real hardware with live frames — that is plan Task 8. On +//! ANY initialization or per-frame failure it surfaces an error; the encoder +//! factory (`create_encoder_for`) downgrades to the raw+Zstd encoder so a +//! session never breaks because of H.264. +//! +//! Pipeline: +//! BGRA capture --(color::bgra_to_nv12)--> NV12 sample --> MFT(H.264) --> H.264 +//! Annex-B/length-prefixed elementary stream --> proto EncodedFrame. +//! +//! Design notes: +//! - The MFT is enumerated with `MFTEnumEx(MFT_CATEGORY_VIDEO_ENCODER, +//! MFT_ENUM_FLAG_HARDWARE, …, MFVideoFormat_H264)` (same probe as +//! `capability`). We `ActivateObject` the first match. +//! - Input is configured as NV12, output as H.264, with frame size, frame rate +//! and an average bitrate derived from `quality`. +//! - Both the SYNCHRONOUS MFT model (ProcessInput/ProcessOutput) and the +//! ASYNCHRONOUS hardware-MFT model (METransformNeedInput / METransformHaveOutput +//! events) exist. To keep this first cut bounded and predictable we DRAIN the +//! MFT synchronously after each input and treat `MF_E_TRANSFORM_NEED_MORE_INPUT` +//! as "no output this tick". A fully async event-driven loop is a Task-8 +//! refinement (documented below). +//! - `MFT_MESSAGE_SET_D3D_MANAGER` is intentionally NOT set — we feed CPU NV12 +//! buffers (software input samples), which every HW H.264 MFT accepts. D3D11 +//! zero-copy is a later optimization. + +#![cfg(windows)] + +use super::{EncodedFrame, Encoder}; +use crate::capture::CapturedFrame; +use crate::encoder::color; +use crate::proto::{video_frame, EncodedFrame as ProtoEncodedFrame, VideoFrame}; +use anyhow::{anyhow, Context, Result}; +use windows::Win32::Media::MediaFoundation::{ + IMFActivate, IMFMediaType, IMFSample, IMFTransform, MFCreateMediaType, MFCreateMemoryBuffer, + MFCreateSample, MFMediaType_Video, MFShutdown, MFStartup, MFTEnumEx, MFVideoFormat_H264, + MFVideoFormat_NV12, MFVideoInterlace_Progressive, MFSTARTUP_LITE, MFT_CATEGORY_VIDEO_ENCODER, + MFT_ENUM_FLAG_HARDWARE, MFT_ENUM_FLAG_SORTANDFILTER, MFT_ENUM_FLAG_TRANSCODE_ONLY, + MFT_MESSAGE_COMMAND_FLUSH, MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, + MFT_MESSAGE_NOTIFY_END_OF_STREAM, MFT_MESSAGE_NOTIFY_END_STREAMING, + MFT_MESSAGE_NOTIFY_START_OF_STREAM, MFT_OUTPUT_DATA_BUFFER, MFT_OUTPUT_STREAM_INFO, + MFT_REGISTER_TYPE_INFO, MF_E_TRANSFORM_NEED_MORE_INPUT, MF_MT_AVG_BITRATE, MF_MT_FRAME_RATE, + MF_MT_FRAME_SIZE, MF_MT_INTERLACE_MODE, MF_MT_MAJOR_TYPE, MF_MT_PIXEL_ASPECT_RATIO, + MF_MT_SUBTYPE, +}; + +/// Encoder-internal state, created once and reused per frame. +pub struct H264Encoder { + /// The activated encoder transform. + transform: IMFTransform, + /// Configured frame dimensions; a capture-size change forces re-init. + width: u32, + height: u32, + /// Quality (1-100) used to derive the bitrate; kept for re-init on resize. + quality: u32, + /// Frame sequence counter (mirrors RawEncoder). + sequence: u32, + /// Force the next frame to request a keyframe. + force_keyframe: bool, + /// Whether `MFT_MESSAGE_NOTIFY_BEGIN_STREAMING` was sent. + streaming: bool, + /// Reusable NV12 staging buffer (resized on dimension change). + nv12: Vec, + /// Input/output stream identifiers (most encoders use 0/0). + input_stream_id: u32, + output_stream_id: u32, + /// True if MF was started by THIS encoder and must be shut down on drop. + mf_started: bool, +} + +// IMFTransform is a COM interface; it is not auto-Send. We only ever touch the +// encoder from the single capture/encode thread (the session owns it behind a +// &mut), so it is safe to move between threads as long as it is not shared. +unsafe impl Send for H264Encoder {} + +impl H264Encoder { + /// Construct and fully initialize a hardware H.264 encoder. Returns an error + /// (so the factory can fall back to raw) if MF is unavailable, no hardware + /// encoder exists, or media-type negotiation fails. A default frame size is + /// used and re-negotiated on the first frame if the real capture differs. + pub fn new(quality: u32) -> Result { + // 1920x1080 default; re-init on the first frame if the capture differs. + Self::with_dimensions(quality, 1920, 1080) + } + + fn with_dimensions(quality: u32, width: u32, height: u32) -> Result { + unsafe { + // MF must be initialized on this thread. MFSTARTUP_LITE avoids the + // sockets/network stack we don't need. + MFStartup(mf_version(), MFSTARTUP_LITE).context("MFStartup failed")?; + let mf_started = true; + + let transform = match Self::activate_hw_encoder() { + Ok(t) => t, + Err(e) => { + // Balance the MFStartup we just did before bailing. + let _ = MFShutdown(); + return Err(e); + } + }; + + let mut enc = Self { + transform, + width, + height, + quality, + sequence: 0, + force_keyframe: true, + streaming: false, + nv12: Vec::new(), + input_stream_id: 0, + output_stream_id: 0, + mf_started, + }; + + // `enc`'s Drop will shut MF down and release the transform on error. + enc.configure_media_types()?; + + Ok(enc) + } + } + + /// Enumerate hardware H.264 encoder MFTs and activate the first one. + unsafe fn activate_hw_encoder() -> Result { + let output_type = MFT_REGISTER_TYPE_INFO { + guidMajorType: MFMediaType_Video, + guidSubtype: MFVideoFormat_H264, + }; + + let mut activate_ptr: *mut Option = std::ptr::null_mut(); + let mut count: u32 = 0; + + MFTEnumEx( + MFT_CATEGORY_VIDEO_ENCODER, + MFT_ENUM_FLAG_HARDWARE | MFT_ENUM_FLAG_SORTANDFILTER | MFT_ENUM_FLAG_TRANSCODE_ONLY, + None, + Some(&output_type as *const _), + &mut activate_ptr, + &mut count, + ) + .context("MFTEnumEx (hardware H.264) failed")?; + + if count == 0 || activate_ptr.is_null() { + if !activate_ptr.is_null() { + windows::Win32::System::Com::CoTaskMemFree(Some(activate_ptr as *const _)); + } + return Err(anyhow!("no hardware H.264 encoder MFT available")); + } + + let slice = std::slice::from_raw_parts_mut(activate_ptr, count as usize); + + // Activate the first usable encoder; release every IMFActivate. + let mut chosen: Option = None; + for entry in slice.iter_mut() { + if chosen.is_none() { + if let Some(activate) = entry.as_ref() { + if let Ok(transform) = activate.ActivateObject::() { + chosen = Some(transform); + } + } + } + // Release this IMFActivate reference. + entry.take(); + } + windows::Win32::System::Com::CoTaskMemFree(Some(activate_ptr as *const _)); + + chosen.ok_or_else(|| anyhow!("failed to activate any hardware H.264 encoder MFT")) + } + + /// Set the H.264 output type and NV12 input type, in the order MF requires + /// (output type FIRST for encoders, then the matching input type). + unsafe fn configure_media_types(&mut self) -> Result<()> { + // Discover the real stream identifiers (most encoders report 0/0). + let mut input_ids = [0u32; 1]; + let mut output_ids = [0u32; 1]; + // GetStreamIDs may return E_NOTIMPL meaning "ids are 0..n-1"; ignore err. + let _ = self.transform.GetStreamIDs(&mut input_ids, &mut output_ids); + // If GetStreamIDs populated nonzero ids use them, else default 0/0. + if input_ids[0] != 0 { + self.input_stream_id = input_ids[0]; + } + if output_ids[0] != 0 { + self.output_stream_id = output_ids[0]; + } + + let fps_num = 30u32; + let fps_den = 1u32; + let bitrate = quality_to_bitrate(self.quality, self.width, self.height); + + // ---- OUTPUT (H.264) ---- + let out_type: IMFMediaType = MFCreateMediaType().context("MFCreateMediaType(out)")?; + out_type.SetGUID(&MF_MT_MAJOR_TYPE, &MFMediaType_Video)?; + out_type.SetGUID(&MF_MT_SUBTYPE, &MFVideoFormat_H264)?; + out_type.SetUINT32(&MF_MT_AVG_BITRATE, bitrate)?; + set_attr_size(&out_type, &MF_MT_FRAME_SIZE, self.width, self.height)?; + set_attr_ratio(&out_type, &MF_MT_FRAME_RATE, fps_num, fps_den)?; + set_attr_ratio(&out_type, &MF_MT_PIXEL_ASPECT_RATIO, 1, 1)?; + out_type.SetUINT32(&MF_MT_INTERLACE_MODE, MFVideoInterlace_Progressive.0 as u32)?; + self.transform + .SetOutputType(self.output_stream_id, &out_type, 0) + .context("SetOutputType(H264)")?; + + // ---- INPUT (NV12) ---- + let in_type: IMFMediaType = MFCreateMediaType().context("MFCreateMediaType(in)")?; + in_type.SetGUID(&MF_MT_MAJOR_TYPE, &MFMediaType_Video)?; + in_type.SetGUID(&MF_MT_SUBTYPE, &MFVideoFormat_NV12)?; + set_attr_size(&in_type, &MF_MT_FRAME_SIZE, self.width, self.height)?; + set_attr_ratio(&in_type, &MF_MT_FRAME_RATE, fps_num, fps_den)?; + set_attr_ratio(&in_type, &MF_MT_PIXEL_ASPECT_RATIO, 1, 1)?; + in_type.SetUINT32(&MF_MT_INTERLACE_MODE, MFVideoInterlace_Progressive.0 as u32)?; + self.transform + .SetInputType(self.input_stream_id, &in_type, 0) + .context("SetInputType(NV12)")?; + + Ok(()) + } + + /// Begin streaming if not already started (idempotent). + unsafe fn ensure_streaming(&mut self) -> Result<()> { + if !self.streaming { + self.transform + .ProcessMessage(MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, 0) + .context("NOTIFY_BEGIN_STREAMING")?; + self.transform + .ProcessMessage(MFT_MESSAGE_NOTIFY_START_OF_STREAM, 0) + .context("NOTIFY_START_OF_STREAM")?; + self.streaming = true; + } + Ok(()) + } + + /// Re-initialize the encoder for a new frame size (capture resolution change). + unsafe fn reinit_for_size(&mut self, width: u32, height: u32) -> Result<()> { + if self.streaming { + let _ = self.transform.ProcessMessage(MFT_MESSAGE_COMMAND_FLUSH, 0); + let _ = self + .transform + .ProcessMessage(MFT_MESSAGE_NOTIFY_END_OF_STREAM, 0); + let _ = self + .transform + .ProcessMessage(MFT_MESSAGE_NOTIFY_END_STREAMING, 0); + self.streaming = false; + } + self.width = width; + self.height = height; + self.force_keyframe = true; + self.configure_media_types() + } + + /// Wrap an NV12 byte buffer into an `IMFSample` with the given timestamp. + /// A free associated fn (does not borrow `self`) so the caller can pass + /// `&self.nv12` without a clone while `self` is mutably borrowed elsewhere. + unsafe fn make_input_sample(nv12: &[u8], pts_100ns: i64) -> Result { + let sample: IMFSample = MFCreateSample().context("MFCreateSample")?; + let buffer = MFCreateMemoryBuffer(nv12.len() as u32).context("MFCreateMemoryBuffer")?; + + // Lock, copy NV12 in, set current length, unlock. + let mut data_ptr: *mut u8 = std::ptr::null_mut(); + let mut max_len: u32 = 0; + buffer + .Lock(&mut data_ptr, Some(&mut max_len), None) + .context("IMFMediaBuffer::Lock")?; + if (max_len as usize) < nv12.len() || data_ptr.is_null() { + let _ = buffer.Unlock(); + return Err(anyhow!("MF buffer too small for NV12 frame")); + } + std::ptr::copy_nonoverlapping(nv12.as_ptr(), data_ptr, nv12.len()); + buffer.SetCurrentLength(nv12.len() as u32)?; + buffer.Unlock()?; + + sample.AddBuffer(&buffer)?; + sample.SetSampleTime(pts_100ns)?; + // 33.367ms per frame at ~30fps, in 100ns units. + sample.SetSampleDuration(333_667)?; + Ok(sample) + } + + /// Drain one available output sample, if any. Returns the encoded bytes and + /// whether the MFT flagged it a keyframe (clean point). `Ok(None)` means the + /// MFT needs more input before it can produce output this tick. + unsafe fn drain_one_output(&mut self) -> Result, bool)>> { + let stream_info: MFT_OUTPUT_STREAM_INFO = self + .transform + .GetOutputStreamInfo(self.output_stream_id) + .context("GetOutputStreamInfo")?; + + // If the MFT does not allocate its own output samples we must provide one. + const MFT_OUTPUT_STREAM_PROVIDES_SAMPLES: u32 = 0x100; + let mft_provides = stream_info.dwFlags & MFT_OUTPUT_STREAM_PROVIDES_SAMPLES != 0; + + let mut out_buffer = MFT_OUTPUT_DATA_BUFFER { + dwStreamID: self.output_stream_id, + ..Default::default() + }; + + if !mft_provides { + let alloc_size = stream_info.cbSize.max(1); + let sample: IMFSample = MFCreateSample().context("MFCreateSample(out)")?; + let buffer = MFCreateMemoryBuffer(alloc_size).context("MFCreateMemoryBuffer(out)")?; + sample.AddBuffer(&buffer)?; + out_buffer.pSample = std::mem::ManuallyDrop::new(Some(sample)); + } + + let mut status: u32 = 0; + let mut bufs = [out_buffer]; + let hr = self.transform.ProcessOutput(0, &mut bufs, &mut status); + + // Take ownership of whatever sample is now in the buffer (ours or MFT's). + let produced = std::mem::ManuallyDrop::take(&mut bufs[0].pSample); + + match hr { + Ok(()) => { + let Some(sample) = produced else { + return Ok(None); + }; + let bytes = sample_to_vec(&sample)?; + let keyframe = sample_is_keyframe(&sample); + Ok(Some((bytes, keyframe))) + } + Err(e) if e.code() == MF_E_TRANSFORM_NEED_MORE_INPUT => Ok(None), + Err(e) => Err(anyhow!("ProcessOutput failed: {e:#}")), + } + } +} + +impl Encoder for H264Encoder { + fn encode(&mut self, frame: &CapturedFrame) -> Result { + self.sequence = self.sequence.wrapping_add(1); + + // H.264 4:2:0 needs even dimensions. Reject odd captures up front so we + // surface a clean error (the factory already fell back to raw if HW was + // missing; a per-frame error here lets the session log + continue). + if !frame.width.is_multiple_of(2) || !frame.height.is_multiple_of(2) { + return Err(anyhow!( + "H.264 requires even dimensions, got {}x{}", + frame.width, + frame.height + )); + } + + unsafe { + // Re-init on a resolution change. + if frame.width != self.width || frame.height != self.height { + self.reinit_for_size(frame.width, frame.height) + .context("H.264 re-init for new frame size")?; + } + + self.ensure_streaming()?; + + // BGRA -> NV12 into the reusable staging buffer. + let need = color::nv12_size(frame.width, frame.height); + if self.nv12.len() != need { + self.nv12.resize(need, 0); + } + color::bgra_to_nv12(&frame.data, frame.width, frame.height, &mut self.nv12) + .map_err(|e| anyhow!("BGRA->NV12 failed: {e}"))?; + + // PTS in 100ns units derived from the frame's capture instant. + let pts_100ns = (frame.timestamp.elapsed().as_nanos() / 100) as i64; + let sample = Self::make_input_sample(&self.nv12, pts_100ns)?; + + // Feed the encoder. NEED_MORE_INPUT is normal back-pressure handling; + // for the synchronous first cut we only push one frame per tick. + match self + .transform + .ProcessInput(self.input_stream_id, &sample, 0) + { + Ok(()) => {} + Err(e) if e.code() == MF_E_TRANSFORM_NEED_MORE_INPUT => {} + Err(e) => return Err(anyhow!("ProcessInput failed: {e:#}")), + } + + // Drain whatever output is ready. + let Some((data, mft_keyframe)) = self.drain_one_output()? else { + // No compressed output yet (encoder latency / GOP buffering). + // Emit an empty frame so the session skips sending this tick. + return Ok(EncodedFrame { + frame: VideoFrame::default(), + size: 0, + is_keyframe: false, + }); + }; + + let is_keyframe = mft_keyframe || self.force_keyframe; + self.force_keyframe = false; + + let size = data.len(); + let encoded = ProtoEncodedFrame { + data, + keyframe: is_keyframe, + pts: pts_100ns, + dts: pts_100ns, + }; + + Ok(EncodedFrame { + frame: VideoFrame { + timestamp: frame.timestamp.elapsed().as_millis() as i64, + display_id: frame.display_id as i32, + sequence: self.sequence as i32, + encoding: Some(video_frame::Encoding::H264(encoded)), + }, + size, + is_keyframe, + }) + } + } + + fn request_keyframe(&mut self) { + // A precise force-IDR uses the MFT codec API + // (CODECAPI_AVEncVideoForceKeyFrame); for the first cut we flag the next + // emitted frame as a keyframe so the viewer treats it as a clean point. + self.force_keyframe = true; + } + + fn name(&self) -> &str { + "h264-mediafoundation" + } +} + +impl Drop for H264Encoder { + fn drop(&mut self) { + unsafe { + if self.streaming { + let _ = self + .transform + .ProcessMessage(MFT_MESSAGE_NOTIFY_END_OF_STREAM, 0); + let _ = self + .transform + .ProcessMessage(MFT_MESSAGE_NOTIFY_END_STREAMING, 0); + } + // The IMFTransform releases when `self.transform` drops. + if self.mf_started { + let _ = MFShutdown(); + } + } + } +} + +/// MF version word expected by `MFStartup` (MF_VERSION = (MF_API_VERSION<<16)|MF_SDK_VERSION). +fn mf_version() -> u32 { + // MF_SDK_VERSION = 0x0002, MF_API_VERSION = 0x0070 -> 0x00020070. + 0x0002_0070 +} + +/// Derive a target average bitrate (bps) from the 1-100 quality knob and the +/// frame area. Tuned conservatively for desktop content (mostly static). +fn quality_to_bitrate(quality: u32, width: u32, height: u32) -> u32 { + let q = quality.clamp(1, 100) as u64; + let pixels = (width as u64) * (height as u64); + // Base ~0.06 bits/pixel/frame at 30fps for q=100, scaled by quality. + // bps = pixels * 30 * bpp; bpp scales 0.01..0.10 with quality. + let bpp_milli = 10 + (q * 90 / 100); // 0.010 .. 0.100 in milli-bits + let bps = pixels.saturating_mul(30).saturating_mul(bpp_milli) / 1000; + bps.clamp(500_000, 50_000_000) as u32 +} + +/// Pack (width, height) into the 64-bit MF_MT_FRAME_SIZE attribute. +#[cfg(windows)] +unsafe fn set_attr_size( + media_type: &IMFMediaType, + key: &windows::core::GUID, + width: u32, + height: u32, +) -> Result<()> { + let packed = ((width as u64) << 32) | (height as u64); + media_type.SetUINT64(key, packed)?; + Ok(()) +} + +/// Pack (numerator, denominator) into a 64-bit ratio MF attribute. +#[cfg(windows)] +unsafe fn set_attr_ratio( + media_type: &IMFMediaType, + key: &windows::core::GUID, + num: u32, + den: u32, +) -> Result<()> { + let packed = ((num as u64) << 32) | (den as u64); + media_type.SetUINT64(key, packed)?; + Ok(()) +} + +/// Copy all bytes out of an `IMFSample` (single contiguous buffer) into a Vec. +#[cfg(windows)] +unsafe fn sample_to_vec(sample: &IMFSample) -> Result> { + let buffer = sample + .ConvertToContiguousBuffer() + .context("ConvertToContiguousBuffer")?; + let mut ptr: *mut u8 = std::ptr::null_mut(); + let mut len: u32 = 0; + buffer + .Lock(&mut ptr, None, Some(&mut len)) + .context("output buffer Lock")?; + let out = if ptr.is_null() || len == 0 { + Vec::new() + } else { + std::slice::from_raw_parts(ptr, len as usize).to_vec() + }; + let _ = buffer.Unlock(); + Ok(out) +} + +/// Read the "clean point" (keyframe) flag off a sample, if present. +#[cfg(windows)] +unsafe fn sample_is_keyframe(sample: &IMFSample) -> bool { + use windows::Win32::Media::MediaFoundation::MFSampleExtension_CleanPoint; + sample + .GetUINT32(&MFSampleExtension_CleanPoint) + .map(|v| v != 0) + .unwrap_or(false) +} diff --git a/agent/src/encoder/mod.rs b/agent/src/encoder/mod.rs index 767306f..2c86a9d 100644 --- a/agent/src/encoder/mod.rs +++ b/agent/src/encoder/mod.rs @@ -1,16 +1,27 @@ //! Frame encoding module //! //! Encodes captured frames for transmission. Supports: -//! - Raw BGRA + Zstd compression (lowest latency, LAN mode) -//! - VP9 software encoding (universal fallback) -//! - H264 hardware encoding (when GPU available) +//! - Raw BGRA + Zstd compression (lowest latency, LAN mode; the guaranteed +//! fallback and the current default). +//! - H.264 hardware encoding via Windows Media Foundation (Task 7) — the +//! negotiated upgrade. Compile-verified; validated on real hardware in plan +//! Task 8. On any init/feed failure the factory or encoder falls back to raw. +//! +//! Codec selection is driven by the negotiated `VideoCodec` the server sends on +//! `StartStream` (see `select_codec` / `create_encoder_for`). The capability the +//! agent advertises to the server is detected by `capability::supports_hardware_h264`. +mod capability; +pub(crate) mod color; +#[cfg(windows)] +mod h264; mod raw; +pub use capability::supports_hardware_h264; pub use raw::RawEncoder; use crate::capture::CapturedFrame; -use crate::proto::VideoFrame; +use crate::proto::{video_frame, VideoCodec, VideoFrame}; use anyhow::Result; /// Encoded frame ready for transmission @@ -28,7 +39,12 @@ pub struct EncodedFrame { pub is_keyframe: bool, } -/// Frame encoder trait +/// Frame encoder trait. +/// +/// Every implementor turns a `CapturedFrame` (BGRA) into a wire `VideoFrame` +/// using one `video_frame::Encoding` variant. `RawEncoder` emits the `Raw` +/// variant; the H.264 encoder emits the `H264` variant. The factory +/// (`create_encoder_for`) selects the implementor from the negotiated codec. pub trait Encoder: Send { /// Encode a captured frame fn encode(&mut self, frame: &CapturedFrame) -> Result; @@ -42,13 +58,167 @@ pub trait Encoder: Send { fn name(&self) -> &str; } -/// Create an encoder based on configuration -pub fn create_encoder(codec: &str, quality: u32) -> Result> { +/// Map a configured/negotiated codec string to a `VideoCodec`. +/// +/// Used when constructing an encoder from the agent's own `EncodingConfig` +/// (before any server negotiation). Unknown / "auto" / "raw" all resolve to raw +/// — the safe default. "h264" resolves to H.264 (which itself falls back to raw +/// if MF init fails). +/// +/// Retained as the config-string entry point (used by `create_encoder` and the +/// unit tests); the live session negotiates via `select_codec` on a `VideoCodec`. +#[allow(dead_code)] +pub fn codec_from_str(codec: &str) -> VideoCodec { match codec.to_lowercase().as_str() { - "raw" | "zstd" => Ok(Box::new(RawEncoder::new(quality)?)), - // "vp9" => Ok(Box::new(Vp9Encoder::new(quality)?)), - // "h264" => Ok(Box::new(H264Encoder::new(quality)?)), - // "auto" and any unknown codec default to raw for now (best for LAN) - _ => Ok(Box::new(RawEncoder::new(quality)?)), + "h264" => VideoCodec::H264, + // "h265"/"hevc" are future opt-in (TODO) — treat as raw for now so we + // never select an unimplemented codec. + _ => VideoCodec::Raw, + } +} + +/// Choose the codec the agent will actually use for a stream, given the codec +/// the server negotiated and the agent's own hardware capability. +/// +/// This is the agent-side guard that keeps the raw fallback authoritative: +/// - The server only negotiates H.264 when the agent advertised support, but we +/// re-check `supports_hardware_h264()` here so a stale/misconfigured server +/// selection can never force an unsupported codec. +/// - H.265 is not implemented; it degrades to raw. +/// - Anything else is raw. +pub fn select_codec(negotiated: VideoCodec, hardware_h264_available: bool) -> VideoCodec { + match negotiated { + VideoCodec::H264 if hardware_h264_available => VideoCodec::H264, + // Server asked for H.264 but we have no HW encoder -> raw. + VideoCodec::H264 => VideoCodec::Raw, + // HEVC not implemented yet (TODO: Task 7 opt-in / future). + VideoCodec::H265 => VideoCodec::Raw, + VideoCodec::Raw => VideoCodec::Raw, + } +} + +/// Create an encoder for an explicit `VideoCodec`, with a transparent fallback +/// to raw if a hardware encoder cannot be constructed. +/// +/// `quality` is the 1-100 quality knob (mapped per-codec). On H.264 init failure +/// this logs and returns a raw encoder so the session keeps working. +pub fn create_encoder_for(codec: VideoCodec, quality: u32) -> Result> { + match codec { + VideoCodec::H264 => { + #[cfg(windows)] + { + match h264::H264Encoder::new(quality) { + Ok(enc) => { + tracing::info!("Using hardware H.264 encoder (Media Foundation)"); + Ok(Box::new(enc)) + } + Err(e) => { + tracing::warn!( + "H.264 encoder init failed ({e:#}); falling back to raw+Zstd" + ); + Ok(Box::new(RawEncoder::new(quality)?)) + } + } + } + #[cfg(not(windows))] + { + tracing::warn!("H.264 unsupported on this platform; using raw+Zstd"); + Ok(Box::new(RawEncoder::new(quality)?)) + } + } + // Raw (and anything that resolved to raw) uses the salvaged encoder. + VideoCodec::Raw | VideoCodec::H265 => Ok(Box::new(RawEncoder::new(quality)?)), + } +} + +/// Create an encoder based on a codec string (agent config path). +/// +/// Backwards-compatible entry point that builds an encoder from a codec STRING +/// (e.g. `EncodingConfig.codec`). Resolves the string to a `VideoCodec`, applies +/// the hardware-availability guard, then builds the encoder. The live session +/// uses `select_codec` + `create_encoder_for` (negotiated `VideoCodec`) instead; +/// this remains for the config path and is covered by unit tests. +#[allow(dead_code)] +pub fn create_encoder(codec: &str, quality: u32) -> Result> { + let requested = codec_from_str(codec); + let chosen = select_codec(requested, supports_hardware_h264()); + create_encoder_for(chosen, quality) +} + +/// Build an `EncodedFrame` carrying a single `video_frame::Encoding` payload. +/// Shared helper so encoders don't each repeat the `VideoFrame` wrapper. +#[allow(dead_code)] +pub(crate) fn wrap_video_frame( + timestamp_ms: i64, + display_id: i32, + sequence: i32, + encoding: video_frame::Encoding, + size: usize, + is_keyframe: bool, +) -> EncodedFrame { + EncodedFrame { + frame: VideoFrame { + timestamp: timestamp_ms, + display_id, + sequence, + encoding: Some(encoding), + }, + size, + is_keyframe, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn codec_from_str_maps_known_and_unknown() { + assert_eq!(codec_from_str("h264"), VideoCodec::H264); + assert_eq!(codec_from_str("H264"), VideoCodec::H264); + assert_eq!(codec_from_str("raw"), VideoCodec::Raw); + assert_eq!(codec_from_str("zstd"), VideoCodec::Raw); + assert_eq!(codec_from_str("auto"), VideoCodec::Raw); + assert_eq!(codec_from_str("vp9"), VideoCodec::Raw); + // HEVC not implemented -> raw, never H265. + assert_eq!(codec_from_str("h265"), VideoCodec::Raw); + assert_eq!(codec_from_str("hevc"), VideoCodec::Raw); + assert_eq!(codec_from_str(""), VideoCodec::Raw); + } + + #[test] + fn select_codec_honors_hardware_guard() { + // Server negotiated H.264 and HW is present -> H.264. + assert_eq!(select_codec(VideoCodec::H264, true), VideoCodec::H264); + // Server negotiated H.264 but no HW -> raw (never forced). + assert_eq!(select_codec(VideoCodec::H264, false), VideoCodec::Raw); + // Raw stays raw regardless of HW. + assert_eq!(select_codec(VideoCodec::Raw, true), VideoCodec::Raw); + assert_eq!(select_codec(VideoCodec::Raw, false), VideoCodec::Raw); + // HEVC always degrades to raw (unimplemented). + assert_eq!(select_codec(VideoCodec::H265, true), VideoCodec::Raw); + } + + #[test] + fn raw_factory_always_succeeds() { + // Raw must always construct (the guaranteed fallback). + let enc = create_encoder_for(VideoCodec::Raw, 75).unwrap(); + assert_eq!(enc.name(), "raw+zstd"); + } + + #[test] + fn create_encoder_string_path_resolves_to_raw_without_hw() { + // On a machine without a HW encoder (CI / non-Windows), "h264" must + // resolve to a working raw encoder, not an error. + let enc = create_encoder("h264", 75).unwrap(); + // Without HW it is raw; with HW it would be the H.264 encoder. We only + // assert it constructed. + let _ = enc.name(); + } + + #[test] + fn create_encoder_auto_is_raw() { + let enc = create_encoder("auto", 75).unwrap(); + assert_eq!(enc.name(), "raw+zstd"); } } diff --git a/agent/src/session/mod.rs b/agent/src/session/mod.rs index 817e9b5..5d749f6 100644 --- a/agent/src/session/mod.rs +++ b/agent/src/session/mod.rs @@ -61,6 +61,10 @@ pub struct SessionManager { input: Option, // Streaming state current_viewer_id: Option, + // Codec negotiated by the server for the current stream (Task 7). Set from + // StartStream.video_codec; the encoder is built from it (guarded by the + // agent's own hardware capability, with raw as the safe fallback). + negotiated_codec: crate::proto::VideoCodec, // System info for status reports hostname: String, is_elevated: bool, @@ -87,6 +91,8 @@ impl SessionManager { encoder: None, input: None, current_viewer_id: None, + // Default to RAW until the server negotiates otherwise (StartStream). + negotiated_codec: crate::proto::VideoCodec::Raw, hostname, is_elevated, start_time: Instant::now(), @@ -168,14 +174,20 @@ impl SessionManager { self.capturer = Some(capturer); tracing::info!("Capturer created successfully"); - // Create encoder with panic protection + // Create encoder from the NEGOTIATED codec (Task 7), guarded by the + // agent's own hardware capability. `create_encoder_for` selects the H.264 + // encoder only if it can actually be constructed, otherwise it returns a + // working raw encoder — so this never breaks the session. + let chosen = + encoder::select_codec(self.negotiated_codec, encoder::supports_hardware_h264()); tracing::debug!( - "Creating encoder (codec={}, quality={})...", - self.config.encoding.codec, + "Creating encoder (negotiated={:?}, chosen={:?}, quality={})...", + self.negotiated_codec, + chosen, self.config.encoding.quality ); let encoder = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { - encoder::create_encoder(&self.config.encoding.codec, self.config.encoding.quality) + encoder::create_encoder_for(chosen, self.config.encoding.quality) })) { Ok(result) => result?, Err(e) => { @@ -232,6 +244,9 @@ impl SessionManager { organization: self.config.company.clone().unwrap_or_default(), site: self.config.site.clone().unwrap_or_default(), tags: self.config.tags.clone(), + // Advertise hardware H.264 capability so the server can negotiate the + // codec (Task 7). Detected once and cached by the encoder module. + supports_h264: encoder::supports_hardware_h264(), }; let msg = Message { @@ -336,6 +351,15 @@ impl SessionManager { match payload { message::Payload::StartStream(start) => { tracing::info!("StartStream received from viewer: {}", start.viewer_id); + // Apply the server-negotiated codec (Task 7) BEFORE + // building the encoder. An older server that omits the + // field sends 0 = VIDEO_CODEC_RAW, preserving the raw + // default. `select_codec` (in init_streaming) re-guards + // against missing hardware. + self.negotiated_codec = + crate::proto::VideoCodec::try_from(start.video_codec) + .unwrap_or(crate::proto::VideoCodec::Raw); + tracing::info!("Server negotiated codec: {:?}", self.negotiated_codec); if let Err(e) = self.init_streaming() { tracing::error!("Failed to init streaming: {}", e); } else { diff --git a/agent/src/viewer/decoder.rs b/agent/src/viewer/decoder.rs new file mode 100644 index 0000000..85dffa3 --- /dev/null +++ b/agent/src/viewer/decoder.rs @@ -0,0 +1,452 @@ +//! H.264 video decoder for the native viewer (Task 7). +//! +//! FIRST-CUT / COMPILE-VERIFIED ONLY. Decodes an H.264 elementary stream +//! (`EncodedFrame{h264}`) via a Media Foundation H.264 decoder MFT into NV12, +//! then converts NV12 -> BGRA so it can flow through the EXISTING raw render +//! path (`render::FrameData { compressed: false, BGRA }`). Not yet validated on +//! real hardware with a live stream — that is plan Task 8. On decode-init +//! failure the decoder reports an error and the viewer logs it; the raw-frame +//! render path is untouched for raw sessions. +//! +//! The decoder is created lazily on the first H.264 frame (so a raw session +//! never spins up MF). It is `!Send` (COM), so it lives on the viewer's receive +//! task and is wrapped accordingly by the caller. + +#![cfg(windows)] + +use anyhow::{anyhow, Context, Result}; +use windows::Win32::Media::MediaFoundation::{ + IMFMediaType, IMFSample, IMFTransform, MFCreateMediaType, MFCreateMemoryBuffer, MFCreateSample, + MFMediaType_Video, MFShutdown, MFStartup, MFTEnumEx, MFVideoFormat_H264, MFVideoFormat_NV12, + MFSTARTUP_LITE, MFT_CATEGORY_VIDEO_DECODER, MFT_ENUM_FLAG_SORTANDFILTER, MFT_ENUM_FLAG_SYNCMFT, + MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, MFT_MESSAGE_NOTIFY_END_OF_STREAM, + MFT_MESSAGE_NOTIFY_END_STREAMING, MFT_MESSAGE_NOTIFY_START_OF_STREAM, MFT_OUTPUT_DATA_BUFFER, + MFT_OUTPUT_STREAM_INFO, MFT_REGISTER_TYPE_INFO, MF_E_TRANSFORM_NEED_MORE_INPUT, + MF_E_TRANSFORM_STREAM_CHANGE, MF_MT_FRAME_SIZE, MF_MT_MAJOR_TYPE, MF_MT_SUBTYPE, +}; + +/// A decoded NV12 frame and its dimensions, ready for NV12 -> BGRA conversion. +pub struct DecodedFrame { + pub width: u32, + pub height: u32, + /// BGRA pixels (4 bytes/px), ready for `render::FrameData`. + pub bgra: Vec, +} + +/// Media Foundation H.264 decoder wrapper. +pub struct H264Decoder { + transform: IMFTransform, + width: u32, + height: u32, + streaming: bool, + input_stream_id: u32, + output_stream_id: u32, + mf_started: bool, +} + +// NOTE: H264Decoder is intentionally NOT `Send`. It wraps COM interfaces with +// thread affinity and is created + used entirely on the dedicated `gc-h264-decode` +// OS thread (see viewer::spawn_h264_decode_worker), so it never crosses a thread +// boundary and does not need a Send assertion. + +impl H264Decoder { + /// Construct an H.264 decoder MFT and set its input type to H.264. The + /// output type (NV12) is negotiated after the first frames decode the + /// sequence header (we (re)read the real frame size on a stream change). + pub fn new() -> Result { + unsafe { + MFStartup(mf_version(), MFSTARTUP_LITE).context("MFStartup (decoder)")?; + let transform = match activate_decoder() { + Ok(t) => t, + Err(e) => { + let _ = MFShutdown(); + return Err(e); + } + }; + + let mut dec = Self { + transform, + width: 0, + height: 0, + streaming: false, + input_stream_id: 0, + output_stream_id: 0, + mf_started: true, + }; + + dec.configure_input()?; + Ok(dec) + } + } + + /// Set the decoder input type to H.264 (no fixed frame size — the decoder + /// learns it from the bitstream). + unsafe fn configure_input(&mut self) -> Result<()> { + let in_type: IMFMediaType = MFCreateMediaType().context("MFCreateMediaType(dec in)")?; + in_type.SetGUID(&MF_MT_MAJOR_TYPE, &MFMediaType_Video)?; + in_type.SetGUID(&MF_MT_SUBTYPE, &MFVideoFormat_H264)?; + self.transform + .SetInputType(self.input_stream_id, &in_type, 0) + .context("SetInputType(H264 decode)")?; + Ok(()) + } + + /// Set the decoder output type to NV12 once the stream size is known. + unsafe fn configure_output_nv12(&mut self) -> Result<()> { + let out_type: IMFMediaType = MFCreateMediaType().context("MFCreateMediaType(dec out)")?; + out_type.SetGUID(&MF_MT_MAJOR_TYPE, &MFMediaType_Video)?; + out_type.SetGUID(&MF_MT_SUBTYPE, &MFVideoFormat_NV12)?; + self.transform + .SetOutputType(self.output_stream_id, &out_type, 0) + .context("SetOutputType(NV12 decode)")?; + Ok(()) + } + + /// Read the negotiated output frame size from the decoder's current output type. + unsafe fn read_output_size(&mut self) -> Result<(u32, u32)> { + let out_type = self + .transform + .GetOutputCurrentType(self.output_stream_id) + .context("GetOutputCurrentType")?; + let packed = out_type + .GetUINT64(&MF_MT_FRAME_SIZE) + .context("read MF_MT_FRAME_SIZE")?; + let width = (packed >> 32) as u32; + let height = (packed & 0xFFFF_FFFF) as u32; + Ok((width, height)) + } + + unsafe fn ensure_streaming(&mut self) -> Result<()> { + if !self.streaming { + self.transform + .ProcessMessage(MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, 0) + .context("decoder BEGIN_STREAMING")?; + self.transform + .ProcessMessage(MFT_MESSAGE_NOTIFY_START_OF_STREAM, 0) + .context("decoder START_OF_STREAM")?; + self.streaming = true; + } + Ok(()) + } + + /// Feed one H.264 access unit and return a decoded BGRA frame if one is + /// produced this tick. `Ok(None)` means the decoder needs more input (normal + /// while it buffers the first GOP). + pub fn decode(&mut self, h264: &[u8], pts_100ns: i64) -> Result> { + if h264.is_empty() { + return Ok(None); + } + unsafe { + self.ensure_streaming()?; + + let sample = make_input_sample(h264, pts_100ns)?; + match self + .transform + .ProcessInput(self.input_stream_id, &sample, 0) + { + Ok(()) => {} + Err(e) if e.code() == MF_E_TRANSFORM_NEED_MORE_INPUT => {} + Err(e) => return Err(anyhow!("decoder ProcessInput failed: {e:#}")), + } + + self.drain_one() + } + } + + /// Drain one decoded output sample, handling the initial NV12 output-type + /// negotiation (`MF_E_TRANSFORM_STREAM_CHANGE`). + unsafe fn drain_one(&mut self) -> Result> { + loop { + // If we have not yet set an output type, do so now (NV12). The first + // ProcessOutput typically returns STREAM_CHANGE until this is set. + if self.width == 0 { + // Try to set NV12 output; ignore failures here (the decoder may + // require a STREAM_CHANGE round-trip first). + let _ = self.configure_output_nv12(); + } + + let stream_info: MFT_OUTPUT_STREAM_INFO = self + .transform + .GetOutputStreamInfo(self.output_stream_id) + .context("decoder GetOutputStreamInfo")?; + + const MFT_OUTPUT_STREAM_PROVIDES_SAMPLES: u32 = 0x100; + let mft_provides = stream_info.dwFlags & MFT_OUTPUT_STREAM_PROVIDES_SAMPLES != 0; + + let mut out_buffer = MFT_OUTPUT_DATA_BUFFER { + dwStreamID: self.output_stream_id, + ..Default::default() + }; + + if !mft_provides { + let alloc = stream_info.cbSize.max(self.guess_nv12_size()); + let sample: IMFSample = MFCreateSample().context("MFCreateSample(dec out)")?; + let buffer = + MFCreateMemoryBuffer(alloc).context("MFCreateMemoryBuffer(dec out)")?; + sample.AddBuffer(&buffer)?; + out_buffer.pSample = std::mem::ManuallyDrop::new(Some(sample)); + } + + let mut status: u32 = 0; + let mut bufs = [out_buffer]; + let hr = self.transform.ProcessOutput(0, &mut bufs, &mut status); + let produced = std::mem::ManuallyDrop::take(&mut bufs[0].pSample); + + match hr { + Ok(()) => { + // (Re)read the negotiated size in case it just became known. + if let Ok((w, h)) = self.read_output_size() { + self.width = w; + self.height = h; + } + let Some(sample) = produced else { + return Ok(None); + }; + if self.width == 0 || self.height == 0 { + return Ok(None); + } + let nv12 = sample_to_vec(&sample)?; + let bgra = nv12_to_bgra(&nv12, self.width, self.height)?; + return Ok(Some(DecodedFrame { + width: self.width, + height: self.height, + bgra, + })); + } + Err(e) if e.code() == MF_E_TRANSFORM_NEED_MORE_INPUT => return Ok(None), + Err(e) if e.code() == MF_E_TRANSFORM_STREAM_CHANGE => { + // The decoder learned the frame size: (re)negotiate NV12 out, + // record the size, and retry the drain. + self.configure_output_nv12() + .context("decoder output renegotiation after stream change")?; + if let Ok((w, h)) = self.read_output_size() { + self.width = w; + self.height = h; + } + continue; + } + Err(e) => return Err(anyhow!("decoder ProcessOutput failed: {e:#}")), + } + } + } + + /// Conservative NV12 buffer estimate when the decoder doesn't report cbSize. + fn guess_nv12_size(&self) -> u32 { + if self.width != 0 && self.height != 0 { + self.width * self.height * 3 / 2 + } else { + // 1080p NV12 upper bound until the real size is known. + 1920 * 1080 * 3 / 2 + } + } +} + +impl Drop for H264Decoder { + fn drop(&mut self) { + unsafe { + if self.streaming { + let _ = self + .transform + .ProcessMessage(MFT_MESSAGE_NOTIFY_END_OF_STREAM, 0); + let _ = self + .transform + .ProcessMessage(MFT_MESSAGE_NOTIFY_END_STREAMING, 0); + } + if self.mf_started { + let _ = MFShutdown(); + } + } + } +} + +/// Enumerate and activate an H.264 decoder MFT (hardware preferred, software +/// acceptable — decode does not require a HW encoder). +unsafe fn activate_decoder() -> Result { + let input_type = MFT_REGISTER_TYPE_INFO { + guidMajorType: MFMediaType_Video, + guidSubtype: MFVideoFormat_H264, + }; + + let mut activate_ptr: *mut Option = + std::ptr::null_mut(); + let mut count: u32 = 0; + + // Allow both HW and SW decoders; SYNCMFT keeps the simple ProcessInput/Output + // contract this first cut uses. + MFTEnumEx( + MFT_CATEGORY_VIDEO_DECODER, + MFT_ENUM_FLAG_SYNCMFT | MFT_ENUM_FLAG_SORTANDFILTER, + Some(&input_type as *const _), + None, + &mut activate_ptr, + &mut count, + ) + .context("MFTEnumEx (H264 decoder)")?; + + if count == 0 || activate_ptr.is_null() { + if !activate_ptr.is_null() { + windows::Win32::System::Com::CoTaskMemFree(Some(activate_ptr as *const _)); + } + return Err(anyhow!("no H.264 decoder MFT available")); + } + + let slice = std::slice::from_raw_parts_mut(activate_ptr, count as usize); + let mut chosen: Option = None; + for entry in slice.iter_mut() { + if chosen.is_none() { + if let Some(activate) = entry.as_ref() { + if let Ok(t) = activate.ActivateObject::() { + chosen = Some(t); + } + } + } + entry.take(); + } + windows::Win32::System::Com::CoTaskMemFree(Some(activate_ptr as *const _)); + + chosen.ok_or_else(|| anyhow!("failed to activate H.264 decoder MFT")) +} + +/// Wrap an H.264 access unit into an IMFSample. +unsafe fn make_input_sample(data: &[u8], pts_100ns: i64) -> Result { + let sample: IMFSample = MFCreateSample().context("MFCreateSample(dec in)")?; + let buffer = MFCreateMemoryBuffer(data.len() as u32).context("MFCreateMemoryBuffer(dec in)")?; + + let mut ptr: *mut u8 = std::ptr::null_mut(); + let mut max_len: u32 = 0; + buffer + .Lock(&mut ptr, Some(&mut max_len), None) + .context("decoder input Lock")?; + if (max_len as usize) < data.len() || ptr.is_null() { + let _ = buffer.Unlock(); + return Err(anyhow!("MF buffer too small for H.264 access unit")); + } + std::ptr::copy_nonoverlapping(data.as_ptr(), ptr, data.len()); + buffer.SetCurrentLength(data.len() as u32)?; + buffer.Unlock()?; + + sample.AddBuffer(&buffer)?; + sample.SetSampleTime(pts_100ns)?; + Ok(sample) +} + +/// Copy a sample's contiguous bytes into a Vec. +unsafe fn sample_to_vec(sample: &IMFSample) -> Result> { + let buffer = sample + .ConvertToContiguousBuffer() + .context("decoder ConvertToContiguousBuffer")?; + let mut ptr: *mut u8 = std::ptr::null_mut(); + let mut len: u32 = 0; + buffer + .Lock(&mut ptr, None, Some(&mut len)) + .context("decoder output Lock")?; + let out = if ptr.is_null() || len == 0 { + Vec::new() + } else { + std::slice::from_raw_parts(ptr, len as usize).to_vec() + }; + let _ = buffer.Unlock(); + Ok(out) +} + +/// MF version word for `MFStartup` (see encoder::h264). +fn mf_version() -> u32 { + 0x0002_0070 +} + +/// Convert an NV12 buffer to BGRA (BT.601 limited range). Inverse of the +/// encoder's BGRA->NV12. Shared with the unit tests below. +pub fn nv12_to_bgra(nv12: &[u8], width: u32, height: u32) -> Result> { + let w = width as usize; + let h = height as usize; + let y_size = w * h; + let need = y_size * 3 / 2; + if nv12.len() < need { + return Err(anyhow!("NV12 buffer too small: {} < {}", nv12.len(), need)); + } + + let (y_plane, uv_plane) = nv12.split_at(y_size); + let mut bgra = vec![0u8; w * h * 4]; + let chroma_cols = w / 2; + + for row in 0..h { + for col in 0..w { + let y = y_plane[row * w + col] as i32; + let cx = col / 2; + let cy = row / 2; + let uv_idx = (cy * chroma_cols + cx) * 2; + let u = uv_plane[uv_idx] as i32; + let v = uv_plane[uv_idx + 1] as i32; + + // BT.601 limited-range YUV -> RGB. + let c = y - 16; + let d = u - 128; + let e = v - 128; + let r = ((298 * c + 409 * e + 128) >> 8).clamp(0, 255); + let g = ((298 * c - 100 * d - 208 * e + 128) >> 8).clamp(0, 255); + let b = ((298 * c + 516 * d + 128) >> 8).clamp(0, 255); + + let px = (row * w + col) * 4; + bgra[px] = b as u8; + bgra[px + 1] = g as u8; + bgra[px + 2] = r as u8; + bgra[px + 3] = 255; + } + } + + Ok(bgra) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::encoder::color::{bgra_to_nv12, nv12_size}; + + /// Round-trip a solid color through BGRA->NV12->BGRA. Chroma subsampling and + /// limited-range rounding introduce small error, so allow a tolerance. + #[test] + fn nv12_bgra_roundtrip_is_approximately_lossless_for_solid_color() { + let w = 4u32; + let h = 4u32; + // Mid gray. + let mut bgra = vec![0u8; (w * h * 4) as usize]; + for px in bgra.chunks_mut(4) { + px[0] = 120; // B + px[1] = 120; // G + px[2] = 120; // R + px[3] = 255; + } + let mut nv12 = vec![0u8; nv12_size(w, h)]; + bgra_to_nv12(&bgra, w, h, &mut nv12).unwrap(); + let back = nv12_to_bgra(&nv12, w, h).unwrap(); + + for (orig, got) in bgra.chunks(4).zip(back.chunks(4)) { + for ch in 0..3 { + let diff = (orig[ch] as i32 - got[ch] as i32).abs(); + assert!(diff <= 6, "channel {ch} drift {diff} too large"); + } + assert_eq!(got[3], 255, "alpha must be opaque"); + } + } + + #[test] + fn nv12_to_bgra_rejects_short_buffer() { + let nv12 = vec![0u8; 4]; + assert!(nv12_to_bgra(&nv12, 16, 16).is_err()); + } + + #[test] + fn black_nv12_decodes_to_black_bgra() { + // Limited-range black: Y=16, UV=128. + let w = 2u32; + let h = 2u32; + let mut nv12 = vec![128u8; nv12_size(w, h)]; + for y in nv12.iter_mut().take((w * h) as usize) { + *y = 16; + } + let bgra = nv12_to_bgra(&nv12, w, h).unwrap(); + for px in bgra.chunks(4) { + assert!(px[0] <= 2 && px[1] <= 2 && px[2] <= 2, "near-black"); + } + } +} diff --git a/agent/src/viewer/mod.rs b/agent/src/viewer/mod.rs index 6366b8c..351dfc6 100644 --- a/agent/src/viewer/mod.rs +++ b/agent/src/viewer/mod.rs @@ -3,6 +3,8 @@ //! This module provides the viewer functionality for connecting to remote //! GuruConnect sessions with low-level keyboard hooks for Win key capture. +#[cfg(windows)] +mod decoder; mod input; mod render; mod transport; @@ -31,6 +33,72 @@ pub enum InputEvent { SpecialKey(proto::SpecialKeyEvent), } +/// Spawn the dedicated H.264 decode worker thread (Task 7, Windows only). +/// +/// Returns a sender for `(h264_access_unit, pts_100ns)`. The worker lazily +/// creates the Media Foundation decoder on the first frame; if creation fails it +/// logs once and then silently drops subsequent frames (the raw render path is +/// never affected). Each decoded frame is converted to BGRA and delivered to the +/// viewer as an uncompressed `FrameData`, reusing the existing render path. +#[cfg(windows)] +fn spawn_h264_decode_worker( + viewer_tx: mpsc::Sender, +) -> std::sync::mpsc::Sender<(Vec, i64)> { + let (tx, rx) = std::sync::mpsc::channel::<(Vec, i64)>(); + + std::thread::Builder::new() + .name("gc-h264-decode".to_string()) + .spawn(move || { + let mut decoder: Option = None; + let mut init_failed = false; + + while let Ok((data, pts)) = rx.recv() { + if init_failed { + continue; + } + if decoder.is_none() { + match decoder::H264Decoder::new() { + Ok(d) => { + info!("H.264 decoder initialized (Media Foundation)"); + decoder = Some(d); + } + Err(e) => { + error!( + "H.264 decoder init failed: {e:#}; H.264 frames will be dropped" + ); + init_failed = true; + continue; + } + } + } + + let dec = decoder.as_mut().expect("decoder present after init"); + match dec.decode(&data, pts) { + Ok(Some(decoded)) => { + let frame = render::FrameData { + width: decoded.width, + height: decoded.height, + data: decoded.bgra, + compressed: false, // already BGRA + is_keyframe: false, + }; + if viewer_tx.blocking_send(ViewerEvent::Frame(frame)).is_err() { + // Viewer closed; stop the worker. + break; + } + } + Ok(None) => { /* decoder buffering; no output this tick */ } + Err(e) => { + warn!("H.264 decode error: {e:#}"); + } + } + } + }) + .expect("failed to spawn H.264 decode worker thread"); + + tx +} + /// Run the viewer to connect to a remote session pub async fn run(server_url: &str, session_id: &str, api_key: &str) -> Result<()> { info!("GuruConnect Viewer starting"); @@ -77,13 +145,23 @@ pub async fn run(server_url: &str, session_id: &str, api_key: &str) -> Result<() } }); + // H.264 decode worker (Task 7, Windows only). The Media Foundation decoder + // wraps COM interfaces with thread affinity, so it runs on a DEDICATED OS + // thread (not a tokio task, which can migrate across workers at await + // points). The receive task forwards H.264 access units to it over a std + // channel; the worker decodes to BGRA and pushes a FrameData back through + // the viewer channel via `blocking_send`. On decoder-init failure the worker + // logs and drops H.264 frames (the raw path is unaffected). + #[cfg(windows)] + let h264_tx = spawn_h264_decode_worker(viewer_tx.clone()); + // Spawn task to receive messages from server let viewer_tx_recv = viewer_tx.clone(); let receive_task = tokio::spawn(async move { while let Some(msg) = ws_receiver.recv().await { match msg.payload { - Some(proto::message::Payload::VideoFrame(frame)) => { - if let Some(proto::video_frame::Encoding::Raw(raw)) = frame.encoding { + Some(proto::message::Payload::VideoFrame(frame)) => match frame.encoding { + Some(proto::video_frame::Encoding::Raw(raw)) => { let frame_data = render::FrameData { width: raw.width as u32, height: raw.height as u32, @@ -93,7 +171,23 @@ pub async fn run(server_url: &str, session_id: &str, api_key: &str) -> Result<() }; let _ = viewer_tx_recv.send(ViewerEvent::Frame(frame_data)).await; } - } + Some(proto::video_frame::Encoding::H264(enc)) => { + // Forward to the decode worker (Windows). On other + // platforms H.264 is never negotiated, so this is dead. + #[cfg(windows)] + { + if h264_tx.send((enc.data, enc.pts)).is_err() { + warn!("H.264 decode worker unavailable; dropping frame"); + } + } + #[cfg(not(windows))] + { + let _ = enc; + } + } + // VP9/H265 not implemented on the viewer (raw + H.264 only). + _ => {} + }, Some(proto::message::Payload::CursorPosition(pos)) => { let _ = viewer_tx_recv .send(ViewerEvent::CursorPosition(pos.x, pos.y, pos.visible)) diff --git a/proto/guruconnect.proto b/proto/guruconnect.proto index 1a2fbdb..842040d 100644 --- a/proto/guruconnect.proto +++ b/proto/guruconnect.proto @@ -17,6 +17,23 @@ message SessionResponse { string session_id = 2; string error = 3; DisplayInfo display_info = 4; + // Negotiated video codec for this session (Task 7). The plan models the + // codec selection here; however the LIVE server->agent handshake in v2 is + // done over WebSocket query params + StartStream (SessionRequest/ + // SessionResponse are not exchanged on the wire today), so the codec the + // agent actually applies is carried on StartStream.video_codec below. This + // field is kept for spec parity / future use; raw is the safe default. + VideoCodec video_codec = 5; +} + +// Negotiated video codec (Task 7). RAW = salvaged BGRA+Zstd+dirty-rects, the +// guaranteed fallback and current default. H264 = hardware Media Foundation +// encode, the negotiated upgrade (compile-verified; validated on hardware in +// plan Task 8). H265/HEVC is future opt-in (TODO). +enum VideoCodec { + VIDEO_CODEC_RAW = 0; // Raw BGRA + Zstd (safe default / fallback) + VIDEO_CODEC_H264 = 1; // Hardware H.264 via Media Foundation + VIDEO_CODEC_H265 = 2; // Future / opt-in (not implemented) } enum SessionType { @@ -268,6 +285,13 @@ message Disconnect { message StartStream { string viewer_id = 1; // ID of viewer requesting stream int32 display_id = 2; // Which display to stream (0 = primary) + // Negotiated codec for this stream (Task 7). The server selects this from + // the agent's advertised capability (AgentStatus.supports_h264) and its + // policy (DEFAULT_PREFER_H264, currently false so we never ship unvalidated + // H.264 as the default). The agent builds its encoder from this value; + // VIDEO_CODEC_RAW (0, the default for older servers) keeps the salvaged + // raw+Zstd path. On any H.264 init failure the agent falls back to raw. + VideoCodec video_codec = 3; } // Server commands agent to stop streaming @@ -287,6 +311,12 @@ message AgentStatus { string organization = 8; // Company/organization name string site = 9; // Site/location name repeated string tags = 10; // Tags for categorization + // HW-encode capability (Task 7). True when the agent enumerated a HARDWARE + // H.264 encoder MFT at startup (MFTEnumEx, MFT_CATEGORY_VIDEO_ENCODER, + // MFVideoFormat_H264, hardware flag). The server uses this for codec + // negotiation (see StartStream.video_codec). Detected once and cached; + // false on non-Windows / no HW encoder / MF unavailable. + bool supports_h264 = 11; } // Server commands agent to uninstall itself diff --git a/server/src/relay/mod.rs b/server/src/relay/mod.rs index 7645fba..7a49849 100644 --- a/server/src/relay/mod.rs +++ b/server/src/relay/mod.rs @@ -805,6 +805,7 @@ async fn handle_agent_connection( organization.clone(), site.clone(), status.tags.clone(), + status.supports_h264, ) .await; diff --git a/server/src/session/mod.rs b/server/src/session/mod.rs index bc32cf7..95bc2be 100644 --- a/server/src/session/mod.rs +++ b/server/src/session/mod.rs @@ -93,6 +93,36 @@ pub struct Session { pub organization: Option, // Company/organization name pub site: Option, // Site/location name pub tags: Vec, // Tags for categorization + /// Whether the agent advertised a hardware H.264 encoder (Task 7). Set from + /// `AgentStatus.supports_h264`; drives codec negotiation in `select_video_codec`. + pub supports_h264: bool, +} + +/// Default codec-negotiation policy (Task 7). +/// +/// `false` means: even when an agent advertises hardware H.264 support, the +/// server still negotiates RAW. H.264 is compile-verified only and not yet +/// validated on real hardware (plan Task 8), so we deliberately do NOT ship it +/// as the default — raw+Zstd stays the guaranteed working path. Flip this to +/// `true` once H.264 is live-validated, or make it per-tenant policy later. +pub const DEFAULT_PREFER_H264: bool = false; + +/// Negotiate the video codec for a stream (Task 7). +/// +/// Pure decision function (unit-tested): given whether the agent advertised +/// hardware H.264 and whether policy prefers H.264, pick the codec. H.264 is +/// chosen ONLY when both the agent supports it AND policy allows it; otherwise +/// raw — the safe default/fallback. HEVC is intentionally never selected here +/// (future opt-in; TODO). +pub fn select_video_codec( + agent_supports_h264: bool, + prefer_h264: bool, +) -> crate::proto::VideoCodec { + if agent_supports_h264 && prefer_h264 { + crate::proto::VideoCodec::H264 + } else { + crate::proto::VideoCodec::Raw + } } /// Channel for sending frames from agent to viewers @@ -206,6 +236,7 @@ impl SessionManager { organization: None, site: None, tags: Vec::new(), + supports_h264: false, }; let session_data = SessionData { @@ -240,12 +271,14 @@ impl SessionManager { organization: Option, site: Option, tags: Vec, + supports_h264: bool, ) { let mut sessions = self.sessions.write().await; if let Some(session_data) = sessions.get_mut(&session_id) { session_data.info.last_heartbeat = chrono::Utc::now(); session_data.last_heartbeat_instant = Instant::now(); session_data.info.is_streaming = is_streaming; + session_data.info.supports_h264 = supports_h264; if let Some(os) = os_version { session_data.info.os_version = Some(os); } @@ -409,10 +442,23 @@ impl SessionManager { use crate::proto; use prost::Message; + // Negotiate the video codec for this stream (Task 7): H.264 only when the + // agent advertised hardware support AND policy prefers it. With + // DEFAULT_PREFER_H264 = false this always resolves to RAW today (H.264 is + // compile-verified only, validated on hardware in Task 8). + let codec = select_video_codec(session_data.info.supports_h264, DEFAULT_PREFER_H264); + tracing::info!( + "StartStream codec negotiation: agent_supports_h264={}, prefer_h264={} -> {:?}", + session_data.info.supports_h264, + DEFAULT_PREFER_H264, + codec + ); + let start_stream = proto::Message { payload: Some(proto::message::Payload::StartStream(proto::StartStream { viewer_id: viewer_id.to_string(), display_id: 0, // Primary display + video_codec: codec as i32, })), }; @@ -618,6 +664,7 @@ impl SessionManager { organization: None, site: None, tags: Vec::new(), + supports_h264: false, }; // Create placeholder channels (will be replaced on reconnect) @@ -717,6 +764,69 @@ mod tests { .is_some()); } + #[test] + fn codec_negotiation_picks_h264_only_when_supported_and_preferred() { + use crate::proto::VideoCodec; + + // Agent supports H.264 AND policy prefers it -> H.264. + assert_eq!(select_video_codec(true, true), VideoCodec::H264); + // Agent supports it but policy does not prefer it -> raw (the safe default). + assert_eq!(select_video_codec(true, false), VideoCodec::Raw); + // Policy prefers H.264 but the agent has no HW encoder -> raw. + assert_eq!(select_video_codec(false, true), VideoCodec::Raw); + // Neither -> raw. + assert_eq!(select_video_codec(false, false), VideoCodec::Raw); + } + + #[test] + fn default_policy_does_not_prefer_h264() { + // Guardrail: until H.264 is hardware-validated (Task 8) the default policy + // MUST keep raw as the negotiated codec even for capable agents. We assert + // the OBSERVABLE behavior (codec selection under the default policy) rather + // than the constant directly, which keeps the test meaningful if the policy + // later becomes dynamic. + let chosen = select_video_codec(true, DEFAULT_PREFER_H264); + assert_eq!( + chosen, + crate::proto::VideoCodec::Raw, + "default policy must negotiate raw until H.264 is hardware-validated" + ); + } + + #[tokio::test] + async fn agent_status_updates_h264_capability() { + let mgr = SessionManager::new(); + let (session_id, _frame_tx, _input_rx) = mgr + .register_agent("agent-cap".to_string(), "Cap PC".to_string(), true) + .await; + + // Default is false until a status reports capability. + assert_eq!( + mgr.get_session(session_id).await.map(|s| s.supports_h264), + Some(false) + ); + + mgr.update_agent_status( + session_id, + Some("Windows".to_string()), + true, + 10, + 1, + false, + Some("0.2.0".to_string()), + None, + None, + Vec::new(), + true, // supports_h264 + ) + .await; + + assert_eq!( + mgr.get_session(session_id).await.map(|s| s.supports_h264), + Some(true) + ); + } + #[tokio::test] async fn denied_attended_session_keeps_viewer_blocked() { let mgr = SessionManager::new(); diff --git a/specs/v2-secure-session-core/plan.md b/specs/v2-secure-session-core/plan.md index a9ba7c3..3f3efb1 100644 --- a/specs/v2-secure-session-core/plan.md +++ b/specs/v2-secure-session-core/plan.md @@ -393,11 +393,104 @@ Reference: SPEC-002 §4.1/§4.2; salvage ledger §2; `agent/src/input/keyboard.r --- -## Task 7: Hardware H.264 encode + negotiated raw/Zstd fallback +## Task 7 [IMPLEMENTED 2026-05-30 — self-verified on local Windows toolchain: `cargo fmt --all --check` clean, `cargo clippy --workspace --all-targets --all-features -- -D warnings` exit 0, `cargo test --workspace` 89 pass (36 agent + 53 server; was 70, no regressions), `cargo build --workspace` ok; pending Code Review]: Hardware H.264 encode + negotiated raw/Zstd fallback -Files touched: `agent/src/encoder/` (`mod.rs`, `h264.rs` [new], `raw.rs` [salvaged]), -`agent/src/capture/` (feed), `agent/src/viewer/` (decode), `proto/guruconnect.proto` -(`AgentStatus` capability, `SessionResponse` codec), `server/src/session/mod.rs` (negotiation). +> [IMPLEMENTED] Raw+Zstd remains the DEFAULT and guaranteed fallback; H.264 is a +> negotiated upgrade that is COMPILE-VERIFIED ONLY (live MF encode/decode is Task +> 8 — needs real GPU + frames). The testable parts (abstraction, factory, +> negotiation, capability plumbing, color-conversion math) are done solidly with +> unit tests; the MF H.264 encoder and viewer decoder are first-cut, clearly +> marked, and gated behind a default-off policy so unvalidated H.264 never ships +> as the default. +> +> 1. ENCODER ABSTRACTION (`agent/src/encoder/mod.rs`): the existing `Encoder` +> trait (`encode(&mut self, &CapturedFrame) -> Result`) is the +> abstraction; `RawEncoder` (salvaged raw+Zstd+dirty-rects, UNCHANGED behavior) +> and the new `H264Encoder` both implement it. Factory split into pure pieces: +> `codec_from_str` (config-string -> `VideoCodec`), `select_codec(negotiated, +> hardware_available)` (agent-side guard: H.264 only if HW present, HEVC->raw, +> else raw), and `create_encoder_for(VideoCodec, quality)` (builds the encoder; +> on H.264 init failure logs + returns a RAW encoder so the session never +> breaks). UNIT-TESTED: codec_from_str mapping, select_codec guard matrix, raw +> factory always succeeds, string path resolves to raw without HW. +> 2. CAPABILITY + NEGOTIATION (testable, done well): +> - `encoder/capability.rs`: `supports_hardware_h264()` probes MF once +> (`MFTEnumEx(MFT_CATEGORY_VIDEO_ENCODER, MFT_ENUM_FLAG_HARDWARE, +> MFVideoFormat_H264)`), caches the bool via `OnceLock`; false on non-Windows +> / no HW / MF error. Advertised in `AgentStatus.supports_h264` (proto field +> 11, additive). +> - Server (`server/src/session/mod.rs`): `select_video_codec(agent_supports, +> prefer_h264)` is a PURE decision fn — H.264 only when BOTH the agent +> supports it AND policy prefers it, else raw. Policy constant +> `DEFAULT_PREFER_H264 = false` (documented: keeps raw as the negotiated codec +> until H.264 is hardware-validated). `supports_h264` stored on the in-memory +> `Session` from `AgentStatus` (`update_agent_status` gained the param). The +> negotiated codec is stamped on `StartStream.video_codec` in +> `send_start_stream_internal` (the LIVE server->agent codec-selection point — +> SessionRequest/SessionResponse are not exchanged on the wire in v2, so the +> proto's `SessionResponse.video_codec` is kept for spec parity but the live +> path uses `StartStream`). UNIT-TESTED: the negotiation matrix, the +> default-policy guardrail (capable agent still gets raw), and the +> `AgentStatus -> supports_h264` ingest. +> - Agent applies it: `StartStream` handler decodes `video_codec`, stores +> `negotiated_codec`, and `init_streaming` builds the encoder via +> `select_codec` + `create_encoder_for` (re-guards on local HW; older server +> sends 0 = RAW, preserving the default). +> 3. MF H.264 ENCODER (`agent/src/encoder/h264.rs`, FIRST-CUT, compile-verified +> only): enumerates+activates a HW H.264 encoder MFT, sets H.264 output then +> NV12 input media types (frame size/rate, bitrate from quality), feeds frames +> (`ProcessInput`) and drains synchronously (`ProcessOutput`, NEED_MORE_INPUT = +> "no output this tick"), emitting `VideoFrame{H264(EncodedFrame{data, keyframe, +> pts, dts})}`. BGRA->NV12 via `encoder/color.rs` (BT.601 limited-range, 2x2 box +> chroma; isolated + UNIT-TESTED: size, odd-dim/short-buffer rejection, black/ +> white/red reference values, plane coverage). On ANY init failure the FACTORY +> falls back to raw (logged); per-frame errors surface to the session (which +> logs + continues). Handles resolution change (re-init), keyframe flag +> (CleanPoint), MF buffer alloc for non-sample-providing MFTs. NOT yet live: the +> async-MFT event model is documented as a Task-8 refinement (this cut drains +> synchronously); precise force-IDR (CODECAPI) is a TODO; D3D11 zero-copy +> deferred (feeds CPU NV12). +> 4. VIEWER H.264 DECODE (`agent/src/viewer/decoder.rs` [new], FIRST-CUT, +> compile-verified only): MF H.264 decoder MFT -> NV12 -> BGRA +> (`nv12_to_bgra`, BT.601 inverse, UNIT-TESTED round-trip within tolerance + +> short-buffer + black). Runs on a DEDICATED OS thread (`gc-h264-decode`), NOT a +> tokio task — the MF decoder has COM thread affinity and a tokio task can +> migrate across workers at await points. The receive task forwards H.264 access +> units over a std channel; the worker decodes and pushes BGRA `FrameData` +> through the existing render path via `blocking_send`. On decoder-init failure +> it logs once and drops H.264 frames; the RAW render path is untouched. Handles +> the `MF_E_TRANSFORM_STREAM_CHANGE` NV12 output renegotiation + size discovery. +> 5. RAW STILL WORKS END-TO-END: `RawEncoder` is unchanged; with +> `DEFAULT_PREFER_H264 = false` the server negotiates RAW for every session +> (including capable agents), the agent builds the raw encoder, and the viewer's +> existing `Raw` branch renders it — the guaranteed default/fallback path is +> fully intact and is what runs today. +> +> PROTO (additive — no field renumbered): `VideoCodec` enum (RAW=0, H264=1, +> H265=2); `SessionResponse.video_codec = 5` (spec parity); `StartStream.video_codec +> = 3` (live negotiation); `AgentStatus.supports_h264 = 11` (capability). HEVC is a +> documented TODO/opt-in everywhere (never selected). Cargo.toml: added the +> `Win32_Media_MediaFoundation` + COM windows features (no new external crates). +> +> COMPILE-VERIFIED-ONLY / NEEDS LIVE HARDWARE (Task 8): the MF H.264 encoder +> init/feed/emit on a real GPU, the viewer MF decoder on a live stream, the +> BGRA<->NV12 fidelity end-to-end, and the synchronous-drain timing. The encoder/ +> decoder are structured to fall back to raw (encoder) / drop frames + log +> (decoder) on any failure so they cannot break a session even if MF misbehaves. +> +> TESTS ADDED (19): agent +16 (encoder factory/select matrix x5, color BGRA->NV12 +> x8, decoder NV12<->BGRA x3), server +3 (codec negotiation matrix, default-policy +> guardrail, AgentStatus capability ingest). + +Files touched: `proto/guruconnect.proto` (`VideoCodec` enum + `SessionResponse.video_codec` ++ `StartStream.video_codec` + `AgentStatus.supports_h264`), `agent/Cargo.toml` (MF/COM windows +features), `agent/src/encoder/mod.rs` (trait/factory/select), `agent/src/encoder/raw.rs` +(salvaged, unchanged), `agent/src/encoder/h264.rs` [new], `agent/src/encoder/capability.rs` [new], +`agent/src/encoder/color.rs` [new], `agent/src/session/mod.rs` (negotiated codec apply + +`supports_h264` advertise), `agent/src/viewer/mod.rs` (H.264 route + decode worker), +`agent/src/viewer/decoder.rs` [new], `server/src/session/mod.rs` (`select_video_codec` + +`DEFAULT_PREFER_H264` + `supports_h264` field/ingest + `StartStream` codec stamp), +`server/src/relay/mod.rs` (pass `supports_h264` from `AgentStatus`). - HW **H.264** via Windows Media Foundation (transparently NVENC/AMF/QuickSync) emitting the proto's `EncodedFrame` (h264). Native viewer decodes via MF/D3D11.