//! Hardware H.264 encoder via Windows Media Foundation (Task 7). //! //! FIRST-CUT / COMPILE-VERIFIED ONLY. This encoder is wired end-to-end (init -> //! feed -> drain -> emit `EncodedFrame{h264}`) and is selected only when the //! agent advertised hardware support AND the server negotiated H.264. It has NOT //! been validated on real hardware with live frames — that is plan Task 8. On //! ANY initialization or per-frame failure it surfaces an error; the encoder //! factory (`create_encoder_for`) downgrades to the raw+Zstd encoder so a //! session never breaks because of H.264. //! //! Pipeline: //! BGRA capture --(color::bgra_to_nv12)--> NV12 sample --> MFT(H.264) --> H.264 //! Annex-B/length-prefixed elementary stream --> proto EncodedFrame. //! //! Design notes: //! - The MFT is enumerated with `MFTEnumEx(MFT_CATEGORY_VIDEO_ENCODER, //! MFT_ENUM_FLAG_HARDWARE, …, MFVideoFormat_H264)` (same probe as //! `capability`). We `ActivateObject` the first match. //! - Input is configured as NV12, output as H.264, with frame size, frame rate //! and an average bitrate derived from `quality`. //! - Both the SYNCHRONOUS MFT model (ProcessInput/ProcessOutput) and the //! ASYNCHRONOUS hardware-MFT model (METransformNeedInput / METransformHaveOutput //! events) exist. To keep this first cut bounded and predictable we DRAIN the //! MFT synchronously after each input and treat `MF_E_TRANSFORM_NEED_MORE_INPUT` //! as "no output this tick". A fully async event-driven loop is a Task-8 //! refinement (documented below). //! - `MFT_MESSAGE_SET_D3D_MANAGER` is intentionally NOT set — we feed CPU NV12 //! buffers (software input samples), which every HW H.264 MFT accepts. D3D11 //! zero-copy is a later optimization. #![cfg(windows)] use super::{EncodedFrame, Encoder}; use crate::capture::CapturedFrame; use crate::encoder::color; use crate::proto::{video_frame, EncodedFrame as ProtoEncodedFrame, VideoFrame}; use anyhow::{anyhow, Context, Result}; use windows::Win32::Media::MediaFoundation::{ IMFActivate, IMFMediaType, IMFSample, IMFTransform, MFCreateMediaType, MFCreateMemoryBuffer, MFCreateSample, MFMediaType_Video, MFShutdown, MFStartup, MFTEnumEx, MFVideoFormat_H264, MFVideoFormat_NV12, MFVideoInterlace_Progressive, MFSTARTUP_LITE, MFT_CATEGORY_VIDEO_ENCODER, MFT_ENUM_FLAG_HARDWARE, MFT_ENUM_FLAG_SORTANDFILTER, MFT_ENUM_FLAG_TRANSCODE_ONLY, MFT_MESSAGE_COMMAND_FLUSH, MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, MFT_MESSAGE_NOTIFY_END_OF_STREAM, MFT_MESSAGE_NOTIFY_END_STREAMING, MFT_MESSAGE_NOTIFY_START_OF_STREAM, MFT_OUTPUT_DATA_BUFFER, MFT_OUTPUT_STREAM_INFO, MFT_REGISTER_TYPE_INFO, MF_E_TRANSFORM_NEED_MORE_INPUT, MF_MT_AVG_BITRATE, MF_MT_FRAME_RATE, MF_MT_FRAME_SIZE, MF_MT_INTERLACE_MODE, MF_MT_MAJOR_TYPE, MF_MT_PIXEL_ASPECT_RATIO, MF_MT_SUBTYPE, }; /// Encoder-internal state, created once and reused per frame. pub struct H264Encoder { /// The activated encoder transform. transform: IMFTransform, /// Configured frame dimensions; a capture-size change forces re-init. width: u32, height: u32, /// Quality (1-100) used to derive the bitrate; kept for re-init on resize. quality: u32, /// Frame sequence counter (mirrors RawEncoder). sequence: u32, /// Force the next frame to request a keyframe. force_keyframe: bool, /// Whether `MFT_MESSAGE_NOTIFY_BEGIN_STREAMING` was sent. streaming: bool, /// Reusable NV12 staging buffer (resized on dimension change). nv12: Vec, /// Input/output stream identifiers (most encoders use 0/0). input_stream_id: u32, output_stream_id: u32, /// True if MF was started by THIS encoder and must be shut down on drop. mf_started: bool, } // IMFTransform is a COM interface; it is not auto-Send. We only ever touch the // encoder from the single capture/encode thread (the session owns it behind a // &mut), so it is safe to move between threads as long as it is not shared. unsafe impl Send for H264Encoder {} impl H264Encoder { /// Construct and fully initialize a hardware H.264 encoder. Returns an error /// (so the factory can fall back to raw) if MF is unavailable, no hardware /// encoder exists, or media-type negotiation fails. A default frame size is /// used and re-negotiated on the first frame if the real capture differs. pub fn new(quality: u32) -> Result { // 1920x1080 default; re-init on the first frame if the capture differs. Self::with_dimensions(quality, 1920, 1080) } fn with_dimensions(quality: u32, width: u32, height: u32) -> Result { unsafe { // MF must be initialized on this thread. MFSTARTUP_LITE avoids the // sockets/network stack we don't need. MFStartup(mf_version(), MFSTARTUP_LITE).context("MFStartup failed")?; let mf_started = true; let transform = match Self::activate_hw_encoder() { Ok(t) => t, Err(e) => { // Balance the MFStartup we just did before bailing. let _ = MFShutdown(); return Err(e); } }; let mut enc = Self { transform, width, height, quality, sequence: 0, force_keyframe: true, streaming: false, nv12: Vec::new(), input_stream_id: 0, output_stream_id: 0, mf_started, }; // `enc`'s Drop will shut MF down and release the transform on error. enc.configure_media_types()?; Ok(enc) } } /// Enumerate hardware H.264 encoder MFTs and activate the first one. unsafe fn activate_hw_encoder() -> Result { let output_type = MFT_REGISTER_TYPE_INFO { guidMajorType: MFMediaType_Video, guidSubtype: MFVideoFormat_H264, }; let mut activate_ptr: *mut Option = std::ptr::null_mut(); let mut count: u32 = 0; MFTEnumEx( MFT_CATEGORY_VIDEO_ENCODER, MFT_ENUM_FLAG_HARDWARE | MFT_ENUM_FLAG_SORTANDFILTER | MFT_ENUM_FLAG_TRANSCODE_ONLY, None, Some(&output_type as *const _), &mut activate_ptr, &mut count, ) .context("MFTEnumEx (hardware H.264) failed")?; if count == 0 || activate_ptr.is_null() { if !activate_ptr.is_null() { windows::Win32::System::Com::CoTaskMemFree(Some(activate_ptr as *const _)); } return Err(anyhow!("no hardware H.264 encoder MFT available")); } let slice = std::slice::from_raw_parts_mut(activate_ptr, count as usize); // Activate the first usable encoder; release every IMFActivate. let mut chosen: Option = None; for entry in slice.iter_mut() { if chosen.is_none() { if let Some(activate) = entry.as_ref() { if let Ok(transform) = activate.ActivateObject::() { chosen = Some(transform); } } } // Release this IMFActivate reference. entry.take(); } windows::Win32::System::Com::CoTaskMemFree(Some(activate_ptr as *const _)); chosen.ok_or_else(|| anyhow!("failed to activate any hardware H.264 encoder MFT")) } /// Set the H.264 output type and NV12 input type, in the order MF requires /// (output type FIRST for encoders, then the matching input type). unsafe fn configure_media_types(&mut self) -> Result<()> { // Discover the real stream identifiers (most encoders report 0/0). let mut input_ids = [0u32; 1]; let mut output_ids = [0u32; 1]; // GetStreamIDs may return E_NOTIMPL meaning "ids are 0..n-1"; ignore err. let _ = self.transform.GetStreamIDs(&mut input_ids, &mut output_ids); // If GetStreamIDs populated nonzero ids use them, else default 0/0. if input_ids[0] != 0 { self.input_stream_id = input_ids[0]; } if output_ids[0] != 0 { self.output_stream_id = output_ids[0]; } let fps_num = 30u32; let fps_den = 1u32; let bitrate = quality_to_bitrate(self.quality, self.width, self.height); // ---- OUTPUT (H.264) ---- let out_type: IMFMediaType = MFCreateMediaType().context("MFCreateMediaType(out)")?; out_type.SetGUID(&MF_MT_MAJOR_TYPE, &MFMediaType_Video)?; out_type.SetGUID(&MF_MT_SUBTYPE, &MFVideoFormat_H264)?; out_type.SetUINT32(&MF_MT_AVG_BITRATE, bitrate)?; set_attr_size(&out_type, &MF_MT_FRAME_SIZE, self.width, self.height)?; set_attr_ratio(&out_type, &MF_MT_FRAME_RATE, fps_num, fps_den)?; set_attr_ratio(&out_type, &MF_MT_PIXEL_ASPECT_RATIO, 1, 1)?; out_type.SetUINT32(&MF_MT_INTERLACE_MODE, MFVideoInterlace_Progressive.0 as u32)?; self.transform .SetOutputType(self.output_stream_id, &out_type, 0) .context("SetOutputType(H264)")?; // ---- INPUT (NV12) ---- let in_type: IMFMediaType = MFCreateMediaType().context("MFCreateMediaType(in)")?; in_type.SetGUID(&MF_MT_MAJOR_TYPE, &MFMediaType_Video)?; in_type.SetGUID(&MF_MT_SUBTYPE, &MFVideoFormat_NV12)?; set_attr_size(&in_type, &MF_MT_FRAME_SIZE, self.width, self.height)?; set_attr_ratio(&in_type, &MF_MT_FRAME_RATE, fps_num, fps_den)?; set_attr_ratio(&in_type, &MF_MT_PIXEL_ASPECT_RATIO, 1, 1)?; in_type.SetUINT32(&MF_MT_INTERLACE_MODE, MFVideoInterlace_Progressive.0 as u32)?; self.transform .SetInputType(self.input_stream_id, &in_type, 0) .context("SetInputType(NV12)")?; Ok(()) } /// Begin streaming if not already started (idempotent). unsafe fn ensure_streaming(&mut self) -> Result<()> { if !self.streaming { self.transform .ProcessMessage(MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, 0) .context("NOTIFY_BEGIN_STREAMING")?; self.transform .ProcessMessage(MFT_MESSAGE_NOTIFY_START_OF_STREAM, 0) .context("NOTIFY_START_OF_STREAM")?; self.streaming = true; } Ok(()) } /// Re-initialize the encoder for a new frame size (capture resolution change). unsafe fn reinit_for_size(&mut self, width: u32, height: u32) -> Result<()> { if self.streaming { let _ = self.transform.ProcessMessage(MFT_MESSAGE_COMMAND_FLUSH, 0); let _ = self .transform .ProcessMessage(MFT_MESSAGE_NOTIFY_END_OF_STREAM, 0); let _ = self .transform .ProcessMessage(MFT_MESSAGE_NOTIFY_END_STREAMING, 0); self.streaming = false; } self.width = width; self.height = height; self.force_keyframe = true; self.configure_media_types() } /// Wrap an NV12 byte buffer into an `IMFSample` with the given timestamp. /// A free associated fn (does not borrow `self`) so the caller can pass /// `&self.nv12` without a clone while `self` is mutably borrowed elsewhere. unsafe fn make_input_sample(nv12: &[u8], pts_100ns: i64) -> Result { let sample: IMFSample = MFCreateSample().context("MFCreateSample")?; let buffer = MFCreateMemoryBuffer(nv12.len() as u32).context("MFCreateMemoryBuffer")?; // Lock, copy NV12 in, set current length, unlock. let mut data_ptr: *mut u8 = std::ptr::null_mut(); let mut max_len: u32 = 0; buffer .Lock(&mut data_ptr, Some(&mut max_len), None) .context("IMFMediaBuffer::Lock")?; if (max_len as usize) < nv12.len() || data_ptr.is_null() { let _ = buffer.Unlock(); return Err(anyhow!("MF buffer too small for NV12 frame")); } std::ptr::copy_nonoverlapping(nv12.as_ptr(), data_ptr, nv12.len()); buffer.SetCurrentLength(nv12.len() as u32)?; buffer.Unlock()?; sample.AddBuffer(&buffer)?; sample.SetSampleTime(pts_100ns)?; // 33.367ms per frame at ~30fps, in 100ns units. sample.SetSampleDuration(333_667)?; Ok(sample) } /// Drain one available output sample, if any. Returns the encoded bytes and /// whether the MFT flagged it a keyframe (clean point). `Ok(None)` means the /// MFT needs more input before it can produce output this tick. unsafe fn drain_one_output(&mut self) -> Result, bool)>> { let stream_info: MFT_OUTPUT_STREAM_INFO = self .transform .GetOutputStreamInfo(self.output_stream_id) .context("GetOutputStreamInfo")?; // If the MFT does not allocate its own output samples we must provide one. const MFT_OUTPUT_STREAM_PROVIDES_SAMPLES: u32 = 0x100; let mft_provides = stream_info.dwFlags & MFT_OUTPUT_STREAM_PROVIDES_SAMPLES != 0; let mut out_buffer = MFT_OUTPUT_DATA_BUFFER { dwStreamID: self.output_stream_id, ..Default::default() }; if !mft_provides { let alloc_size = stream_info.cbSize.max(1); let sample: IMFSample = MFCreateSample().context("MFCreateSample(out)")?; let buffer = MFCreateMemoryBuffer(alloc_size).context("MFCreateMemoryBuffer(out)")?; sample.AddBuffer(&buffer)?; out_buffer.pSample = std::mem::ManuallyDrop::new(Some(sample)); } let mut status: u32 = 0; let mut bufs = [out_buffer]; let hr = self.transform.ProcessOutput(0, &mut bufs, &mut status); // Take ownership of whatever sample is now in the buffer (ours or MFT's). let produced = std::mem::ManuallyDrop::take(&mut bufs[0].pSample); match hr { Ok(()) => { let Some(sample) = produced else { return Ok(None); }; let bytes = sample_to_vec(&sample)?; let keyframe = sample_is_keyframe(&sample); Ok(Some((bytes, keyframe))) } Err(e) if e.code() == MF_E_TRANSFORM_NEED_MORE_INPUT => Ok(None), Err(e) => Err(anyhow!("ProcessOutput failed: {e:#}")), } } } impl Encoder for H264Encoder { fn encode(&mut self, frame: &CapturedFrame) -> Result { self.sequence = self.sequence.wrapping_add(1); // H.264 4:2:0 needs even dimensions. Reject odd captures up front so we // surface a clean error (the factory already fell back to raw if HW was // missing; a per-frame error here lets the session log + continue). if !frame.width.is_multiple_of(2) || !frame.height.is_multiple_of(2) { return Err(anyhow!( "H.264 requires even dimensions, got {}x{}", frame.width, frame.height )); } unsafe { // Re-init on a resolution change. if frame.width != self.width || frame.height != self.height { self.reinit_for_size(frame.width, frame.height) .context("H.264 re-init for new frame size")?; } self.ensure_streaming()?; // BGRA -> NV12 into the reusable staging buffer. let need = color::nv12_size(frame.width, frame.height); if self.nv12.len() != need { self.nv12.resize(need, 0); } color::bgra_to_nv12(&frame.data, frame.width, frame.height, &mut self.nv12) .map_err(|e| anyhow!("BGRA->NV12 failed: {e}"))?; // PTS in 100ns units derived from the frame's capture instant. let pts_100ns = (frame.timestamp.elapsed().as_nanos() / 100) as i64; let sample = Self::make_input_sample(&self.nv12, pts_100ns)?; // Feed the encoder. NEED_MORE_INPUT is normal back-pressure handling; // for the synchronous first cut we only push one frame per tick. match self .transform .ProcessInput(self.input_stream_id, &sample, 0) { Ok(()) => {} Err(e) if e.code() == MF_E_TRANSFORM_NEED_MORE_INPUT => {} Err(e) => return Err(anyhow!("ProcessInput failed: {e:#}")), } // Drain whatever output is ready. let Some((data, mft_keyframe)) = self.drain_one_output()? else { // No compressed output yet (encoder latency / GOP buffering). // Emit an empty frame so the session skips sending this tick. return Ok(EncodedFrame { frame: VideoFrame::default(), size: 0, is_keyframe: false, }); }; let is_keyframe = mft_keyframe || self.force_keyframe; self.force_keyframe = false; let size = data.len(); let encoded = ProtoEncodedFrame { data, keyframe: is_keyframe, pts: pts_100ns, dts: pts_100ns, }; Ok(EncodedFrame { frame: VideoFrame { timestamp: frame.timestamp.elapsed().as_millis() as i64, display_id: frame.display_id as i32, sequence: self.sequence as i32, encoding: Some(video_frame::Encoding::H264(encoded)), }, size, is_keyframe, }) } } fn request_keyframe(&mut self) { // A precise force-IDR uses the MFT codec API // (CODECAPI_AVEncVideoForceKeyFrame); for the first cut we flag the next // emitted frame as a keyframe so the viewer treats it as a clean point. self.force_keyframe = true; } fn name(&self) -> &str { "h264-mediafoundation" } } impl Drop for H264Encoder { fn drop(&mut self) { unsafe { if self.streaming { let _ = self .transform .ProcessMessage(MFT_MESSAGE_NOTIFY_END_OF_STREAM, 0); let _ = self .transform .ProcessMessage(MFT_MESSAGE_NOTIFY_END_STREAMING, 0); } // The IMFTransform releases when `self.transform` drops. if self.mf_started { let _ = MFShutdown(); } } } } /// MF version word expected by `MFStartup` (MF_VERSION = (MF_API_VERSION<<16)|MF_SDK_VERSION). fn mf_version() -> u32 { // MF_SDK_VERSION = 0x0002, MF_API_VERSION = 0x0070 -> 0x00020070. 0x0002_0070 } /// Derive a target average bitrate (bps) from the 1-100 quality knob and the /// frame area. Tuned conservatively for desktop content (mostly static). fn quality_to_bitrate(quality: u32, width: u32, height: u32) -> u32 { let q = quality.clamp(1, 100) as u64; let pixels = (width as u64) * (height as u64); // Base ~0.06 bits/pixel/frame at 30fps for q=100, scaled by quality. // bps = pixels * 30 * bpp; bpp scales 0.01..0.10 with quality. let bpp_milli = 10 + (q * 90 / 100); // 0.010 .. 0.100 in milli-bits let bps = pixels.saturating_mul(30).saturating_mul(bpp_milli) / 1000; bps.clamp(500_000, 50_000_000) as u32 } /// Pack (width, height) into the 64-bit MF_MT_FRAME_SIZE attribute. #[cfg(windows)] unsafe fn set_attr_size( media_type: &IMFMediaType, key: &windows::core::GUID, width: u32, height: u32, ) -> Result<()> { let packed = ((width as u64) << 32) | (height as u64); media_type.SetUINT64(key, packed)?; Ok(()) } /// Pack (numerator, denominator) into a 64-bit ratio MF attribute. #[cfg(windows)] unsafe fn set_attr_ratio( media_type: &IMFMediaType, key: &windows::core::GUID, num: u32, den: u32, ) -> Result<()> { let packed = ((num as u64) << 32) | (den as u64); media_type.SetUINT64(key, packed)?; Ok(()) } /// Copy all bytes out of an `IMFSample` (single contiguous buffer) into a Vec. #[cfg(windows)] unsafe fn sample_to_vec(sample: &IMFSample) -> Result> { let buffer = sample .ConvertToContiguousBuffer() .context("ConvertToContiguousBuffer")?; let mut ptr: *mut u8 = std::ptr::null_mut(); let mut len: u32 = 0; buffer .Lock(&mut ptr, None, Some(&mut len)) .context("output buffer Lock")?; let out = if ptr.is_null() || len == 0 { Vec::new() } else { std::slice::from_raw_parts(ptr, len as usize).to_vec() }; let _ = buffer.Unlock(); Ok(out) } /// Read the "clean point" (keyframe) flag off a sample, if present. #[cfg(windows)] unsafe fn sample_is_keyframe(sample: &IMFSample) -> bool { use windows::Win32::Media::MediaFoundation::MFSampleExtension_CleanPoint; sample .GetUINT32(&MFSampleExtension_CleanPoint) .map(|v| v != 0) .unwrap_or(false) }