Refactor video decoding (#7625)

### What * Closes #7583 This refactors the video decoder to unify the native and web decoder logic. It changes how error handling is done quite a bit, so this will require some testing. This PR also hides the spinner if the current frame is less than 400ms outdated. This means a small hickup during decoding will not cause the spinner to show up, but jumping a big step in the video stream (moving the time cursor by a large step), will make the spinner show up immediately, making Rerun feel responsive. ### Checklist * [x] I have read and agree to [Contributor Guide](https://github.com/rerun-io/rerun/blob/main/CONTRIBUTING.md) and the [Code of Conduct](https://github.com/rerun-io/rerun/blob/main/CODE_OF_CONDUCT.md) * [x] I've included a screenshot or gif (if applicable) * [x] I have tested the web demo (if applicable): * Using examples from latest `main` build: [rerun.io/viewer](https://rerun.io/viewer/pr/7625?manifest_url=https://app.rerun.io/version/main/examples_manifest.json) * Using full set of examples from `nightly` build: [rerun.io/viewer](https://rerun.io/viewer/pr/7625?manifest_url=https://app.rerun.io/version/nightly/examples_manifest.json) * [x] The PR title and labels are set such as to maximize their usefulness for the next release's CHANGELOG * [x] If applicable, add a new check to the [release checklist](https://github.com/rerun-io/rerun/blob/main/tests/python/release_checklist)! * [x] If have noted any breaking changes to the log API in `CHANGELOG.md` and the migration guide - [PR Build Summary](https://build.rerun.io/pr/7625) - [Recent benchmark results](https://build.rerun.io/graphs/crates.html) - [Wasm size tracking](https://build.rerun.io/graphs/sizes.html) To run all checks from `main`, comment on the PR with `@rerun-bot full-check`.
rerun-io · Oct 8, 2024 · 187b673 · 187b673
1 parent 87e76f8
commit 187b673
Show file tree

Hide file tree

Showing 13 changed files with 642 additions and 814 deletions.
diff --git a/crates/store/re_video/examples/frames.rs b/crates/store/re_video/examples/frames.rs
@@ -13,8 +13,6 @@ use std::{
 use indicatif::ProgressBar;
 use parking_lot::Mutex;
 
-use re_video::demux::mp4::load_mp4;
-
 fn main() {
     // frames <video.mp4>
     let args: Vec<_> = std::env::args().collect();
@@ -27,11 +25,11 @@ fn main() {
     println!("Decoding {video_path}");
 
     let video = std::fs::read(video_path).expect("failed to read video");
-    let video = load_mp4(&video).expect("failed to load video");
+    let video = re_video::VideoData::load_mp4(&video).expect("failed to load video");
 
     println!(
         "{} {}x{}",
-        video.segments.len(),
+        video.gops.len(),
         video.config.coded_width,
         video.config.coded_height
     );

diff --git a/crates/store/re_video/src/demux/mod.rs b/crates/store/re_video/src/demux/mod.rs
@@ -1,6 +1,6 @@
 //! Video demultiplexing.
 //!
-//! Parses a video file into a raw [`VideoData`] struct, which contains basic metadata and a list of [`Segment`]s.
+//! Parses a video file into a raw [`VideoData`] struct, which contains basic metadata and a list of [`GroupOfPictures`]s.
 //!
 //! The entry point is [`VideoData::load_from_bytes`]
 //! which produces an instance of [`VideoData`] from any supported video container.
@@ -26,9 +26,9 @@ pub struct VideoData {
     /// Duration of the video, in time units.
     pub duration: Time,
 
-    /// We split video into segments, each beginning with a key frame,
+    /// We split video into GOPs, each beginning with a key frame,
     /// followed by any number of delta frames.
-    pub segments: Vec<Segment>,
+    pub gops: Vec<GroupOfPictures>,
 
     /// Samples contain the byte offsets into `data` for each frame.
     ///
@@ -54,7 +54,8 @@ impl VideoData {
     /// at the very least the should be a way to extract only metadata.
     pub fn load_from_bytes(data: &[u8], media_type: &str) -> Result<Self, VideoLoadError> {
         match media_type {
-            "video/mp4" => mp4::load_mp4(data),
+            "video/mp4" => Self::load_mp4(data),
+
             media_type => {
                 if media_type.starts_with("video/") {
                     Err(VideoLoadError::UnsupportedMimeType {
@@ -111,7 +112,7 @@ impl VideoData {
     pub fn frame_timestamps_ns(&self) -> impl Iterator<Item = i64> + '_ {
         // Segments are guaranteed to be sorted among each other, but within a segment,
         // presentation timestamps may not be sorted since this is sorted by decode timestamps.
-        self.segments.iter().flat_map(|seg| {
+        self.gops.iter().flat_map(|seg| {
             self.samples[seg.range()]
                 .iter()
                 .map(|sample| sample.composition_timestamp.into_nanos(self.timescale))
@@ -138,18 +139,20 @@ impl VideoData {
     }
 }
 
-/// A segment of a video.
+/// A Group of Pictures (GOP) always starts with an I-frame, followed by delta-frames.
+///
+/// See <https://en.wikipedia.org/wiki/Group_of_pictures> for more.
 #[derive(Debug, Clone)]
-pub struct Segment {
-    /// Decode timestamp of the first sample in this segment, in time units.
+pub struct GroupOfPictures {
+    /// Decode timestamp of the first sample in this GOP, in time units.
     pub start: Time,
 
-    /// Range of samples contained in this segment.
+    /// Range of samples contained in this GOP.
     pub sample_range: Range<u32>,
 }
 
-impl Segment {
-    /// The segment's `sample_range` mapped to `usize` for slicing.
+impl GroupOfPictures {
+    /// The GOP's `sample_range` mapped to `usize` for slicing.
     pub fn range(&self) -> Range<usize> {
         Range {
             start: self.sample_range.start as usize,
@@ -163,11 +166,15 @@ impl Segment {
 pub struct Sample {
     /// Time at which this sample appears in the decoded bitstream, in time units.
     ///
+    /// Samples should be decoded in this order.
+    ///
     /// `decode_timestamp <= composition_timestamp`
     pub decode_timestamp: Time,
 
     /// Time at which this sample appears in the frame stream, in time units.
     ///
+    /// The frame should be shown at this time.
+    ///
     /// `decode_timestamp <= composition_timestamp`
     pub composition_timestamp: Time,
 
@@ -245,7 +252,7 @@ impl std::fmt::Debug for VideoData {
             .field("config", &self.config)
             .field("timescale", &self.timescale)
             .field("duration", &self.duration)
-            .field("segments", &self.segments)
+            .field("gops", &self.gops)
             .field(
                 "samples",
                 &self.samples.iter().enumerate().collect::<Vec<_>>(),

diff --git a/crates/store/re_video/src/demux/mp4.rs b/crates/store/re_video/src/demux/mp4.rs
@@ -1,89 +1,91 @@
 #![allow(clippy::map_err_ignore)]
 
-use super::{Config, Sample, Segment, VideoData, VideoLoadError};
+use super::{Config, GroupOfPictures, Sample, VideoData, VideoLoadError};
 
 use crate::{Time, Timescale};
 
-pub fn load_mp4(bytes: &[u8]) -> Result<VideoData, VideoLoadError> {
-    let mp4 = re_mp4::Mp4::read_bytes(bytes)?;
-
-    let mp4_tracks = mp4.tracks().iter().map(|(k, t)| (*k, t.kind)).collect();
-
-    let track = mp4
-        .tracks()
-        .values()
-        .find(|t| t.kind == Some(re_mp4::TrackKind::Video))
-        .ok_or_else(|| VideoLoadError::NoVideoTrack)?;
-
-    let codec = track
-        .codec_string(&mp4)
-        .ok_or_else(|| VideoLoadError::UnsupportedCodec(unknown_codec_fourcc(&mp4, track)))?;
-    let description = track
-        .raw_codec_config(&mp4)
-        .ok_or_else(|| VideoLoadError::UnsupportedCodec(unknown_codec_fourcc(&mp4, track)))?;
-
-    let coded_height = track.height;
-    let coded_width = track.width;
-
-    let config = Config {
-        codec,
-        description,
-        coded_height,
-        coded_width,
-    };
-
-    let timescale = Timescale::new(track.timescale);
-    let duration = Time::new(track.duration as i64);
-    let mut samples = Vec::<Sample>::new();
-    let mut segments = Vec::<Segment>::new();
-    let mut segment_sample_start_index = 0;
-    let data = track.data.clone();
-
-    for sample in &track.samples {
-        if sample.is_sync && !samples.is_empty() {
-            let start = samples[segment_sample_start_index].decode_timestamp;
-            let sample_range = segment_sample_start_index as u32..samples.len() as u32;
-            segments.push(Segment {
+impl VideoData {
+    pub fn load_mp4(bytes: &[u8]) -> Result<Self, VideoLoadError> {
+        let mp4 = re_mp4::Mp4::read_bytes(bytes)?;
+
+        let mp4_tracks = mp4.tracks().iter().map(|(k, t)| (*k, t.kind)).collect();
+
+        let track = mp4
+            .tracks()
+            .values()
+            .find(|t| t.kind == Some(re_mp4::TrackKind::Video))
+            .ok_or_else(|| VideoLoadError::NoVideoTrack)?;
+
+        let codec = track
+            .codec_string(&mp4)
+            .ok_or_else(|| VideoLoadError::UnsupportedCodec(unknown_codec_fourcc(&mp4, track)))?;
+        let description = track
+            .raw_codec_config(&mp4)
+            .ok_or_else(|| VideoLoadError::UnsupportedCodec(unknown_codec_fourcc(&mp4, track)))?;
+
+        let coded_height = track.height;
+        let coded_width = track.width;
+
+        let config = Config {
+            codec,
+            description,
+            coded_height,
+            coded_width,
+        };
+
+        let timescale = Timescale::new(track.timescale);
+        let duration = Time::new(track.duration as i64);
+        let mut samples = Vec::<Sample>::new();
+        let mut gops = Vec::<GroupOfPictures>::new();
+        let mut gop_sample_start_index = 0;
+        let data = track.data.clone();
+
+        for sample in &track.samples {
+            if sample.is_sync && !samples.is_empty() {
+                let start = samples[gop_sample_start_index].decode_timestamp;
+                let sample_range = gop_sample_start_index as u32..samples.len() as u32;
+                gops.push(GroupOfPictures {
+                    start,
+                    sample_range,
+                });
+                gop_sample_start_index = samples.len();
+            }
+
+            let decode_timestamp = Time::new(sample.decode_timestamp as i64);
+            let composition_timestamp = Time::new(sample.composition_timestamp as i64);
+            let duration = Time::new(sample.duration as i64);
+
+            let byte_offset = sample.offset as u32;
+            let byte_length = sample.size as u32;
+
+            samples.push(Sample {
+                decode_timestamp,
+                composition_timestamp,
+                duration,
+                byte_offset,
+                byte_length,
+            });
+        }
+
+        if !samples.is_empty() {
+            let start = samples[gop_sample_start_index].decode_timestamp;
+            let sample_range = gop_sample_start_index as u32..samples.len() as u32;
+            gops.push(GroupOfPictures {
                 start,
                 sample_range,
             });
-            segment_sample_start_index = samples.len();
         }
 
-        let decode_timestamp = Time::new(sample.decode_timestamp as i64);
-        let composition_timestamp = Time::new(sample.composition_timestamp as i64);
-        let duration = Time::new(sample.duration as i64);
-
-        let byte_offset = sample.offset as u32;
-        let byte_length = sample.size as u32;
-
-        samples.push(Sample {
-            decode_timestamp,
-            composition_timestamp,
+        Ok(Self {
+            config,
+            timescale,
             duration,
-            byte_offset,
-            byte_length,
-        });
-    }
-
-    if !samples.is_empty() {
-        let start = samples[segment_sample_start_index].decode_timestamp;
-        let sample_range = segment_sample_start_index as u32..samples.len() as u32;
-        segments.push(Segment {
-            start,
-            sample_range,
-        });
+            gops,
+            samples,
+            data,
+            mp4_tracks,
+        })
     }
-
-    Ok(VideoData {
-        config,
-        timescale,
-        duration,
-        segments,
-        samples,
-        data,
-        mp4_tracks,
-    })
 }
 
 fn unknown_codec_fourcc(mp4: &re_mp4::Mp4, track: &re_mp4::Track) -> re_mp4::FourCC {

diff --git a/crates/store/re_video/src/lib.rs b/crates/store/re_video/src/lib.rs
@@ -88,6 +88,15 @@ impl Time {
     }
 }
 
+impl std::ops::Add for Time {
+    type Output = Self;
+
+    #[inline]
+    fn add(self, rhs: Self) -> Self::Output {
+        Self(self.0.saturating_add(rhs.0))
+    }
+}
+
 impl std::ops::Sub for Time {
     type Output = Self;
 

diff --git a/crates/store/re_video/src/mp4.rs b/crates/store/re_video/src/mp4.rs