pytorch · NicolasHug · Mar 14, 2025 · Mar 13, 2025 · Mar 13, 2025 · NicolasHug
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -170,6 +170,9 @@ void VideoDecoder::initializeDecoder() {
       }
       containerMetadata_.numVideoStreams++;
     } else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
+      AVSampleFormat format =
+          static_cast<AVSampleFormat>(avStream->codecpar->format);
+      streamMetadata.sampleFormat = av_get_sample_fmt_name(format);
       containerMetadata_.numAudioStreams++;
     }
 

diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -81,6 +81,7 @@ class VideoDecoder {
     // Audio-only fields
     std::optional<int64_t> sampleRate;
     std::optional<int64_t> numChannels;
+    std::optional<std::string> sampleFormat;
   };
 
   struct ContainerMetadata {

diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -495,12 +495,15 @@ std::string get_stream_json_metadata(
   if (streamMetadata.numChannels.has_value()) {
     map["numChannels"] = std::to_string(*streamMetadata.numChannels);
   }
+  if (streamMetadata.sampleFormat.has_value()) {
+    map["sampleFormat"] = quoteValue(streamMetadata.sampleFormat.value());
+  }
   if (streamMetadata.mediaType == AVMEDIA_TYPE_VIDEO) {
-    map["mediaType"] = "\"video\"";
+    map["mediaType"] = quoteValue("video");
   } else if (streamMetadata.mediaType == AVMEDIA_TYPE_AUDIO) {
-    map["mediaType"] = "\"audio\"";
+    map["mediaType"] = quoteValue("audio");
   } else {
-    map["mediaType"] = "\"other\"";
+    map["mediaType"] = quoteValue("other");
   }
   return mapToJson(map);
 }

diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py
@@ -161,9 +161,9 @@ def __repr__(self):
 class AudioStreamMetadata(StreamMetadata):
     """Metadata of a single audio stream."""
 
-    # TODO-AUDIO Add sample format field
     sample_rate: Optional[int]
     num_channels: Optional[int]
+    sample_format: Optional[str]
 
     def __repr__(self):
         return super().__repr__()
@@ -240,6 +240,7 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
                 AudioStreamMetadata(
                     sample_rate=stream_dict.get("sampleRate"),
                     num_channels=stream_dict.get("numChannels"),
+                    sample_format=stream_dict.get("sampleFormat"),
                     **common_meta,
                 )
             )

diff --git a/test/decoders/test_decoders.py b/test/decoders/test_decoders.py
@@ -25,6 +25,7 @@
     NASA_AUDIO,
     NASA_AUDIO_MP3,
     NASA_VIDEO,
+    SINE_MONO_S32,
 )
 
 
@@ -940,7 +941,7 @@ def get_some_frames(decoder):
 
 
 class TestAudioDecoder:
-    @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
+    @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3, SINE_MONO_S32))
     def test_metadata(self, asset):
         decoder = AudioDecoder(asset.path)
         assert isinstance(decoder.metadata, AudioStreamMetadata)
@@ -955,6 +956,7 @@ def test_metadata(self, asset):
         )
         assert decoder.metadata.sample_rate == asset.sample_rate
         assert decoder.metadata.num_channels == asset.num_channels
+        assert decoder.metadata.sample_format == asset.sample_format
 
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
     def test_error(self, asset):

diff --git a/test/decoders/test_metadata.py b/test/decoders/test_metadata.py
@@ -90,6 +90,7 @@ def test_get_metadata(metadata_getter):
     )
     assert best_audio_stream_metadata.bit_rate == 128837
     assert best_audio_stream_metadata.codec == "aac"
+    assert best_audio_stream_metadata.sample_format == "fltp"
 
 
 @pytest.mark.parametrize(
@@ -109,6 +110,7 @@ def test_get_metadata_audio_file(metadata_getter):
     )
     assert best_audio_stream_metadata.bit_rate == 64000
     assert best_audio_stream_metadata.codec == "mp3"
+    assert best_audio_stream_metadata.sample_format == "fltp"
 
 
 @pytest.mark.parametrize(

diff --git a/test/resources/sine_mono_s32.wav b/test/resources/sine_mono_s32.wav
diff --git a/test/resources/sine_mono_s32.wav.stream0.all_frames_info.json b/test/resources/sine_mono_s32.wav.stream0.all_frames_info.json
@@ -0,0 +1,254 @@
+[
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.000000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.064000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.128000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.192000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.256000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.320000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.384000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.448000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.512000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.576000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.640000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.704000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.768000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.832000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.896000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "0.960000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.024000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.088000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.152000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.216000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.280000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.344000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.408000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.472000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.536000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.600000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.664000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.728000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.792000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.856000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.920000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "1.984000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.048000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.112000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.176000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.240000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.304000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.368000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.432000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.496000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.560000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.624000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.688000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.752000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.816000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.880000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "2.944000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.008000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.072000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.136000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.200000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.264000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.328000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.392000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.456000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.520000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.584000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.648000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.712000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.776000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.840000"
+  },
+  {
+    "duration_time": "0.064000",
+    "pts_time": "3.904000"
+  },
+  {
+    "duration_time": "0.032000",
+    "pts_time": "3.968000"
+  }
+]