Nit

NicolasHug · NicolasHug · commit af4e88afa455 · 2025-03-19T15:03:05.000Z
diff --git a/src/torchcodec/decoders/_audio_decoder.py b/src/torchcodec/decoders/_audio_decoder.py
@@ -42,9 +42,8 @@ def __init__(
             decoder=self._decoder, stream_index=stream_index, media_type="audio"
         )
         assert isinstance(self.metadata, core.AudioStreamMetadata)  # mypy
-        self._source_sample_rate = self.metadata.sample_rate
         self._desired_sample_rate = (
-            sample_rate if sample_rate is not None else self._source_sample_rate
+            sample_rate if sample_rate is not None else self.metadata.sample_rate
         )
 
     def get_samples_played_in_range(
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -1431,9 +1431,9 @@ UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormatAndSampleRate(
   if (sourceSampleRate != desiredSampleRate) {
     // Note that this is an upper bound on the number of output samples.
     // `swr_convert()` will likely not fill convertedAVFrame with that many
-    // samples, it will buffer the last few ones because those require future
-    // samples. That's also why we reset nb_samples after the call to
-    // `swr_convert()`.
+    // samples if sample rate conversion is needed. It will buffer the last few
+    // ones because those require future samples. That's also why we reset
+    // nb_samples after the call to `swr_convert()`.
     convertedAVFrame->nb_samples = av_rescale_rnd(
         swr_get_delay(streamInfo.swrContext.get(), sourceSampleRate) +
             srcAVFrame->nb_samples,
@@ -1464,7 +1464,6 @@ UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormatAndSampleRate(
 
   // See comment above about nb_samples
   convertedAVFrame->nb_samples = numConvertedSamples;
-  // TODO need to flush properly to retrieve the last few samples.
 
   return convertedAVFrame;
 }
diff --git a/test/decoders/test_decoders.py b/test/decoders/test_decoders.py
@@ -1104,8 +1104,11 @@ def test_format_conversion(self):
     def test_sample_rate_conversion(self, start_seconds, stop_seconds):
         # When start_seconds is not exactly 0, we have to increase the tolerance
         # a bit. This is because sample_rate conversion relies on a sliding
-        # window of samples: if we start a stream in the middle, the first few
-        # samples aren't able to take advantage of the preceeding samples.
+        # window of samples: if we start decoding a stream in the middle, the
+        # first few samples we're decoding aren't able to take advantage of the
+        # preceeding samples for sample-rate conversion. This leads to a
+        # slightly different sample-rate conversion that we would otherwise get,
+        # had we started the stream from the beginning.
         atol = 1e-4 if start_seconds == 0 else 1e-2
         rtol = 1e-6
 

Original file line number	Diff line number	Diff line change
`@@ -42,9 +42,8 @@ def __init__(`
`42`	`42`	`decoder=self._decoder, stream_index=stream_index, media_type="audio"`
`43`	`43`	`)`
`44`	`44`	`assert isinstance(self.metadata, core.AudioStreamMetadata) # mypy`
`45`		`- self._source_sample_rate = self.metadata.sample_rate`
`46`	`45`	`self._desired_sample_rate = (`
`47`		`- sample_rate if sample_rate is not None else self._source_sample_rate`
	`46`	`+ sample_rate if sample_rate is not None else self.metadata.sample_rate`
`48`	`47`	`)`
`49`	`48`
`50`	`49`	`def get_samples_played_in_range(`