Merge branch 'main' of github.com:pytorch/torchcodec into sample_rate

NicolasHug · NicolasHug · commit db740a6728e4 · 2025-03-19T11:39:03.000Z
diff --git a/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp b/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp
@@ -17,7 +17,7 @@ namespace facebook::torchcodec {
 void convertAVFrameToFrameOutputOnCuda(
     const torch::Device& device,
     [[maybe_unused]] const VideoDecoder::VideoStreamOptions& videoStreamOptions,
-    [[maybe_unused]] VideoDecoder::AVFrameStream& avFrameStream,
+    [[maybe_unused]] UniqueAVFrame& avFrame,
     [[maybe_unused]] VideoDecoder::FrameOutput& frameOutput,
     [[maybe_unused]] std::optional<torch::Tensor> preAllocatedOutputTensor) {
   throwUnsupportedDeviceError(device);
diff --git a/src/torchcodec/decoders/_core/CudaDevice.cpp b/src/torchcodec/decoders/_core/CudaDevice.cpp
@@ -190,17 +190,15 @@ void initializeContextOnCuda(
 void convertAVFrameToFrameOutputOnCuda(
     const torch::Device& device,
     const VideoDecoder::VideoStreamOptions& videoStreamOptions,
-    VideoDecoder::AVFrameStream& avFrameStream,
+    UniqueAVFrame& avFrame,
     VideoDecoder::FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  AVFrame* avFrame = avFrameStream.avFrame.get();
-
   TORCH_CHECK(
       avFrame->format == AV_PIX_FMT_CUDA,
       "Expected format to be AV_PIX_FMT_CUDA, got " +
           std::string(av_get_pix_fmt_name((AVPixelFormat)avFrame->format)));
   auto frameDims =
-      getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, *avFrame);
+      getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
   int height = frameDims.height;
   int width = frameDims.width;
   torch::Tensor& dst = frameOutput.data;
diff --git a/src/torchcodec/decoders/_core/DeviceInterface.h b/src/torchcodec/decoders/_core/DeviceInterface.h
@@ -32,7 +32,7 @@ void initializeContextOnCuda(
 void convertAVFrameToFrameOutputOnCuda(
     const torch::Device& device,
     const VideoDecoder::VideoStreamOptions& videoStreamOptions,
-    VideoDecoder::AVFrameStream& avFrameStream,
+    UniqueAVFrame& avFrame,
     VideoDecoder::FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.cpp b/src/torchcodec/decoders/_core/FFMPEGCommon.cpp
@@ -48,15 +48,11 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode) {
   return std::string(errorBuffer);
 }
 
-int64_t getDuration(const UniqueAVFrame& frame) {
-  return getDuration(frame.get());
-}
-
-int64_t getDuration(const AVFrame* frame) {
+int64_t getDuration(const UniqueAVFrame& avFrame) {
 #if LIBAVUTIL_VERSION_MAJOR < 58
-  return frame->pkt_duration;
+  return avFrame->pkt_duration;
 #else
-  return frame->duration;
+  return avFrame->duration;
 #endif
 }
 
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.h b/src/torchcodec/decoders/_core/FFMPEGCommon.h
@@ -140,7 +140,6 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
 // struct member representing duration has changed across the versions we
 // support.
 int64_t getDuration(const UniqueAVFrame& frame);
-int64_t getDuration(const AVFrame* frame);
 
 int getNumChannels(const UniqueAVFrame& avFrame);
 int getNumChannels(const UniqueAVCodecContext& avCodecContext);
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -587,9 +587,9 @@ VideoDecoder::FrameOutput VideoDecoder::getNextFrame() {
 VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   validateActiveStream();
-  AVFrameStream avFrameStream = decodeAVFrame(
-      [this](AVFrame* avFrame) { return avFrame->pts >= cursor_; });
-  return convertAVFrameToFrameOutput(avFrameStream, preAllocatedOutputTensor);
+  UniqueAVFrame avFrame = decodeAVFrame(
+      [this](const UniqueAVFrame& avFrame) { return avFrame->pts >= cursor_; });
+  return convertAVFrameToFrameOutput(avFrame, preAllocatedOutputTensor);
 }
 
 VideoDecoder::FrameOutput VideoDecoder::getFrameAtIndex(int64_t frameIndex) {
@@ -719,8 +719,8 @@ VideoDecoder::FrameOutput VideoDecoder::getFramePlayedAt(double seconds) {
   }
 
   setCursorPtsInSeconds(seconds);
-  AVFrameStream avFrameStream =
-      decodeAVFrame([seconds, this](AVFrame* avFrame) {
+  UniqueAVFrame avFrame =
+      decodeAVFrame([seconds, this](const UniqueAVFrame& avFrame) {
         StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
         double frameStartTime = ptsToSeconds(avFrame->pts, streamInfo.timeBase);
         double frameEndTime = ptsToSeconds(
@@ -739,7 +739,7 @@ VideoDecoder::FrameOutput VideoDecoder::getFramePlayedAt(double seconds) {
       });
 
   // Convert the frame to tensor.
-  FrameOutput frameOutput = convertAVFrameToFrameOutput(avFrameStream);
+  FrameOutput frameOutput = convertAVFrameToFrameOutput(avFrame);
   frameOutput.data = maybePermuteHWC2CHW(frameOutput.data);
   return frameOutput;
 }
@@ -895,14 +895,11 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
   auto finished = false;
   while (!finished) {
     try {
-      AVFrameStream avFrameStream = decodeAVFrame([startPts](AVFrame* avFrame) {
-        return startPts < avFrame->pts + getDuration(avFrame);
-      });
-      // TODO: it's not great that we are getting a FrameOutput, which is
-      // intended for videos. We should consider bypassing
-      // convertAVFrameToFrameOutput and directly call
-      // convertAudioAVFrameToFrameOutputOnCPU.
-      auto frameOutput = convertAVFrameToFrameOutput(avFrameStream);
+      UniqueAVFrame avFrame =
+          decodeAVFrame([startPts](const UniqueAVFrame& avFrame) {
+            return startPts < avFrame->pts + getDuration(avFrame);
+          });
+      auto frameOutput = convertAVFrameToFrameOutput(avFrame);
       firstFramePtsSeconds =
           std::min(firstFramePtsSeconds, frameOutput.ptsSeconds);
       frames.push_back(frameOutput.data);
@@ -1039,8 +1036,8 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
 // LOW-LEVEL DECODING
 // --------------------------------------------------------------------------
 
-VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
-    std::function<bool(AVFrame*)> filterFunction) {
+UniqueAVFrame VideoDecoder::decodeAVFrame(
+    std::function<bool(const UniqueAVFrame&)> filterFunction) {
   validateActiveStream();
 
   resetDecodeStats();
@@ -1068,7 +1065,7 @@ VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
 
     decodeStats_.numFramesReceivedByDecoder++;
     // Is this the kind of frame we're looking for?
-    if (status == AVSUCCESS && filterFunction(avFrame.get())) {
+    if (status == AVSUCCESS && filterFunction(avFrame)) {
       // Yes, this is the frame we'll return; break out of the decoding loop.
       break;
     } else if (status == AVSUCCESS) {
@@ -1154,37 +1151,35 @@ VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
   streamInfo.lastDecodedAvFramePts = avFrame->pts;
   streamInfo.lastDecodedAvFrameDuration = getDuration(avFrame);
 
-  return AVFrameStream(std::move(avFrame), activeStreamIndex_);
+  return avFrame;
 }
 
 // --------------------------------------------------------------------------
 // AVFRAME <-> FRAME OUTPUT CONVERSION
 // --------------------------------------------------------------------------
 
 VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
-    VideoDecoder::AVFrameStream& avFrameStream,
+    UniqueAVFrame& avFrame,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   // Convert the frame to tensor.
   FrameOutput frameOutput;
-  int streamIndex = avFrameStream.streamIndex;
-  AVFrame* avFrame = avFrameStream.avFrame.get();
-  frameOutput.streamIndex = streamIndex;
-  auto& streamInfo = streamInfos_[streamIndex];
+  auto& streamInfo = streamInfos_[activeStreamIndex_];
   frameOutput.ptsSeconds = ptsToSeconds(
-      avFrame->pts, formatContext_->streams[streamIndex]->time_base);
+      avFrame->pts, formatContext_->streams[activeStreamIndex_]->time_base);
   frameOutput.durationSeconds = ptsToSeconds(
-      getDuration(avFrame), formatContext_->streams[streamIndex]->time_base);
+      getDuration(avFrame),
+      formatContext_->streams[activeStreamIndex_]->time_base);
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
     convertAudioAVFrameToFrameOutputOnCPU(
-        avFrameStream, frameOutput, preAllocatedOutputTensor);
+        avFrame, frameOutput, preAllocatedOutputTensor);
   } else if (streamInfo.videoStreamOptions.device.type() == torch::kCPU) {
     convertAVFrameToFrameOutputOnCPU(
-        avFrameStream, frameOutput, preAllocatedOutputTensor);
+        avFrame, frameOutput, preAllocatedOutputTensor);
   } else if (streamInfo.videoStreamOptions.device.type() == torch::kCUDA) {
     convertAVFrameToFrameOutputOnCuda(
         streamInfo.videoStreamOptions.device,
         streamInfo.videoStreamOptions,
-        avFrameStream,
+        avFrame,
         frameOutput,
         preAllocatedOutputTensor);
   } else {
@@ -1205,14 +1200,13 @@ VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
 // Dimension order of the preAllocatedOutputTensor must be HWC, regardless of
 // `dimension_order` parameter. It's up to callers to re-shape it if needed.
 void VideoDecoder::convertAVFrameToFrameOutputOnCPU(
-    VideoDecoder::AVFrameStream& avFrameStream,
+    UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  AVFrame* avFrame = avFrameStream.avFrame.get();
   auto& streamInfo = streamInfos_[activeStreamIndex_];
 
   auto frameDims = getHeightAndWidthFromOptionsOrAVFrame(
-      streamInfo.videoStreamOptions, *avFrame);
+      streamInfo.videoStreamOptions, avFrame);
   int expectedOutputHeight = frameDims.height;
   int expectedOutputWidth = frameDims.width;
 
@@ -1306,7 +1300,7 @@ void VideoDecoder::convertAVFrameToFrameOutputOnCPU(
 }
 
 int VideoDecoder::convertAVFrameToTensorUsingSwsScale(
-    const AVFrame* avFrame,
+    const UniqueAVFrame& avFrame,
     torch::Tensor& outputTensor) {
   StreamInfo& activeStreamInfo = streamInfos_[activeStreamIndex_];
   SwsContext* swsContext = activeStreamInfo.swsContext.get();
@@ -1326,11 +1320,11 @@ int VideoDecoder::convertAVFrameToTensorUsingSwsScale(
 }
 
 torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph(
-    const AVFrame* avFrame) {
+    const UniqueAVFrame& avFrame) {
   FilterGraphContext& filterGraphContext =
       streamInfos_[activeStreamIndex_].filterGraphContext;
   int status =
-      av_buffersrc_write_frame(filterGraphContext.sourceContext, avFrame);
+      av_buffersrc_write_frame(filterGraphContext.sourceContext, avFrame.get());
   if (status < AVSUCCESS) {
     throw std::runtime_error("Failed to add frame to buffer source context");
   }
@@ -1354,18 +1348,18 @@ torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph(
 }
 
 void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
-    VideoDecoder::AVFrameStream& avFrameStream,
+    UniqueAVFrame& srcAVFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   TORCH_CHECK(
       !preAllocatedOutputTensor.has_value(),
       "pre-allocated audio tensor not supported yet.");
 
   AVSampleFormat sourceSampleFormat =
-      static_cast<AVSampleFormat>(avFrameStream.avFrame->format);
+      static_cast<AVSampleFormat>(srcAVFrame->format);
   AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
 
-  int sourceSampleRate = avFrameStream.avFrame->sample_rate;
+  int sourceSampleRate = srcAVFrame->sample_rate;
   int desiredSampleRate =
       streamInfos_[activeStreamIndex_].audioStreamOptions.sampleRate.value_or(
           sourceSampleRate);
@@ -1377,14 +1371,13 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
   UniqueAVFrame convertedAVFrame;
   if (mustConvert) {
     convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate(
-        avFrameStream.avFrame,
+        srcAVFrame,
         sourceSampleFormat,
         desiredSampleFormat,
         sourceSampleRate,
         desiredSampleRate);
   }
-  const UniqueAVFrame& avFrame =
-      mustConvert ? convertedAVFrame : avFrameStream.avFrame;
+  const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
 
   AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
   TORCH_CHECK(
@@ -1981,10 +1974,10 @@ FrameDims getHeightAndWidthFromOptionsOrMetadata(
 
 FrameDims getHeightAndWidthFromOptionsOrAVFrame(
     const VideoDecoder::VideoStreamOptions& videoStreamOptions,
-    const AVFrame& avFrame) {
+    const UniqueAVFrame& avFrame) {
   return FrameDims(
-      videoStreamOptions.height.value_or(avFrame.height),
-      videoStreamOptions.width.value_or(avFrame.width));
+      videoStreamOptions.height.value_or(avFrame->height),
+      videoStreamOptions.width.value_or(avFrame->width));
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -161,9 +161,13 @@ class VideoDecoder {
   // They are the equivalent of the user-facing Frame and FrameBatch classes in
   // Python. They contain RGB decoded frames along with some associated data
   // like PTS and duration.
+  // FrameOutput is also relevant for audio decoding, typically as the output of
+  // getNextFrame(), or as a temporary output variable.
   struct FrameOutput {
-    torch::Tensor data; // 3D: of shape CHW or HWC.
-    int streamIndex;
+    // data shape is:
+    // - 3D (C, H, W) or (H, W, C) for videos
+    // - 2D (numChannels, numSamples) for audio
+    torch::Tensor data;
     double ptsSeconds;
     double durationSeconds;
   };
@@ -252,23 +256,6 @@ class VideoDecoder {
   // These are APIs that should be private, but that are effectively exposed for
   // practical reasons, typically for testing purposes.
 
-  // This struct is needed because AVFrame doesn't retain the streamIndex. Only
-  // the AVPacket knows its stream. This is what the low-level private decoding
-  // entry points return. The AVFrameStream is then converted to a FrameOutput
-  // with convertAVFrameToFrameOutput. It should be private, but is currently
-  // used by DeviceInterface.
-  struct AVFrameStream {
-    // The actual decoded output as a unique pointer to an AVFrame.
-    // Usually, this is a YUV frame. It'll be converted to RGB in
-    // convertAVFrameToFrameOutput.
-    UniqueAVFrame avFrame;
-    // The stream index of the decoded frame.
-    int streamIndex;
-
-    explicit AVFrameStream(UniqueAVFrame&& a, int s)
-        : avFrame(std::move(a)), streamIndex(s) {}
-  };
-
   // Once getFrameAtIndex supports the preAllocatedOutputTensor parameter, we
   // can move it back to private.
   FrameOutput getFrameAtIndexInternal(
@@ -385,31 +372,33 @@ class VideoDecoder {
 
   void maybeSeekToBeforeDesiredPts();
 
-  AVFrameStream decodeAVFrame(std::function<bool(AVFrame*)> filterFunction);
+  UniqueAVFrame decodeAVFrame(
+      std::function<bool(const UniqueAVFrame&)> filterFunction);
 
   FrameOutput getNextFrameInternal(
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
   torch::Tensor maybePermuteHWC2CHW(torch::Tensor& hwcTensor);
 
   FrameOutput convertAVFrameToFrameOutput(
-      AVFrameStream& avFrameStream,
+      UniqueAVFrame& avFrame,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
   void convertAVFrameToFrameOutputOnCPU(
-      AVFrameStream& avFrameStream,
+      UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
   void convertAudioAVFrameToFrameOutputOnCPU(
-      AVFrameStream& avFrameStream,
+      UniqueAVFrame& srcAVFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
-  torch::Tensor convertAVFrameToTensorUsingFilterGraph(const AVFrame* avFrame);
+  torch::Tensor convertAVFrameToTensorUsingFilterGraph(
+      const UniqueAVFrame& avFrame);
 
   int convertAVFrameToTensorUsingSwsScale(
-      const AVFrame* avFrame,
+      const UniqueAVFrame& avFrame,
       torch::Tensor& outputTensor);
 
   UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
@@ -580,7 +569,7 @@ FrameDims getHeightAndWidthFromOptionsOrMetadata(
 
 FrameDims getHeightAndWidthFromOptionsOrAVFrame(
     const VideoDecoder::VideoStreamOptions& videoStreamOptions,
-    const AVFrame& avFrame);
+    const UniqueAVFrame& avFrame);
 
 torch::Tensor allocateEmptyHWCTensor(
     int height,
diff --git a/test/decoders/test_ops.py b/test/decoders/test_ops.py
@@ -650,12 +650,15 @@ def test_next(self, asset):
         frame_index = 0
         while True:
             try:
-                frame, *_ = get_next_frame(decoder)
+                frame, pts_seconds, duration_seconds = get_next_frame(decoder)
             except IndexError:
                 break
             torch.testing.assert_close(
                 frame, asset.get_frame_data_by_index(frame_index)
             )
+            frame_info = asset.get_frame_info(frame_index)
+            assert pts_seconds == frame_info.pts_seconds
+            assert duration_seconds == frame_info.duration_seconds
             frame_index += 1
 
     @pytest.mark.parametrize(