pytorch · scotts · Apr 7, 2025 · Apr 4, 2025 · Apr 4, 2025 · Apr 4, 2025
diff --git a/src/torchcodec/_core/CudaDevice.cpp b/src/torchcodec/_core/CudaDevice.cpp
@@ -190,9 +190,9 @@ void CudaDevice::initializeContext(AVCodecContext* codecContext) {
 }
 
 void CudaDevice::convertAVFrameToFrameOutput(
-    const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+    const VideoStreamOptions& videoStreamOptions,
     UniqueAVFrame& avFrame,
-    SingleStreamDecoder::FrameOutput& frameOutput,
+    FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   TORCH_CHECK(
       avFrame->format == AV_PIX_FMT_CUDA,

diff --git a/src/torchcodec/_core/CudaDevice.h b/src/torchcodec/_core/CudaDevice.h
@@ -21,9 +21,9 @@ class CudaDevice : public DeviceInterface {
   void initializeContext(AVCodecContext* codecContext) override;
 
   void convertAVFrameToFrameOutput(
-      const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+      const VideoStreamOptions& videoStreamOptions,
       UniqueAVFrame& avFrame,
-      SingleStreamDecoder::FrameOutput& frameOutput,
+      FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor =
           std::nullopt) override;
 

diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -12,7 +12,8 @@
 #include <stdexcept>
 #include <string>
 #include "FFMPEGCommon.h"
-#include "src/torchcodec/_core/SingleStreamDecoder.h"
+#include "src/torchcodec/_core/Frame.h"
+#include "src/torchcodec/_core/StreamOptions.h"
 
 namespace facebook::torchcodec {
 
@@ -41,9 +42,9 @@ class DeviceInterface {
   virtual void initializeContext(AVCodecContext* codecContext) = 0;
 
   virtual void convertAVFrameToFrameOutput(
-      const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+      const VideoStreamOptions& videoStreamOptions,
       UniqueAVFrame& avFrame,
-      SingleStreamDecoder::FrameOutput& frameOutput,
+      FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt) = 0;
 
  protected:

diff --git a/src/torchcodec/_core/Frame.h b/src/torchcodec/_core/Frame.h
@@ -0,0 +1,47 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torch/types.h>
+#include "src/torchcodec/_core/Metadata.h"
+#include "src/torchcodec/_core/StreamOptions.h"
+
+namespace facebook::torchcodec {
+
+// All public video decoding entry points return either a FrameOutput or a
+// FrameBatchOutput.
+// They are the equivalent of the user-facing Frame and FrameBatch classes in
+// Python. They contain RGB decoded frames along with some associated data
+// like PTS and duration.
+// FrameOutput is also relevant for audio decoding, typically as the output of
+// getNextFrame(), or as a temporary output variable.
+struct FrameOutput {
+  // data shape is:
+  // - 3D (C, H, W) or (H, W, C) for videos
+  // - 2D (numChannels, numSamples) for audio
+  torch::Tensor data;
+  double ptsSeconds;
+  double durationSeconds;
+};
+
+struct FrameBatchOutput {
+  torch::Tensor data; // 4D: of shape NCHW or NHWC.
+  torch::Tensor ptsSeconds; // 1D of shape (N,)
+  torch::Tensor durationSeconds; // 1D of shape (N,)
+
+  explicit FrameBatchOutput(
+      int64_t numFrames,
+      const VideoStreamOptions& videoStreamOptions,
+      const StreamMetadata& streamMetadata);
+};
+
+struct AudioFramesOutput {
+  torch::Tensor data; // shape is (numChannels, numSamples)
+  double ptsSeconds;
+};
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h
@@ -0,0 +1,70 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <optional>
+#include <string>
+#include <vector>
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavutil/avutil.h>
+}
+
+namespace facebook::torchcodec {
+
+struct StreamMetadata {
+  // Common (video and audio) fields derived from the AVStream.
+  int streamIndex;
+  // See this link for what various values are available:
+  // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
+  AVMediaType mediaType;
+  std::optional<AVCodecID> codecId;
+  std::optional<std::string> codecName;
+  std::optional<double> durationSeconds;
+  std::optional<double> beginStreamFromHeader;
+  std::optional<int64_t> numFrames;
+  std::optional<int64_t> numKeyFrames;
+  std::optional<double> averageFps;
+  std::optional<double> bitRate;
+
+  // More accurate duration, obtained by scanning the file.
+  // These presentation timestamps are in time base.
+  std::optional<int64_t> minPtsFromScan;
+  std::optional<int64_t> maxPtsFromScan;
+  // These presentation timestamps are in seconds.
+  std::optional<double> minPtsSecondsFromScan;
+  std::optional<double> maxPtsSecondsFromScan;
+  // This can be useful for index-based seeking.
+  std::optional<int64_t> numFramesFromScan;
+
+  // Video-only fields derived from the AVCodecContext.
+  std::optional<int64_t> width;
+  std::optional<int64_t> height;
+
+  // Audio-only fields
+  std::optional<int64_t> sampleRate;
+  std::optional<int64_t> numChannels;
+  std::optional<std::string> sampleFormat;
+};
+
+struct ContainerMetadata {
+  std::vector<StreamMetadata> allStreamMetadata;
+  int numAudioStreams = 0;
+  int numVideoStreams = 0;
+  // Note that this is the container-level duration, which is usually the max
+  // of all stream durations available in the container.
+  std::optional<double> durationSeconds;
+  // Total BitRate level information at the container level in bit/s
+  std::optional<double> bitRate;
+  // If set, this is the index to the default audio stream.
+  std::optional<int> bestAudioStreamIndex;
+  // If set, this is the index to the default video stream.
+  std::optional<int> bestVideoStreamIndex;
+};
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -13,7 +13,6 @@
 #include <sstream>
 #include <stdexcept>
 #include <string_view>
-#include "src/torchcodec/_core/DeviceInterface.h"
 #include "torch/types.h"
 
 extern "C" {
@@ -350,8 +349,7 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
   scannedAllStreams_ = true;
 }
 
-SingleStreamDecoder::ContainerMetadata
-SingleStreamDecoder::getContainerMetadata() const {
+ContainerMetadata SingleStreamDecoder::getContainerMetadata() const {
   return containerMetadata_;
 }
 
@@ -406,7 +404,7 @@ void SingleStreamDecoder::addStream(
   streamInfo.stream = formatContext_->streams[activeStreamIndex_];
   streamInfo.avMediaType = mediaType;
 
-  deviceInterface = createDeviceInterface(device);
+  deviceInterface_ = createDeviceInterface(device);
 
   // This should never happen, checking just to be safe.
   TORCH_CHECK(
@@ -418,9 +416,9 @@ void SingleStreamDecoder::addStream(
   // TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within
   // addStream() which is supposed to be generic
   if (mediaType == AVMEDIA_TYPE_VIDEO) {
-    if (deviceInterface) {
+    if (deviceInterface_) {
       avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
-          deviceInterface->findCodec(streamInfo.stream->codecpar->codec_id)
+          deviceInterface_->findCodec(streamInfo.stream->codecpar->codec_id)
               .value_or(avCodec));
     }
   }
@@ -438,8 +436,8 @@ void SingleStreamDecoder::addStream(
 
   // TODO_CODE_QUALITY same as above.
   if (mediaType == AVMEDIA_TYPE_VIDEO) {
-    if (deviceInterface) {
-      deviceInterface->initializeContext(codecContext);
+    if (deviceInterface_) {
+      deviceInterface_->initializeContext(codecContext);
     }
   }
 
@@ -501,9 +499,8 @@ void SingleStreamDecoder::addVideoStream(
   // swscale requires widths to be multiples of 32:
   // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
   // so we fall back to filtergraph if the width is not a multiple of 32.
-  auto defaultLibrary = (width % 32 == 0)
-      ? SingleStreamDecoder::ColorConversionLibrary::SWSCALE
-      : SingleStreamDecoder::ColorConversionLibrary::FILTERGRAPH;
+  auto defaultLibrary = (width % 32 == 0) ? ColorConversionLibrary::SWSCALE
+                                          : ColorConversionLibrary::FILTERGRAPH;
 
   streamInfo.colorConversionLibrary =
       videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
@@ -539,30 +536,29 @@ void SingleStreamDecoder::addAudioStream(
 // HIGH-LEVEL DECODING ENTRY-POINTS
 // --------------------------------------------------------------------------
 
-SingleStreamDecoder::FrameOutput SingleStreamDecoder::getNextFrame() {
+FrameOutput SingleStreamDecoder::getNextFrame() {
   auto output = getNextFrameInternal();
   if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO) {
     output.data = maybePermuteHWC2CHW(output.data);
   }
   return output;
 }
 
-SingleStreamDecoder::FrameOutput SingleStreamDecoder::getNextFrameInternal(
+FrameOutput SingleStreamDecoder::getNextFrameInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   validateActiveStream();
   UniqueAVFrame avFrame = decodeAVFrame(
       [this](const UniqueAVFrame& avFrame) { return avFrame->pts >= cursor_; });
   return convertAVFrameToFrameOutput(avFrame, preAllocatedOutputTensor);
 }
 
-SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFrameAtIndex(
-    int64_t frameIndex) {
+FrameOutput SingleStreamDecoder::getFrameAtIndex(int64_t frameIndex) {
   auto frameOutput = getFrameAtIndexInternal(frameIndex);
   frameOutput.data = maybePermuteHWC2CHW(frameOutput.data);
   return frameOutput;
 }
 
-SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFrameAtIndexInternal(
+FrameOutput SingleStreamDecoder::getFrameAtIndexInternal(
     int64_t frameIndex,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
@@ -577,7 +573,7 @@ SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFrameAtIndexInternal(
   return getNextFrameInternal(preAllocatedOutputTensor);
 }
 
-SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesAtIndices(
+FrameBatchOutput SingleStreamDecoder::getFramesAtIndices(
     const std::vector<int64_t>& frameIndices) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
 
@@ -636,7 +632,7 @@ SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesAtIndices(
   return frameBatchOutput;
 }
 
-SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesInRange(
+FrameBatchOutput SingleStreamDecoder::getFramesInRange(
     int64_t start,
     int64_t stop,
     int64_t step) {
@@ -670,8 +666,7 @@ SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesInRange(
   return frameBatchOutput;
 }
 
-SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFramePlayedAt(
-    double seconds) {
+FrameOutput SingleStreamDecoder::getFramePlayedAt(double seconds) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
   double frameStartTime =
@@ -711,7 +706,7 @@ SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFramePlayedAt(
   return frameOutput;
 }
 
-SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
+FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
     const std::vector<double>& timestamps) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
 
@@ -741,8 +736,7 @@ SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
   return getFramesAtIndices(frameIndices);
 }
 
-SingleStreamDecoder::FrameBatchOutput
-SingleStreamDecoder::getFramesPlayedInRange(
+FrameBatchOutput SingleStreamDecoder::getFramesPlayedInRange(
     double startSeconds,
     double stopSeconds) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
@@ -875,8 +869,7 @@ SingleStreamDecoder::getFramesPlayedInRange(
 // [2] If you're brave and curious, you can read the long "Seek offset for
 // audio" note in https://github.com/pytorch/torchcodec/pull/507/files, which
 // sums up past (and failed) attemps at working around this issue.
-SingleStreamDecoder::AudioFramesOutput
-SingleStreamDecoder::getFramesPlayedInRangeAudio(
+AudioFramesOutput SingleStreamDecoder::getFramesPlayedInRangeAudio(
     double startSeconds,
     std::optional<double> stopSecondsOptional) {
   validateActiveStream(AVMEDIA_TYPE_AUDIO);
@@ -1196,8 +1189,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
 // AVFRAME <-> FRAME OUTPUT CONVERSION
 // --------------------------------------------------------------------------
 
-SingleStreamDecoder::FrameOutput
-SingleStreamDecoder::convertAVFrameToFrameOutput(
+FrameOutput SingleStreamDecoder::convertAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   // Convert the frame to tensor.
@@ -1210,11 +1202,11 @@ SingleStreamDecoder::convertAVFrameToFrameOutput(
       formatContext_->streams[activeStreamIndex_]->time_base);
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
     convertAudioAVFrameToFrameOutputOnCPU(avFrame, frameOutput);
-  } else if (!deviceInterface) {
+  } else if (!deviceInterface_) {
     convertAVFrameToFrameOutputOnCPU(
         avFrame, frameOutput, preAllocatedOutputTensor);
-  } else if (deviceInterface) {
-    deviceInterface->convertAVFrameToFrameOutput(
+  } else if (deviceInterface_) {
+    deviceInterface_->convertAVFrameToFrameOutput(
         streamInfo.videoStreamOptions,
         avFrame,
         frameOutput,
@@ -1547,7 +1539,7 @@ std::optional<torch::Tensor> SingleStreamDecoder::maybeFlushSwrBuffers() {
 // OUTPUT ALLOCATION AND SHAPE CONVERSION
 // --------------------------------------------------------------------------
 
-SingleStreamDecoder::FrameBatchOutput::FrameBatchOutput(
+FrameBatchOutput::FrameBatchOutput(
     int64_t numFrames,
     const VideoStreamOptions& videoStreamOptions,
     const StreamMetadata& streamMetadata)
@@ -2047,15 +2039,15 @@ FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame) {
 }
 
 FrameDims getHeightAndWidthFromOptionsOrMetadata(
-    const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
-    const SingleStreamDecoder::StreamMetadata& streamMetadata) {
+    const VideoStreamOptions& videoStreamOptions,
+    const StreamMetadata& streamMetadata) {
   return FrameDims(
       videoStreamOptions.height.value_or(*streamMetadata.height),
       videoStreamOptions.width.value_or(*streamMetadata.width));
 }
 
 FrameDims getHeightAndWidthFromOptionsOrAVFrame(
-    const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+    const VideoStreamOptions& videoStreamOptions,
     const UniqueAVFrame& avFrame) {
   return FrameDims(
       videoStreamOptions.height.value_or(avFrame->height),