Audio encoding - part 1 of N (#524)

NicolasHug · web-flow · commit bea7360c9bee · 2025-04-03T16:37:02.000+01:00
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -61,6 +61,9 @@ function(make_torchcodec_libraries
         AVIOContextHolder.cpp
         FFMPEGCommon.cpp
         SingleStreamDecoder.cpp
+        # TODO: lib name should probably not be "*_decoder*" now that it also
+        # contains an encoder
+        Encoder.cpp
     )
 
     if(ENABLE_CUDA)
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -0,0 +1,219 @@
+#include "src/torchcodec/_core/Encoder.h"
+#include "torch/types.h"
+
+namespace facebook::torchcodec {
+
+AudioEncoder::~AudioEncoder() {}
+
+// TODO-ENCODING: disable ffmpeg logs by default
+
+AudioEncoder::AudioEncoder(
+    const torch::Tensor wf,
+    int sampleRate,
+    std::string_view fileName)
+    : wf_(wf), sampleRate_(sampleRate) {
+  TORCH_CHECK(
+      wf_.dtype() == torch::kFloat32,
+      "waveform must have float32 dtype, got ",
+      wf_.dtype());
+  TORCH_CHECK(
+      wf_.dim() == 2, "waveform must have 2 dimensions, got ", wf_.dim());
+  AVFormatContext* avFormatContext = nullptr;
+  auto status = avformat_alloc_output_context2(
+      &avFormatContext, nullptr, nullptr, fileName.data());
+  TORCH_CHECK(
+      avFormatContext != nullptr,
+      "Couldn't allocate AVFormatContext. ",
+      "Check the desired extension? ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  avFormatContext_.reset(avFormatContext);
+
+  // TODO-ENCODING: Should also support encoding into bytes (use
+  // AVIOBytesContext)
+  TORCH_CHECK(
+      !(avFormatContext->oformat->flags & AVFMT_NOFILE),
+      "AVFMT_NOFILE is set. We only support writing to a file.");
+  status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
+  TORCH_CHECK(
+      status >= 0,
+      "avio_open failed: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+
+  // We use the AVFormatContext's default codec for that
+  // specific format/container.
+  const AVCodec* avCodec =
+      avcodec_find_encoder(avFormatContext_->oformat->audio_codec);
+  TORCH_CHECK(avCodec != nullptr, "Codec not found");
+
+  AVCodecContext* avCodecContext = avcodec_alloc_context3(avCodec);
+  TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
+  avCodecContext_.reset(avCodecContext);
+
+  // TODO-ENCODING I think this sets the bit rate to the minimum supported.
+  // That's not what the ffmpeg CLI would choose by default, so we should try to
+  // do the same.
+  // TODO-ENCODING Should also let user choose for compressed formats like mp3.
+  avCodecContext_->bit_rate = 0;
+
+  avCodecContext_->sample_rate = sampleRate_;
+
+  // Note: This is the format of the **input** waveform. This doesn't determine
+  // the output.
+  // TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
+  // planar.
+  // TODO-ENCODING If the encoder doesn't support FLTP (like flac), FFmpeg will
+  // raise. We need to handle this, probably converting the format with
+  // libswresample.
+  avCodecContext_->sample_fmt = AV_SAMPLE_FMT_FLTP;
+
+  int numChannels = static_cast<int>(wf_.sizes()[0]);
+  TORCH_CHECK(
+      // TODO-ENCODING is this even true / needed? We can probably support more
+      // with non-planar data?
+      numChannels <= AV_NUM_DATA_POINTERS,
+      "Trying to encode ",
+      numChannels,
+      " channels, but FFmpeg only supports ",
+      AV_NUM_DATA_POINTERS,
+      " channels per frame.");
+
+  setDefaultChannelLayout(avCodecContext_, numChannels);
+
+  status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "avcodec_open2 failed: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+
+  TORCH_CHECK(
+      avCodecContext_->frame_size > 0,
+      "frame_size is ",
+      avCodecContext_->frame_size,
+      ". Cannot encode. This should probably never happen?");
+
+  // We're allocating the stream here. Streams are meant to be freed by
+  // avformat_free_context(avFormatContext), which we call in the
+  // avFormatContext_'s destructor.
+  AVStream* avStream = avformat_new_stream(avFormatContext_.get(), nullptr);
+  TORCH_CHECK(avStream != nullptr, "Couldn't create new stream.");
+  status = avcodec_parameters_from_context(
+      avStream->codecpar, avCodecContext_.get());
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "avcodec_parameters_from_context failed: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  streamIndex_ = avStream->index;
+}
+
+void AudioEncoder::encode() {
+  UniqueAVFrame avFrame(av_frame_alloc());
+  TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
+  avFrame->nb_samples = avCodecContext_->frame_size;
+  avFrame->format = avCodecContext_->sample_fmt;
+  avFrame->sample_rate = avCodecContext_->sample_rate;
+  avFrame->pts = 0;
+  setChannelLayout(avFrame, avCodecContext_);
+
+  auto status = av_frame_get_buffer(avFrame.get(), 0);
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "Couldn't allocate avFrame's buffers: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+
+  AutoAVPacket autoAVPacket;
+
+  uint8_t* pwf = static_cast<uint8_t*>(wf_.data_ptr());
+  int numSamples = static_cast<int>(wf_.sizes()[1]); // per channel
+  int numEncodedSamples = 0; // per channel
+  int numSamplesPerFrame = avCodecContext_->frame_size; // per channel
+  int numBytesPerSample = static_cast<int>(wf_.element_size());
+  int numBytesPerChannel = numSamples * numBytesPerSample;
+
+  status = avformat_write_header(avFormatContext_.get(), nullptr);
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "Error in avformat_write_header: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+
+  while (numEncodedSamples < numSamples) {
+    status = av_frame_make_writable(avFrame.get());
+    TORCH_CHECK(
+        status == AVSUCCESS,
+        "Couldn't make AVFrame writable: ",
+        getFFMPEGErrorStringFromErrorCode(status));
+
+    int numSamplesToEncode =
+        std::min(numSamplesPerFrame, numSamples - numEncodedSamples);
+    int numBytesToEncode = numSamplesToEncode * numBytesPerSample;
+
+    for (int ch = 0; ch < wf_.sizes()[0]; ch++) {
+      std::memcpy(
+          avFrame->data[ch], pwf + ch * numBytesPerChannel, numBytesToEncode);
+    }
+    pwf += numBytesToEncode;
+
+    // Above, we set the AVFrame's .nb_samples to AVCodecContext.frame_size so
+    // that the frame buffers are allocated to a big enough size. Here, we reset
+    // it to the exact number of samples that need to be encoded, otherwise the
+    // encoded frame would contain more samples than necessary and our results
+    // wouldn't match the ffmpeg CLI.
+    avFrame->nb_samples = numSamplesToEncode;
+    encodeInnerLoop(autoAVPacket, avFrame);
+
+    avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
+    numEncodedSamples += numSamplesToEncode;
+  }
+  TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong.");
+
+  flushBuffers();
+
+  status = av_write_trailer(avFormatContext_.get());
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "Error in: av_write_trailer",
+      getFFMPEGErrorStringFromErrorCode(status));
+}
+
+void AudioEncoder::encodeInnerLoop(
+    AutoAVPacket& autoAVPacket,
+    const UniqueAVFrame& avFrame) {
+  auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "Error while sending frame: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+
+  while (status >= 0) {
+    ReferenceAVPacket packet(autoAVPacket);
+    status = avcodec_receive_packet(avCodecContext_.get(), packet.get());
+    if (status == AVERROR(EAGAIN) || status == AVERROR_EOF) {
+      // TODO-ENCODING this is from TorchAudio, probably needed, but not sure.
+      //   if (status == AVERROR_EOF) {
+      //     status = av_interleaved_write_frame(avFormatContext_.get(),
+      //     nullptr); TORCH_CHECK(
+      //         status == AVSUCCESS,
+      //         "Failed to flush packet ",
+      //         getFFMPEGErrorStringFromErrorCode(status));
+      //   }
+      return;
+    }
+    TORCH_CHECK(
+        status >= 0,
+        "Error receiving packet: ",
+        getFFMPEGErrorStringFromErrorCode(status));
+
+    packet->stream_index = streamIndex_;
+
+    status = av_interleaved_write_frame(avFormatContext_.get(), packet.get());
+    TORCH_CHECK(
+        status == AVSUCCESS,
+        "Error in av_interleaved_write_frame: ",
+        getFFMPEGErrorStringFromErrorCode(status));
+  }
+}
+
+void AudioEncoder::flushBuffers() {
+  AutoAVPacket autoAVPacket;
+  encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr));
+}
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -0,0 +1,36 @@
+#pragma once
+#include <torch/types.h>
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+
+namespace facebook::torchcodec {
+class AudioEncoder {
+ public:
+  ~AudioEncoder();
+
+  AudioEncoder(
+      const torch::Tensor wf,
+      int sampleRate,
+      std::string_view fileName);
+  void encode();
+
+ private:
+  void encodeInnerLoop(
+      AutoAVPacket& autoAVPacket,
+      const UniqueAVFrame& avFrame);
+  void flushBuffers();
+
+  UniqueEncodingAVFormatContext avFormatContext_;
+  UniqueAVCodecContext avCodecContext_;
+  int streamIndex_;
+
+  const torch::Tensor wf_;
+  // The *output* sample rate. We can't really decide for the user what it
+  // should be. Particularly, the sample rate of the input waveform should match
+  // this, and that's up to the user. If sample rates don't match, encoding will
+  // still work but audio will be distorted.
+  // We technically could let the user also specify the input sample rate, and
+  // resample the waveform internally to match them, but that's not in scope for
+  // an initial version (if at all).
+  int sampleRate_;
+};
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -74,6 +74,38 @@ int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
 #endif
 }
 
+void setDefaultChannelLayout(
+    UniqueAVCodecContext& avCodecContext,
+    int numChannels) {
+#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
+  AVChannelLayout channel_layout;
+  av_channel_layout_default(&channel_layout, numChannels);
+  avCodecContext->ch_layout = channel_layout;
+
+#else
+  uint64_t channel_layout = av_get_default_channel_layout(numChannels);
+  avCodecContext->channel_layout = channel_layout;
+  avCodecContext->channels = numChannels;
+#endif
+}
+
+void setChannelLayout(
+    UniqueAVFrame& dstAVFrame,
+    const UniqueAVCodecContext& avCodecContext) {
+#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
+  auto status = av_channel_layout_copy(
+      &dstAVFrame->ch_layout, &avCodecContext->ch_layout);
+  TORCH_CHECK(
+      status == AVSUCCESS,
+      "Couldn't copy channel layout to avFrame: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+#else
+  dstAVFrame->channel_layout = avCodecContext->channel_layout;
+  dstAVFrame->channels = avCodecContext->channels;
+
+#endif
+}
+
 void setChannelLayout(
     UniqueAVFrame& dstAVFrame,
     const UniqueAVFrame& srcAVFrame) {
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
@@ -50,9 +50,12 @@ struct Deleter {
 };
 
 // Unique pointers for FFMPEG structures.
-using UniqueAVFormatContext = std::unique_ptr<
+using UniqueDecodingAVFormatContext = std::unique_ptr<
     AVFormatContext,
     Deleterp<AVFormatContext, void, avformat_close_input>>;
+using UniqueEncodingAVFormatContext = std::unique_ptr<
+    AVFormatContext,
+    Deleter<AVFormatContext, void, avformat_free_context>>;
 using UniqueAVCodecContext = std::unique_ptr<
     AVCodecContext,
     Deleterp<AVCodecContext, void, avcodec_free_context>>;
@@ -144,6 +147,14 @@ int64_t getDuration(const UniqueAVFrame& frame);
 int getNumChannels(const UniqueAVFrame& avFrame);
 int getNumChannels(const UniqueAVCodecContext& avCodecContext);
 
+void setDefaultChannelLayout(
+    UniqueAVCodecContext& avCodecContext,
+    int numChannels);
+
+void setChannelLayout(
+    UniqueAVFrame& dstAVFrame,
+    const UniqueAVCodecContext& avCodecContext);
+
 void setChannelLayout(
     UniqueAVFrame& dstAVFrame,
     const UniqueAVFrame& srcAVFrame);
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1443,7 +1443,7 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
     auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
     for (auto channel = 0; channel < numChannels;
          ++channel, outputChannelData += numBytesPerChannel) {
-      memcpy(
+      std::memcpy(
           outputChannelData,
           avFrame->extended_data[channel],
           numBytesPerChannel);
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -492,7 +492,7 @@ class SingleStreamDecoder {
 
   SeekMode seekMode_;
   ContainerMetadata containerMetadata_;
-  UniqueAVFormatContext formatContext_;
+  UniqueDecodingAVFormatContext formatContext_;
   std::map<int, StreamInfo> streamInfos_;
   const int NO_ACTIVE_STREAM = -2;
   int activeStreamIndex_ = NO_ACTIVE_STREAM;
diff --git a/src/torchcodec/_core/__init__.py b/src/torchcodec/_core/__init__.py
@@ -18,10 +18,12 @@
     _test_frame_pts_equality,
     add_audio_stream,
     add_video_stream,
+    create_audio_encoder,
     create_from_bytes,
     create_from_file,
     create_from_file_like,
     create_from_tensor,
+    encode_audio,
     get_ffmpeg_library_versions,
     get_frame_at_index,
     get_frame_at_pts,
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
diff --git a/test/test_ops.py b/test/test_ops.py
diff --git a/test/utils.py b/test/utils.py

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,9 @@ function(make_torchcodec_libraries`
`61`	`61`	`AVIOContextHolder.cpp`
`62`	`62`	`FFMPEGCommon.cpp`
`63`	`63`	`SingleStreamDecoder.cpp`
	`64`	`+ # TODO: lib name should probably not be "_decoder" now that it also`
	`65`	`+ # contains an encoder`
	`66`	`+ Encoder.cpp`
`64`	`67`	`)`
`65`	`68`
`66`	`69`	`if(ENABLE_CUDA)`