Skip to content

Audio encoding - part 1 of N #524

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Apr 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d5fe996
Super WIP encoder
NicolasHug Feb 26, 2025
779f19e
Write output file through AVFormatContext
NicolasHug Feb 27, 2025
b110dac
Cleanup
NicolasHug Feb 28, 2025
0906fb3
Properly free AVFormatContext and streams
NicolasHug Feb 28, 2025
a1532c9
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug Apr 1, 2025
3890227
don't return encoded bytes for now
NicolasHug Apr 1, 2025
52d1753
Write TODOs, avoid raw pointers
NicolasHug Apr 1, 2025
45fd0ec
Add (failing) round-trip test
NicolasHug Apr 1, 2025
9beec1a
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug Apr 2, 2025
2c05e88
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug Apr 2, 2025
75b099b
Create new file
NicolasHug Apr 2, 2025
01dc1b1
NULL -> nullptr
NicolasHug Apr 2, 2025
691dde7
Use 'status' instead of ffmpegRet
NicolasHug Apr 2, 2025
eb2a86c
Stuff
NicolasHug Apr 2, 2025
3cec761
Add tests
NicolasHug Apr 2, 2025
8c5479c
Flags
NicolasHug Apr 2, 2025
f609052
hopefully fix ffmpeg4
NicolasHug Apr 2, 2025
0c900be
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug Apr 2, 2025
debb32c
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug Apr 2, 2025
42f5160
Fix MacOS build??
NicolasHug Apr 2, 2025
52c4d54
more tests
NicolasHug Apr 2, 2025
bbf2af2
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug Apr 3, 2025
061c60f
Address some comments
NicolasHug Apr 3, 2025
9b90894
cast
NicolasHug Apr 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/torchcodec/_core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ function(make_torchcodec_libraries
AVIOContextHolder.cpp
FFMPEGCommon.cpp
SingleStreamDecoder.cpp
# TODO: lib name should probably not be "*_decoder*" now that it also
# contains an encoder
Encoder.cpp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it should be libtorchcodec_coreN.so?

)

if(ENABLE_CUDA)
Expand Down
219 changes: 219 additions & 0 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
#include "src/torchcodec/_core/Encoder.h"
#include "torch/types.h"

namespace facebook::torchcodec {

AudioEncoder::~AudioEncoder() {}

// TODO-ENCODING: disable ffmpeg logs by default

AudioEncoder::AudioEncoder(
const torch::Tensor wf,
int sampleRate,
std::string_view fileName)
: wf_(wf), sampleRate_(sampleRate) {
TORCH_CHECK(
wf_.dtype() == torch::kFloat32,
"waveform must have float32 dtype, got ",
wf_.dtype());
TORCH_CHECK(
wf_.dim() == 2, "waveform must have 2 dimensions, got ", wf_.dim());
AVFormatContext* avFormatContext = nullptr;
auto status = avformat_alloc_output_context2(
&avFormatContext, nullptr, nullptr, fileName.data());
TORCH_CHECK(
avFormatContext != nullptr,
"Couldn't allocate AVFormatContext. ",
"Check the desired extension? ",
getFFMPEGErrorStringFromErrorCode(status));
avFormatContext_.reset(avFormatContext);

// TODO-ENCODING: Should also support encoding into bytes (use
// AVIOBytesContext)
TORCH_CHECK(
!(avFormatContext->oformat->flags & AVFMT_NOFILE),
"AVFMT_NOFILE is set. We only support writing to a file.");
status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
TORCH_CHECK(
status >= 0,
"avio_open failed: ",
getFFMPEGErrorStringFromErrorCode(status));

// We use the AVFormatContext's default codec for that
// specific format/container.
const AVCodec* avCodec =
avcodec_find_encoder(avFormatContext_->oformat->audio_codec);
TORCH_CHECK(avCodec != nullptr, "Codec not found");

AVCodecContext* avCodecContext = avcodec_alloc_context3(avCodec);
TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
avCodecContext_.reset(avCodecContext);

// TODO-ENCODING I think this sets the bit rate to the minimum supported.
// That's not what the ffmpeg CLI would choose by default, so we should try to
// do the same.
// TODO-ENCODING Should also let user choose for compressed formats like mp3.
avCodecContext_->bit_rate = 0;

avCodecContext_->sample_rate = sampleRate_;

// Note: This is the format of the **input** waveform. This doesn't determine
// the output.
// TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
// planar.
// TODO-ENCODING If the encoder doesn't support FLTP (like flac), FFmpeg will
// raise. We need to handle this, probably converting the format with
// libswresample.
avCodecContext_->sample_fmt = AV_SAMPLE_FMT_FLTP;

int numChannels = static_cast<int>(wf_.sizes()[0]);
TORCH_CHECK(
// TODO-ENCODING is this even true / needed? We can probably support more
// with non-planar data?
numChannels <= AV_NUM_DATA_POINTERS,
"Trying to encode ",
numChannels,
" channels, but FFmpeg only supports ",
AV_NUM_DATA_POINTERS,
" channels per frame.");

setDefaultChannelLayout(avCodecContext_, numChannels);

status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
TORCH_CHECK(
status == AVSUCCESS,
"avcodec_open2 failed: ",
getFFMPEGErrorStringFromErrorCode(status));

TORCH_CHECK(
avCodecContext_->frame_size > 0,
"frame_size is ",
avCodecContext_->frame_size,
". Cannot encode. This should probably never happen?");

// We're allocating the stream here. Streams are meant to be freed by
// avformat_free_context(avFormatContext), which we call in the
// avFormatContext_'s destructor.
AVStream* avStream = avformat_new_stream(avFormatContext_.get(), nullptr);
TORCH_CHECK(avStream != nullptr, "Couldn't create new stream.");
status = avcodec_parameters_from_context(
avStream->codecpar, avCodecContext_.get());
TORCH_CHECK(
status == AVSUCCESS,
"avcodec_parameters_from_context failed: ",
getFFMPEGErrorStringFromErrorCode(status));
streamIndex_ = avStream->index;
}

void AudioEncoder::encode() {
UniqueAVFrame avFrame(av_frame_alloc());
TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
avFrame->nb_samples = avCodecContext_->frame_size;
avFrame->format = avCodecContext_->sample_fmt;
avFrame->sample_rate = avCodecContext_->sample_rate;
avFrame->pts = 0;
setChannelLayout(avFrame, avCodecContext_);

auto status = av_frame_get_buffer(avFrame.get(), 0);
TORCH_CHECK(
status == AVSUCCESS,
"Couldn't allocate avFrame's buffers: ",
getFFMPEGErrorStringFromErrorCode(status));

AutoAVPacket autoAVPacket;

uint8_t* pwf = static_cast<uint8_t*>(wf_.data_ptr());
int numSamples = static_cast<int>(wf_.sizes()[1]); // per channel
int numEncodedSamples = 0; // per channel
int numSamplesPerFrame = avCodecContext_->frame_size; // per channel
int numBytesPerSample = static_cast<int>(wf_.element_size());
int numBytesPerChannel = numSamples * numBytesPerSample;

status = avformat_write_header(avFormatContext_.get(), nullptr);
TORCH_CHECK(
status == AVSUCCESS,
"Error in avformat_write_header: ",
getFFMPEGErrorStringFromErrorCode(status));

while (numEncodedSamples < numSamples) {
status = av_frame_make_writable(avFrame.get());
TORCH_CHECK(
status == AVSUCCESS,
"Couldn't make AVFrame writable: ",
getFFMPEGErrorStringFromErrorCode(status));

int numSamplesToEncode =
std::min(numSamplesPerFrame, numSamples - numEncodedSamples);
int numBytesToEncode = numSamplesToEncode * numBytesPerSample;

for (int ch = 0; ch < wf_.sizes()[0]; ch++) {
std::memcpy(
avFrame->data[ch], pwf + ch * numBytesPerChannel, numBytesToEncode);
}
pwf += numBytesToEncode;

// Above, we set the AVFrame's .nb_samples to AVCodecContext.frame_size so
// that the frame buffers are allocated to a big enough size. Here, we reset
// it to the exact number of samples that need to be encoded, otherwise the
// encoded frame would contain more samples than necessary and our results
// wouldn't match the ffmpeg CLI.
avFrame->nb_samples = numSamplesToEncode;
encodeInnerLoop(autoAVPacket, avFrame);

avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
numEncodedSamples += numSamplesToEncode;
}
TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong.");

flushBuffers();

status = av_write_trailer(avFormatContext_.get());
TORCH_CHECK(
status == AVSUCCESS,
"Error in: av_write_trailer",
getFFMPEGErrorStringFromErrorCode(status));
}

void AudioEncoder::encodeInnerLoop(
AutoAVPacket& autoAVPacket,
const UniqueAVFrame& avFrame) {
auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
TORCH_CHECK(
status == AVSUCCESS,
"Error while sending frame: ",
getFFMPEGErrorStringFromErrorCode(status));

while (status >= 0) {
ReferenceAVPacket packet(autoAVPacket);
status = avcodec_receive_packet(avCodecContext_.get(), packet.get());
if (status == AVERROR(EAGAIN) || status == AVERROR_EOF) {
// TODO-ENCODING this is from TorchAudio, probably needed, but not sure.
// if (status == AVERROR_EOF) {
// status = av_interleaved_write_frame(avFormatContext_.get(),
// nullptr); TORCH_CHECK(
// status == AVSUCCESS,
// "Failed to flush packet ",
// getFFMPEGErrorStringFromErrorCode(status));
// }
return;
}
TORCH_CHECK(
status >= 0,
"Error receiving packet: ",
getFFMPEGErrorStringFromErrorCode(status));

packet->stream_index = streamIndex_;

status = av_interleaved_write_frame(avFormatContext_.get(), packet.get());
TORCH_CHECK(
status == AVSUCCESS,
"Error in av_interleaved_write_frame: ",
getFFMPEGErrorStringFromErrorCode(status));
}
}

void AudioEncoder::flushBuffers() {
AutoAVPacket autoAVPacket;
encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr));
}
} // namespace facebook::torchcodec
36 changes: 36 additions & 0 deletions src/torchcodec/_core/Encoder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#pragma once
#include <torch/types.h>
#include "src/torchcodec/_core/FFMPEGCommon.h"

namespace facebook::torchcodec {
class AudioEncoder {
public:
~AudioEncoder();

AudioEncoder(
const torch::Tensor wf,
int sampleRate,
std::string_view fileName);
void encode();

private:
void encodeInnerLoop(
AutoAVPacket& autoAVPacket,
const UniqueAVFrame& avFrame);
void flushBuffers();

UniqueEncodingAVFormatContext avFormatContext_;
UniqueAVCodecContext avCodecContext_;
int streamIndex_;

const torch::Tensor wf_;
// The *output* sample rate. We can't really decide for the user what it
// should be. Particularly, the sample rate of the input waveform should match
// this, and that's up to the user. If sample rates don't match, encoding will
// still work but audio will be distorted.
// We technically could let the user also specify the input sample rate, and
// resample the waveform internally to match them, but that's not in scope for
// an initial version (if at all).
int sampleRate_;
};
} // namespace facebook::torchcodec
32 changes: 32 additions & 0 deletions src/torchcodec/_core/FFMPEGCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,38 @@ int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
#endif
}

void setDefaultChannelLayout(
UniqueAVCodecContext& avCodecContext,
int numChannels) {
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
AVChannelLayout channel_layout;
av_channel_layout_default(&channel_layout, numChannels);
avCodecContext->ch_layout = channel_layout;

#else
uint64_t channel_layout = av_get_default_channel_layout(numChannels);
avCodecContext->channel_layout = channel_layout;
avCodecContext->channels = numChannels;
#endif
}

void setChannelLayout(
UniqueAVFrame& dstAVFrame,
const UniqueAVCodecContext& avCodecContext) {
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
auto status = av_channel_layout_copy(
&dstAVFrame->ch_layout, &avCodecContext->ch_layout);
TORCH_CHECK(
status == AVSUCCESS,
"Couldn't copy channel layout to avFrame: ",
getFFMPEGErrorStringFromErrorCode(status));
#else
dstAVFrame->channel_layout = avCodecContext->channel_layout;
dstAVFrame->channels = avCodecContext->channels;

#endif
}

void setChannelLayout(
UniqueAVFrame& dstAVFrame,
const UniqueAVFrame& srcAVFrame) {
Expand Down
13 changes: 12 additions & 1 deletion src/torchcodec/_core/FFMPEGCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,12 @@ struct Deleter {
};

// Unique pointers for FFMPEG structures.
using UniqueAVFormatContext = std::unique_ptr<
using UniqueDecodingAVFormatContext = std::unique_ptr<
AVFormatContext,
Deleterp<AVFormatContext, void, avformat_close_input>>;
using UniqueEncodingAVFormatContext = std::unique_ptr<
AVFormatContext,
Deleter<AVFormatContext, void, avformat_free_context>>;
using UniqueAVCodecContext = std::unique_ptr<
AVCodecContext,
Deleterp<AVCodecContext, void, avcodec_free_context>>;
Expand Down Expand Up @@ -144,6 +147,14 @@ int64_t getDuration(const UniqueAVFrame& frame);
int getNumChannels(const UniqueAVFrame& avFrame);
int getNumChannels(const UniqueAVCodecContext& avCodecContext);

void setDefaultChannelLayout(
UniqueAVCodecContext& avCodecContext,
int numChannels);

void setChannelLayout(
UniqueAVFrame& dstAVFrame,
const UniqueAVCodecContext& avCodecContext);

void setChannelLayout(
UniqueAVFrame& dstAVFrame,
const UniqueAVFrame& srcAVFrame);
Expand Down
2 changes: 1 addition & 1 deletion src/torchcodec/_core/SingleStreamDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1443,7 +1443,7 @@ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
for (auto channel = 0; channel < numChannels;
++channel, outputChannelData += numBytesPerChannel) {
memcpy(
std::memcpy(
outputChannelData,
avFrame->extended_data[channel],
numBytesPerChannel);
Expand Down
2 changes: 1 addition & 1 deletion src/torchcodec/_core/SingleStreamDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ class SingleStreamDecoder {

SeekMode seekMode_;
ContainerMetadata containerMetadata_;
UniqueAVFormatContext formatContext_;
UniqueDecodingAVFormatContext formatContext_;
std::map<int, StreamInfo> streamInfos_;
const int NO_ACTIVE_STREAM = -2;
int activeStreamIndex_ = NO_ACTIVE_STREAM;
Expand Down
2 changes: 2 additions & 0 deletions src/torchcodec/_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@
_test_frame_pts_equality,
add_audio_stream,
add_video_stream,
create_audio_encoder,
create_from_bytes,
create_from_file,
create_from_file_like,
create_from_tensor,
encode_audio,
get_ffmpeg_library_versions,
get_frame_at_index,
get_frame_at_pts,
Expand Down
Loading
Loading