-
Notifications
You must be signed in to change notification settings - Fork 39
Audio encoding - part 1 of N #524
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
24 commits
Select commit
Hold shift + click to select a range
d5fe996
Super WIP encoder
NicolasHug 779f19e
Write output file through AVFormatContext
NicolasHug b110dac
Cleanup
NicolasHug 0906fb3
Properly free AVFormatContext and streams
NicolasHug a1532c9
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug 3890227
don't return encoded bytes for now
NicolasHug 52d1753
Write TODOs, avoid raw pointers
NicolasHug 45fd0ec
Add (failing) round-trip test
NicolasHug 9beec1a
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug 2c05e88
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug 75b099b
Create new file
NicolasHug 01dc1b1
NULL -> nullptr
NicolasHug 691dde7
Use 'status' instead of ffmpegRet
NicolasHug eb2a86c
Stuff
NicolasHug 3cec761
Add tests
NicolasHug 8c5479c
Flags
NicolasHug f609052
hopefully fix ffmpeg4
NicolasHug 0c900be
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug debb32c
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug 42f5160
Fix MacOS build??
NicolasHug 52c4d54
more tests
NicolasHug bbf2af2
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_yolo
NicolasHug 061c60f
Address some comments
NicolasHug 9b90894
cast
NicolasHug File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,219 @@ | ||
#include "src/torchcodec/_core/Encoder.h" | ||
#include "torch/types.h" | ||
|
||
namespace facebook::torchcodec { | ||
|
||
AudioEncoder::~AudioEncoder() {} | ||
|
||
// TODO-ENCODING: disable ffmpeg logs by default | ||
|
||
AudioEncoder::AudioEncoder( | ||
const torch::Tensor wf, | ||
int sampleRate, | ||
std::string_view fileName) | ||
: wf_(wf), sampleRate_(sampleRate) { | ||
TORCH_CHECK( | ||
wf_.dtype() == torch::kFloat32, | ||
"waveform must have float32 dtype, got ", | ||
wf_.dtype()); | ||
TORCH_CHECK( | ||
wf_.dim() == 2, "waveform must have 2 dimensions, got ", wf_.dim()); | ||
AVFormatContext* avFormatContext = nullptr; | ||
auto status = avformat_alloc_output_context2( | ||
&avFormatContext, nullptr, nullptr, fileName.data()); | ||
TORCH_CHECK( | ||
avFormatContext != nullptr, | ||
"Couldn't allocate AVFormatContext. ", | ||
"Check the desired extension? ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
avFormatContext_.reset(avFormatContext); | ||
|
||
// TODO-ENCODING: Should also support encoding into bytes (use | ||
// AVIOBytesContext) | ||
TORCH_CHECK( | ||
!(avFormatContext->oformat->flags & AVFMT_NOFILE), | ||
"AVFMT_NOFILE is set. We only support writing to a file."); | ||
status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE); | ||
TORCH_CHECK( | ||
status >= 0, | ||
"avio_open failed: ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
|
||
// We use the AVFormatContext's default codec for that | ||
// specific format/container. | ||
const AVCodec* avCodec = | ||
avcodec_find_encoder(avFormatContext_->oformat->audio_codec); | ||
TORCH_CHECK(avCodec != nullptr, "Codec not found"); | ||
|
||
AVCodecContext* avCodecContext = avcodec_alloc_context3(avCodec); | ||
TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context."); | ||
avCodecContext_.reset(avCodecContext); | ||
|
||
// TODO-ENCODING I think this sets the bit rate to the minimum supported. | ||
// That's not what the ffmpeg CLI would choose by default, so we should try to | ||
// do the same. | ||
// TODO-ENCODING Should also let user choose for compressed formats like mp3. | ||
avCodecContext_->bit_rate = 0; | ||
|
||
avCodecContext_->sample_rate = sampleRate_; | ||
|
||
// Note: This is the format of the **input** waveform. This doesn't determine | ||
// the output. | ||
// TODO-ENCODING check contiguity of the input wf to ensure that it is indeed | ||
// planar. | ||
// TODO-ENCODING If the encoder doesn't support FLTP (like flac), FFmpeg will | ||
// raise. We need to handle this, probably converting the format with | ||
// libswresample. | ||
avCodecContext_->sample_fmt = AV_SAMPLE_FMT_FLTP; | ||
|
||
int numChannels = static_cast<int>(wf_.sizes()[0]); | ||
TORCH_CHECK( | ||
// TODO-ENCODING is this even true / needed? We can probably support more | ||
// with non-planar data? | ||
numChannels <= AV_NUM_DATA_POINTERS, | ||
"Trying to encode ", | ||
numChannels, | ||
" channels, but FFmpeg only supports ", | ||
AV_NUM_DATA_POINTERS, | ||
" channels per frame."); | ||
|
||
setDefaultChannelLayout(avCodecContext_, numChannels); | ||
|
||
status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr); | ||
TORCH_CHECK( | ||
status == AVSUCCESS, | ||
"avcodec_open2 failed: ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
|
||
TORCH_CHECK( | ||
avCodecContext_->frame_size > 0, | ||
"frame_size is ", | ||
avCodecContext_->frame_size, | ||
". Cannot encode. This should probably never happen?"); | ||
|
||
// We're allocating the stream here. Streams are meant to be freed by | ||
// avformat_free_context(avFormatContext), which we call in the | ||
// avFormatContext_'s destructor. | ||
AVStream* avStream = avformat_new_stream(avFormatContext_.get(), nullptr); | ||
TORCH_CHECK(avStream != nullptr, "Couldn't create new stream."); | ||
status = avcodec_parameters_from_context( | ||
avStream->codecpar, avCodecContext_.get()); | ||
TORCH_CHECK( | ||
status == AVSUCCESS, | ||
"avcodec_parameters_from_context failed: ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
streamIndex_ = avStream->index; | ||
} | ||
|
||
void AudioEncoder::encode() { | ||
UniqueAVFrame avFrame(av_frame_alloc()); | ||
TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame."); | ||
avFrame->nb_samples = avCodecContext_->frame_size; | ||
avFrame->format = avCodecContext_->sample_fmt; | ||
avFrame->sample_rate = avCodecContext_->sample_rate; | ||
avFrame->pts = 0; | ||
setChannelLayout(avFrame, avCodecContext_); | ||
|
||
auto status = av_frame_get_buffer(avFrame.get(), 0); | ||
TORCH_CHECK( | ||
status == AVSUCCESS, | ||
"Couldn't allocate avFrame's buffers: ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
|
||
AutoAVPacket autoAVPacket; | ||
|
||
uint8_t* pwf = static_cast<uint8_t*>(wf_.data_ptr()); | ||
int numSamples = static_cast<int>(wf_.sizes()[1]); // per channel | ||
int numEncodedSamples = 0; // per channel | ||
int numSamplesPerFrame = avCodecContext_->frame_size; // per channel | ||
int numBytesPerSample = static_cast<int>(wf_.element_size()); | ||
int numBytesPerChannel = numSamples * numBytesPerSample; | ||
|
||
status = avformat_write_header(avFormatContext_.get(), nullptr); | ||
TORCH_CHECK( | ||
status == AVSUCCESS, | ||
"Error in avformat_write_header: ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
|
||
while (numEncodedSamples < numSamples) { | ||
status = av_frame_make_writable(avFrame.get()); | ||
TORCH_CHECK( | ||
status == AVSUCCESS, | ||
"Couldn't make AVFrame writable: ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
|
||
int numSamplesToEncode = | ||
std::min(numSamplesPerFrame, numSamples - numEncodedSamples); | ||
int numBytesToEncode = numSamplesToEncode * numBytesPerSample; | ||
|
||
for (int ch = 0; ch < wf_.sizes()[0]; ch++) { | ||
std::memcpy( | ||
avFrame->data[ch], pwf + ch * numBytesPerChannel, numBytesToEncode); | ||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
pwf += numBytesToEncode; | ||
|
||
// Above, we set the AVFrame's .nb_samples to AVCodecContext.frame_size so | ||
// that the frame buffers are allocated to a big enough size. Here, we reset | ||
// it to the exact number of samples that need to be encoded, otherwise the | ||
// encoded frame would contain more samples than necessary and our results | ||
// wouldn't match the ffmpeg CLI. | ||
avFrame->nb_samples = numSamplesToEncode; | ||
encodeInnerLoop(autoAVPacket, avFrame); | ||
|
||
avFrame->pts += static_cast<int64_t>(numSamplesToEncode); | ||
numEncodedSamples += numSamplesToEncode; | ||
} | ||
TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong."); | ||
|
||
flushBuffers(); | ||
|
||
status = av_write_trailer(avFormatContext_.get()); | ||
TORCH_CHECK( | ||
status == AVSUCCESS, | ||
"Error in: av_write_trailer", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
} | ||
|
||
void AudioEncoder::encodeInnerLoop( | ||
AutoAVPacket& autoAVPacket, | ||
const UniqueAVFrame& avFrame) { | ||
auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get()); | ||
TORCH_CHECK( | ||
status == AVSUCCESS, | ||
"Error while sending frame: ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
|
||
while (status >= 0) { | ||
ReferenceAVPacket packet(autoAVPacket); | ||
status = avcodec_receive_packet(avCodecContext_.get(), packet.get()); | ||
if (status == AVERROR(EAGAIN) || status == AVERROR_EOF) { | ||
// TODO-ENCODING this is from TorchAudio, probably needed, but not sure. | ||
// if (status == AVERROR_EOF) { | ||
// status = av_interleaved_write_frame(avFormatContext_.get(), | ||
// nullptr); TORCH_CHECK( | ||
// status == AVSUCCESS, | ||
// "Failed to flush packet ", | ||
// getFFMPEGErrorStringFromErrorCode(status)); | ||
// } | ||
return; | ||
} | ||
TORCH_CHECK( | ||
status >= 0, | ||
"Error receiving packet: ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
|
||
packet->stream_index = streamIndex_; | ||
|
||
status = av_interleaved_write_frame(avFormatContext_.get(), packet.get()); | ||
TORCH_CHECK( | ||
status == AVSUCCESS, | ||
"Error in av_interleaved_write_frame: ", | ||
getFFMPEGErrorStringFromErrorCode(status)); | ||
} | ||
} | ||
|
||
void AudioEncoder::flushBuffers() { | ||
AutoAVPacket autoAVPacket; | ||
encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr)); | ||
} | ||
} // namespace facebook::torchcodec |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#pragma once | ||
#include <torch/types.h> | ||
#include "src/torchcodec/_core/FFMPEGCommon.h" | ||
|
||
namespace facebook::torchcodec { | ||
class AudioEncoder { | ||
public: | ||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
~AudioEncoder(); | ||
|
||
AudioEncoder( | ||
const torch::Tensor wf, | ||
int sampleRate, | ||
std::string_view fileName); | ||
void encode(); | ||
|
||
private: | ||
void encodeInnerLoop( | ||
AutoAVPacket& autoAVPacket, | ||
const UniqueAVFrame& avFrame); | ||
void flushBuffers(); | ||
|
||
UniqueEncodingAVFormatContext avFormatContext_; | ||
UniqueAVCodecContext avCodecContext_; | ||
int streamIndex_; | ||
|
||
const torch::Tensor wf_; | ||
// The *output* sample rate. We can't really decide for the user what it | ||
// should be. Particularly, the sample rate of the input waveform should match | ||
// this, and that's up to the user. If sample rates don't match, encoding will | ||
// still work but audio will be distorted. | ||
// We technically could let the user also specify the input sample rate, and | ||
// resample the waveform internally to match them, but that's not in scope for | ||
// an initial version (if at all). | ||
int sampleRate_; | ||
}; | ||
} // namespace facebook::torchcodec |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe it should be
libtorchcodec_coreN.so
?