Skip to content

Support encoding into a bytes tensor #635

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Apr 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7921558
Disable FFmpeg logs for encoder
NicolasHug Apr 7, 2025
2a19014
Merge branch 'main' of github.com:pytorch/torchcodec into loglevelenc…
NicolasHug Apr 8, 2025
73bdc85
Use c++ strings
NicolasHug Apr 8, 2025
54f5543
Merge branch 'main' of github.com:pytorch/torchcodec into loglevelenc…
NicolasHug Apr 8, 2025
24842b6
Account for frame_size being 0
NicolasHug Apr 8, 2025
c3ac80a
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_wav
NicolasHug Apr 9, 2025
5b39c8f
WIP
NicolasHug Apr 9, 2025
1f9f904
Move createSwrContext in ffmpeg file
NicolasHug Apr 9, 2025
f525848
WIP
NicolasHug Apr 9, 2025
9150137
Move convertAudioAVFrameSampleFormatAndSampleRate in ffmpeg file
NicolasHug Apr 9, 2025
872b569
Automatically find output sample format
NicolasHug Apr 9, 2025
a0dcafd
Convert sample format, update tests
NicolasHug Apr 9, 2025
f49d507
Skip wav on FFmpeg4
NicolasHug Apr 9, 2025
ee3a199
Add assertion
NicolasHug Apr 9, 2025
485ee2e
Move comment
NicolasHug Apr 9, 2025
27fdbac
Better default heuristic
NicolasHug Apr 9, 2025
8467b92
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_wav
NicolasHug Apr 10, 2025
ee7a217
Support encoding into tensor
NicolasHug Apr 10, 2025
7b3847f
Merge branch 'main' of github.com:pytorch/torchcodec into avio
NicolasHug Apr 14, 2025
d85baa2
nits
NicolasHug Apr 14, 2025
42c6373
Allow output tensor re-allocation
NicolasHug Apr 14, 2025
254529f
Fix compilation on FFmpeg7?
NicolasHug Apr 14, 2025
3f0417c
Fix?
NicolasHug Apr 14, 2025
290c96e
Use int64_t consistently
NicolasHug Apr 14, 2025
5f42d15
cmake
NicolasHug Apr 14, 2025
0f415c1
Merge branch 'main' of github.com:pytorch/torchcodec into avio
NicolasHug Apr 22, 2025
2954c9b
Move type aliases, fix avioAllocContext name
NicolasHug Apr 22, 2025
29866fc
Merge branch 'main' of github.com:pytorch/torchcodec into avio
NicolasHug Apr 24, 2025
9cb31c9
Merge branch 'main' of github.com:pytorch/torchcodec into avio
NicolasHug Apr 28, 2025
ff6c1e0
Create 2 separate constructors
NicolasHug Apr 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 68 additions & 1 deletion src/torchcodec/_core/AVIOBytesContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ AVIOBytesContext::AVIOBytesContext(const void* data, int64_t dataSize)
: dataContext_{static_cast<const uint8_t*>(data), dataSize, 0} {
TORCH_CHECK(data != nullptr, "Video data buffer cannot be nullptr!");
TORCH_CHECK(dataSize > 0, "Video data size must be positive");
createAVIOContext(&read, &seek, &dataContext_);
createAVIOContext(&read, nullptr, &seek, &dataContext_);
}

// The signature of this function is defined by FFMPEG.
Expand Down Expand Up @@ -67,4 +67,71 @@ int64_t AVIOBytesContext::seek(void* opaque, int64_t offset, int whence) {
return ret;
}

AVIOToTensorContext::AVIOToTensorContext()
: dataContext_{
torch::empty(
{AVIOToTensorContext::INITIAL_TENSOR_SIZE},
{torch::kUInt8}),
0} {
createAVIOContext(nullptr, &write, &seek, &dataContext_);
}

// The signature of this function is defined by FFMPEG.
int AVIOToTensorContext::write(void* opaque, const uint8_t* buf, int buf_size) {
auto dataContext = static_cast<DataContext*>(opaque);

int64_t bufSize = static_cast<int64_t>(buf_size);
if (dataContext->current + bufSize > dataContext->outputTensor.numel()) {
TORCH_CHECK(
dataContext->outputTensor.numel() * 2 <=
AVIOToTensorContext::MAX_TENSOR_SIZE,
"We tried to allocate an output encoded tensor larger than ",
AVIOToTensorContext::MAX_TENSOR_SIZE,
" bytes. If you think this should be supported, please report.");

// We double the size of the outpout tensor. Calling cat() may not be the
// most efficient, but it's simple.
dataContext->outputTensor =
torch::cat({dataContext->outputTensor, dataContext->outputTensor});
}

TORCH_CHECK(
dataContext->current + bufSize <= dataContext->outputTensor.numel(),
"Re-allocation of the output tensor didn't work. ",
"This should not happen, please report on TorchCodec bug tracker");

uint8_t* outputTensorData = dataContext->outputTensor.data_ptr<uint8_t>();
std::memcpy(outputTensorData + dataContext->current, buf, bufSize);
dataContext->current += bufSize;
return buf_size;
}

// The signature of this function is defined by FFMPEG.
// Note: This `seek()` implementation is very similar to that of
// AVIOBytesContext. We could consider merging both classes, or do some kind of
// refac, but this doesn't seem worth it ATM.
int64_t AVIOToTensorContext::seek(void* opaque, int64_t offset, int whence) {
auto dataContext = static_cast<DataContext*>(opaque);
int64_t ret = -1;

switch (whence) {
case AVSEEK_SIZE:
ret = dataContext->outputTensor.numel();
break;
case SEEK_SET:
dataContext->current = offset;
ret = offset;
break;
default:
break;
}

return ret;
}

torch::Tensor AVIOToTensorContext::getOutputTensor() {
return dataContext_.outputTensor.narrow(
/*dim=*/0, /*start=*/0, /*length=*/dataContext_.current);
}

} // namespace facebook::torchcodec
26 changes: 24 additions & 2 deletions src/torchcodec/_core/AVIOBytesContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@

#pragma once

#include <torch/types.h>
#include "src/torchcodec/_core/AVIOContextHolder.h"

namespace facebook::torchcodec {

// Enables users to pass in the entire video as bytes. Our read and seek
// functions then traverse the bytes in memory.
// For Decoding: enables users to pass in the entire video or audio as bytes.
// Our read and seek functions then traverse the bytes in memory.
class AVIOBytesContext : public AVIOContextHolder {
public:
explicit AVIOBytesContext(const void* data, int64_t dataSize);
Expand All @@ -29,4 +30,25 @@ class AVIOBytesContext : public AVIOContextHolder {
DataContext dataContext_;
};

// For Encoding: used to encode into an output uint8 (bytes) tensor.
class AVIOToTensorContext : public AVIOContextHolder {
public:
explicit AVIOToTensorContext();
torch::Tensor getOutputTensor();

private:
struct DataContext {
torch::Tensor outputTensor;
int64_t current;
};

static constexpr int64_t INITIAL_TENSOR_SIZE = 10'000'000; // 10MB
static constexpr int64_t MAX_TENSOR_SIZE = 320'000'000; // 320 MB
static int write(void* opaque, const uint8_t* buf, int buf_size);
// We need to expose seek() for some formats like mp3.
static int64_t seek(void* opaque, int64_t offset, int whence);

DataContext dataContext_;
};

} // namespace facebook::torchcodec
11 changes: 8 additions & 3 deletions src/torchcodec/_core/AVIOContextHolder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ namespace facebook::torchcodec {

void AVIOContextHolder::createAVIOContext(
AVIOReadFunction read,
AVIOWriteFunction write,
AVIOSeekFunction seek,
void* heldData,
int bufferSize) {
Expand All @@ -22,13 +23,17 @@ void AVIOContextHolder::createAVIOContext(
buffer != nullptr,
"Failed to allocate buffer of size " + std::to_string(bufferSize));

avioContext_.reset(avio_alloc_context(
TORCH_CHECK(
(seek != nullptr) && ((write != nullptr) ^ (read != nullptr)),
"seek method must be defined, and either write or read must be defined. "
"But not both!")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may relax the mutual-exclusivity check above eventually, if we implement both write and read within the same class. For now, mutual-exclusivity is assumed and enforced, because we use the existence of write to set the write_flag below.

avioContext_.reset(avioAllocContext(
buffer,
bufferSize,
0,
/*write_flag=*/write != nullptr,
heldData,
read,
nullptr, // write function; not supported yet
write,
seek));

if (!avioContext_) {
Expand Down
11 changes: 4 additions & 7 deletions src/torchcodec/_core/AVIOContextHolder.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ namespace facebook::torchcodec {
// freed.
// 2. It is a base class for AVIOContext specializations. When specializing a
// AVIOContext, we need to provide four things:
// 1. A read callback function.
// 2. A seek callback function.
// 3. A write callback function. (Not supported yet; it's for encoding.)
// 1. A read callback function, for decoding.
// 2. A seek callback function, for decoding and encoding.
// 3. A write callback function, for encoding.
// 4. A pointer to some context object that has the same lifetime as the
// AVIOContext itself. This context object holds the custom state that
// tracks the custom behavior of reading, seeking and writing. It is
Expand All @@ -44,13 +44,10 @@ class AVIOContextHolder {
// enforced by having a pure virtual methods, but we don't have any.)
AVIOContextHolder() = default;

// These signatures are defined by FFmpeg.
using AVIOReadFunction = int (*)(void*, uint8_t*, int);
using AVIOSeekFunction = int64_t (*)(void*, int64_t, int);

// Deriving classes should call this function in their constructor.
void createAVIOContext(
AVIOReadFunction read,
AVIOWriteFunction write,
AVIOSeekFunction seek,
void* heldData,
int bufferSize = defaultBufferSize);
Expand Down
2 changes: 1 addition & 1 deletion src/torchcodec/_core/AVIOFileLikeContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)
py::hasattr(fileLike, "seek"),
"File like object must implement a seek method.");
}
createAVIOContext(&read, &seek, &fileLike_);
createAVIOContext(&read, nullptr, &seek, &fileLike_);
}

int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {
Expand Down
3 changes: 2 additions & 1 deletion src/torchcodec/_core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,9 @@ function(make_torchcodec_libraries
set(decoder_library_name "libtorchcodec_decoder${ffmpeg_major_version}")
set(decoder_sources
AVIOContextHolder.cpp
AVIOBytesContext.cpp
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Encoder.[cpp, h] rely on AVIOToTensorContext, and specifically on the AVIOToTensorContext::getOutputTensor() method. They don't rely on the AVIOContextHolder base class, like the decoder. For this reason we have to add AVIOBytesContext.cpp to the source dependency here.

Alternatively, I think we could make getOutputTensor a virtual method of the base class? But this method wouldn't make much sense for the existing child classes (like AVIOBytesContext), so this doesn't sounds like a great OOP design.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed on not making getOutputTensor() virtual on the base class, since it's only applicable to one of the derived classes. We could potentially pull AVIOToTensorContext out of AVIOBytesContext.[h|cpp] to limit what gets put into libtorchcodec_decoderN.so.

I think the issue here is that the Encoder class needs to call getOutputTensor(), which means that it must actually hold a reference to an actual AVIOToTensorContext rather than just the base AVIOContextHolder. (Which is what SingleStreamDecoder does.) A potential way around this is for AVIOToTensorContext to accept a tensor rather than creating its own. The caller, however, would still need to keep track of how many bytes it asked it to encode in order to do the final narrow on the tensor, so that's not necessarily any cleaner. Your call on what you think makes the most sense.

FFMPEGCommon.cpp
DeviceInterface.cpp
DeviceInterface.cpp
SingleStreamDecoder.cpp
# TODO: lib name should probably not be "*_decoder*" now that it also
# contains an encoder
Expand Down
74 changes: 57 additions & 17 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,24 @@
#include <sstream>

#include "src/torchcodec/_core/AVIOBytesContext.h"
#include "src/torchcodec/_core/Encoder.h"
#include "torch/types.h"

namespace facebook::torchcodec {

namespace {

torch::Tensor validateWf(torch::Tensor wf) {
TORCH_CHECK(
wf.dtype() == torch::kFloat32,
"waveform must have float32 dtype, got ",
wf.dtype());
// TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
// planar (fltp).
TORCH_CHECK(wf.dim() == 2, "waveform must have 2 dimensions, got ", wf.dim());
return wf;
}

void validateSampleRate(const AVCodec& avCodec, int sampleRate) {
if (avCodec.supported_samplerates == nullptr) {
return;
Expand Down Expand Up @@ -80,38 +92,55 @@ AudioEncoder::AudioEncoder(
int sampleRate,
std::string_view fileName,
std::optional<int64_t> bitRate)
: wf_(wf) {
TORCH_CHECK(
wf_.dtype() == torch::kFloat32,
"waveform must have float32 dtype, got ",
wf_.dtype());
// TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
// planar (fltp).
TORCH_CHECK(
wf_.dim() == 2, "waveform must have 2 dimensions, got ", wf_.dim());

: wf_(validateWf(wf)) {
setFFmpegLogLevel();
AVFormatContext* avFormatContext = nullptr;
auto status = avformat_alloc_output_context2(
int status = avformat_alloc_output_context2(
&avFormatContext, nullptr, nullptr, fileName.data());

TORCH_CHECK(
avFormatContext != nullptr,
"Couldn't allocate AVFormatContext. ",
"Check the desired extension? ",
getFFMPEGErrorStringFromErrorCode(status));
avFormatContext_.reset(avFormatContext);

// TODO-ENCODING: Should also support encoding into bytes (use
// AVIOBytesContext)
TORCH_CHECK(
!(avFormatContext->oformat->flags & AVFMT_NOFILE),
"AVFMT_NOFILE is set. We only support writing to a file.");
status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
TORCH_CHECK(
status >= 0,
"avio_open failed: ",
getFFMPEGErrorStringFromErrorCode(status));

initializeEncoder(sampleRate, bitRate);
}

AudioEncoder::AudioEncoder(
const torch::Tensor wf,
int sampleRate,
std::string_view formatName,
std::unique_ptr<AVIOToTensorContext> avioContextHolder,
std::optional<int64_t> bitRate)
: wf_(validateWf(wf)), avioContextHolder_(std::move(avioContextHolder)) {
setFFmpegLogLevel();
AVFormatContext* avFormatContext = nullptr;
int status = avformat_alloc_output_context2(
&avFormatContext, nullptr, formatName.data(), nullptr);

TORCH_CHECK(
avFormatContext != nullptr,
"Couldn't allocate AVFormatContext. ",
"Check the desired extension? ",
getFFMPEGErrorStringFromErrorCode(status));
avFormatContext_.reset(avFormatContext);

avFormatContext_->pb = avioContextHolder_->getAVIOContext();

initializeEncoder(sampleRate, bitRate);
}

void AudioEncoder::initializeEncoder(
int sampleRate,
std::optional<int64_t> bitRate) {
// We use the AVFormatContext's default codec for that
// specific format/container.
const AVCodec* avCodec =
Expand Down Expand Up @@ -150,7 +179,7 @@ AudioEncoder::AudioEncoder(

setDefaultChannelLayout(avCodecContext_, numChannels);

status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
int status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
TORCH_CHECK(
status == AVSUCCESS,
"avcodec_open2 failed: ",
Expand All @@ -170,7 +199,18 @@ AudioEncoder::AudioEncoder(
streamIndex_ = avStream->index;
}

torch::Tensor AudioEncoder::encodeToTensor() {
TORCH_CHECK(
avioContextHolder_ != nullptr,
"Cannot encode to tensor, avio context doesn't exist.");
encode();
return avioContextHolder_->getOutputTensor();
}

void AudioEncoder::encode() {
// TODO-ENCODING: Need to check, but consecutive calls to encode() are
// probably invalid. We can address this once we (re)design the public and
// private encoding APIs.
UniqueAVFrame avFrame(av_frame_alloc());
TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
// Default to 256 like in torchaudio
Expand Down
14 changes: 14 additions & 0 deletions src/torchcodec/_core/Encoder.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include <torch/types.h>
#include "src/torchcodec/_core/AVIOBytesContext.h"
#include "src/torchcodec/_core/FFMPEGCommon.h"

namespace facebook::torchcodec {
Expand All @@ -21,9 +22,19 @@ class AudioEncoder {
int sampleRate,
std::string_view fileName,
std::optional<int64_t> bitRate = std::nullopt);
AudioEncoder(
const torch::Tensor wf,
int sampleRate,
std::string_view formatName,
std::unique_ptr<AVIOToTensorContext> avioContextHolder,
std::optional<int64_t> bitRate = std::nullopt);
void encode();
torch::Tensor encodeToTensor();

private:
void initializeEncoder(
int sampleRate,
std::optional<int64_t> bitRate = std::nullopt);
void encodeInnerLoop(
AutoAVPacket& autoAVPacket,
const UniqueAVFrame& srcAVFrame);
Expand All @@ -35,5 +46,8 @@ class AudioEncoder {
UniqueSwrContext swrContext_;

const torch::Tensor wf_;

// Stores the AVIOContext for the output tensor buffer.
std::unique_ptr<AVIOToTensorContext> avioContextHolder_;
};
} // namespace facebook::torchcodec
23 changes: 23 additions & 0 deletions src/torchcodec/_core/FFMPEGCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,4 +261,27 @@ void setFFmpegLogLevel() {
av_log_set_level(logLevel);
}

AVIOContext* avioAllocContext(
uint8_t* buffer,
int buffer_size,
int write_flag,
void* opaque,
AVIOReadFunction read_packet,
AVIOWriteFunction write_packet,
AVIOSeekFunction seek) {
return avio_alloc_context(
buffer,
buffer_size,
write_flag,
opaque,
read_packet,
// The buf parameter of the write function is not const before FFmpeg 7.
#if LIBAVFILTER_VERSION_MAJOR >= 10 // FFmpeg >= 7
write_packet,
#else
reinterpret_cast<AVIOWriteFunctionOld>(write_packet),
#endif
seek);
}

} // namespace facebook::torchcodec
Loading
Loading