Skip to content

Add sample_format to audio metadata #557

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/torchcodec/decoders/_core/VideoDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ void VideoDecoder::initializeDecoder() {
}
containerMetadata_.numVideoStreams++;
} else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
AVSampleFormat format =
static_cast<AVSampleFormat>(avStream->codecpar->format);
streamMetadata.sampleFormat = av_get_sample_fmt_name(format);
containerMetadata_.numAudioStreams++;
}

Expand Down
1 change: 1 addition & 0 deletions src/torchcodec/decoders/_core/VideoDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class VideoDecoder {
// Audio-only fields
std::optional<int64_t> sampleRate;
std::optional<int64_t> numChannels;
std::optional<std::string> sampleFormat;
};

struct ContainerMetadata {
Expand Down
9 changes: 6 additions & 3 deletions src/torchcodec/decoders/_core/VideoDecoderOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -495,12 +495,15 @@ std::string get_stream_json_metadata(
if (streamMetadata.numChannels.has_value()) {
map["numChannels"] = std::to_string(*streamMetadata.numChannels);
}
if (streamMetadata.sampleFormat.has_value()) {
map["sampleFormat"] = quoteValue(streamMetadata.sampleFormat.value());
}
if (streamMetadata.mediaType == AVMEDIA_TYPE_VIDEO) {
map["mediaType"] = "\"video\"";
map["mediaType"] = quoteValue("video");
} else if (streamMetadata.mediaType == AVMEDIA_TYPE_AUDIO) {
map["mediaType"] = "\"audio\"";
map["mediaType"] = quoteValue("audio");
} else {
map["mediaType"] = "\"other\"";
map["mediaType"] = quoteValue("other");
}
return mapToJson(map);
}
Expand Down
3 changes: 2 additions & 1 deletion src/torchcodec/decoders/_core/_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,9 @@ def __repr__(self):
class AudioStreamMetadata(StreamMetadata):
"""Metadata of a single audio stream."""

# TODO-AUDIO Add sample format field
sample_rate: Optional[int]
num_channels: Optional[int]
sample_format: Optional[str]

def __repr__(self):
return super().__repr__()
Expand Down Expand Up @@ -240,6 +240,7 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
AudioStreamMetadata(
sample_rate=stream_dict.get("sampleRate"),
num_channels=stream_dict.get("numChannels"),
sample_format=stream_dict.get("sampleFormat"),
**common_meta,
)
)
Expand Down
4 changes: 3 additions & 1 deletion test/decoders/test_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
NASA_AUDIO,
NASA_AUDIO_MP3,
NASA_VIDEO,
SINE_MONO_S32,
)


Expand Down Expand Up @@ -940,7 +941,7 @@ def get_some_frames(decoder):


class TestAudioDecoder:
@pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
@pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3, SINE_MONO_S32))
def test_metadata(self, asset):
decoder = AudioDecoder(asset.path)
assert isinstance(decoder.metadata, AudioStreamMetadata)
Expand All @@ -955,6 +956,7 @@ def test_metadata(self, asset):
)
assert decoder.metadata.sample_rate == asset.sample_rate
assert decoder.metadata.num_channels == asset.num_channels
assert decoder.metadata.sample_format == asset.sample_format

@pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
def test_error(self, asset):
Expand Down
2 changes: 2 additions & 0 deletions test/decoders/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def test_get_metadata(metadata_getter):
)
assert best_audio_stream_metadata.bit_rate == 128837
assert best_audio_stream_metadata.codec == "aac"
assert best_audio_stream_metadata.sample_format == "fltp"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hard-coded value here and below, this test already has lots of these.



@pytest.mark.parametrize(
Expand All @@ -109,6 +110,7 @@ def test_get_metadata_audio_file(metadata_getter):
)
assert best_audio_stream_metadata.bit_rate == 64000
assert best_audio_stream_metadata.codec == "mp3"
assert best_audio_stream_metadata.sample_format == "fltp"


@pytest.mark.parametrize(
Expand Down
Binary file added test/resources/sine_mono_s32.wav
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also adding this new asset, not strictly needed for this PR, but still useful to check a format that's not fltp.
It will be needed in #556 anyway. It's from TorchAudio.

Binary file not shown.
254 changes: 254 additions & 0 deletions test/resources/sine_mono_s32.wav.stream0.all_frames_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
[
{
"duration_time": "0.064000",
"pts_time": "0.000000"
},
{
"duration_time": "0.064000",
"pts_time": "0.064000"
},
{
"duration_time": "0.064000",
"pts_time": "0.128000"
},
{
"duration_time": "0.064000",
"pts_time": "0.192000"
},
{
"duration_time": "0.064000",
"pts_time": "0.256000"
},
{
"duration_time": "0.064000",
"pts_time": "0.320000"
},
{
"duration_time": "0.064000",
"pts_time": "0.384000"
},
{
"duration_time": "0.064000",
"pts_time": "0.448000"
},
{
"duration_time": "0.064000",
"pts_time": "0.512000"
},
{
"duration_time": "0.064000",
"pts_time": "0.576000"
},
{
"duration_time": "0.064000",
"pts_time": "0.640000"
},
{
"duration_time": "0.064000",
"pts_time": "0.704000"
},
{
"duration_time": "0.064000",
"pts_time": "0.768000"
},
{
"duration_time": "0.064000",
"pts_time": "0.832000"
},
{
"duration_time": "0.064000",
"pts_time": "0.896000"
},
{
"duration_time": "0.064000",
"pts_time": "0.960000"
},
{
"duration_time": "0.064000",
"pts_time": "1.024000"
},
{
"duration_time": "0.064000",
"pts_time": "1.088000"
},
{
"duration_time": "0.064000",
"pts_time": "1.152000"
},
{
"duration_time": "0.064000",
"pts_time": "1.216000"
},
{
"duration_time": "0.064000",
"pts_time": "1.280000"
},
{
"duration_time": "0.064000",
"pts_time": "1.344000"
},
{
"duration_time": "0.064000",
"pts_time": "1.408000"
},
{
"duration_time": "0.064000",
"pts_time": "1.472000"
},
{
"duration_time": "0.064000",
"pts_time": "1.536000"
},
{
"duration_time": "0.064000",
"pts_time": "1.600000"
},
{
"duration_time": "0.064000",
"pts_time": "1.664000"
},
{
"duration_time": "0.064000",
"pts_time": "1.728000"
},
{
"duration_time": "0.064000",
"pts_time": "1.792000"
},
{
"duration_time": "0.064000",
"pts_time": "1.856000"
},
{
"duration_time": "0.064000",
"pts_time": "1.920000"
},
{
"duration_time": "0.064000",
"pts_time": "1.984000"
},
{
"duration_time": "0.064000",
"pts_time": "2.048000"
},
{
"duration_time": "0.064000",
"pts_time": "2.112000"
},
{
"duration_time": "0.064000",
"pts_time": "2.176000"
},
{
"duration_time": "0.064000",
"pts_time": "2.240000"
},
{
"duration_time": "0.064000",
"pts_time": "2.304000"
},
{
"duration_time": "0.064000",
"pts_time": "2.368000"
},
{
"duration_time": "0.064000",
"pts_time": "2.432000"
},
{
"duration_time": "0.064000",
"pts_time": "2.496000"
},
{
"duration_time": "0.064000",
"pts_time": "2.560000"
},
{
"duration_time": "0.064000",
"pts_time": "2.624000"
},
{
"duration_time": "0.064000",
"pts_time": "2.688000"
},
{
"duration_time": "0.064000",
"pts_time": "2.752000"
},
{
"duration_time": "0.064000",
"pts_time": "2.816000"
},
{
"duration_time": "0.064000",
"pts_time": "2.880000"
},
{
"duration_time": "0.064000",
"pts_time": "2.944000"
},
{
"duration_time": "0.064000",
"pts_time": "3.008000"
},
{
"duration_time": "0.064000",
"pts_time": "3.072000"
},
{
"duration_time": "0.064000",
"pts_time": "3.136000"
},
{
"duration_time": "0.064000",
"pts_time": "3.200000"
},
{
"duration_time": "0.064000",
"pts_time": "3.264000"
},
{
"duration_time": "0.064000",
"pts_time": "3.328000"
},
{
"duration_time": "0.064000",
"pts_time": "3.392000"
},
{
"duration_time": "0.064000",
"pts_time": "3.456000"
},
{
"duration_time": "0.064000",
"pts_time": "3.520000"
},
{
"duration_time": "0.064000",
"pts_time": "3.584000"
},
{
"duration_time": "0.064000",
"pts_time": "3.648000"
},
{
"duration_time": "0.064000",
"pts_time": "3.712000"
},
{
"duration_time": "0.064000",
"pts_time": "3.776000"
},
{
"duration_time": "0.064000",
"pts_time": "3.840000"
},
{
"duration_time": "0.064000",
"pts_time": "3.904000"
},
{
"duration_time": "0.032000",
"pts_time": "3.968000"
}
]
Loading
Loading