Skip to content

feat: support arbitrary attributes for speak provider #532

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion deepgram/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,6 @@
Listen,
ListenProvider,
Speak,
SpeakProvider,
Header,
Item,
Properties,
Expand Down
1 change: 0 additions & 1 deletion deepgram/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,6 @@
Listen,
ListenProvider,
Speak,
SpeakProvider,
Header,
Item,
Properties,
Expand Down
1 change: 0 additions & 1 deletion deepgram/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,6 @@
Listen,
ListenProvider,
Speak,
SpeakProvider,
Header,
Item,
Properties,
Expand Down
1 change: 0 additions & 1 deletion deepgram/clients/agent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
Listen,
ListenProvider,
Speak,
SpeakProvider,
Header,
Item,
Properties,
Expand Down
2 changes: 0 additions & 2 deletions deepgram/clients/agent/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
Listen as LatestListen,
ListenProvider as LatestListenProvider,
Speak as LatestSpeak,
SpeakProvider as LatestSpeakProvider,
Header as LatestHeader,
Item as LatestItem,
Properties as LatestProperties,
Expand Down Expand Up @@ -87,7 +86,6 @@
Listen = LatestListen
ListenProvider = LatestListenProvider
Speak = LatestSpeak
SpeakProvider = LatestSpeakProvider
Header = LatestHeader
Item = LatestItem
Properties = LatestProperties
Expand Down
1 change: 0 additions & 1 deletion deepgram/clients/agent/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
Listen,
ListenProvider,
Speak,
SpeakProvider,
Header,
Item,
Properties,
Expand Down
1 change: 0 additions & 1 deletion deepgram/clients/agent/v1/websocket/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
Listen,
ListenProvider,
Speak,
SpeakProvider,
Header,
Item,
Properties,
Expand Down
67 changes: 17 additions & 50 deletions deepgram/clients/agent/v1/websocket/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Use of this source code is governed by a MIT license that can be found in the LICENSE file.
# SPDX-License-Identifier: MIT

from typing import List, Optional, Union, Any, Tuple
from typing import List, Optional, Union, Any, Tuple, Dict
import logging

from dataclasses import dataclass, field
Expand Down Expand Up @@ -167,52 +167,6 @@ class ThinkProvider(BaseResponse):
)


@dataclass
class SpeakProvider(BaseResponse):
"""
This class defines the provider for the Speak model.
"""

type: Optional[str] = field(default="deepgram")
"""
Deepgram OR OpenAI model to use.
"""
model: Optional[str] = field(
default="aura-2-thalia-en",
metadata=dataclass_config(exclude=lambda f: f is None),
)
"""
ElevenLabs or Cartesia model to use.
"""
model_id: Optional[str] = field(
default=None, metadata=dataclass_config(exclude=lambda f: f is None)
)
"""
Cartesia voice configuration.
"""
voice: Optional[CartesiaVoice] = field(
default=None, metadata=dataclass_config(exclude=lambda f: f is None)
)
"""
Cartesia language.
"""
language: Optional[str] = field(
default=None, metadata=dataclass_config(exclude=lambda f: f is None)
)
"""
ElevenLabs language.
"""
language_code: Optional[str] = field(
default=None, metadata=dataclass_config(exclude=lambda f: f is None)
)

def __getitem__(self, key):
_dict = self.to_dict()
if "voice" in _dict and isinstance(_dict["voice"], dict):
_dict["voice"] = CartesiaVoice.from_dict(_dict["voice"])
return _dict[key]


Comment on lines -170 to -215
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking of something along the lines of

from dataclasses import dataclass, field, asdict
from typing import Any, Dict

@dataclass
class SpeakProvider:
    """
    This class defines the provider for the Speak model.
    """
    
    """
    The provider type. The only truly mandatory property.
    Marked optional because it has a default value.
    """
    type: Optional[str] = field(default="deepgram")
    
    """
    Internal property to store arbitrary proprties
    """
    __extra: Dict[str, Any] = field(default_factory=dict)

    def __post_init__(self):
        known_fields = {f.name for f in self.__dataclass_fields__.values()}
        for key in list(self.__dict__):
            if key not in known_fields:
                self.__extra[key] = self.__dict__.pop(key)

    def __getitem__(self, key):
        if key == "type":
            return self.type
        return self.__extra[key]

    def __setitem__(self, key, value):
        if key == "type":
            self.type = value
        else:
            self.__extra[key] = value

    def to_dict(self):
        return {"type": self.type, **self.__extra}

@dataclass
class Think(BaseResponse):
"""
Expand Down Expand Up @@ -264,15 +218,28 @@ class Speak(BaseResponse):
This class defines any configuration settings for the Speak model.
"""

provider: SpeakProvider = field(default_factory=SpeakProvider)
provider: dict = field(default_factory=dict)
endpoint: Optional[Endpoint] = field(
default=None, metadata=dataclass_config(exclude=lambda f: f is None)
)

def __post_init__(self):
# Allow attribute-style access to provider dict
# pylint: disable=missing-class-docstring
class AttrDict(dict):
def __getattr__(self, name):
try:
return self[name]
except KeyError:
# pylint: disable=raise-missing-from
raise AttributeError(name)
def __setattr__(self, name, value):
self[name] = value
if not isinstance(self.provider, AttrDict):
self.provider = AttrDict(self.provider)

def __getitem__(self, key):
_dict = self.to_dict()
if "provider" in _dict and isinstance(_dict["provider"], dict):
_dict["provider"] = SpeakProvider.from_dict(_dict["provider"])
if "endpoint" in _dict and isinstance(_dict["endpoint"], dict):
_dict["endpoint"] = Endpoint.from_dict(_dict["endpoint"])
return _dict[key]
Expand Down
100 changes: 100 additions & 0 deletions examples/agent/arbitrary_keys/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Copyright 2025 Deepgram SDK contributors. All Rights Reserved.
# Use of this source code is governed by a MIT license that can be found in the LICENSE file.
# SPDX-License-Identifier: MIT

# Import dependencies and set up the main function
import requests
import wave
import io
import time
import os
import json
import threading
from datetime import datetime

from deepgram import (
DeepgramClient,
DeepgramClientOptions,
AgentWebSocketEvents,
AgentKeepAlive,
)
from deepgram.clients.agent.v1.websocket.options import SettingsOptions

def main():
try:
# Initialize the Voice Agent
api_key = os.getenv("DEEPGRAM_API_KEY")
if not api_key:
raise ValueError("DEEPGRAM_API_KEY environment variable is not set")
print(f"API Key found:")

# Initialize Deepgram client
config = DeepgramClientOptions(
options={
"keepalive": "true",
# "speaker_playback": "true",
},
)
deepgram = DeepgramClient(api_key, config)
connection = deepgram.agent.websocket.v("1")
print("Created WebSocket connection...")

# 4. Configure the Agent
options = SettingsOptions()
# Audio input configuration
options.audio.input.encoding = "linear16"
options.audio.input.sample_rate = 24000
# Audio output configuration
options.audio.output.encoding = "linear16"
options.audio.output.sample_rate = 24000
options.audio.output.container = "wav"
# Agent configuration
options.agent.language = "en"
options.agent.listen.provider.type = "deepgram"
options.agent.listen.provider.model = "nova-3"
options.agent.think.provider.type = "open_ai"
options.agent.think.provider.model = "gpt-4o-mini"
options.agent.think.prompt = "You are a friendly AI assistant."
options.agent.speak.provider.type = "deepgram"
options.agent.speak.provider.model = "aura-2-thalia-en"
options.agent.greeting = "Hello! How can I help you today?"
options.agent.speak.provider.arbitrary_key = "test"

def on_welcome(self, welcome, **kwargs):
print(f"Welcome message received: {welcome}")
with open("chatlog.txt", 'a') as chatlog:
chatlog.write(f"Welcome message: {welcome}\n")

def on_settings_applied(self, settings_applied, **kwargs):
print(f"Settings applied: {settings_applied}")
with open("chatlog.txt", 'a') as chatlog:
chatlog.write(f"Settings applied: {settings_applied}\n")

def on_error(self, error, **kwargs):
print(f"Error received: {error}")
with open("chatlog.txt", 'a') as chatlog:
chatlog.write(f"Error: {error}\n")

# Register handlers
connection.on(AgentWebSocketEvents.Welcome, on_welcome)
connection.on(AgentWebSocketEvents.SettingsApplied, on_settings_applied)
connection.on(AgentWebSocketEvents.Error, on_error)
print("Event handlers registered")

# Start the connection
print("Starting WebSocket connection...")
print(options)
if not connection.start(options):
print("Failed to start connection")
return
print("WebSocket connection started successfully")

# Cleanup
connection.finish()
print("Finished")

except Exception as e:
print(f"Error: {str(e)}")

if __name__ == "__main__":
main()