Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: restructured the client classes, agent verbs #10834

Merged
merged 4 commits into from
Mar 6, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions python/samples/concepts/realtime/README.md
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@ To run these samples, you will need to have the following setup:
- pyaudio
- sounddevice
- pydub
e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
e.g. pip install pyaudio sounddevice pydub semantic-kernel[realtime]

The samples all run as python scripts, that can either be started directly or through your IDE.

@@ -41,10 +41,10 @@ The following two samples use function calling with the following functions:

A line is logged whenever one of these functions is called.

### [Chat with function calling Websocket](./realtime_chat_with_function_calling_websocket.py)
### [Chat with function calling Websocket](./realtime_agent_with_function_calling_websocket.py)

This sample uses the websocket api with Azure OpenAI to run the interaction with the voice model, but now with function calling.
This sample uses the websocket api with Azure OpenAI to run a voice agent, capable of taking actions on your behalf.

### [Chat with function calling WebRTC](./realtime_chat_with_function_calling_webrtc.py)
### [Chat with function calling WebRTC](./realtime_agent_with_function_calling_webrtc.py)

This sample uses the WebRTC api with OpenAI to run the interaction with the voice model, but now with function calling.
This sample uses the WebRTC api with OpenAI to run a voice agent, capable of taking actions on your behalf.
Original file line number Diff line number Diff line change
@@ -6,16 +6,14 @@
from random import randint

from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
from semantic_kernel.connectors.ai.open_ai import (
ListenEvents,
OpenAIRealtimeExecutionSettings,
OpenAIRealtimeWebRTC,
TurnDetection,
)
from semantic_kernel.contents import ChatHistory
from semantic_kernel.contents.realtime_events import RealtimeTextEvent
from semantic_kernel.contents import ChatHistory, RealtimeTextEvent
from semantic_kernel.functions import kernel_function

logging.basicConfig(level=logging.WARNING)
@@ -26,13 +24,13 @@

"""
This simple sample demonstrates how to use the OpenAI Realtime API to create
a chat bot that can listen and respond directly through audio.
a agent that can listen and respond directly through audio.
It requires installing:
- semantic-kernel[realtime]
- pyaudio
- sounddevice
- pydub
e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
e.g. pip install pyaudio sounddevice pydub semantic-kernel[realtime]

For more details of the exact setup, see the README.md in the realtime folder.
"""
@@ -46,41 +44,42 @@
check_audio_devices()


@kernel_function
def get_weather(location: str) -> str:
"""Get the weather for a location."""
weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing")
weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec
logger.info(f"@ Getting weather for {location}: {weather}")
return f"The weather in {location} is {weather}."
class Helpers:
"""A set of helper functions for the Realtime Agent."""

@kernel_function
def get_weather(self, location: str) -> str:
"""Get the weather for a location."""
weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing")
weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec
logger.info(f"@ Getting weather for {location}: {weather}")
return f"The weather in {location} is {weather}."

@kernel_function
def get_date_time() -> str:
"""Get the current date and time."""
logger.info("@ Getting current datetime")
return f"The current date and time is {datetime.now().isoformat()}."
@kernel_function
def get_date_time(self) -> str:
"""Get the current date and time."""
logger.info("@ Getting current datetime")
return f"The current date and time is {datetime.now().isoformat()}."


@kernel_function
def goodbye():
"""When the user is done, say goodbye and then call this function."""
logger.info("@ Goodbye has been called!")
raise KeyboardInterrupt
@kernel_function
def goodbye(self):
"""When the user is done, say goodbye and then call this function."""
logger.info("@ Goodbye has been called!")
raise KeyboardInterrupt


async def main() -> None:
print_transcript = True
# create the Kernel and add a simple function for function calling.
kernel = Kernel()
kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time])

# create the audio player and audio track
# both take a device_id parameter, which is the index of the device to use, if None the default device is used
# both take a device_id parameter, which is the index of the device to use,
# if None the default device will be used
audio_player = AudioPlayerWebRTC()
# create the realtime client and optionally add the audio output function, this is optional

# create the realtime agent and optionally add the audio output function, this is optional
# and can also be passed in the receive method
realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC())
# You can also pass in kernel, plugins, chat_history or settings here.
# For WebRTC the audio_track is required
realtime_agent = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), plugins=[Helpers()])

# Create the settings for the session
# The realtime api, does not use a system message, but takes instructions as a parameter for a session
@@ -111,14 +110,13 @@ async def main() -> None:
# the context manager calls the create_session method on the client and starts listening to the audio stream
async with (
audio_player,
realtime_client(
realtime_agent(
settings=settings,
chat_history=chat_history,
kernel=kernel,
create_response=True,
),
):
async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
async for event in realtime_agent.receive(audio_output_callback=audio_player.client_callback):
match event:
case RealtimeTextEvent():
if print_transcript:
Original file line number Diff line number Diff line change
@@ -14,8 +14,7 @@
ListenEvents,
TurnDetection,
)
from semantic_kernel.contents import ChatHistory
from semantic_kernel.contents.realtime_events import RealtimeTextEvent
from semantic_kernel.contents import ChatHistory, RealtimeTextEvent
from semantic_kernel.functions import kernel_function

logger = logging.getLogger(__name__)
@@ -29,7 +28,7 @@
- pyaudio
- sounddevice
- pydub
e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
e.g. pip install pyaudio sounddevice pydub semantic-kernel[realtime]

For more details of the exact setup, see the README.md in the realtime folder.
"""
@@ -60,20 +59,21 @@ def goodbye():

async def main() -> None:
print_transcript = True
# create the Kernel and add a simple function for function calling.
# create a Kernel and add a simple function for function calling.
kernel = Kernel()
kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time])

# create the realtime client, in this the Azure Websocket client, there are also OpenAI Websocket and WebRTC clients
# See 02b-chat_with_function_calling_webrtc.py for an example of the WebRTC client
realtime_client = AzureRealtimeWebsocket()
# create the realtime agent, in this using Azure OpenAI through Websockets,
# there are also OpenAI Websocket and WebRTC clients
# See realtime_agent_with_function_calling_webrtc.py for an example of the WebRTC client
realtime_agent = AzureRealtimeWebsocket()
# create the audio player and audio track
# both take a device_id parameter, which is the index of the device to use, if None the default device is used
audio_player = AudioPlayerWebsocket()
audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_agent)

# Create the settings for the session
# The realtime api, does not use a system message, but takes instructions as a parameter for a session
# The realtime api, does not use a system message, but, like agents, takes instructions as a parameter for a session
# Another important setting is to tune the server_vad turn detection
# if this is turned off (by setting turn_detection=None), you will have to send
# the "input_audio_buffer.commit" and "response.create" event to the realtime api
@@ -102,20 +102,20 @@ async def main() -> None:
"I can tell you what the weather is or the time."
)

# the context manager calls the create_session method on the client and starts listening to the audio stream
# the context manager calls the create_session method on the agent and starts listening to the audio stream
async with (
audio_player,
audio_recorder,
realtime_client(
realtime_agent(
settings=settings,
chat_history=chat_history,
kernel=kernel,
create_response=True,
),
):
# the audio_output_callback can be added here or in the client constructor
# the audio_output_callback can be added here or in the constructor
# using this gives the smoothest experience
async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
async for event in realtime_agent.receive(audio_output_callback=audio_player.client_callback):
match event:
case RealtimeTextEvent():
if print_transcript:
@@ -133,7 +133,7 @@ async def main() -> None:

if __name__ == "__main__":
print(
"Instructions: The model will start speaking immediately,"
"Instructions: The agent will start speaking immediately,"
"this can be turned off by removing `create_response=True` above."
"The model will detect when you stop and automatically generate a response. "
"Press ctrl + c to stop the program."
Original file line number Diff line number Diff line change
@@ -24,7 +24,7 @@
- pyaudio
- sounddevice
- pydub
e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
e.g. pip install pyaudio sounddevice pydub semantic-kernel[realtime]

For more details of the exact setup, see the README.md in the realtime folder.
"""
@@ -42,8 +42,6 @@ async def main() -> None:
# create the realtime client and optionally add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC())
# Create the settings for the session
settings = OpenAIRealtimeExecutionSettings(
instructions="""
You are a chat bot. Your name is Mosscap and
@@ -58,9 +56,11 @@ async def main() -> None:
# for more details.
voice="alloy",
)
realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), settings=settings)
# Create the settings for the session
audio_player = AudioPlayerWebRTC()
# the context manager calls the create_session method on the client and starts listening to the audio stream
async with audio_player, realtime_client(settings=settings, create_response=True):
async with audio_player, realtime_client:
async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
match event.event_type:
case "text":
@@ -76,9 +76,8 @@ async def main() -> None:

if __name__ == "__main__":
print(
"Instructions: The model will start speaking immediately,"
"this can be turned off by removing `create_response=True` above."
"The model will detect when you stop and automatically generate a response. "
"Instructions: start speaking. "
"The model will detect when you stop and automatically start responding. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -25,7 +25,7 @@
- pyaudio
- sounddevice
- pydub
e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
e.g. pip install pyaudio sounddevice pydub semantic-kernel[realtime]

For more details of the exact setup, see the README.md in the realtime folder.
"""
@@ -43,10 +43,6 @@ async def main() -> None:
# create the realtime client and optionally add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = AzureRealtimeWebsocket()
audio_player = AudioPlayerWebsocket()
audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
# Create the settings for the session
settings = AzureRealtimeExecutionSettings(
instructions="""
You are a chat bot. Your name is Mosscap and
@@ -61,8 +57,12 @@ async def main() -> None:
# for more details.
voice="shimmer",
)
realtime_client = AzureRealtimeWebsocket(settings=settings)
audio_player = AudioPlayerWebsocket()
audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
# Create the settings for the session
# the context manager calls the create_session method on the client and starts listening to the audio stream
async with audio_player, audio_recorder, realtime_client(settings=settings, create_response=True):
async with audio_player, audio_recorder, realtime_client:
async for event in realtime_client.receive():
match event:
# this can be used as an alternative to the callback function used in other samples,
@@ -82,9 +82,8 @@ async def main() -> None:

if __name__ == "__main__":
print(
"Instructions: The model will start speaking immediately,"
"this can be turned off by removing `create_response=True` above."
"The model will detect when you stop and automatically generate a response. "
"Instructions: Start speaking. "
"The model will detect when you stop and automatically start responding. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())
3 changes: 1 addition & 2 deletions python/samples/concepts/realtime/utils.py
Original file line number Diff line number Diff line change
@@ -16,8 +16,7 @@
from sounddevice import InputStream, OutputStream

from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
from semantic_kernel.contents import AudioContent
from semantic_kernel.contents.realtime_events import RealtimeAudioEvent
from semantic_kernel.contents import AudioContent, RealtimeAudioEvent

logger = logging.getLogger(__name__)

2 changes: 1 addition & 1 deletion python/samples/demos/call_automation/call_automation.py
Original file line number Diff line number Diff line change
@@ -46,8 +46,8 @@
from semantic_kernel.connectors.ai.open_ai import (
AzureRealtimeExecutionSettings,
AzureRealtimeWebsocket,
ListenEvents,
)
from semantic_kernel.connectors.ai.open_ai.services._open_ai_realtime import ListenEvents
from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
from semantic_kernel.contents import AudioContent, RealtimeAudioEvent
from semantic_kernel.functions import kernel_function
3 changes: 1 addition & 2 deletions python/semantic_kernel/connectors/ai/open_ai/__init__.py
Original file line number Diff line number Diff line change
@@ -34,6 +34,7 @@
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_image_execution_settings import (
OpenAITextToImageExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.services._open_ai_realtime import ListenEvents, SendEvents
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
from semantic_kernel.connectors.ai.open_ai.services.azure_chat_completion import AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtimeWebsocket
@@ -44,10 +45,8 @@
from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText
from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion
from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import (
ListenEvents,
OpenAIRealtimeWebRTC,
OpenAIRealtimeWebsocket,
SendEvents,
)
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding
Loading
Loading