microsoft · eavanvalkenburg · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025
@@ -11,7 +11,7 @@ To run these samples, you will need to have the following setup:
   - pyaudio
   - sounddevice
   - pydub
-    e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
+    e.g. pip install pyaudio sounddevice pydub semantic-kernel[realtime]
 
 The samples all run as python scripts, that can either be started directly or through your IDE.
 
@@ -41,10 +41,10 @@ The following two samples use function calling with the following functions:
 
 A line is logged whenever one of these functions is called.
 
-### [Chat with function calling Websocket](./realtime_chat_with_function_calling_websocket.py)
+### [Chat with function calling Websocket](./realtime_agent_with_function_calling_websocket.py)
 
-This sample uses the websocket api with Azure OpenAI to run the interaction with the voice model, but now with function calling.
+This sample uses the websocket api with Azure OpenAI to run a voice agent, capable of taking actions on your behalf.
 
-### [Chat with function calling WebRTC](./realtime_chat_with_function_calling_webrtc.py)
+### [Chat with function calling WebRTC](./realtime_agent_with_function_calling_webrtc.py)
 
-This sample uses the WebRTC api with OpenAI to run the interaction with the voice model, but now with function calling.
+This sample uses the WebRTC api with OpenAI to run a voice agent, capable of taking actions on your behalf.
@@ -6,16 +6,14 @@
 from random import randint
 
 from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
-from semantic_kernel import Kernel
 from semantic_kernel.connectors.ai import FunctionChoiceBehavior
 from semantic_kernel.connectors.ai.open_ai import (
     ListenEvents,
     OpenAIRealtimeExecutionSettings,
     OpenAIRealtimeWebRTC,
     TurnDetection,
 )
-from semantic_kernel.contents import ChatHistory
-from semantic_kernel.contents.realtime_events import RealtimeTextEvent
+from semantic_kernel.contents import ChatHistory, RealtimeTextEvent
 from semantic_kernel.functions import kernel_function
 
 logging.basicConfig(level=logging.WARNING)
@@ -26,13 +24,13 @@
 
 """
 This simple sample demonstrates how to use the OpenAI Realtime API to create
-a chat bot that can listen and respond directly through audio.
+a agent that can listen and respond directly through audio.
 It requires installing:
 - semantic-kernel[realtime]
 - pyaudio
 - sounddevice
 - pydub
-e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
+e.g. pip install pyaudio sounddevice pydub semantic-kernel[realtime]
 
 For more details of the exact setup, see the README.md in the realtime folder.
 """
@@ -46,41 +44,42 @@
 check_audio_devices()
 
 
-@kernel_function
-def get_weather(location: str) -> str:
-    """Get the weather for a location."""
-    weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing")
-    weather = weather_conditions[randint(0, len(weather_conditions) - 1)]  # nosec
-    logger.info(f"@ Getting weather for {location}: {weather}")
-    return f"The weather in {location} is {weather}."
+class Helpers:
+    """A set of helper functions for the Realtime Agent."""
 
+    @kernel_function
+    def get_weather(self, location: str) -> str:
+        """Get the weather for a location."""
+        weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing")
+        weather = weather_conditions[randint(0, len(weather_conditions) - 1)]  # nosec
+        logger.info(f"@ Getting weather for {location}: {weather}")
+        return f"The weather in {location} is {weather}."
 
-@kernel_function
-def get_date_time() -> str:
-    """Get the current date and time."""
-    logger.info("@ Getting current datetime")
-    return f"The current date and time is {datetime.now().isoformat()}."
+    @kernel_function
+    def get_date_time(self) -> str:
+        """Get the current date and time."""
+        logger.info("@ Getting current datetime")
+        return f"The current date and time is {datetime.now().isoformat()}."
 
-
-@kernel_function
-def goodbye():
-    """When the user is done, say goodbye and then call this function."""
-    logger.info("@ Goodbye has been called!")
-    raise KeyboardInterrupt
+    @kernel_function
+    def goodbye(self):
+        """When the user is done, say goodbye and then call this function."""
+        logger.info("@ Goodbye has been called!")
+        raise KeyboardInterrupt
 
 
 async def main() -> None:
     print_transcript = True
-    # create the Kernel and add a simple function for function calling.
-    kernel = Kernel()
-    kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time])
-
     # create the audio player and audio track
-    # both take a device_id parameter, which is the index of the device to use, if None the default device is used
+    # both take a device_id parameter, which is the index of the device to use,
+    # if None the default device will be used
     audio_player = AudioPlayerWebRTC()
-    # create the realtime client and optionally add the audio output function, this is optional
+
+    # create the realtime agent and optionally add the audio output function, this is optional
     # and can also be passed in the receive method
-    realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC())
+    # You can also pass in kernel, plugins, chat_history or settings here.
+    # For WebRTC the audio_track is required
+    realtime_agent = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), plugins=[Helpers()])
 
     # Create the settings for the session
     # The realtime api, does not use a system message, but takes instructions as a parameter for a session
@@ -111,14 +110,13 @@ async def main() -> None:
     # the context manager calls the create_session method on the client and starts listening to the audio stream
     async with (
         audio_player,
-        realtime_client(
+        realtime_agent(
             settings=settings,
             chat_history=chat_history,
-            kernel=kernel,
             create_response=True,
         ),
     ):
-        async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
+        async for event in realtime_agent.receive(audio_output_callback=audio_player.client_callback):
             match event:
                 case RealtimeTextEvent():
                     if print_transcript:

@@ -14,8 +14,7 @@
     ListenEvents,
     TurnDetection,
 )
-from semantic_kernel.contents import ChatHistory
-from semantic_kernel.contents.realtime_events import RealtimeTextEvent
+from semantic_kernel.contents import ChatHistory, RealtimeTextEvent
 from semantic_kernel.functions import kernel_function
 
 logger = logging.getLogger(__name__)
@@ -29,7 +28,7 @@
 - pyaudio
 - sounddevice
 - pydub
-e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
+e.g. pip install pyaudio sounddevice pydub semantic-kernel[realtime]
 
 For more details of the exact setup, see the README.md in the realtime folder.
 """
@@ -60,20 +59,21 @@ def goodbye():
 
 async def main() -> None:
     print_transcript = True
-    # create the Kernel and add a simple function for function calling.
+    # create a Kernel and add a simple function for function calling.
     kernel = Kernel()
     kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time])
 
-    # create the realtime client, in this the Azure Websocket client, there are also OpenAI Websocket and WebRTC clients
-    # See 02b-chat_with_function_calling_webrtc.py for an example of the WebRTC client
-    realtime_client = AzureRealtimeWebsocket()
+    # create the realtime agent, in this using Azure OpenAI through Websockets,
+    # there are also OpenAI Websocket and WebRTC clients
+    # See realtime_agent_with_function_calling_webrtc.py for an example of the WebRTC client
+    realtime_agent = AzureRealtimeWebsocket()
     # create the audio player and audio track
     # both take a device_id parameter, which is the index of the device to use, if None the default device is used
     audio_player = AudioPlayerWebsocket()
-    audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
+    audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_agent)
 
     # Create the settings for the session
-    # The realtime api, does not use a system message, but takes instructions as a parameter for a session
+    # The realtime api, does not use a system message, but, like agents, takes instructions as a parameter for a session
     # Another important setting is to tune the server_vad turn detection
     # if this is turned off (by setting turn_detection=None), you will have to send
     # the "input_audio_buffer.commit" and "response.create" event to the realtime api
@@ -102,20 +102,20 @@ async def main() -> None:
         "I can tell you what the weather is or the time."
     )
 
-    # the context manager calls the create_session method on the client and starts listening to the audio stream
+    # the context manager calls the create_session method on the agent and starts listening to the audio stream
     async with (
         audio_player,
         audio_recorder,
-        realtime_client(
+        realtime_agent(
             settings=settings,
             chat_history=chat_history,
             kernel=kernel,
             create_response=True,
         ),
     ):
-        # the audio_output_callback can be added here or in the client constructor
+        # the audio_output_callback can be added here or in the constructor
         # using this gives the smoothest experience
-        async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
+        async for event in realtime_agent.receive(audio_output_callback=audio_player.client_callback):
             match event:
                 case RealtimeTextEvent():
                     if print_transcript:
@@ -133,7 +133,7 @@ async def main() -> None:
 
 if __name__ == "__main__":
     print(
-        "Instructions: The model will start speaking immediately,"
+        "Instructions: The agent will start speaking immediately,"
         "this can be turned off by removing `create_response=True` above."
         "The model will detect when you stop and automatically generate a response. "
         "Press ctrl + c to stop the program."

@@ -24,7 +24,7 @@
 - pyaudio
 - sounddevice
 - pydub
-e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
+e.g. pip install pyaudio sounddevice pydub semantic-kernel[realtime]
 
 For more details of the exact setup, see the README.md in the realtime folder.
 """
@@ -42,8 +42,6 @@ async def main() -> None:
     # create the realtime client and optionally add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
-    realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC())
-    # Create the settings for the session
     settings = OpenAIRealtimeExecutionSettings(
         instructions="""
     You are a chat bot. Your name is Mosscap and
@@ -58,9 +56,11 @@ async def main() -> None:
         # for more details.
         voice="alloy",
     )
+    realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), settings=settings)
+    # Create the settings for the session
     audio_player = AudioPlayerWebRTC()
     # the context manager calls the create_session method on the client and starts listening to the audio stream
-    async with audio_player, realtime_client(settings=settings, create_response=True):
+    async with audio_player, realtime_client:
         async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
             match event.event_type:
                 case "text":
@@ -76,9 +76,8 @@ async def main() -> None:
 
 if __name__ == "__main__":
     print(
-        "Instructions: The model will start speaking immediately,"
-        "this can be turned off by removing `create_response=True` above."
-        "The model will detect when you stop and automatically generate a response. "
+        "Instructions: start speaking. "
+        "The model will detect when you stop and automatically start responding. "
         "Press ctrl + c to stop the program."
     )
     asyncio.run(main())
@@ -25,7 +25,7 @@
 - pyaudio
 - sounddevice
 - pydub
-e.g. pip install pyaudio sounddevice pydub semantic_kernel[realtime]
+e.g. pip install pyaudio sounddevice pydub semantic-kernel[realtime]
 
 For more details of the exact setup, see the README.md in the realtime folder.
 """
@@ -43,10 +43,6 @@ async def main() -> None:
     # create the realtime client and optionally add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
-    realtime_client = AzureRealtimeWebsocket()
-    audio_player = AudioPlayerWebsocket()
-    audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
-    # Create the settings for the session
     settings = AzureRealtimeExecutionSettings(
         instructions="""
     You are a chat bot. Your name is Mosscap and
@@ -61,8 +57,12 @@ async def main() -> None:
         # for more details.
         voice="shimmer",
     )
+    realtime_client = AzureRealtimeWebsocket(settings=settings)
+    audio_player = AudioPlayerWebsocket()
+    audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
+    # Create the settings for the session
     # the context manager calls the create_session method on the client and starts listening to the audio stream
-    async with audio_player, audio_recorder, realtime_client(settings=settings, create_response=True):
+    async with audio_player, audio_recorder, realtime_client:
         async for event in realtime_client.receive():
             match event:
                 # this can be used as an alternative to the callback function used in other samples,
@@ -82,9 +82,8 @@ async def main() -> None:
 
 if __name__ == "__main__":
     print(
-        "Instructions: The model will start speaking immediately,"
-        "this can be turned off by removing `create_response=True` above."
-        "The model will detect when you stop and automatically generate a response. "
+        "Instructions: Start speaking. "
+        "The model will detect when you stop and automatically start responding. "
         "Press ctrl + c to stop the program."
     )
     asyncio.run(main())
@@ -16,8 +16,7 @@
 from sounddevice import InputStream, OutputStream
 
 from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
-from semantic_kernel.contents import AudioContent
-from semantic_kernel.contents.realtime_events import RealtimeAudioEvent
+from semantic_kernel.contents import AudioContent, RealtimeAudioEvent
 
 logger = logging.getLogger(__name__)
 

@@ -46,8 +46,8 @@
 from semantic_kernel.connectors.ai.open_ai import (
     AzureRealtimeExecutionSettings,
     AzureRealtimeWebsocket,
-    ListenEvents,
 )
+from semantic_kernel.connectors.ai.open_ai.services._open_ai_realtime import ListenEvents
 from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
 from semantic_kernel.contents import AudioContent, RealtimeAudioEvent
 from semantic_kernel.functions import kernel_function

@@ -34,6 +34,7 @@
 from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_image_execution_settings import (
     OpenAITextToImageExecutionSettings,
 )
+from semantic_kernel.connectors.ai.open_ai.services._open_ai_realtime import ListenEvents, SendEvents
 from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
 from semantic_kernel.connectors.ai.open_ai.services.azure_chat_completion import AzureChatCompletion
 from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtimeWebsocket
@@ -44,10 +45,8 @@
 from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText
 from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion
 from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import (
-    ListenEvents,
     OpenAIRealtimeWebRTC,
     OpenAIRealtimeWebsocket,
-    SendEvents,
 )
 from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion
 from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding