Skip to content

Commit 63d1dc7

Browse files
authoredNov 12, 2024
Python: Text to audio (microsoft#9625)
### Motivation and Context <!-- Thank you for your contribution to the semantic-kernel repo! Please help reviewers and future users, providing the following information: 1. Why is this change required? 2. What problem does it solve? 3. What scenario does it contribute to? 4. If it fixes an open issue, please link to the issue here. --> Addresses: microsoft#7433 ### Description <!-- Describe your changes, the overall approach, the underlying design. These notes will help understanding how your code works. Thanks! --> Add text to audio interface to Python Semantic Kernel. ### Contribution Checklist 1. Text-to-audio client base. 2. OpenAI and Azure OpenAI implementations of the client. 3. Unit tests and integration tests. 4. Samples. <!-- Before submitting this PR, please make sure: --> - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
1 parent 5764c8c commit 63d1dc7

34 files changed

+1019
-36
lines changed
 

‎.github/workflows/python-integration-tests.yml

+14-2
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,19 @@ jobs:
6464
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
6565
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
6666
AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
67-
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
67+
AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME }}
68+
AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME }}
69+
AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME }}
6870
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
6971
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
72+
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
73+
AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
7074
BING_API_KEY: ${{ secrets.BING_API_KEY }}
7175
OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI_CHAT_MODEL_ID }}
7276
OPENAI_TEXT_MODEL_ID: ${{ vars.OPENAI_TEXT_MODEL_ID }}
7377
OPENAI_EMBEDDING_MODEL_ID: ${{ vars.OPENAI_EMBEDDING_MODEL_ID }}
78+
OPENAI_AUDIO_TO_TEXT_MODEL_ID: ${{ vars.OPENAI_AUDIO_TO_TEXT_MODEL_ID }}
79+
OPENAI_TEXT_TO_AUDIO_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_AUDIO_MODEL_ID }}
7480
OPENAI_TEXT_TO_IMAGE_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_IMAGE_MODEL_ID }}
7581
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
7682
PINECONE_API_KEY: ${{ secrets.PINECONE__APIKEY }}
@@ -233,13 +239,19 @@ jobs:
233239
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
234240
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
235241
AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
236-
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
242+
AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME }}
243+
AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME }}
244+
AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME }}
237245
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
238246
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
247+
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
248+
AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT: ${{ secrets.AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT }}
239249
BING_API_KEY: ${{ secrets.BING_API_KEY }}
240250
OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI_CHAT_MODEL_ID }}
241251
OPENAI_TEXT_MODEL_ID: ${{ vars.OPENAI_TEXT_MODEL_ID }}
242252
OPENAI_EMBEDDING_MODEL_ID: ${{ vars.OPENAI_EMBEDDING_MODEL_ID }}
253+
OPENAI_AUDIO_TO_TEXT_MODEL_ID: ${{ vars.OPENAI_AUDIO_TO_TEXT_MODEL_ID }}
254+
OPENAI_TEXT_TO_AUDIO_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_AUDIO_MODEL_ID }}
243255
OPENAI_TEXT_TO_IMAGE_MODEL_ID: ${{ vars.OPENAI_TEXT_TO_IMAGE_MODEL_ID }}
244256
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
245257
PINECONE_API_KEY: ${{ secrets.PINECONE__APIKEY }}

‎python/samples/concepts/audio_to_text/chat_with_audio_input.py ‎python/samples/concepts/audio/01-chat_with_audio_input.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,22 @@
44
import logging
55
import os
66

7-
from samples.concepts.audio_to_text.audio_recorder import AudioRecorder
8-
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
9-
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
7+
from samples.concepts.audio.audio_recorder import AudioRecorder
8+
from semantic_kernel.connectors.ai.open_ai import (
9+
AzureAudioToText,
10+
AzureChatCompletion,
1011
OpenAIChatPromptExecutionSettings,
1112
)
12-
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
13-
from semantic_kernel.contents import ChatHistory
14-
from semantic_kernel.contents.audio_content import AudioContent
13+
from semantic_kernel.contents import AudioContent, ChatHistory
1514

1615
# This simple sample demonstrates how to use the AzureChatCompletion and AzureAudioToText services
1716
# to create a chat bot that can communicate with the user using audio input.
1817
# The user can enage a long conversation with the chat bot by speaking to it.
1918

19+
# Resources required for this sample:
20+
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
21+
# 2. An Azure Speech to Text deployment (e.g. whisper).
22+
2023
# Additional dependencies required for this sample:
2124
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
2225
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
import asyncio
4+
import logging
5+
6+
from samples.concepts.audio.audio_player import AudioPlayer
7+
from semantic_kernel.connectors.ai.open_ai import (
8+
AzureChatCompletion,
9+
AzureTextToAudio,
10+
OpenAIChatPromptExecutionSettings,
11+
OpenAITextToAudioExecutionSettings,
12+
)
13+
from semantic_kernel.contents import ChatHistory
14+
15+
# This simple sample demonstrates how to use the AzureChatCompletion and AzureTextToAudio services
16+
# to create a chat bot that can communicate with the user using audio output.
17+
# The chatbot will engage in a conversation with the user and respond using audio output.
18+
19+
# Resources required for this sample:
20+
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
21+
# 2. An Azure Text to Speech deployment (e.g. tts).
22+
23+
# Additional dependencies required for this sample:
24+
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
25+
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
26+
27+
28+
logging.basicConfig(level=logging.WARNING)
29+
30+
system_message = """
31+
You are a chat bot. Your name is Mosscap and
32+
you have one goal: figure out what people need.
33+
Your full name, should you need to know it, is
34+
Splendid Speckled Mosscap. You communicate
35+
effectively, but you tend to answer with long
36+
flowery prose.
37+
"""
38+
39+
40+
chat_service = AzureChatCompletion()
41+
text_to_audio_service = AzureTextToAudio()
42+
43+
history = ChatHistory()
44+
history.add_user_message("Hi there, who are you?")
45+
history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
46+
47+
48+
async def chat() -> bool:
49+
try:
50+
user_input = input("User:> ")
51+
except KeyboardInterrupt:
52+
print("\n\nExiting chat...")
53+
return False
54+
except EOFError:
55+
print("\n\nExiting chat...")
56+
return False
57+
58+
if user_input == "exit":
59+
print("\n\nExiting chat...")
60+
return False
61+
62+
history.add_user_message(user_input)
63+
64+
# No need to stream the response since we can only pass the
65+
# response to the text to audio service as a whole
66+
response = await chat_service.get_chat_message_content(
67+
chat_history=history,
68+
settings=OpenAIChatPromptExecutionSettings(
69+
max_tokens=2000,
70+
temperature=0.7,
71+
top_p=0.8,
72+
),
73+
)
74+
75+
# Need to set the response format to wav since the audio player only supports wav files
76+
audio_content = await text_to_audio_service.get_audio_content(
77+
response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
78+
)
79+
AudioPlayer(audio_content=audio_content).play()
80+
81+
print(f"Mosscap:> {response.content}")
82+
83+
history.add_message(response)
84+
85+
return True
86+
87+
88+
async def main() -> None:
89+
chatting = True
90+
while chatting:
91+
chatting = await chat()
92+
93+
94+
if __name__ == "__main__":
95+
asyncio.run(main())
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
import asyncio
4+
import logging
5+
import os
6+
7+
from samples.concepts.audio.audio_player import AudioPlayer
8+
from samples.concepts.audio.audio_recorder import AudioRecorder
9+
from semantic_kernel.connectors.ai.open_ai import (
10+
AzureAudioToText,
11+
AzureChatCompletion,
12+
AzureTextToAudio,
13+
OpenAIChatPromptExecutionSettings,
14+
OpenAITextToAudioExecutionSettings,
15+
)
16+
from semantic_kernel.contents import AudioContent, ChatHistory
17+
18+
# This simple sample demonstrates how to use the AzureChatCompletion, AzureTextToAudio, and AzureAudioToText
19+
# services to create a chat bot that can communicate with the user using both audio input and output.
20+
# The chatbot will engage in a conversation with the user by audio only.
21+
# This sample combines the functionality of the samples/concepts/audio/01-chat_with_audio_input.py and
22+
# samples/concepts/audio/02-chat_with_audio_output.py samples.
23+
24+
# Resources required for this sample:
25+
# 1. An Azure OpenAI model deployment (e.g. GPT-4o-mini).
26+
# 2. An Azure Text to Speech deployment (e.g. tts).
27+
# 3. An Azure Speech to Text deployment (e.g. whisper).
28+
29+
# Additional dependencies required for this sample:
30+
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
31+
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
32+
33+
34+
logging.basicConfig(level=logging.WARNING)
35+
AUDIO_FILEPATH = os.path.join(os.path.dirname(__file__), "output.wav")
36+
37+
38+
system_message = """
39+
You are a chat bot. Your name is Mosscap and
40+
you have one goal: figure out what people need.
41+
Your full name, should you need to know it, is
42+
Splendid Speckled Mosscap. You communicate
43+
effectively, but you tend to answer with long
44+
flowery prose.
45+
"""
46+
47+
48+
chat_service = AzureChatCompletion()
49+
text_to_audio_service = AzureTextToAudio()
50+
audio_to_text_service = AzureAudioToText()
51+
52+
history = ChatHistory()
53+
history.add_user_message("Hi there, who are you?")
54+
history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
55+
56+
57+
async def chat() -> bool:
58+
try:
59+
print("User:> ", end="", flush=True)
60+
with AudioRecorder(output_filepath=AUDIO_FILEPATH) as recorder:
61+
recorder.start_recording()
62+
user_input = await audio_to_text_service.get_text_content(AudioContent.from_audio_file(AUDIO_FILEPATH))
63+
print(user_input.text)
64+
except KeyboardInterrupt:
65+
print("\n\nExiting chat...")
66+
return False
67+
except EOFError:
68+
print("\n\nExiting chat...")
69+
return False
70+
71+
if "exit" in user_input.text.lower():
72+
print("\n\nExiting chat...")
73+
return False
74+
75+
history.add_user_message(user_input.text)
76+
77+
# No need to stream the response since we can only pass the
78+
# response to the text to audio service as a whole
79+
response = await chat_service.get_chat_message_content(
80+
chat_history=history,
81+
settings=OpenAIChatPromptExecutionSettings(
82+
max_tokens=2000,
83+
temperature=0.7,
84+
top_p=0.8,
85+
),
86+
)
87+
88+
# Need to set the response format to wav since the audio player only supports wav files
89+
audio_content = await text_to_audio_service.get_audio_content(
90+
response.content, OpenAITextToAudioExecutionSettings(response_format="wav")
91+
)
92+
print("Mosscap:> ", end="", flush=True)
93+
AudioPlayer(audio_content=audio_content).play(text=response.content)
94+
95+
history.add_message(response)
96+
97+
return True
98+
99+
100+
async def main() -> None:
101+
print(
102+
"Instruction: when it's your turn to speak, press the spacebar to start recording."
103+
" Release the spacebar to stop recording."
104+
)
105+
106+
chatting = True
107+
while chatting:
108+
chatting = await chat()
109+
110+
111+
if __name__ == "__main__":
112+
asyncio.run(main())
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
import io
4+
import logging
5+
import wave
6+
from typing import ClassVar
7+
8+
import pyaudio
9+
from pydantic import BaseModel
10+
11+
from semantic_kernel.contents import AudioContent
12+
13+
logging.basicConfig(level=logging.WARNING)
14+
logger: logging.Logger = logging.getLogger(__name__)
15+
16+
17+
class AudioPlayer(BaseModel):
18+
"""A class to play an audio file to the default audio output device."""
19+
20+
# Audio replay parameters
21+
CHUNK: ClassVar[int] = 1024
22+
23+
audio_content: AudioContent
24+
25+
def play(self, text: str | None = None) -> None:
26+
"""Play the audio content to the default audio output device.
27+
28+
Args:
29+
text (str, optional): The text to display while playing the audio. Defaults to None.
30+
"""
31+
audio_stream = io.BytesIO(self.audio_content.data)
32+
with wave.open(audio_stream, "rb") as wf:
33+
audio = pyaudio.PyAudio()
34+
stream = audio.open(
35+
format=audio.get_format_from_width(wf.getsampwidth()),
36+
channels=wf.getnchannels(),
37+
rate=wf.getframerate(),
38+
output=True,
39+
)
40+
41+
if text:
42+
# Simulate the output of text while playing the audio
43+
data_frames = []
44+
45+
data = wf.readframes(self.CHUNK)
46+
while data:
47+
data_frames.append(data)
48+
data = wf.readframes(self.CHUNK)
49+
50+
if len(data_frames) < len(text):
51+
logger.warning(
52+
"The audio is too short to play the entire text. ",
53+
"The text will be displayed without synchronization.",
54+
)
55+
print(text)
56+
else:
57+
for data_frame, text_frame in self._zip_text_and_audio(text, data_frames):
58+
stream.write(data_frame)
59+
print(text_frame, end="", flush=True)
60+
print()
61+
else:
62+
data = wf.readframes(self.CHUNK)
63+
while data:
64+
stream.write(data)
65+
data = wf.readframes(self.CHUNK)
66+
67+
stream.stop_stream()
68+
stream.close()
69+
audio.terminate()
70+
71+
def _zip_text_and_audio(self, text: str, audio_frames: list) -> zip:
72+
"""Zip the text and audio frames together so that they can be displayed in sync.
73+
74+
This is done by evenly distributing empty strings between each character and
75+
append the remaining empty strings at the end.
76+
77+
Args:
78+
text (str): The text to display while playing the audio.
79+
audio_frames (list): The audio frames to play.
80+
81+
Returns:
82+
zip: The zipped text and audio frames.
83+
"""
84+
text_frames = list(text)
85+
empty_string_count = len(audio_frames) - len(text_frames)
86+
empty_string_spacing = len(text_frames) // empty_string_count
87+
88+
modified_text_frames = []
89+
current_empty_string_count = 0
90+
for i, text_frame in enumerate(text_frames):
91+
modified_text_frames.append(text_frame)
92+
if current_empty_string_count < empty_string_count and i % empty_string_spacing == 0:
93+
modified_text_frames.append("")
94+
current_empty_string_count += 1
95+
96+
if current_empty_string_count < empty_string_count:
97+
modified_text_frames.extend([""] * (empty_string_count - current_empty_string_count))
98+
99+
return zip(audio_frames, modified_text_frames)

‎python/samples/concepts/audio_to_text/audio_recorder.py ‎python/samples/concepts/audio/audio_recorder.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@
66

77
import keyboard
88
import pyaudio
9+
from pydantic import BaseModel
910

10-
from semantic_kernel.kernel_pydantic import KernelBaseModel
1111

12-
13-
class AudioRecorder(KernelBaseModel):
12+
class AudioRecorder(BaseModel):
1413
"""A class to record audio from the microphone and save it to a WAV file.
1514
1615
To start recording, press the spacebar. To stop recording, release the spacebar.

‎python/samples/concepts/setup/ALL_SETTINGS.md

+20-4
Original file line numberDiff line numberDiff line change
@@ -18,27 +18,43 @@ OpenAI | [OpenAIChatCompletion](../../../semantic_kernel/connectors/ai/open_ai/s
1818
| | | ai_model_id | OPENAI_TEXT_TO_IMAGE_MODEL_ID | Yes
1919
| | | api_key | OPENAI_API_KEY | Yes
2020
| | | org_id | OPENAI_ORG_ID | No
21+
| | [OpenAITextToAudio](../../../semantic_kernel/connectors/ai/open_ai/services/open_ai_text_to_audio.py)
22+
| | | ai_model_id | OPENAI_TEXT_TO_AUDIO_MODEL_ID | Yes
23+
| | | api_key | OPENAI_API_KEY | Yes
24+
| | | org_id | OPENAI_ORG_ID | No
25+
| | [OpenAIAudioToText](../../../semantic_kernel/connectors/ai/open_ai/services/open_ai_audio_to_text.py)
26+
| | | ai_model_id | OPENAI_AUDIO_TO_TEXT_MODEL_ID | Yes
27+
| | | api_key | OPENAI_API_KEY | Yes
28+
| | | org_id | OPENAI_ORG_ID | No
2129
Azure OpenAI | [AzureOpenAIChatCompletion](../../../semantic_kernel/connectors/ai/open_ai/services/azure_chat_completion.py) | | | | [AzureOpenAISettings](../../../semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py)
2230
| | | deployment_name | AZURE_OPENAI_CHAT_DEPLOYMENT_NAME | Yes
23-
| | | api_key | AZURE_OPENAI_API_KEY | Yes
31+
| | | api_key | AZURE_OPENAI_API_KEY | No
2432
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes
2533
| | | api_version | AZURE_OPENAI_API_VERSION | Yes
2634
| | | base_url | AZURE_OPENAI_BASE_URL | Yes
2735
| | [AzureOpenAITextCompletion](../../../semantic_kernel/connectors/ai/open_ai/services/azure_text_completion.py)
2836
| | | deployment_name | AZURE_OPENAI_TEXT_DEPLOYMENT_NAME | Yes
29-
| | | api_key | AZURE_OPENAI_API_KEY | Yes
37+
| | | api_key | AZURE_OPENAI_API_KEY | No
3038
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes
3139
| | | api_version | AZURE_OPENAI_API_VERSION | Yes
3240
| | | base_url | AZURE_OPENAI_BASE_URL | Yes
3341
| | [AzureOpenAITextEmbedding](../../../semantic_kernel/connectors/ai/open_ai/services/azure_text_embedding.py)
3442
| | | deployment_name | AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME | Yes
35-
| | | api_key | AZURE_OPENAI_API_KEY | Yes
43+
| | | api_key | AZURE_OPENAI_API_KEY | No
3644
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes
3745
| | | api_version | AZURE_OPENAI_API_VERSION | Yes
3846
| | | base_url | AZURE_OPENAI_BASE_URL | Yes
3947
| | [AzureTextToImage](../../../semantic_kernel/connectors/ai/open_ai/services/azure_text_to_image.py)
4048
| | | deployment_name | AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME | Yes
41-
| | | api_key | AZURE_OPENAI_API_KEY | Yes
49+
| | | api_key | AZURE_OPENAI_API_KEY | No
50+
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes
51+
| | [AzureTextToAudio](../../../semantic_kernel/connectors/ai/open_ai/services/azure_text_to_audio.py)
52+
| | | deployment_name | AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME | Yes
53+
| | | api_key | AZURE_OPENAI_API_KEY | No
54+
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes
55+
| | [AzureAudioToText](../../../semantic_kernel/connectors/ai/open_ai/services/azure_audio_to_text.py)
56+
| | | deployment_name | AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME | Yes
57+
| | | api_key | AZURE_OPENAI_API_KEY | No
4258
| | | endpoint | AZURE_OPENAI_ENDPOINT | Yes
4359

4460
## Memory Service Settings used across SK:

‎python/semantic_kernel/connectors/ai/open_ai/__init__.py

+20
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,39 @@
1313
DataSourceFieldsMapping,
1414
ExtraBody,
1515
)
16+
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_audio_to_text_execution_settings import (
17+
OpenAIAudioToTextExecutionSettings,
18+
)
1619
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
1720
OpenAIChatPromptExecutionSettings,
1821
OpenAIEmbeddingPromptExecutionSettings,
1922
OpenAIPromptExecutionSettings,
2023
OpenAITextPromptExecutionSettings,
2124
)
25+
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
26+
OpenAITextToAudioExecutionSettings,
27+
)
28+
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_image_execution_settings import (
29+
OpenAITextToImageExecutionSettings,
30+
)
31+
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
2232
from semantic_kernel.connectors.ai.open_ai.services.azure_chat_completion import AzureChatCompletion
2333
from semantic_kernel.connectors.ai.open_ai.services.azure_text_completion import AzureTextCompletion
2434
from semantic_kernel.connectors.ai.open_ai.services.azure_text_embedding import AzureTextEmbedding
35+
from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio
2536
from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_image import AzureTextToImage
37+
from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText
2638
from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion
2739
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion
2840
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding
41+
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio
2942
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_image import OpenAITextToImage
3043

3144
__all__ = [
3245
"ApiKeyAuthentication",
3346
"AzureAISearchDataSource",
3447
"AzureAISearchDataSourceParameters",
48+
"AzureAudioToText",
3549
"AzureChatCompletion",
3650
"AzureChatPromptExecutionSettings",
3751
"AzureCosmosDBDataSource",
@@ -40,17 +54,23 @@
4054
"AzureEmbeddingDependency",
4155
"AzureTextCompletion",
4256
"AzureTextEmbedding",
57+
"AzureTextToAudio",
4358
"AzureTextToImage",
4459
"ConnectionStringAuthentication",
4560
"DataSourceFieldsMapping",
4661
"DataSourceFieldsMapping",
4762
"ExtraBody",
63+
"OpenAIAudioToText",
64+
"OpenAIAudioToTextExecutionSettings",
4865
"OpenAIChatCompletion",
4966
"OpenAIChatPromptExecutionSettings",
5067
"OpenAIEmbeddingPromptExecutionSettings",
5168
"OpenAIPromptExecutionSettings",
5269
"OpenAITextCompletion",
5370
"OpenAITextEmbedding",
5471
"OpenAITextPromptExecutionSettings",
72+
"OpenAITextToAudio",
73+
"OpenAITextToAudioExecutionSettings",
5574
"OpenAITextToImage",
75+
"OpenAITextToImageExecutionSettings",
5676
]

‎python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_audio_to_text_execution_settings.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ class OpenAIAudioToTextExecutionSettings(PromptExecutionSettings):
1414
"""Request settings for OpenAI audio to text services."""
1515

1616
ai_model_id: str | None = Field(None, serialization_alias="model")
17-
filename: str | None = None
17+
filename: str | None = Field(
18+
None, description="Do not set this manually. It is set by the service based on the audio content."
19+
)
1820
language: str | None = None
1921
prompt: str | None = None
2022
response_format: str | None = None

‎python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_prompt_execution_settings.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ class OpenAIPromptExecutionSettings(PromptExecutionSettings):
3838
class OpenAITextPromptExecutionSettings(OpenAIPromptExecutionSettings):
3939
"""Specific settings for the completions endpoint."""
4040

41-
prompt: str | None = None
41+
prompt: str | None = Field(
42+
None, description="Do not set this manually. It is set by the service based on the text content."
43+
)
4244
best_of: int | None = Field(None, ge=1)
4345
echo: bool = False
4446
logprobs: int | None = Field(None, ge=0, le=5)
@@ -66,7 +68,9 @@ class OpenAIChatPromptExecutionSettings(OpenAIPromptExecutionSettings):
6668
) = None
6769
function_call: str | None = None
6870
functions: list[dict[str, Any]] | None = None
69-
messages: list[dict[str, Any]] | None = None
71+
messages: list[dict[str, Any]] | None = Field(
72+
None, description="Do not set this manually. It is set by the service based on the chat history."
73+
)
7074
function_call_behavior: FunctionCallBehavior | None = Field(None, exclude=True)
7175
parallel_tool_calls: bool = True
7276
tools: list[dict[str, Any]] | None = Field(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
import logging
4+
from typing import Literal
5+
6+
from pydantic import Field, model_validator
7+
8+
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
9+
from semantic_kernel.exceptions.service_exceptions import ServiceInvalidExecutionSettingsError
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
class OpenAITextToAudioExecutionSettings(PromptExecutionSettings):
15+
"""Request settings for OpenAI text to audio services."""
16+
17+
ai_model_id: str | None = Field(None, serialization_alias="model")
18+
input: str | None = Field(
19+
None, description="Do not set this manually. It is set by the service based on the text content."
20+
)
21+
voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"] = "alloy"
22+
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | None = None
23+
speed: float | None = None
24+
25+
@model_validator(mode="after")
26+
def validate_speed(self) -> "OpenAITextToAudioExecutionSettings":
27+
"""Validate the speed parameter."""
28+
if self.speed is not None and (self.speed < 0.25 or self.speed > 4.0):
29+
raise ServiceInvalidExecutionSettingsError("Speed must be between 0.25 and 4.0.")
30+
return self

‎python/semantic_kernel/connectors/ai/open_ai/services/azure_audio_to_text.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def __init__(
4242
api_key: The optional api key. If provided, will override the value in the
4343
env vars or .env file.
4444
deployment_name: The optional deployment. If provided, will override the value
45-
(text_to_image_deployment_name) in the env vars or .env file.
45+
(audio_to_text_deployment_name) in the env vars or .env file.
4646
endpoint: The optional deployment endpoint. If provided will override the value
4747
in the env vars or .env file.
4848
base_url: The optional deployment base_url. If provided will override the value
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
from collections.abc import Mapping
4+
from typing import Any, TypeVar
5+
6+
from openai import AsyncAzureOpenAI
7+
from openai.lib.azure import AsyncAzureADTokenProvider
8+
from pydantic import ValidationError
9+
10+
from semantic_kernel.connectors.ai.open_ai.services.azure_config_base import AzureOpenAIConfigBase
11+
from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes
12+
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio_base import OpenAITextToAudioBase
13+
from semantic_kernel.connectors.ai.open_ai.settings.azure_open_ai_settings import AzureOpenAISettings
14+
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError
15+
16+
T_ = TypeVar("T_", bound="AzureTextToAudio")
17+
18+
19+
class AzureTextToAudio(AzureOpenAIConfigBase, OpenAITextToAudioBase):
20+
"""Azure text to audio service."""
21+
22+
def __init__(
23+
self,
24+
service_id: str | None = None,
25+
api_key: str | None = None,
26+
deployment_name: str | None = None,
27+
endpoint: str | None = None,
28+
base_url: str | None = None,
29+
api_version: str | None = "2024-10-01-preview",
30+
ad_token: str | None = None,
31+
ad_token_provider: AsyncAzureADTokenProvider | None = None,
32+
token_endpoint: str | None = None,
33+
default_headers: Mapping[str, str] | None = None,
34+
async_client: AsyncAzureOpenAI | None = None,
35+
env_file_path: str | None = None,
36+
env_file_encoding: str | None = None,
37+
) -> None:
38+
"""Initialize an AzureTextToAudio service.
39+
40+
Args:
41+
service_id: The service ID. (Optional)
42+
api_key: The optional api key. If provided, will override the value in the
43+
env vars or .env file.
44+
deployment_name: The optional deployment. If provided, will override the value
45+
(text_to_audio_deployment_name) in the env vars or .env file.
46+
endpoint: The optional deployment endpoint. If provided will override the value
47+
in the env vars or .env file.
48+
base_url: The optional deployment base_url. If provided will override the value
49+
in the env vars or .env file.
50+
api_version: The optional deployment api version. If provided will override the value
51+
in the env vars or .env file. Default is "2024-10-01-preview".
52+
ad_token: The Azure AD token for authentication. (Optional)
53+
ad_token_provider: Azure AD Token provider. (Optional)
54+
token_endpoint: The Azure AD token endpoint. (Optional)
55+
default_headers: The default headers mapping of string keys to
56+
string values for HTTP requests. (Optional)
57+
async_client: An existing client to use. (Optional)
58+
env_file_path: Use the environment settings file as a fallback to
59+
environment variables. (Optional)
60+
env_file_encoding: The encoding of the environment settings file. (Optional)
61+
"""
62+
try:
63+
azure_openai_settings = AzureOpenAISettings.create(
64+
env_file_path=env_file_path,
65+
env_file_encoding=env_file_encoding,
66+
api_key=api_key,
67+
text_to_audio_deployment_name=deployment_name,
68+
endpoint=endpoint,
69+
base_url=base_url,
70+
api_version=api_version,
71+
token_endpoint=token_endpoint,
72+
)
73+
except ValidationError as exc:
74+
raise ServiceInitializationError(f"Invalid settings: {exc}") from exc
75+
if not azure_openai_settings.text_to_audio_deployment_name:
76+
raise ServiceInitializationError("The Azure OpenAI text to audio deployment name is required.")
77+
78+
super().__init__(
79+
deployment_name=azure_openai_settings.text_to_audio_deployment_name,
80+
endpoint=azure_openai_settings.endpoint,
81+
base_url=azure_openai_settings.base_url,
82+
api_version=azure_openai_settings.api_version,
83+
service_id=service_id,
84+
api_key=azure_openai_settings.api_key.get_secret_value() if azure_openai_settings.api_key else None,
85+
ad_token=ad_token,
86+
ad_token_provider=ad_token_provider,
87+
token_endpoint=azure_openai_settings.token_endpoint,
88+
default_headers=default_headers,
89+
ai_model_type=OpenAIModelTypes.TEXT_TO_AUDIO,
90+
client=async_client,
91+
)
92+
93+
@classmethod
94+
def from_dict(cls: type[T_], settings: dict[str, Any]) -> T_:
95+
"""Initialize an Azure OpenAI service from a dictionary of settings.
96+
97+
Args:
98+
settings: A dictionary of settings for the service.
99+
should contain keys: deployment_name, endpoint, api_key
100+
and optionally: api_version, ad_auth
101+
"""
102+
return cls(
103+
service_id=settings.get("service_id"),
104+
api_key=settings.get("api_key"),
105+
deployment_name=settings.get("deployment_name"),
106+
endpoint=settings.get("endpoint"),
107+
base_url=settings.get("base_url"),
108+
api_version=settings.get("api_version"),
109+
ad_token=settings.get("ad_token"),
110+
ad_token_provider=settings.get("ad_token_provider"),
111+
default_headers=settings.get("default_headers"),
112+
env_file_path=settings.get("env_file_path"),
113+
)

‎python/semantic_kernel/connectors/ai/open_ai/services/open_ai_audio_to_text_base.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@
1818
)
1919
from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler
2020
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
21-
from semantic_kernel.contents.audio_content import AudioContent
22-
from semantic_kernel.contents.text_content import TextContent
21+
from semantic_kernel.contents import AudioContent, TextContent
2322

2423

2524
class OpenAIAudioToTextBase(OpenAIHandler, AudioToTextClientBase):
@@ -58,3 +57,7 @@ async def get_text_contents(
5857
inner_content=response,
5958
)
6059
]
60+
61+
def get_prompt_execution_settings_class(self) -> type[PromptExecutionSettings]:
62+
"""Get the request settings class."""
63+
return OpenAIAudioToTextExecutionSettings

‎python/semantic_kernel/connectors/ai/open_ai/services/open_ai_handler.py

+25-7
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,23 @@
44
from abc import ABC
55
from typing import Any, Union
66

7-
from openai import AsyncOpenAI, AsyncStream, BadRequestError
7+
from openai import AsyncOpenAI, AsyncStream, BadRequestError, _legacy_response
88
from openai.lib._parsing._completions import type_to_response_format_param
99
from openai.types import Completion, CreateEmbeddingResponse
1010
from openai.types.audio import Transcription
1111
from openai.types.chat import ChatCompletion, ChatCompletionChunk
1212
from openai.types.images_response import ImagesResponse
1313
from pydantic import BaseModel
1414

15-
from semantic_kernel.connectors.ai.open_ai.exceptions.content_filter_ai_exception import ContentFilterAIException
16-
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_audio_to_text_execution_settings import (
15+
from semantic_kernel.connectors.ai.open_ai import (
1716
OpenAIAudioToTextExecutionSettings,
18-
)
19-
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
2017
OpenAIChatPromptExecutionSettings,
2118
OpenAIEmbeddingPromptExecutionSettings,
2219
OpenAIPromptExecutionSettings,
23-
)
24-
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_image_execution_settings import (
20+
OpenAITextToAudioExecutionSettings,
2521
OpenAITextToImageExecutionSettings,
2622
)
23+
from semantic_kernel.connectors.ai.open_ai.exceptions.content_filter_ai_exception import ContentFilterAIException
2724
from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes
2825
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
2926
from semantic_kernel.connectors.utils.structured_output_schema import generate_structured_output_response_format_schema
@@ -42,6 +39,7 @@
4239
list[Any],
4340
ImagesResponse,
4441
Transcription,
42+
_legacy_response.HttpxBinaryResponseContent,
4543
]
4644

4745

@@ -68,6 +66,9 @@ async def _send_request(self, settings: PromptExecutionSettings) -> RESPONSE_TYP
6866
if self.ai_model_type == OpenAIModelTypes.AUDIO_TO_TEXT:
6967
assert isinstance(settings, OpenAIAudioToTextExecutionSettings) # nosec
7068
return await self._send_audio_to_text_request(settings)
69+
if self.ai_model_type == OpenAIModelTypes.TEXT_TO_AUDIO:
70+
assert isinstance(settings, OpenAITextToAudioExecutionSettings) # nosec
71+
return await self._send_text_to_audio_request(settings)
7172

7273
raise NotImplementedError(f"Model type {self.ai_model_type} is not supported")
7374

@@ -144,6 +145,23 @@ async def _send_audio_to_text_request(self, settings: OpenAIAudioToTextExecution
144145
ex,
145146
) from ex
146147

148+
async def _send_text_to_audio_request(
149+
self, settings: OpenAITextToAudioExecutionSettings
150+
) -> _legacy_response.HttpxBinaryResponseContent:
151+
"""Send a request to the OpenAI text to audio endpoint.
152+
153+
The OpenAI API returns the content of the generated audio file.
154+
"""
155+
try:
156+
return await self.client.audio.speech.create(
157+
**settings.prepare_settings_dict(),
158+
)
159+
except Exception as ex:
160+
raise ServiceResponseException(
161+
f"{type(self)} service failed to generate audio",
162+
ex,
163+
) from ex
164+
147165
def _handle_structured_output(
148166
self, request_settings: OpenAIChatPromptExecutionSettings, settings: dict[str, Any]
149167
) -> None:

‎python/semantic_kernel/connectors/ai/open_ai/services/open_ai_model_types.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ class OpenAIModelTypes(Enum):
1111
EMBEDDING = "embedding"
1212
TEXT_TO_IMAGE = "text-to-image"
1313
AUDIO_TO_TEXT = "audio-to-text"
14+
TEXT_TO_AUDIO = "text-to-audio"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
from collections.abc import Mapping
4+
from typing import Any, TypeVar
5+
6+
from openai import AsyncOpenAI
7+
from pydantic import ValidationError
8+
9+
from semantic_kernel.connectors.ai.open_ai.services.open_ai_config_base import OpenAIConfigBase
10+
from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes
11+
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio_base import OpenAITextToAudioBase
12+
from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings
13+
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError
14+
15+
T_ = TypeVar("T_", bound="OpenAITextToAudio")
16+
17+
18+
class OpenAITextToAudio(OpenAIConfigBase, OpenAITextToAudioBase):
19+
"""OpenAI Text to Image service."""
20+
21+
def __init__(
22+
self,
23+
ai_model_id: str | None = None,
24+
api_key: str | None = None,
25+
org_id: str | None = None,
26+
service_id: str | None = None,
27+
default_headers: Mapping[str, str] | None = None,
28+
async_client: AsyncOpenAI | None = None,
29+
env_file_path: str | None = None,
30+
env_file_encoding: str | None = None,
31+
) -> None:
32+
"""Initializes a new instance of the OpenAITextToAudio class.
33+
34+
Args:
35+
ai_model_id: OpenAI model name, see
36+
https://platform.openai.com/docs/models
37+
service_id: Service ID tied to the execution settings.
38+
api_key: The optional API key to use. If provided will override,
39+
the env vars or .env file value.
40+
org_id: The optional org ID to use. If provided will override,
41+
the env vars or .env file value.
42+
default_headers: The default headers mapping of string keys to
43+
string values for HTTP requests. (Optional)
44+
async_client: An existing client to use. (Optional)
45+
env_file_path: Use the environment settings file as
46+
a fallback to environment variables. (Optional)
47+
env_file_encoding: The encoding of the environment settings file. (Optional)
48+
"""
49+
try:
50+
openai_settings = OpenAISettings.create(
51+
api_key=api_key,
52+
org_id=org_id,
53+
text_to_audio_model_id=ai_model_id,
54+
env_file_path=env_file_path,
55+
env_file_encoding=env_file_encoding,
56+
)
57+
except ValidationError as ex:
58+
raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex
59+
if not openai_settings.text_to_audio_model_id:
60+
raise ServiceInitializationError("The OpenAI text to audio model ID is required.")
61+
super().__init__(
62+
ai_model_id=openai_settings.text_to_audio_model_id,
63+
api_key=openai_settings.api_key.get_secret_value() if openai_settings.api_key else None,
64+
ai_model_type=OpenAIModelTypes.TEXT_TO_AUDIO,
65+
org_id=openai_settings.org_id,
66+
service_id=service_id,
67+
default_headers=default_headers,
68+
client=async_client,
69+
)
70+
71+
@classmethod
72+
def from_dict(cls: type[T_], settings: dict[str, Any]) -> T_:
73+
"""Initialize an Open AI service from a dictionary of settings.
74+
75+
Args:
76+
settings: A dictionary of settings for the service.
77+
"""
78+
return cls(
79+
ai_model_id=settings.get("ai_model_id"),
80+
api_key=settings.get("api_key"),
81+
org_id=settings.get("org_id"),
82+
service_id=settings.get("service_id"),
83+
default_headers=settings.get("default_headers", {}),
84+
env_file_path=settings.get("env_file_path"),
85+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
import sys
4+
from typing import Any
5+
6+
from openai import _legacy_response
7+
8+
if sys.version_info >= (3, 12):
9+
from typing import override # pragma: no cover
10+
else:
11+
from typing_extensions import override # pragma: no cover
12+
13+
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
14+
OpenAITextToAudioExecutionSettings,
15+
)
16+
from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler
17+
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
18+
from semantic_kernel.connectors.ai.text_to_audio_client_base import TextToAudioClientBase
19+
from semantic_kernel.contents.audio_content import AudioContent
20+
21+
22+
class OpenAITextToAudioBase(OpenAIHandler, TextToAudioClientBase):
23+
"""OpenAI text to audio client base class."""
24+
25+
@override
26+
async def get_audio_contents(
27+
self,
28+
text: str,
29+
settings: PromptExecutionSettings | None = None,
30+
**kwargs: Any,
31+
) -> list[AudioContent]:
32+
if not settings:
33+
settings = OpenAITextToAudioExecutionSettings(ai_model_id=self.ai_model_id)
34+
else:
35+
if not isinstance(settings, OpenAITextToAudioExecutionSettings):
36+
settings = self.get_prompt_execution_settings_from_settings(settings)
37+
38+
assert isinstance(settings, OpenAITextToAudioExecutionSettings) # nosec
39+
40+
if settings.ai_model_id is None:
41+
settings.ai_model_id = self.ai_model_id
42+
settings.input = text
43+
44+
response = await self._send_request(settings)
45+
assert isinstance(response, _legacy_response.HttpxBinaryResponseContent) # nosec
46+
47+
return [
48+
AudioContent(
49+
ai_model_id=settings.ai_model_id,
50+
data=response.read(),
51+
data_format="base64",
52+
)
53+
]
54+
55+
def get_prompt_execution_settings_class(self) -> type[PromptExecutionSettings]:
56+
"""Get the request settings class."""
57+
return OpenAITextToAudioExecutionSettings

‎python/semantic_kernel/connectors/ai/open_ai/services/open_ai_text_to_image_base.py

+5
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
OpenAITextToImageExecutionSettings,
1010
)
1111
from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler
12+
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
1213
from semantic_kernel.connectors.ai.text_to_image_client_base import TextToImageClientBase
1314
from semantic_kernel.exceptions.service_exceptions import ServiceResponseException
1415

@@ -42,3 +43,7 @@ async def generate_image(self, description: str, width: int, height: int, **kwar
4243
raise ServiceResponseException("Failed to generate image.")
4344

4445
return response.data[0].url
46+
47+
def get_prompt_execution_settings_class(self) -> type[PromptExecutionSettings]:
48+
"""Get the request settings class."""
49+
return OpenAITextToImageExecutionSettings

‎python/semantic_kernel/connectors/ai/open_ai/settings/azure_open_ai_settings.py

+7
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ class AzureOpenAISettings(KernelBaseSettings):
4949
Resource Management > Deployments in the Azure portal or, alternatively,
5050
under Management > Deployments in Azure OpenAI Studio.
5151
(Env var AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME)
52+
- text_to_audio_deployment_name: str - The name of the Azure Text to Audio deployment. This
53+
value will correspond to the custom name you chose for your deployment
54+
when you deployed a model. This value can be found under
55+
Resource Management > Deployments in the Azure portal or, alternatively,
56+
under Management > Deployments in Azure OpenAI Studio.
57+
(Env var AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME)
5258
- api_key: SecretStr - The API key for the Azure deployment. This value can be
5359
found in the Keys & Endpoint section when examining your resource in
5460
the Azure portal. You can use either KEY1 or KEY2.
@@ -78,6 +84,7 @@ class AzureOpenAISettings(KernelBaseSettings):
7884
embedding_deployment_name: str | None = None
7985
text_to_image_deployment_name: str | None = None
8086
audio_to_text_deployment_name: str | None = None
87+
text_to_audio_deployment_name: str | None = None
8188
endpoint: HttpsUrl | None = None
8289
base_url: HttpsUrl | None = None
8390
api_key: SecretStr | None = None

‎python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py

+3
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ class OpenAISettings(KernelBaseSettings):
3030
(Env var OPENAI_TEXT_TO_IMAGE_MODEL_ID)
3131
- audio_to_text_model_id: str | None - The OpenAI audio to text model ID to use, for example, whisper-1.
3232
(Env var OPENAI_AUDIO_TO_TEXT_MODEL_ID)
33+
- text_to_audio_model_id: str | None - The OpenAI text to audio model ID to use, for example, jukebox-1.
34+
(Env var OPENAI_TEXT_TO_AUDIO_MODEL_ID)
3335
- env_file_path: str | None - if provided, the .env settings are read from this file path location
3436
"""
3537

@@ -42,3 +44,4 @@ class OpenAISettings(KernelBaseSettings):
4244
embedding_model_id: str | None = None
4345
text_to_image_model_id: str | None = None
4446
audio_to_text_model_id: str | None = None
47+
text_to_audio_model_id: str | None = None
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
from abc import ABC, abstractmethod
4+
from typing import Any
5+
6+
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
7+
from semantic_kernel.contents.audio_content import AudioContent
8+
from semantic_kernel.services.ai_service_client_base import AIServiceClientBase
9+
10+
11+
class TextToAudioClientBase(AIServiceClientBase, ABC):
12+
"""Base class for text to audio client."""
13+
14+
@abstractmethod
15+
async def get_audio_contents(
16+
self,
17+
text: str,
18+
settings: PromptExecutionSettings | None = None,
19+
**kwargs: Any,
20+
) -> list[AudioContent]:
21+
"""Get audio contents from text.
22+
23+
Args:
24+
text: The text to convert to audio.
25+
settings: Prompt execution settings.
26+
kwargs: Additional arguments.
27+
28+
Returns:
29+
list[AudioContent]: The generated audio contents.
30+
31+
Some services may return multiple audio contents in one call. some services don't.
32+
It is ok to return a list of one element.
33+
"""
34+
raise NotImplementedError
35+
36+
async def get_audio_content(
37+
self,
38+
text: str,
39+
settings: PromptExecutionSettings | None = None,
40+
**kwargs: Any,
41+
) -> AudioContent:
42+
"""Get audio content from text.
43+
44+
Args:
45+
text: The text to convert to audio.
46+
settings: Prompt execution settings.
47+
kwargs: Additional arguments.
48+
49+
Returns:
50+
AudioContent: The generated audio content.
51+
"""
52+
return (await self.get_audio_contents(text, settings, **kwargs))[0]

‎python/semantic_kernel/contents/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright (c) Microsoft. All rights reserved.
22

33
from semantic_kernel.contents.annotation_content import AnnotationContent
4+
from semantic_kernel.contents.audio_content import AudioContent
45
from semantic_kernel.contents.chat_history import ChatHistory
56
from semantic_kernel.contents.chat_message_content import ChatMessageContent
67
from semantic_kernel.contents.function_call_content import FunctionCallContent
@@ -16,6 +17,7 @@
1617

1718
__all__ = [
1819
"AnnotationContent",
20+
"AudioContent",
1921
"AuthorRole",
2022
"ChatHistory",
2123
"ChatMessageContent",

‎python/semantic_kernel/contents/binary_content.py

+5
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,11 @@ def from_element(cls: type[_T], element: Element) -> _T:
165165

166166
return cls(uri=element.get("uri", None))
167167

168+
def write_to_file(self, path: str | FilePath) -> None:
169+
"""Write the data to a file."""
170+
with open(path, "wb") as file:
171+
file.write(self.data)
172+
168173
def to_dict(self) -> dict[str, Any]:
169174
"""Convert the instance to a dictionary."""
170175
return {"type": "binary", "binary": {"uri": str(self)}}

‎python/tests/conftest.py

+2
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ def azure_openai_unit_test_env(monkeypatch, exclude_list, override_env_param_dic
231231
"AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME": "test_embedding_deployment",
232232
"AZURE_OPENAI_TEXT_TO_IMAGE_DEPLOYMENT_NAME": "test_text_to_image_deployment",
233233
"AZURE_OPENAI_AUDIO_TO_TEXT_DEPLOYMENT_NAME": "test_audio_to_text_deployment",
234+
"AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME": "test_text_to_audio_deployment",
234235
"AZURE_OPENAI_API_KEY": "test_api_key",
235236
"AZURE_OPENAI_ENDPOINT": "https://test-endpoint.com",
236237
"AZURE_OPENAI_API_VERSION": "2023-03-15-preview",
@@ -266,6 +267,7 @@ def openai_unit_test_env(monkeypatch, exclude_list, override_env_param_dict):
266267
"OPENAI_EMBEDDING_MODEL_ID": "test_embedding_model_id",
267268
"OPENAI_TEXT_TO_IMAGE_MODEL_ID": "test_text_to_image_model_id",
268269
"OPENAI_AUDIO_TO_TEXT_MODEL_ID": "test_audio_to_text_model_id",
270+
"OPENAI_TEXT_TO_AUDIO_MODEL_ID": "test_text_to_audio_model_id",
269271
}
270272

271273
env_vars.update(override_env_param_dict)

‎python/tests/integration/audio_to_text/audio_to_text_test_base.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
import pytest
66

77
from semantic_kernel.connectors.ai.audio_to_text_client_base import AudioToTextClientBase
8-
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
9-
from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText
10-
from tests.integration.test_utils import is_service_setup_for_testing
8+
from semantic_kernel.connectors.ai.open_ai import AzureAudioToText, OpenAIAudioToText
9+
from tests.integration.utils import is_service_setup_for_testing
1110

1211
# There is only the whisper model available on Azure OpenAI for audio to text. And that model is
1312
# only available in the North Switzerland region. Therefore, the endpoint is different than the one

‎python/tests/integration/audio_to_text/test_audio_to_text.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pytest
66

77
from semantic_kernel.connectors.ai.audio_to_text_client_base import AudioToTextClientBase
8-
from semantic_kernel.contents.audio_content import AudioContent
8+
from semantic_kernel.contents import AudioContent
99
from tests.integration.audio_to_text.audio_to_text_test_base import AudioToTextTestBase
1010

1111
pytestmark = pytest.mark.parametrize(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
4+
import pytest
5+
6+
from semantic_kernel.connectors.ai.text_to_audio_client_base import TextToAudioClientBase
7+
from semantic_kernel.contents import AudioContent
8+
from tests.integration.text_to_audio.text_to_audio_test_base import TextToAudioTestBase
9+
10+
pytestmark = pytest.mark.parametrize(
11+
"service_id, text",
12+
[
13+
pytest.param(
14+
"openai",
15+
"Hello World!",
16+
id="openai",
17+
),
18+
pytest.param(
19+
"azure_openai",
20+
"Hello World!",
21+
id="azure_openai",
22+
),
23+
],
24+
)
25+
26+
27+
@pytest.mark.asyncio(scope="module")
28+
class TestTextToAudio(TextToAudioTestBase):
29+
"""Test text-to-audio services."""
30+
31+
@pytest.mark.asyncio
32+
async def test_audio_to_text(
33+
self,
34+
services: dict[str, TextToAudioClientBase],
35+
service_id: str,
36+
text: str,
37+
) -> None:
38+
"""Test text-to-audio services.
39+
40+
Args:
41+
services: text-to-audio services.
42+
service_id: Service ID.
43+
text: Text content.
44+
"""
45+
46+
service = services[service_id]
47+
result = await service.get_audio_content(text)
48+
49+
assert isinstance(result, AudioContent)
50+
assert result.data is not None
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
import os
4+
5+
import pytest
6+
7+
from semantic_kernel.connectors.ai.open_ai import AzureTextToAudio, OpenAITextToAudio
8+
from semantic_kernel.connectors.ai.text_to_audio_client_base import TextToAudioClientBase
9+
from tests.integration.utils import is_service_setup_for_testing
10+
11+
# TTS model on Azure model is not available in regions at which we have chat completion models.
12+
# Therefore, we need to use a different endpoint for testing.
13+
is_service_setup_for_testing(["AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT"])
14+
15+
16+
class TextToAudioTestBase:
17+
"""Base class for testing text-to-audio services."""
18+
19+
@pytest.fixture(scope="module")
20+
def services(self) -> dict[str, TextToAudioClientBase]:
21+
"""Return text-to-audio services."""
22+
return {
23+
"openai": OpenAITextToAudio(),
24+
"azure_openai": AzureTextToAudio(endpoint=os.environ["AZURE_OPENAI_TEXT_TO_AUDIO_ENDPOINT"]),
25+
}

‎python/tests/unit/connectors/ai/open_ai/services/test_azure_audio_to_text.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
from openai.resources.audio.transcriptions import AsyncTranscriptions
99
from openai.types.audio import Transcription
1010

11-
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
12-
from semantic_kernel.contents.audio_content import AudioContent
11+
from semantic_kernel.connectors.ai.open_ai import AzureAudioToText
12+
from semantic_kernel.contents import AudioContent
1313
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError, ServiceInvalidRequestError
1414

1515

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
from unittest.mock import patch
4+
5+
import httpx
6+
import pytest
7+
from openai import AsyncAzureOpenAI, _legacy_response
8+
from openai.resources.audio.speech import AsyncSpeech
9+
10+
from semantic_kernel.connectors.ai.open_ai import AzureTextToAudio
11+
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError
12+
13+
14+
def test_azure_text_to_audio_init(azure_openai_unit_test_env) -> None:
15+
azure_text_to_audio = AzureTextToAudio()
16+
17+
assert azure_text_to_audio.client is not None
18+
assert isinstance(azure_text_to_audio.client, AsyncAzureOpenAI)
19+
assert azure_text_to_audio.ai_model_id == azure_openai_unit_test_env["AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME"]
20+
21+
22+
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME"]], indirect=True)
23+
def test_azure_text_to_audio_init_with_empty_deployment_name(azure_openai_unit_test_env) -> None:
24+
with pytest.raises(ServiceInitializationError, match="The Azure OpenAI text to audio deployment name is required."):
25+
AzureTextToAudio(env_file_path="test.env")
26+
27+
28+
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_API_KEY"]], indirect=True)
29+
def test_azure_text_to_audio_init_with_empty_api_key(azure_openai_unit_test_env) -> None:
30+
with pytest.raises(ServiceInitializationError):
31+
AzureTextToAudio(env_file_path="test.env")
32+
33+
34+
@pytest.mark.parametrize("exclude_list", [["AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_BASE_URL"]], indirect=True)
35+
def test_azure_text_to_audio_init_with_empty_endpoint_and_base_url(azure_openai_unit_test_env) -> None:
36+
with pytest.raises(ServiceInitializationError, match="Please provide an endpoint or a base_url"):
37+
AzureTextToAudio(env_file_path="test.env")
38+
39+
40+
@pytest.mark.parametrize("override_env_param_dict", [{"AZURE_OPENAI_ENDPOINT": "http://test.com"}], indirect=True)
41+
def test_azure_text_to_audio_init_with_invalid_http_endpoint(azure_openai_unit_test_env) -> None:
42+
with pytest.raises(ServiceInitializationError, match="Invalid settings: "):
43+
AzureTextToAudio()
44+
45+
46+
@pytest.mark.parametrize(
47+
"override_env_param_dict",
48+
[{"AZURE_OPENAI_BASE_URL": "https://test_text_to_audio_deployment.test-base-url.com"}],
49+
indirect=True,
50+
)
51+
def test_azure_text_to_audio_init_with_from_dict(azure_openai_unit_test_env) -> None:
52+
default_headers = {"test_header": "test_value"}
53+
54+
settings = {
55+
"deployment_name": azure_openai_unit_test_env["AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME"],
56+
"endpoint": azure_openai_unit_test_env["AZURE_OPENAI_ENDPOINT"],
57+
"api_key": azure_openai_unit_test_env["AZURE_OPENAI_API_KEY"],
58+
"api_version": azure_openai_unit_test_env["AZURE_OPENAI_API_VERSION"],
59+
"default_headers": default_headers,
60+
}
61+
62+
azure_text_to_audio = AzureTextToAudio.from_dict(settings=settings)
63+
64+
assert azure_text_to_audio.client is not None
65+
assert isinstance(azure_text_to_audio.client, AsyncAzureOpenAI)
66+
assert azure_text_to_audio.ai_model_id == azure_openai_unit_test_env["AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME"]
67+
assert settings["deployment_name"] in str(azure_text_to_audio.client.base_url)
68+
assert azure_text_to_audio.client.api_key == azure_openai_unit_test_env["AZURE_OPENAI_API_KEY"]
69+
70+
# Assert that the default header we added is present in the client's default headers
71+
for key, value in default_headers.items():
72+
assert key in azure_text_to_audio.client.default_headers
73+
assert azure_text_to_audio.client.default_headers[key] == value
74+
75+
76+
@pytest.mark.asyncio
77+
@patch.object(AsyncSpeech, "create", return_value=_legacy_response.HttpxBinaryResponseContent(httpx.Response(200)))
78+
async def test_azure_text_to_audio_get_audio_contents(mock_speech_create, azure_openai_unit_test_env) -> None:
79+
openai_audio_to_text = AzureTextToAudio()
80+
81+
audio_contents = await openai_audio_to_text.get_audio_contents("Hello World!")
82+
assert len(audio_contents) == 1
83+
assert audio_contents[0].ai_model_id == azure_openai_unit_test_env["AZURE_OPENAI_TEXT_TO_AUDIO_DEPLOYMENT_NAME"]

‎python/tests/unit/connectors/ai/open_ai/services/test_openai_audio_to_text.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99
from openai.resources.audio.transcriptions import AsyncTranscriptions
1010
from openai.types.audio import Transcription
1111

12+
from semantic_kernel.connectors.ai.open_ai import OpenAIAudioToTextExecutionSettings
1213
from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText
13-
from semantic_kernel.contents.audio_content import AudioContent
14+
from semantic_kernel.contents import AudioContent
1415
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError, ServiceInvalidRequestError
1516

1617

@@ -57,6 +58,11 @@ def test_init_to_from_dict(openai_unit_test_env):
5758
assert dumped_settings["api_key"] == settings["api_key"]
5859

5960

61+
def test_prompt_execution_settings_class(openai_unit_test_env) -> None:
62+
openai_audio_to_text = OpenAIAudioToText()
63+
assert openai_audio_to_text.get_prompt_execution_settings_class() == OpenAIAudioToTextExecutionSettings
64+
65+
6066
@pytest.mark.asyncio
6167
@patch.object(AsyncTranscriptions, "create", return_value=Transcription(text="This is a test audio file."))
6268
async def test_get_text_contents(mock_transcription_create, openai_unit_test_env):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Copyright (c) Microsoft. All rights reserved.
2+
3+
4+
from unittest.mock import patch
5+
6+
import httpx
7+
import pytest
8+
from openai import AsyncClient, _legacy_response
9+
from openai.resources.audio.speech import AsyncSpeech
10+
11+
from semantic_kernel.connectors.ai.open_ai import OpenAITextToAudio, OpenAITextToAudioExecutionSettings
12+
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError
13+
14+
15+
def test_init(openai_unit_test_env):
16+
openai_text_to_audio = OpenAITextToAudio()
17+
18+
assert openai_text_to_audio.client is not None
19+
assert isinstance(openai_text_to_audio.client, AsyncClient)
20+
assert openai_text_to_audio.ai_model_id == openai_unit_test_env["OPENAI_TEXT_TO_AUDIO_MODEL_ID"]
21+
22+
23+
def test_init_validation_fail() -> None:
24+
with pytest.raises(ServiceInitializationError, match="Failed to create OpenAI settings."):
25+
OpenAITextToAudio(api_key="34523", ai_model_id={"test": "dict"})
26+
27+
28+
@pytest.mark.parametrize("exclude_list", [["OPENAI_TEXT_TO_AUDIO_MODEL_ID"]], indirect=True)
29+
def test_init_text_to_audio_model_not_provided(openai_unit_test_env) -> None:
30+
with pytest.raises(ServiceInitializationError, match="The OpenAI text to audio model ID is required."):
31+
OpenAITextToAudio(
32+
env_file_path="test.env",
33+
)
34+
35+
36+
@pytest.mark.parametrize("exclude_list", [["OPENAI_API_KEY"]], indirect=True)
37+
def test_init_with_empty_api_key(openai_unit_test_env) -> None:
38+
with pytest.raises(ServiceInitializationError):
39+
OpenAITextToAudio(
40+
env_file_path="test.env",
41+
)
42+
43+
44+
def test_init_to_from_dict(openai_unit_test_env):
45+
default_headers = {"X-Unit-Test": "test-guid"}
46+
47+
settings = {
48+
"ai_model_id": openai_unit_test_env["OPENAI_TEXT_TO_AUDIO_MODEL_ID"],
49+
"api_key": openai_unit_test_env["OPENAI_API_KEY"],
50+
"default_headers": default_headers,
51+
}
52+
audio_to_text = OpenAITextToAudio.from_dict(settings)
53+
dumped_settings = audio_to_text.to_dict()
54+
assert dumped_settings["ai_model_id"] == settings["ai_model_id"]
55+
assert dumped_settings["api_key"] == settings["api_key"]
56+
57+
58+
def test_prompt_execution_settings_class(openai_unit_test_env) -> None:
59+
openai_text_to_audio = OpenAITextToAudio()
60+
assert openai_text_to_audio.get_prompt_execution_settings_class() == OpenAITextToAudioExecutionSettings
61+
62+
63+
@pytest.mark.asyncio
64+
@patch.object(AsyncSpeech, "create", return_value=_legacy_response.HttpxBinaryResponseContent(httpx.Response(200)))
65+
async def test_get_text_contents(mock_speech_create, openai_unit_test_env):
66+
openai_text_to_audio = OpenAITextToAudio()
67+
68+
audio_contents = await openai_text_to_audio.get_audio_contents("Hello World!")
69+
assert len(audio_contents) == 1
70+
assert audio_contents[0].ai_model_id == openai_unit_test_env["OPENAI_TEXT_TO_AUDIO_MODEL_ID"]

‎python/tests/unit/connectors/ai/open_ai/services/test_openai_text_to_image.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from openai.types.image import Image
99
from openai.types.images_response import ImagesResponse
1010

11-
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_image import OpenAITextToImage
11+
from semantic_kernel.connectors.ai.open_ai import OpenAITextToImage, OpenAITextToImageExecutionSettings
1212
from semantic_kernel.exceptions.service_exceptions import (
1313
ServiceInitializationError,
1414
ServiceInvalidExecutionSettingsError,
@@ -59,6 +59,11 @@ def test_init_with_no_model_id(openai_unit_test_env) -> None:
5959
)
6060

6161

62+
def test_prompt_execution_settings_class(openai_unit_test_env) -> None:
63+
openai_text_to_image = OpenAITextToImage()
64+
assert openai_text_to_image.get_prompt_execution_settings_class() == OpenAITextToImageExecutionSettings
65+
66+
6267
@pytest.mark.asyncio
6368
@patch.object(AsyncImages, "generate", return_value=AsyncMock(spec=ImagesResponse))
6469
async def test_generate_calls_with_parameters(mock_generate, openai_unit_test_env) -> None:

0 commit comments

Comments
 (0)
Please sign in to comment.