markwallace-microsoft · Feb 19, 2025
diff --git a/‎python/samples/concepts/caching/semantic_caching.py
+143 b/‎python/samples/concepts/caching/semantic_caching.py
+143
diff --git a/‎python/samples/concepts/filtering/function_invocation_filters_stream.py
+15-12 b/‎python/samples/concepts/filtering/function_invocation_filters_stream.py
+15-12
diff --git a/‎python/samples/concepts/filtering/retry_with_different_model.py
+98 b/‎python/samples/concepts/filtering/retry_with_different_model.py
+98
diff --git a/‎python/samples/concepts/filtering/retry_with_filters.py
+3-4 b/‎python/samples/concepts/filtering/retry_with_filters.py
+3-4
diff --git a/‎python/semantic_kernel/connectors/ai/chat_completion_client_base.py
+3 b/‎python/semantic_kernel/connectors/ai/chat_completion_client_base.py
+3
diff --git a/‎python/semantic_kernel/connectors/memory/in_memory/in_memory_collection.py
+4-4 b/‎python/semantic_kernel/connectors/memory/in_memory/in_memory_collection.py
+4-4
@@ -0,0 +1,143 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import time
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass, field
+from typing import Annotated
+from uuid import uuid4
+
+from semantic_kernel.connectors.ai.embeddings.embedding_generator_base import EmbeddingGeneratorBase
+from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion
+from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding
+from semantic_kernel.connectors.memory.in_memory.in_memory_store import InMemoryVectorStore
+from semantic_kernel.data.record_definition import vectorstoremodel
+from semantic_kernel.data.record_definition.vector_store_record_fields import (
+    VectorStoreRecordDataField,
+    VectorStoreRecordKeyField,
+    VectorStoreRecordVectorField,
+)
+from semantic_kernel.data.vector_search.vector_search_options import VectorSearchOptions
+from semantic_kernel.data.vector_search.vectorized_search import VectorizedSearchMixin
+from semantic_kernel.data.vector_storage.vector_store import VectorStore
+from semantic_kernel.data.vector_storage.vector_store_record_collection import VectorStoreRecordCollection
+from semantic_kernel.filters.filter_types import FilterTypes
+from semantic_kernel.filters.functions.function_invocation_context import FunctionInvocationContext
+from semantic_kernel.filters.prompts.prompt_render_context import PromptRenderContext
+from semantic_kernel.functions.function_result import FunctionResult
+from semantic_kernel.kernel import Kernel
+
+COLLECTION_NAME = "llm_responses"
+RECORD_ID_KEY = "cache_record_id"
+
+
+# Define a simple data model to store, the prompt, the result, and the prompt embedding.
+@vectorstoremodel
+@dataclass
+class CacheRecord:
+    prompt: Annotated[str, VectorStoreRecordDataField(embedding_property_name="prompt_embedding")]
+    result: Annotated[str, VectorStoreRecordDataField(is_full_text_searchable=True)]
+    prompt_embedding: Annotated[list[float], VectorStoreRecordVectorField(dimensions=1526)] = field(
+        default_factory=list
+    )
+    id: Annotated[str, VectorStoreRecordKeyField] = field(default_factory=lambda: str(uuid4()))
+
+
+# Define the filters, one for caching the results and one for using the cache.
+class PromptCacheFilter:
+    """A filter to cache the results of the prompt rendering and function invocation."""
+
+    def __init__(
+        self,
+        embedding_service: EmbeddingGeneratorBase,
+        vector_store: VectorStore,
+        collection_name: str = COLLECTION_NAME,
+        score_threshold: float = 0.2,
+    ):
+        self.embedding_service = embedding_service
+        self.vector_store = vector_store
+        self.collection: VectorStoreRecordCollection[str, CacheRecord] = vector_store.get_collection(
+            collection_name, data_model_type=CacheRecord
+        )
+        self.score_threshold = score_threshold
+
+    async def on_prompt_render(
+        self, context: PromptRenderContext, next: Callable[[PromptRenderContext], Awaitable[None]]
+    ):
+        """Filter to cache the rendered prompt and the result of the function.
+
+        It uses the score threshold to determine if the result should be cached.
+        The direction of the comparison is based on the default distance metric for
+        the in memory vector store, which is cosine distance, so the closer to 0 the
+        closer the match.
+        """
+        await next(context)
+        assert context.rendered_prompt  # nosec
+        prompt_embedding = await self.embedding_service.generate_raw_embeddings([context.rendered_prompt])
+        await self.collection.create_collection_if_not_exists()
+        assert isinstance(self.collection, VectorizedSearchMixin)  # nosec
+        results = await self.collection.vectorized_search(
+            vector=prompt_embedding[0], options=VectorSearchOptions(vector_field_name="prompt_embedding", top=1)
+        )
+        async for result in results.results:
+            if result.score < self.score_threshold:
+                context.function_result = FunctionResult(
+                    function=context.function.metadata,
+                    value=result.record.result,
+                    rendered_prompt=context.rendered_prompt,
+                    metadata={RECORD_ID_KEY: result.record.id},
+                )
+
+    async def on_function_invocation(
+        self, context: FunctionInvocationContext, next: Callable[[FunctionInvocationContext], Awaitable[None]]
+    ):
+        """Filter to store the result in the cache if it is new."""
+        await next(context)
+        result = context.result
+        if result and result.rendered_prompt and RECORD_ID_KEY not in result.metadata:
+            prompt_embedding = await self.embedding_service.generate_embeddings([result.rendered_prompt])
+            cache_record = CacheRecord(
+                prompt=result.rendered_prompt,
+                result=str(result),
+                prompt_embedding=prompt_embedding[0],
+            )
+            await self.collection.create_collection_if_not_exists()
+            await self.collection.upsert(cache_record)
+
+
+async def execute_async(kernel: Kernel, title: str, prompt: str):
+    """Helper method to execute and log time."""
+    print(f"{title}: {prompt}")
+    start = time.time()
+    result = await kernel.invoke_prompt(prompt)
+    elapsed = time.time() - start
+    print(f"\tElapsed Time: {elapsed:.3f}")
+    return result
+
+
+async def main():
+    # create the kernel and add the chat service and the embedding service
+    kernel = Kernel()
+    chat = OpenAIChatCompletion(service_id="default")
+    embedding = OpenAITextEmbedding(service_id="embedder")
+    kernel.add_service(chat)
+    kernel.add_service(embedding)
+    # create the in-memory vector store
+    vector_store = InMemoryVectorStore()
+    # create the cache filter and add the filters to the kernel
+    cache = PromptCacheFilter(embedding_service=embedding, vector_store=vector_store)
+    kernel.add_filter(FilterTypes.PROMPT_RENDERING, cache.on_prompt_render)
+    kernel.add_filter(FilterTypes.FUNCTION_INVOCATION, cache.on_function_invocation)
+
+    # Run the sample
+    print("\nIn-memory cache sample:")
+    r1 = await execute_async(kernel, "First run", "What's the tallest building in New York?")
+    print(f"\tResult 1: {r1}")
+    r2 = await execute_async(kernel, "Second run", "How are you today?")
+    print(f"\tResult 2: {r2}")
+    r3 = await execute_async(kernel, "Third run", "What is the highest building in New York City?")
+    print(f"\tResult 3: {r3}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -4,7 +4,6 @@
 import logging
 import os
 from collections.abc import Callable, Coroutine
-from functools import reduce
 from typing import Any
 
 from semantic_kernel import Kernel
@@ -38,17 +37,21 @@ async def streaming_exception_handling(
 ):
     await next(context)
 
-    async def override_stream(stream):
-        try:
-            async for partial in stream:
-                yield partial
-        except Exception as e:
-            yield [
-                StreamingChatMessageContent(role=AuthorRole.ASSISTANT, content=f"Exception caught: {e}", choice_index=0)
-            ]
+    if context.is_streaming:
 
-    stream = context.result.value
-    context.result = FunctionResult(function=context.result.function, value=override_stream(stream))
+        async def override_stream(stream):
+            try:
+                async for partial in stream:
+                    yield partial
+            except Exception as e:
+                yield [
+                    StreamingChatMessageContent(
+                        role=AuthorRole.ASSISTANT, content=f"Exception caught: {e}", choice_index=0
+                    )
+                ]
+
+        stream = context.result.value
+        context.result = FunctionResult(function=context.result.function, value=override_stream(stream))
 
 
 async def chat(chat_history: ChatHistory) -> bool:
@@ -77,7 +80,7 @@ async def chat(chat_history: ChatHistory) -> bool:
     print("")
     chat_history.add_user_message(user_input)
     if streamed_chunks:
-        streaming_chat_message = reduce(lambda first, second: first + second, streamed_chunks)
+        streaming_chat_message = sum(streamed_chunks[1:], streamed_chunks[0])
         chat_history.add_message(streaming_chat_message)
     return True
 
 
@@ -0,0 +1,98 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import logging
+from collections.abc import Awaitable, Callable
+
+from semantic_kernel import Kernel
+from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
+    OpenAIChatPromptExecutionSettings,
+)
+from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion
+from semantic_kernel.filters import FunctionInvocationContext
+from semantic_kernel.filters.filter_types import FilterTypes
+from semantic_kernel.functions.kernel_arguments import KernelArguments
+
+# This sample shows how to use a filter to use a fallback service if the default service fails to execute the function.
+# this works by replacing the settings that point to the default service
+# with the settings that point to the fallback service
+# after the default service fails to execute the function.
+
+logger = logging.getLogger(__name__)
+
+
+class RetryFilter:
+    """A filter that retries the function invocation with a different model if it fails."""
+
+    def __init__(self, default_service_id: str, fallback_service_id: str):
+        """Initialize the filter with the default and fallback service ids."""
+        self.default_service_id = default_service_id
+        self.fallback_service_id = fallback_service_id
+
+    async def retry_filter(
+        self,
+        context: FunctionInvocationContext,
+        next: Callable[[FunctionInvocationContext], Awaitable[None]],
+    ) -> None:
+        """A filter that retries the function invocation with a different model if it fails."""
+        try:
+            # try the default function
+            await next(context)
+        except Exception as ex:
+            print("Expected failure to execute the function: ", ex)
+            # if the default function fails, try the fallback function
+            if (
+                context.arguments
+                and context.arguments.execution_settings
+                and self.default_service_id in context.arguments.execution_settings
+            ):
+                # get the settings for the default service
+                settings = context.arguments.execution_settings.pop(self.default_service_id)
+                settings.service_id = self.fallback_service_id
+                # add them back with the right service id
+                context.arguments.execution_settings[self.fallback_service_id] = settings
+                # try again!
+                await next(context)
+            else:
+                raise ex
+
+
+async def main() -> None:
+    # set the ids for the default and fallback services
+    default_service_id = "default_service"
+    fallback_service_id = "fallback_service"
+    kernel = Kernel()
+    # create the filter with the ids
+    retry_filter = RetryFilter(default_service_id=default_service_id, fallback_service_id=fallback_service_id)
+    # add the filter to the kernel
+    kernel.add_filter(FilterTypes.FUNCTION_INVOCATION, retry_filter.retry_filter)
+
+    # add the default and fallback services
+    default_service = OpenAIChatCompletion(service_id=default_service_id, api_key="invalid_key")
+    kernel.add_service(default_service)
+    fallback_service = OpenAIChatCompletion(service_id=fallback_service_id)
+    kernel.add_service(fallback_service)
+
+    # create the settings for the request
+    request_settings = OpenAIChatPromptExecutionSettings(service_id=default_service_id)
+    # invoke a simple prompt function
+    response = await kernel.invoke_prompt(
+        function_name="retry_function",
+        prompt="How are you today?",
+        arguments=KernelArguments(settings=request_settings),
+    )
+
+    print("Model response: ", response)
+
+    # Sample output:
+    # Expected failure to execute the function:  Error occurred while invoking function retry_function:
+    # ("<class 'semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion.OpenAIChatCompletion'> service
+    # failed to complete the prompt", AuthenticationError("Error code: 401 - {'error': {'message': 'Incorrect API key
+    # provided: invalid_key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type':
+    # 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}"))
+    # Model response:  I'm just a program, so I don't experience feelings, but I'm here and ready to help you out.
+    # How can I assist you today?
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -2,8 +2,7 @@
 
 import asyncio
 import logging
-from collections.abc import Callable, Coroutine
-from typing import Any
+from collections.abc import Awaitable, Callable
 
 from samples.concepts.setup.chat_completion_services import Services, get_chat_completion_service_and_request_settings
 from semantic_kernel import Kernel
@@ -34,7 +33,7 @@ def __init__(self):
         self._invocation_count = 0
 
     @kernel_function(name="GetWeather", description="Get the weather of the day at the current location.")
-    def get_wather(self) -> str:
+    def get_weather(self) -> str:
         """Get the weather of the day at the current location.
 
         Simulates a call to an external service to get the weather.
@@ -50,7 +49,7 @@ def get_wather(self) -> str:
 
 async def retry_filter(
     context: FunctionInvocationContext,
-    next: Callable[[FunctionInvocationContext], Coroutine[Any, Any, None]],
+    next: Callable[[FunctionInvocationContext], Awaitable[None]],
 ) -> None:
     """A filter that retries the function invocation if it fails.
 
 
@@ -157,6 +157,7 @@ async def get_chat_message_contents(
                             function_call=function_call,
                             chat_history=chat_history,
                             arguments=kwargs.get("arguments"),
+                            execution_settings=settings,
                             function_call_count=fc_count,
                             request_index=request_index,
                             function_behavior=settings.function_choice_behavior,
@@ -289,6 +290,8 @@ async def get_streaming_chat_message_contents(
                             function_call=function_call,
                             chat_history=chat_history,
                             arguments=kwargs.get("arguments"),
+                            is_streaming=True,
+                            execution_settings=settings,
                             function_call_count=fc_count,
                             request_index=request_index,
                             function_behavior=settings.function_choice_behavior,
 
@@ -6,16 +6,15 @@
 
 from pydantic import Field
 
-from semantic_kernel.data.filter_clauses.any_tags_equal_to_filter_clause import AnyTagsEqualTo
-from semantic_kernel.data.filter_clauses.equal_to_filter_clause import EqualTo
-
 if sys.version_info >= (3, 12):
     from typing import override  # pragma: no cover
 else:
     from typing_extensions import override  # pragma: no cover
 
 from semantic_kernel.connectors.memory.in_memory.const import DISTANCE_FUNCTION_MAP
 from semantic_kernel.data.const import DistanceFunction
+from semantic_kernel.data.filter_clauses.any_tags_equal_to_filter_clause import AnyTagsEqualTo
+from semantic_kernel.data.filter_clauses.equal_to_filter_clause import EqualTo
 from semantic_kernel.data.filter_clauses.filter_clause_base import FilterClauseBase
 from semantic_kernel.data.kernel_search_results import KernelSearchResults
 from semantic_kernel.data.record_definition.vector_store_model_definition import VectorStoreRecordDefinition
@@ -29,6 +28,7 @@
 from semantic_kernel.data.vector_search.vectorized_search import VectorizedSearchMixin
 from semantic_kernel.exceptions import VectorSearchExecutionException, VectorStoreModelValidationError
 from semantic_kernel.kernel_types import OneOrMany
+from semantic_kernel.utils.list_handler import empty_generator
 
 KEY_TYPES = str | int | float
 
@@ -171,7 +171,7 @@ async def _inner_search_vectorized(
                 ),
                 total_count=len(return_records) if options and options.include_total_count else None,
             )
-        return KernelSearchResults(results=None)
+        return KernelSearchResults(results=empty_generator())
 
     async def _generate_return_list(
         self, return_records: dict[KEY_TYPES, float], options: VectorSearchOptions | None