From 201bdf71487cde67a12a9ede002bd528b58b5145 Mon Sep 17 00:00:00 2001 From: Kyle Winkelman <39207896+kyle-winkelman@users.noreply.github.com> Date: Thu, 29 Aug 2024 11:48:04 -0500 Subject: [PATCH 1/3] community: Cap AzureOpenAIEmbeddings chunk_size at 2048 instead of 16. (#25852) **Description:** Within AzureOpenAIEmbeddings there is a validation to cap `chunk_size` at 16. The value of 16 is either an old limitation or was erroneously chosen. I have checked all of the `preview` and `stable` releases to ensure that the `embeddings` endpoint can handle 2048 entries [Azure/azure-rest-api-specs](https://github.com/Azure/azure-rest-api-specs/tree/main/specification/cognitiveservices/data-plane/AzureOpenAI/inference). I have also found many locations that confirm this limit should be 2048: - https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings - https://learn.microsoft.com/en-us/azure/ai-services/openai/quotas-limits **Issue:** fixes #25462 --- .../langchain_community/embeddings/azure_openai.py | 4 ++-- libs/community/langchain_community/embeddings/openai.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/libs/community/langchain_community/embeddings/azure_openai.py b/libs/community/langchain_community/embeddings/azure_openai.py index 61aee8a8668..cf06a795d03 100644 --- a/libs/community/langchain_community/embeddings/azure_openai.py +++ b/libs/community/langchain_community/embeddings/azure_openai.py @@ -91,10 +91,10 @@ class AzureOpenAIEmbeddings(OpenAIEmbeddings): values["azure_ad_token"] = values.get("azure_ad_token") or os.getenv( "AZURE_OPENAI_AD_TOKEN" ) - # Azure OpenAI embedding models allow a maximum of 16 texts + # Azure OpenAI embedding models allow a maximum of 2048 texts # at a time in each batch # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings - values["chunk_size"] = min(values["chunk_size"], 16) + values["chunk_size"] = min(values["chunk_size"], 2048) try: import openai # noqa: F401 except ImportError: diff --git a/libs/community/langchain_community/embeddings/openai.py b/libs/community/langchain_community/embeddings/openai.py index 09936de8057..0c097b11aca 100644 --- a/libs/community/langchain_community/embeddings/openai.py +++ b/libs/community/langchain_community/embeddings/openai.py @@ -307,10 +307,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings): ) if values["openai_api_type"] in ("azure", "azure_ad", "azuread"): default_api_version = "2023-05-15" - # Azure OpenAI embedding models allow a maximum of 16 texts - # at a time in each batch + # Azure OpenAI embedding models allow a maximum of 2048 + # texts at a time in each batch # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings - values["chunk_size"] = min(values["chunk_size"], 16) + values["chunk_size"] = min(values["chunk_size"], 2048) else: default_api_version = "" values["openai_api_version"] = get_from_dict_or_env( From 09c2d8faca42f09a924599c9d75cc38e05ba1ad4 Mon Sep 17 00:00:00 2001 From: Kyle Winkelman <39207896+kyle-winkelman@users.noreply.github.com> Date: Thu, 29 Aug 2024 12:54:43 -0500 Subject: [PATCH 2/3] langchain_openai: Cleanup OpenAIEmbeddings validate_environment. (#25855) **Description:** [This portion of code](https://github.com/langchain-ai/langchain/blob/v0.1.16/libs/partners/openai/langchain_openai/embeddings/base.py#L189-L196) has no use as a couple lines later a [`ValueError` is thrown](https://github.com/langchain-ai/langchain/blob/v0.1.16/libs/partners/openai/langchain_openai/embeddings/base.py#L209-L213). **Issue:** A follow up to #25852. --- .../openai/langchain_openai/embeddings/base.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/libs/partners/openai/langchain_openai/embeddings/base.py b/libs/partners/openai/langchain_openai/embeddings/base.py index 1e78302a9e4..3625c34a8ad 100644 --- a/libs/partners/openai/langchain_openai/embeddings/base.py +++ b/libs/partners/openai/langchain_openai/embeddings/base.py @@ -302,19 +302,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings): values["openai_proxy"] = get_from_dict_or_env( values, "openai_proxy", "OPENAI_PROXY", default="" ) - if values["openai_api_type"] in ("azure", "azure_ad", "azuread"): - default_api_version = "2023-05-15" - # Azure OpenAI embedding models allow a maximum of 16 texts - # at a time in each batch - # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings - values["chunk_size"] = min(values["chunk_size"], 16) - else: - default_api_version = "" values["openai_api_version"] = get_from_dict_or_env( - values, - "openai_api_version", - "OPENAI_API_VERSION", - default=default_api_version, + values, "openai_api_version", "OPENAI_API_VERSION", default="" ) # Check OPENAI_ORGANIZATION for backwards compatibility. values["openai_organization"] = ( From fabd3295fabb4c79fedb4dbbe725a308658ef8d8 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Thu, 29 Aug 2024 13:34:54 -0700 Subject: [PATCH 3/3] core[patch]: dont mutate merged lists/dicts (#25858) Update merging utils to - not mutate objects - have special handling to 'type' keys in dicts --- libs/core/langchain_core/utils/_merge.py | 21 +++++++++-- .../core/tests/unit_tests/utils/test_utils.py | 37 +++++++++++++++++++ 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/libs/core/langchain_core/utils/_merge.py b/libs/core/langchain_core/utils/_merge.py index 42cdde85493..d058b8041aa 100644 --- a/libs/core/langchain_core/utils/_merge.py +++ b/libs/core/langchain_core/utils/_merge.py @@ -41,6 +41,19 @@ def merge_dicts(left: Dict[str, Any], *others: Dict[str, Any]) -> Dict[str, Any] " but with a different type." ) elif isinstance(merged[right_k], str): + # TODO: Add below special handling for 'type' key in 0.3 and remove + # merge_lists 'type' logic. + # + # if right_k == "type": + # if merged[right_k] == right_v: + # continue + # else: + # raise ValueError( + # "Unable to merge. Two different values seen for special " + # f"key 'type': {merged[right_k]} and {right_v}. 'type' " + # "should either occur once or have the same value across " + # "all dicts." + # ) merged[right_k] += right_v elif isinstance(merged[right_k], dict): merged[right_k] = merge_dicts(merged[right_k], right_v) @@ -81,10 +94,10 @@ def merge_lists(left: Optional[List], *others: Optional[List]) -> Optional[List] if e_left["index"] == e["index"] ] if to_merge: - # If a top-level "type" has been set for a chunk, it should no - # longer be overridden by the "type" field in future chunks. - if "type" in merged[to_merge[0]] and "type" in e: - e.pop("type") + # TODO: Remove this once merge_dict is updated with special + # handling for 'type'. + if "type" in e: + e = {k: v for k, v in e.items() if k != "type"} merged[to_merge[0]] = merge_dicts(merged[to_merge[0]], e) else: merged.append(e) diff --git a/libs/core/tests/unit_tests/utils/test_utils.py b/libs/core/tests/unit_tests/utils/test_utils.py index 7905bfb62dc..419e6309fd9 100644 --- a/libs/core/tests/unit_tests/utils/test_utils.py +++ b/libs/core/tests/unit_tests/utils/test_utils.py @@ -1,6 +1,7 @@ import os import re from contextlib import AbstractContextManager, nullcontext +from copy import deepcopy from typing import Any, Callable, Dict, Optional, Tuple, Type, Union from unittest.mock import patch @@ -120,9 +121,45 @@ def test_merge_dicts( else: err = nullcontext() + left_copy = deepcopy(left) + right_copy = deepcopy(right) with err: actual = merge_dicts(left, right) assert actual == expected + # no mutation + assert left == left_copy + assert right == right_copy + + +@pytest.mark.parametrize( + ("left", "right", "expected"), + ( + # 'type' special key handling + ({"type": "foo"}, {"type": "foo"}, {"type": "foo"}), + ( + {"type": "foo"}, + {"type": "bar"}, + pytest.raises(ValueError, match="Unable to merge."), + ), + ), +) +@pytest.mark.xfail(reason="Refactors to make in 0.3") +def test_merge_dicts_0_3( + left: dict, right: dict, expected: Union[dict, AbstractContextManager] +) -> None: + if isinstance(expected, AbstractContextManager): + err = expected + else: + err = nullcontext() + + left_copy = deepcopy(left) + right_copy = deepcopy(right) + with err: + actual = merge_dicts(left, right) + assert actual == expected + # no mutation + assert left == left_copy + assert right == right_copy @pytest.mark.parametrize(