From 201bdf71487cde67a12a9ede002bd528b58b5145 Mon Sep 17 00:00:00 2001
From: Kyle Winkelman <39207896+kyle-winkelman@users.noreply.github.com>
Date: Thu, 29 Aug 2024 11:48:04 -0500
Subject: [PATCH 1/3] community: Cap AzureOpenAIEmbeddings chunk_size at 2048
 instead of 16. (#25852)

**Description:** Within AzureOpenAIEmbeddings there is a validation to
cap `chunk_size` at 16. The value of 16 is either an old limitation or
was erroneously chosen. I have checked all of the `preview` and `stable`
releases to ensure that the `embeddings` endpoint can handle 2048
entries
[Azure/azure-rest-api-specs](https://github.com/Azure/azure-rest-api-specs/tree/main/specification/cognitiveservices/data-plane/AzureOpenAI/inference).
I have also found many locations that confirm this limit should be 2048:
-
https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
-
https://learn.microsoft.com/en-us/azure/ai-services/openai/quotas-limits

**Issue:** fixes #25462
---
 .../langchain_community/embeddings/azure_openai.py          | 4 ++--
 libs/community/langchain_community/embeddings/openai.py     | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libs/community/langchain_community/embeddings/azure_openai.py b/libs/community/langchain_community/embeddings/azure_openai.py
index 61aee8a8668..cf06a795d03 100644
--- a/libs/community/langchain_community/embeddings/azure_openai.py
+++ b/libs/community/langchain_community/embeddings/azure_openai.py
@@ -91,10 +91,10 @@ class AzureOpenAIEmbeddings(OpenAIEmbeddings):
         values["azure_ad_token"] = values.get("azure_ad_token") or os.getenv(
             "AZURE_OPENAI_AD_TOKEN"
         )
-        # Azure OpenAI embedding models allow a maximum of 16 texts
+        # Azure OpenAI embedding models allow a maximum of 2048 texts
         # at a time in each batch
         # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
-        values["chunk_size"] = min(values["chunk_size"], 16)
+        values["chunk_size"] = min(values["chunk_size"], 2048)
         try:
             import openai  # noqa: F401
         except ImportError:
diff --git a/libs/community/langchain_community/embeddings/openai.py b/libs/community/langchain_community/embeddings/openai.py
index 09936de8057..0c097b11aca 100644
--- a/libs/community/langchain_community/embeddings/openai.py
+++ b/libs/community/langchain_community/embeddings/openai.py
@@ -307,10 +307,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
         )
         if values["openai_api_type"] in ("azure", "azure_ad", "azuread"):
             default_api_version = "2023-05-15"
-            # Azure OpenAI embedding models allow a maximum of 16 texts
-            # at a time in each batch
+            # Azure OpenAI embedding models allow a maximum of 2048
+            # texts at a time in each batch
             # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
-            values["chunk_size"] = min(values["chunk_size"], 16)
+            values["chunk_size"] = min(values["chunk_size"], 2048)
         else:
             default_api_version = ""
         values["openai_api_version"] = get_from_dict_or_env(

From 09c2d8faca42f09a924599c9d75cc38e05ba1ad4 Mon Sep 17 00:00:00 2001
From: Kyle Winkelman <39207896+kyle-winkelman@users.noreply.github.com>
Date: Thu, 29 Aug 2024 12:54:43 -0500
Subject: [PATCH 2/3] langchain_openai: Cleanup OpenAIEmbeddings
 validate_environment. (#25855)

**Description:** [This portion of
code](https://github.com/langchain-ai/langchain/blob/v0.1.16/libs/partners/openai/langchain_openai/embeddings/base.py#L189-L196)
has no use as a couple lines later a [`ValueError` is
thrown](https://github.com/langchain-ai/langchain/blob/v0.1.16/libs/partners/openai/langchain_openai/embeddings/base.py#L209-L213).
**Issue:** A follow up to #25852.
---
 .../openai/langchain_openai/embeddings/base.py      | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/libs/partners/openai/langchain_openai/embeddings/base.py b/libs/partners/openai/langchain_openai/embeddings/base.py
index 1e78302a9e4..3625c34a8ad 100644
--- a/libs/partners/openai/langchain_openai/embeddings/base.py
+++ b/libs/partners/openai/langchain_openai/embeddings/base.py
@@ -302,19 +302,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
         values["openai_proxy"] = get_from_dict_or_env(
             values, "openai_proxy", "OPENAI_PROXY", default=""
         )
-        if values["openai_api_type"] in ("azure", "azure_ad", "azuread"):
-            default_api_version = "2023-05-15"
-            # Azure OpenAI embedding models allow a maximum of 16 texts
-            # at a time in each batch
-            # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
-            values["chunk_size"] = min(values["chunk_size"], 16)
-        else:
-            default_api_version = ""
         values["openai_api_version"] = get_from_dict_or_env(
-            values,
-            "openai_api_version",
-            "OPENAI_API_VERSION",
-            default=default_api_version,
+            values, "openai_api_version", "OPENAI_API_VERSION", default=""
         )
         # Check OPENAI_ORGANIZATION for backwards compatibility.
         values["openai_organization"] = (

From fabd3295fabb4c79fedb4dbbe725a308658ef8d8 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Thu, 29 Aug 2024 13:34:54 -0700
Subject: [PATCH 3/3] core[patch]: dont mutate merged lists/dicts (#25858)

Update merging utils to
- not mutate objects
- have special handling to 'type' keys in dicts
---
 libs/core/langchain_core/utils/_merge.py      | 21 +++++++++--
 .../core/tests/unit_tests/utils/test_utils.py | 37 +++++++++++++++++++
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/libs/core/langchain_core/utils/_merge.py b/libs/core/langchain_core/utils/_merge.py
index 42cdde85493..d058b8041aa 100644
--- a/libs/core/langchain_core/utils/_merge.py
+++ b/libs/core/langchain_core/utils/_merge.py
@@ -41,6 +41,19 @@ def merge_dicts(left: Dict[str, Any], *others: Dict[str, Any]) -> Dict[str, Any]
                     " but with a different type."
                 )
             elif isinstance(merged[right_k], str):
+                # TODO: Add below special handling for 'type' key in 0.3 and remove
+                # merge_lists 'type' logic.
+                #
+                # if right_k == "type":
+                #     if merged[right_k] == right_v:
+                #         continue
+                #     else:
+                #         raise ValueError(
+                #             "Unable to merge. Two different values seen for special "
+                #             f"key 'type': {merged[right_k]} and {right_v}. 'type' "
+                #             "should either occur once or have the same value across "
+                #             "all dicts."
+                #         )
                 merged[right_k] += right_v
             elif isinstance(merged[right_k], dict):
                 merged[right_k] = merge_dicts(merged[right_k], right_v)
@@ -81,10 +94,10 @@ def merge_lists(left: Optional[List], *others: Optional[List]) -> Optional[List]
                         if e_left["index"] == e["index"]
                     ]
                     if to_merge:
-                        # If a top-level "type" has been set for a chunk, it should no
-                        # longer be overridden by the "type" field in future chunks.
-                        if "type" in merged[to_merge[0]] and "type" in e:
-                            e.pop("type")
+                        # TODO: Remove this once merge_dict is updated with special
+                        # handling for 'type'.
+                        if "type" in e:
+                            e = {k: v for k, v in e.items() if k != "type"}
                         merged[to_merge[0]] = merge_dicts(merged[to_merge[0]], e)
                     else:
                         merged.append(e)
diff --git a/libs/core/tests/unit_tests/utils/test_utils.py b/libs/core/tests/unit_tests/utils/test_utils.py
index 7905bfb62dc..419e6309fd9 100644
--- a/libs/core/tests/unit_tests/utils/test_utils.py
+++ b/libs/core/tests/unit_tests/utils/test_utils.py
@@ -1,6 +1,7 @@
 import os
 import re
 from contextlib import AbstractContextManager, nullcontext
+from copy import deepcopy
 from typing import Any, Callable, Dict, Optional, Tuple, Type, Union
 from unittest.mock import patch
 
@@ -120,9 +121,45 @@ def test_merge_dicts(
     else:
         err = nullcontext()
 
+    left_copy = deepcopy(left)
+    right_copy = deepcopy(right)
     with err:
         actual = merge_dicts(left, right)
         assert actual == expected
+        # no mutation
+        assert left == left_copy
+        assert right == right_copy
+
+
+@pytest.mark.parametrize(
+    ("left", "right", "expected"),
+    (
+        # 'type' special key handling
+        ({"type": "foo"}, {"type": "foo"}, {"type": "foo"}),
+        (
+            {"type": "foo"},
+            {"type": "bar"},
+            pytest.raises(ValueError, match="Unable to merge."),
+        ),
+    ),
+)
+@pytest.mark.xfail(reason="Refactors to make in 0.3")
+def test_merge_dicts_0_3(
+    left: dict, right: dict, expected: Union[dict, AbstractContextManager]
+) -> None:
+    if isinstance(expected, AbstractContextManager):
+        err = expected
+    else:
+        err = nullcontext()
+
+    left_copy = deepcopy(left)
+    right_copy = deepcopy(right)
+    with err:
+        actual = merge_dicts(left, right)
+        assert actual == expected
+        # no mutation
+        assert left == left_copy
+        assert right == right_copy
 
 
 @pytest.mark.parametrize(