Merge branch 'master' into bagatur/format_content_as

This commit is contained in:
Bagatur 2024-08-29 13:37:08 -07:00
commit fb002faba4
5 changed files with 60 additions and 21 deletions

View File

@ -91,10 +91,10 @@ class AzureOpenAIEmbeddings(OpenAIEmbeddings):
values["azure_ad_token"] = values.get("azure_ad_token") or os.getenv( values["azure_ad_token"] = values.get("azure_ad_token") or os.getenv(
"AZURE_OPENAI_AD_TOKEN" "AZURE_OPENAI_AD_TOKEN"
) )
# Azure OpenAI embedding models allow a maximum of 16 texts # Azure OpenAI embedding models allow a maximum of 2048 texts
# at a time in each batch # at a time in each batch
# See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
values["chunk_size"] = min(values["chunk_size"], 16) values["chunk_size"] = min(values["chunk_size"], 2048)
try: try:
import openai # noqa: F401 import openai # noqa: F401
except ImportError: except ImportError:

View File

@ -307,10 +307,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
) )
if values["openai_api_type"] in ("azure", "azure_ad", "azuread"): if values["openai_api_type"] in ("azure", "azure_ad", "azuread"):
default_api_version = "2023-05-15" default_api_version = "2023-05-15"
# Azure OpenAI embedding models allow a maximum of 16 texts # Azure OpenAI embedding models allow a maximum of 2048
# at a time in each batch # texts at a time in each batch
# See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
values["chunk_size"] = min(values["chunk_size"], 16) values["chunk_size"] = min(values["chunk_size"], 2048)
else: else:
default_api_version = "" default_api_version = ""
values["openai_api_version"] = get_from_dict_or_env( values["openai_api_version"] = get_from_dict_or_env(

View File

@ -41,6 +41,19 @@ def merge_dicts(left: Dict[str, Any], *others: Dict[str, Any]) -> Dict[str, Any]
" but with a different type." " but with a different type."
) )
elif isinstance(merged[right_k], str): elif isinstance(merged[right_k], str):
# TODO: Add below special handling for 'type' key in 0.3 and remove
# merge_lists 'type' logic.
#
# if right_k == "type":
# if merged[right_k] == right_v:
# continue
# else:
# raise ValueError(
# "Unable to merge. Two different values seen for special "
# f"key 'type': {merged[right_k]} and {right_v}. 'type' "
# "should either occur once or have the same value across "
# "all dicts."
# )
merged[right_k] += right_v merged[right_k] += right_v
elif isinstance(merged[right_k], dict): elif isinstance(merged[right_k], dict):
merged[right_k] = merge_dicts(merged[right_k], right_v) merged[right_k] = merge_dicts(merged[right_k], right_v)
@ -81,10 +94,10 @@ def merge_lists(left: Optional[List], *others: Optional[List]) -> Optional[List]
if e_left["index"] == e["index"] if e_left["index"] == e["index"]
] ]
if to_merge: if to_merge:
# If a top-level "type" has been set for a chunk, it should no # TODO: Remove this once merge_dict is updated with special
# longer be overridden by the "type" field in future chunks. # handling for 'type'.
if "type" in merged[to_merge[0]] and "type" in e: if "type" in e:
e.pop("type") e = {k: v for k, v in e.items() if k != "type"}
merged[to_merge[0]] = merge_dicts(merged[to_merge[0]], e) merged[to_merge[0]] = merge_dicts(merged[to_merge[0]], e)
else: else:
merged.append(e) merged.append(e)

View File

@ -1,6 +1,7 @@
import os import os
import re import re
from contextlib import AbstractContextManager, nullcontext from contextlib import AbstractContextManager, nullcontext
from copy import deepcopy
from typing import Any, Callable, Dict, Optional, Tuple, Type, Union from typing import Any, Callable, Dict, Optional, Tuple, Type, Union
from unittest.mock import patch from unittest.mock import patch
@ -120,9 +121,45 @@ def test_merge_dicts(
else: else:
err = nullcontext() err = nullcontext()
left_copy = deepcopy(left)
right_copy = deepcopy(right)
with err: with err:
actual = merge_dicts(left, right) actual = merge_dicts(left, right)
assert actual == expected assert actual == expected
# no mutation
assert left == left_copy
assert right == right_copy
@pytest.mark.parametrize(
("left", "right", "expected"),
(
# 'type' special key handling
({"type": "foo"}, {"type": "foo"}, {"type": "foo"}),
(
{"type": "foo"},
{"type": "bar"},
pytest.raises(ValueError, match="Unable to merge."),
),
),
)
@pytest.mark.xfail(reason="Refactors to make in 0.3")
def test_merge_dicts_0_3(
left: dict, right: dict, expected: Union[dict, AbstractContextManager]
) -> None:
if isinstance(expected, AbstractContextManager):
err = expected
else:
err = nullcontext()
left_copy = deepcopy(left)
right_copy = deepcopy(right)
with err:
actual = merge_dicts(left, right)
assert actual == expected
# no mutation
assert left == left_copy
assert right == right_copy
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -302,19 +302,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
values["openai_proxy"] = get_from_dict_or_env( values["openai_proxy"] = get_from_dict_or_env(
values, "openai_proxy", "OPENAI_PROXY", default="" values, "openai_proxy", "OPENAI_PROXY", default=""
) )
if values["openai_api_type"] in ("azure", "azure_ad", "azuread"):
default_api_version = "2023-05-15"
# Azure OpenAI embedding models allow a maximum of 16 texts
# at a time in each batch
# See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
values["chunk_size"] = min(values["chunk_size"], 16)
else:
default_api_version = ""
values["openai_api_version"] = get_from_dict_or_env( values["openai_api_version"] = get_from_dict_or_env(
values, values, "openai_api_version", "OPENAI_API_VERSION", default=""
"openai_api_version",
"OPENAI_API_VERSION",
default=default_api_version,
) )
# Check OPENAI_ORGANIZATION for backwards compatibility. # Check OPENAI_ORGANIZATION for backwards compatibility.
values["openai_organization"] = ( values["openai_organization"] = (