1
0
mirror of https://github.com/hwchase17/langchain.git synced 2025-05-04 22:58:42 +00:00

langchain: make numpy optional ()

Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
Erick Friis 2025-02-26 06:35:24 -08:00 committed by GitHub
parent 8c28742980
commit 3c96012f5e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 5214 additions and 38 deletions
libs/langchain
langchain
chains
evaluation/embedding_distance
retrievers/document_compressors
poetry.lockpyproject.toml
tests/unit_tests
uv.lock

View File

@ -1,9 +1,9 @@
from __future__ import annotations
import logging
import re
from typing import Any, Dict, List, Optional, Sequence, Tuple
import numpy as np
from langchain_core.callbacks import (
CallbackManagerForChainRun,
)
@ -23,6 +23,8 @@ from langchain.chains.flare.prompts import (
)
from langchain.chains.llm import LLMChain
logger = logging.getLogger(__name__)
def _extract_tokens_and_log_probs(response: AIMessage) -> Tuple[List[str], List[float]]:
"""Extract tokens and log probabilities from chat model response."""
@ -57,7 +59,24 @@ def _low_confidence_spans(
min_token_gap: int,
num_pad_tokens: int,
) -> List[str]:
_low_idx = np.where(np.exp(log_probs) < min_prob)[0]
try:
import numpy as np
_low_idx = np.where(np.exp(log_probs) < min_prob)[0]
except ImportError:
logger.warning(
"NumPy not found in the current Python environment. FlareChain will use a "
"pure Python implementation for internal calculations, which may "
"significantly impact performance, especially for large datasets. For "
"optimal speed and efficiency, consider installing NumPy: pip install numpy"
)
import math
_low_idx = [ # type: ignore[assignment]
idx
for idx, log_prob in enumerate(log_probs)
if math.exp(log_prob) < min_prob
]
low_idx = [i for i in _low_idx if re.search(r"\w", tokens[i])]
if len(low_idx) == 0:
return []

View File

@ -5,9 +5,9 @@ https://arxiv.org/abs/2212.10496
from __future__ import annotations
import logging
from typing import Any, Dict, List, Optional
import numpy as np
from langchain_core.callbacks import CallbackManagerForChainRun
from langchain_core.embeddings import Embeddings
from langchain_core.language_models import BaseLanguageModel
@ -20,6 +20,8 @@ from langchain.chains.base import Chain
from langchain.chains.hyde.prompts import PROMPT_MAP
from langchain.chains.llm import LLMChain
logger = logging.getLogger(__name__)
class HypotheticalDocumentEmbedder(Chain, Embeddings):
"""Generate hypothetical document for query, and then embed that.
@ -54,7 +56,22 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):
def combine_embeddings(self, embeddings: List[List[float]]) -> List[float]:
"""Combine embeddings into final embeddings."""
return list(np.array(embeddings).mean(axis=0))
try:
import numpy as np
return list(np.array(embeddings).mean(axis=0))
except ImportError:
logger.warning(
"NumPy not found in the current Python environment. "
"HypotheticalDocumentEmbedder will use a pure Python implementation "
"for internal calculations, which may significantly impact "
"performance, especially for large datasets. For optimal speed and "
"efficiency, consider installing NumPy: pip install numpy"
)
if not embeddings:
return []
num_vectors = len(embeddings)
return [sum(dim_values) / num_vectors for dim_values in zip(*embeddings)]
def embed_query(self, text: str) -> List[float]:
"""Generate a hypothetical document and embedded it."""

View File

@ -1,9 +1,11 @@
"""A chain for comparing the output of two models using embeddings."""
import functools
import logging
from enum import Enum
from importlib import util
from typing import Any, Dict, List, Optional
import numpy as np
from langchain_core.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
@ -18,6 +20,34 @@ from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
from langchain.schema import RUN_KEY
def _import_numpy() -> Any:
try:
import numpy as np
return np
except ImportError as e:
raise ImportError(
"Could not import numpy, please install with `pip install numpy`."
) from e
logger = logging.getLogger(__name__)
@functools.lru_cache(maxsize=1)
def _check_numpy() -> bool:
if bool(util.find_spec("numpy")):
return True
logger.warning(
"NumPy not found in the current Python environment. "
"langchain will use a pure Python implementation for embedding distance "
"operations, which may significantly impact performance, especially for large "
"datasets. For optimal speed and efficiency, consider installing NumPy: "
"pip install numpy"
)
return False
def _embedding_factory() -> Embeddings:
"""Create an Embeddings object.
Returns:
@ -158,7 +188,7 @@ class _EmbeddingDistanceChainMixin(Chain):
raise ValueError(f"Invalid metric: {metric}")
@staticmethod
def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
def _cosine_distance(a: Any, b: Any) -> Any:
"""Compute the cosine distance between two vectors.
Args:
@ -179,7 +209,7 @@ class _EmbeddingDistanceChainMixin(Chain):
return 1.0 - cosine_similarity(a, b)
@staticmethod
def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
def _euclidean_distance(a: Any, b: Any) -> Any:
"""Compute the Euclidean distance between two vectors.
Args:
@ -189,10 +219,15 @@ class _EmbeddingDistanceChainMixin(Chain):
Returns:
np.floating: The Euclidean distance.
"""
return np.linalg.norm(a - b)
if _check_numpy():
import numpy as np
return np.linalg.norm(a - b)
return sum((x - y) * (x - y) for x, y in zip(a, b)) ** 0.5
@staticmethod
def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
def _manhattan_distance(a: Any, b: Any) -> Any:
"""Compute the Manhattan distance between two vectors.
Args:
@ -202,10 +237,14 @@ class _EmbeddingDistanceChainMixin(Chain):
Returns:
np.floating: The Manhattan distance.
"""
return np.sum(np.abs(a - b))
if _check_numpy():
np = _import_numpy()
return np.sum(np.abs(a - b))
return sum(abs(x - y) for x, y in zip(a, b))
@staticmethod
def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
def _chebyshev_distance(a: Any, b: Any) -> Any:
"""Compute the Chebyshev distance between two vectors.
Args:
@ -215,10 +254,14 @@ class _EmbeddingDistanceChainMixin(Chain):
Returns:
np.floating: The Chebyshev distance.
"""
return np.max(np.abs(a - b))
if _check_numpy():
np = _import_numpy()
return np.max(np.abs(a - b))
return max(abs(x - y) for x, y in zip(a, b))
@staticmethod
def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
def _hamming_distance(a: Any, b: Any) -> Any:
"""Compute the Hamming distance between two vectors.
Args:
@ -228,9 +271,13 @@ class _EmbeddingDistanceChainMixin(Chain):
Returns:
np.floating: The Hamming distance.
"""
return np.mean(a != b)
if _check_numpy():
np = _import_numpy()
return np.mean(a != b)
def _compute_score(self, vectors: np.ndarray) -> float:
return sum(1 for x, y in zip(a, b) if x != y) / len(a)
def _compute_score(self, vectors: Any) -> float:
"""Compute the score based on the distance metric.
Args:
@ -240,8 +287,11 @@ class _EmbeddingDistanceChainMixin(Chain):
float: The computed score.
"""
metric = self._get_metric(self.distance_metric)
score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
return score
if _check_numpy() and isinstance(vectors, _import_numpy().ndarray):
score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
else:
score = metric(vectors[0], vectors[1])
return float(score)
class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
@ -292,9 +342,12 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
Returns:
Dict[str, Any]: The computed score.
"""
vectors = np.array(
self.embeddings.embed_documents([inputs["prediction"], inputs["reference"]])
vectors = self.embeddings.embed_documents(
[inputs["prediction"], inputs["reference"]]
)
if _check_numpy():
np = _import_numpy()
vectors = np.array(vectors)
score = self._compute_score(vectors)
return {"score": score}
@ -313,13 +366,15 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
Returns:
Dict[str, Any]: The computed score.
"""
embedded = await self.embeddings.aembed_documents(
vectors = await self.embeddings.aembed_documents(
[
inputs["prediction"],
inputs["reference"],
]
)
vectors = np.array(embedded)
if _check_numpy():
np = _import_numpy()
vectors = np.array(vectors)
score = self._compute_score(vectors)
return {"score": score}
@ -432,14 +487,15 @@ class PairwiseEmbeddingDistanceEvalChain(
Returns:
Dict[str, Any]: The computed score.
"""
vectors = np.array(
self.embeddings.embed_documents(
[
inputs["prediction"],
inputs["prediction_b"],
]
)
vectors = self.embeddings.embed_documents(
[
inputs["prediction"],
inputs["prediction_b"],
]
)
if _check_numpy():
np = _import_numpy()
vectors = np.array(vectors)
score = self._compute_score(vectors)
return {"score": score}
@ -458,13 +514,15 @@ class PairwiseEmbeddingDistanceEvalChain(
Returns:
Dict[str, Any]: The computed score.
"""
embedded = await self.embeddings.aembed_documents(
vectors = await self.embeddings.aembed_documents(
[
inputs["prediction"],
inputs["prediction_b"],
]
)
vectors = np.array(embedded)
if _check_numpy():
np = _import_numpy()
vectors = np.array(vectors)
score = self._compute_score(vectors)
return {"score": score}

View File

@ -1,6 +1,5 @@
from typing import Callable, Dict, Optional, Sequence
import numpy as np
from langchain_core.callbacks.manager import Callbacks
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
@ -69,6 +68,13 @@ class EmbeddingsFilter(BaseDocumentCompressor):
"To use please install langchain-community "
"with `pip install langchain-community`."
)
try:
import numpy as np
except ImportError as e:
raise ImportError(
"Could not import numpy, please install with `pip install numpy`."
) from e
stateful_documents = get_stateful_documents(documents)
embedded_documents = _get_embeddings_from_stateful_docs(
self.embeddings, stateful_documents
@ -104,6 +110,13 @@ class EmbeddingsFilter(BaseDocumentCompressor):
"To use please install langchain-community "
"with `pip install langchain-community`."
)
try:
import numpy as np
except ImportError as e:
raise ImportError(
"Could not import numpy, please install with `pip install numpy`."
) from e
stateful_documents = get_stateful_documents(documents)
embedded_documents = await _aget_embeddings_from_stateful_docs(
self.embeddings, stateful_documents

5067
libs/langchain/poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -14,8 +14,6 @@ dependencies = [
"SQLAlchemy<3,>=1.4",
"requests<3,>=2",
"PyYAML>=5.3",
"numpy<2,>=1.26.4; python_version < \"3.12\"",
"numpy<3,>=1.26.2; python_version >= \"3.12\"",
"async-timeout<5.0.0,>=4.0.0; python_version < \"3.11\"",
]
name = "langchain"
@ -74,6 +72,7 @@ test = [
"langchain-openai",
"toml>=0.10.2",
"packaging>=24.2",
"numpy<3,>=1.26.4",
]
codespell = ["codespell<3.0.0,>=2.2.0"]
test_integration = [
@ -102,6 +101,7 @@ typing = [
"mypy-protobuf<4.0.0,>=3.0.0",
"langchain-core",
"langchain-text-splitters",
"numpy<3,>=1.26.4",
]
dev = [
"jupyter<2.0.0,>=1.0.0",

View File

@ -37,7 +37,6 @@ def test_required_dependencies(uv_conf: Mapping[str, Any]) -> None:
"langchain-core",
"langchain-text-splitters",
"langsmith",
"numpy",
"pydantic",
"requests",
]
@ -82,5 +81,6 @@ def test_test_group_dependencies(uv_conf: Mapping[str, Any]) -> None:
"requests-mock",
# TODO: temporary hack since cffi 1.17.1 doesn't work with py 3.9.
"cffi",
"numpy",
]
)

View File

@ -2247,8 +2247,6 @@ dependencies = [
{ name = "langchain-core" },
{ name = "langchain-text-splitters" },
{ name = "langsmith" },
{ name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
{ name = "numpy", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
{ name = "pydantic" },
{ name = "pyyaml" },
{ name = "requests" },
@ -2329,6 +2327,8 @@ test = [
{ name = "langchain-tests" },
{ name = "langchain-text-splitters" },
{ name = "lark" },
{ name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
{ name = "numpy", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
{ name = "packaging" },
{ name = "pandas" },
{ name = "pytest" },
@ -2359,6 +2359,8 @@ typing = [
{ name = "langchain-text-splitters" },
{ name = "mypy" },
{ name = "mypy-protobuf" },
{ name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
{ name = "numpy", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
{ name = "types-chardet" },
{ name = "types-pytz" },
{ name = "types-pyyaml" },
@ -2389,8 +2391,6 @@ requires-dist = [
{ name = "langchain-together", marker = "extra == 'together'" },
{ name = "langchain-xai", marker = "extra == 'xai'" },
{ name = "langsmith", specifier = ">=0.1.17,<0.4" },
{ name = "numpy", marker = "python_full_version < '3.12'", specifier = ">=1.26.4,<2" },
{ name = "numpy", marker = "python_full_version >= '3.12'", specifier = ">=1.26.2,<3" },
{ name = "pydantic", specifier = ">=2.7.4,<3.0.0" },
{ name = "pyyaml", specifier = ">=5.3" },
{ name = "requests", specifier = ">=2,<3" },
@ -2422,6 +2422,7 @@ test = [
{ name = "langchain-tests", editable = "../standard-tests" },
{ name = "langchain-text-splitters", editable = "../text-splitters" },
{ name = "lark", specifier = ">=1.1.5,<2.0.0" },
{ name = "numpy", specifier = ">=1.26.4,<3" },
{ name = "packaging", specifier = ">=24.2" },
{ name = "pandas", specifier = ">=2.0.0,<3.0.0" },
{ name = "pytest", specifier = ">=8,<9" },
@ -2452,6 +2453,7 @@ typing = [
{ name = "langchain-text-splitters", editable = "../text-splitters" },
{ name = "mypy", specifier = ">=1.10,<2.0" },
{ name = "mypy-protobuf", specifier = ">=3.0.0,<4.0.0" },
{ name = "numpy", specifier = ">=1.26.4,<3" },
{ name = "types-chardet", specifier = ">=5.0.4.6,<6.0.0.0" },
{ name = "types-pytz", specifier = ">=2023.3.0.0,<2024.0.0.0" },
{ name = "types-pyyaml", specifier = ">=6.0.12.2,<7.0.0.0" },