langchain: make numpy optional (#29182)

Co-authored-by: Chester Curme <chester.curme@gmail.com>
2025-06-30 18:33:40 +00:00 · 2025-02-26 06:35:24 -08:00 · 2025-02-26 06:35:24 -08:00 · 3c96012f5e
commit 3c96012f5e
parent 8c28742980
8 changed files with 5214 additions and 38 deletions
--- a/libs/langchain/langchain/chains/flare/base.py
+++ b/libs/langchain/langchain/chains/flare/base.py
@ -1,9 +1,9 @@
 from __future__ import annotations

+import logging
 import re
 from typing import Any, Dict, List, Optional, Sequence, Tuple

-import numpy as np
 from langchain_core.callbacks import (
    CallbackManagerForChainRun,
 )
@ -23,6 +23,8 @@ from langchain.chains.flare.prompts import (
 )
 from langchain.chains.llm import LLMChain

+logger = logging.getLogger(__name__)
+

 def _extract_tokens_and_log_probs(response: AIMessage) -> Tuple[List[str], List[float]]:
    """Extract tokens and log probabilities from chat model response."""
@ -57,7 +59,24 @@ def _low_confidence_spans(
    min_token_gap: int,
    num_pad_tokens: int,
 ) -> List[str]:
-    _low_idx = np.where(np.exp(log_probs) < min_prob)[0]
+    try:
+        import numpy as np
+
+        _low_idx = np.where(np.exp(log_probs) < min_prob)[0]
+    except ImportError:
+        logger.warning(
+            "NumPy not found in the current Python environment. FlareChain will use a "
+            "pure Python implementation for internal calculations, which may "
+            "significantly impact performance, especially for large datasets. For "
+            "optimal speed and efficiency, consider installing NumPy: pip install numpy"
+        )
+        import math
+
+        _low_idx = [  # type: ignore[assignment]
+            idx
+            for idx, log_prob in enumerate(log_probs)
+            if math.exp(log_prob) < min_prob
+        ]
    low_idx = [i for i in _low_idx if re.search(r"\w", tokens[i])]
    if len(low_idx) == 0:
        return []
--- a/libs/langchain/langchain/chains/hyde/base.py
+++ b/libs/langchain/langchain/chains/hyde/base.py
@ -5,9 +5,9 @@ https://arxiv.org/abs/2212.10496

 from __future__ import annotations

+import logging
 from typing import Any, Dict, List, Optional

-import numpy as np
 from langchain_core.callbacks import CallbackManagerForChainRun
 from langchain_core.embeddings import Embeddings
 from langchain_core.language_models import BaseLanguageModel
@ -20,6 +20,8 @@ from langchain.chains.base import Chain
 from langchain.chains.hyde.prompts import PROMPT_MAP
 from langchain.chains.llm import LLMChain

+logger = logging.getLogger(__name__)
+

 class HypotheticalDocumentEmbedder(Chain, Embeddings):
    """Generate hypothetical document for query, and then embed that.
@ -54,7 +56,22 @@ class HypotheticalDocumentEmbedder(Chain, Embeddings):

    def combine_embeddings(self, embeddings: List[List[float]]) -> List[float]:
        """Combine embeddings into final embeddings."""
-        return list(np.array(embeddings).mean(axis=0))
+        try:
+            import numpy as np
+
+            return list(np.array(embeddings).mean(axis=0))
+        except ImportError:
+            logger.warning(
+                "NumPy not found in the current Python environment. "
+                "HypotheticalDocumentEmbedder will use a pure Python implementation "
+                "for internal calculations, which may significantly impact "
+                "performance, especially for large datasets. For optimal speed and "
+                "efficiency, consider installing NumPy: pip install numpy"
+            )
+            if not embeddings:
+                return []
+            num_vectors = len(embeddings)
+            return [sum(dim_values) / num_vectors for dim_values in zip(*embeddings)]

    def embed_query(self, text: str) -> List[float]:
        """Generate a hypothetical document and embedded it."""
--- a/libs/langchain/langchain/evaluation/embedding_distance/base.py
+++ b/libs/langchain/langchain/evaluation/embedding_distance/base.py
@ -1,9 +1,11 @@
 """A chain for comparing the output of two models using embeddings."""

+import functools
+import logging
 from enum import Enum
+from importlib import util
 from typing import Any, Dict, List, Optional

-import numpy as np
 from langchain_core.callbacks.manager import (
    AsyncCallbackManagerForChainRun,
    CallbackManagerForChainRun,
@ -18,6 +20,34 @@ from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
 from langchain.schema import RUN_KEY


+def _import_numpy() -> Any:
+    try:
+        import numpy as np
+
+        return np
+    except ImportError as e:
+        raise ImportError(
+            "Could not import numpy, please install with `pip install numpy`."
+        ) from e
+
+
+logger = logging.getLogger(__name__)
+
+
+@functools.lru_cache(maxsize=1)
+def _check_numpy() -> bool:
+    if bool(util.find_spec("numpy")):
+        return True
+    logger.warning(
+        "NumPy not found in the current Python environment. "
+        "langchain will use a pure Python implementation for embedding distance "
+        "operations, which may significantly impact performance, especially for large "
+        "datasets. For optimal speed and efficiency, consider installing NumPy: "
+        "pip install numpy"
+    )
+    return False
+
+
 def _embedding_factory() -> Embeddings:
    """Create an Embeddings object.
    Returns:
@ -158,7 +188,7 @@ class _EmbeddingDistanceChainMixin(Chain):
            raise ValueError(f"Invalid metric: {metric}")

    @staticmethod
-    def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    def _cosine_distance(a: Any, b: Any) -> Any:
        """Compute the cosine distance between two vectors.

        Args:
@ -179,7 +209,7 @@ class _EmbeddingDistanceChainMixin(Chain):
        return 1.0 - cosine_similarity(a, b)

    @staticmethod
-    def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+    def _euclidean_distance(a: Any, b: Any) -> Any:
        """Compute the Euclidean distance between two vectors.

        Args:
@ -189,10 +219,15 @@ class _EmbeddingDistanceChainMixin(Chain):
        Returns:
            np.floating: The Euclidean distance.
        """
-        return np.linalg.norm(a - b)
+        if _check_numpy():
+            import numpy as np
+
+            return np.linalg.norm(a - b)
+
+        return sum((x - y) * (x - y) for x, y in zip(a, b)) ** 0.5

    @staticmethod
-    def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+    def _manhattan_distance(a: Any, b: Any) -> Any:
        """Compute the Manhattan distance between two vectors.

        Args:
@ -202,10 +237,14 @@ class _EmbeddingDistanceChainMixin(Chain):
        Returns:
            np.floating: The Manhattan distance.
        """
-        return np.sum(np.abs(a - b))
+        if _check_numpy():
+            np = _import_numpy()
+            return np.sum(np.abs(a - b))
+
+        return sum(abs(x - y) for x, y in zip(a, b))

    @staticmethod
-    def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+    def _chebyshev_distance(a: Any, b: Any) -> Any:
        """Compute the Chebyshev distance between two vectors.

        Args:
@ -215,10 +254,14 @@ class _EmbeddingDistanceChainMixin(Chain):
        Returns:
            np.floating: The Chebyshev distance.
        """
-        return np.max(np.abs(a - b))
+        if _check_numpy():
+            np = _import_numpy()
+            return np.max(np.abs(a - b))
+
+        return max(abs(x - y) for x, y in zip(a, b))

    @staticmethod
-    def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+    def _hamming_distance(a: Any, b: Any) -> Any:
        """Compute the Hamming distance between two vectors.

        Args:
@ -228,9 +271,13 @@ class _EmbeddingDistanceChainMixin(Chain):
        Returns:
            np.floating: The Hamming distance.
        """
-        return np.mean(a != b)
+        if _check_numpy():
+            np = _import_numpy()
+            return np.mean(a != b)

-    def _compute_score(self, vectors: np.ndarray) -> float:
+        return sum(1 for x, y in zip(a, b) if x != y) / len(a)
+
+    def _compute_score(self, vectors: Any) -> float:
        """Compute the score based on the distance metric.

        Args:
@ -240,8 +287,11 @@ class _EmbeddingDistanceChainMixin(Chain):
            float: The computed score.
        """
        metric = self._get_metric(self.distance_metric)
-        score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
-        return score
+        if _check_numpy() and isinstance(vectors, _import_numpy().ndarray):
+            score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
+        else:
+            score = metric(vectors[0], vectors[1])
+        return float(score)


 class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
@ -292,9 +342,12 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
        Returns:
            Dict[str, Any]: The computed score.
        """
-        vectors = np.array(
-            self.embeddings.embed_documents([inputs["prediction"], inputs["reference"]])
+        vectors = self.embeddings.embed_documents(
+            [inputs["prediction"], inputs["reference"]]
        )
+        if _check_numpy():
+            np = _import_numpy()
+            vectors = np.array(vectors)
        score = self._compute_score(vectors)
        return {"score": score}

@ -313,13 +366,15 @@ class EmbeddingDistanceEvalChain(_EmbeddingDistanceChainMixin, StringEvaluator):
        Returns:
            Dict[str, Any]: The computed score.
        """
-        embedded = await self.embeddings.aembed_documents(
+        vectors = await self.embeddings.aembed_documents(
            [
                inputs["prediction"],
                inputs["reference"],
            ]
        )
-        vectors = np.array(embedded)
+        if _check_numpy():
+            np = _import_numpy()
+            vectors = np.array(vectors)
        score = self._compute_score(vectors)
        return {"score": score}

@ -432,14 +487,15 @@ class PairwiseEmbeddingDistanceEvalChain(
        Returns:
            Dict[str, Any]: The computed score.
        """
-        vectors = np.array(
-            self.embeddings.embed_documents(
-                [
-                    inputs["prediction"],
-                    inputs["prediction_b"],
-                ]
-            )
+        vectors = self.embeddings.embed_documents(
+            [
+                inputs["prediction"],
+                inputs["prediction_b"],
+            ]
        )
+        if _check_numpy():
+            np = _import_numpy()
+            vectors = np.array(vectors)
        score = self._compute_score(vectors)
        return {"score": score}

@ -458,13 +514,15 @@ class PairwiseEmbeddingDistanceEvalChain(
        Returns:
            Dict[str, Any]: The computed score.
        """
-        embedded = await self.embeddings.aembed_documents(
+        vectors = await self.embeddings.aembed_documents(
            [
                inputs["prediction"],
                inputs["prediction_b"],
            ]
        )
-        vectors = np.array(embedded)
+        if _check_numpy():
+            np = _import_numpy()
+            vectors = np.array(vectors)
        score = self._compute_score(vectors)
        return {"score": score}

--- a/libs/langchain/langchain/retrievers/document_compressors/embeddings_filter.py
+++ b/libs/langchain/langchain/retrievers/document_compressors/embeddings_filter.py
@ -1,6 +1,5 @@
 from typing import Callable, Dict, Optional, Sequence

-import numpy as np
 from langchain_core.callbacks.manager import Callbacks
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
@ -69,6 +68,13 @@ class EmbeddingsFilter(BaseDocumentCompressor):
                "To use please install langchain-community "
                "with `pip install langchain-community`."
            )
+
+        try:
+            import numpy as np
+        except ImportError as e:
+            raise ImportError(
+                "Could not import numpy, please install with `pip install numpy`."
+            ) from e
        stateful_documents = get_stateful_documents(documents)
        embedded_documents = _get_embeddings_from_stateful_docs(
            self.embeddings, stateful_documents
@ -104,6 +110,13 @@ class EmbeddingsFilter(BaseDocumentCompressor):
                "To use please install langchain-community "
                "with `pip install langchain-community`."
            )
+
+        try:
+            import numpy as np
+        except ImportError as e:
+            raise ImportError(
+                "Could not import numpy, please install with `pip install numpy`."
+            ) from e
        stateful_documents = get_stateful_documents(documents)
        embedded_documents = await _aget_embeddings_from_stateful_docs(
            self.embeddings, stateful_documents
--- a/libs/langchain/poetry.lock
+++ b/libs/langchain/poetry.lock
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@ -14,8 +14,6 @@ dependencies = [
    "SQLAlchemy<3,>=1.4",
    "requests<3,>=2",
    "PyYAML>=5.3",
-    "numpy<2,>=1.26.4; python_version < \"3.12\"",
-    "numpy<3,>=1.26.2; python_version >= \"3.12\"",
    "async-timeout<5.0.0,>=4.0.0; python_version < \"3.11\"",
 ]
 name = "langchain"
@ -74,6 +72,7 @@ test = [
    "langchain-openai",
    "toml>=0.10.2",
    "packaging>=24.2",
+    "numpy<3,>=1.26.4",
 ]
 codespell = ["codespell<3.0.0,>=2.2.0"]
 test_integration = [
@ -102,6 +101,7 @@ typing = [
    "mypy-protobuf<4.0.0,>=3.0.0",
    "langchain-core",
    "langchain-text-splitters",
+    "numpy<3,>=1.26.4",
 ]
 dev = [
    "jupyter<2.0.0,>=1.0.0",
--- a/libs/langchain/tests/unit_tests/test_dependencies.py
+++ b/libs/langchain/tests/unit_tests/test_dependencies.py
@ -37,7 +37,6 @@ def test_required_dependencies(uv_conf: Mapping[str, Any]) -> None:
            "langchain-core",
            "langchain-text-splitters",
            "langsmith",
-            "numpy",
            "pydantic",
            "requests",
        ]
@ -82,5 +81,6 @@ def test_test_group_dependencies(uv_conf: Mapping[str, Any]) -> None:
            "requests-mock",
            # TODO: temporary hack since cffi 1.17.1 doesn't work with py 3.9.
            "cffi",
+            "numpy",
        ]
    )
--- a/libs/langchain/uv.lock
+++ b/libs/langchain/uv.lock
@ -2247,8 +2247,6 @@ dependencies = [
    { name = "langchain-core" },
    { name = "langchain-text-splitters" },
    { name = "langsmith" },
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
-    { name = "numpy", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
    { name = "pydantic" },
    { name = "pyyaml" },
    { name = "requests" },
@ -2329,6 +2327,8 @@ test = [
    { name = "langchain-tests" },
    { name = "langchain-text-splitters" },
    { name = "lark" },
+    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
+    { name = "numpy", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
    { name = "packaging" },
    { name = "pandas" },
    { name = "pytest" },
@ -2359,6 +2359,8 @@ typing = [
    { name = "langchain-text-splitters" },
    { name = "mypy" },
    { name = "mypy-protobuf" },
+    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" },
+    { name = "numpy", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" },
    { name = "types-chardet" },
    { name = "types-pytz" },
    { name = "types-pyyaml" },
@ -2389,8 +2391,6 @@ requires-dist = [
    { name = "langchain-together", marker = "extra == 'together'" },
    { name = "langchain-xai", marker = "extra == 'xai'" },
    { name = "langsmith", specifier = ">=0.1.17,<0.4" },
-    { name = "numpy", marker = "python_full_version < '3.12'", specifier = ">=1.26.4,<2" },
-    { name = "numpy", marker = "python_full_version >= '3.12'", specifier = ">=1.26.2,<3" },
    { name = "pydantic", specifier = ">=2.7.4,<3.0.0" },
    { name = "pyyaml", specifier = ">=5.3" },
    { name = "requests", specifier = ">=2,<3" },
@ -2422,6 +2422,7 @@ test = [
    { name = "langchain-tests", editable = "../standard-tests" },
    { name = "langchain-text-splitters", editable = "../text-splitters" },
    { name = "lark", specifier = ">=1.1.5,<2.0.0" },
+    { name = "numpy", specifier = ">=1.26.4,<3" },
    { name = "packaging", specifier = ">=24.2" },
    { name = "pandas", specifier = ">=2.0.0,<3.0.0" },
    { name = "pytest", specifier = ">=8,<9" },
@ -2452,6 +2453,7 @@ typing = [
    { name = "langchain-text-splitters", editable = "../text-splitters" },
    { name = "mypy", specifier = ">=1.10,<2.0" },
    { name = "mypy-protobuf", specifier = ">=3.0.0,<4.0.0" },
+    { name = "numpy", specifier = ">=1.26.4,<3" },
    { name = "types-chardet", specifier = ">=5.0.4.6,<6.0.0.0" },
    { name = "types-pytz", specifier = ">=2023.3.0.0,<2024.0.0.0" },
    { name = "types-pyyaml", specifier = ">=6.0.12.2,<7.0.0.0" },