core[patch], langchain[patch], experimental[patch]: import CI (#14414)

2025-08-09 13:00:34 +00:00 · 2023-12-08 11:28:55 -08:00 · 2023-12-08 11:28:55 -08:00 · b3f226e8f8
commit b3f226e8f8
parent ba083887e5
12 changed files with 177 additions and 113 deletions
--- a/.github/workflows/_all_ci.yml
+++ b/.github/workflows/_all_ci.yml
@ -52,8 +52,8 @@ jobs:
      working-directory: ${{ inputs.working-directory }}
    secrets: inherit

-  pydantic-compatibility:
-    uses: ./.github/workflows/_pydantic_compatibility.yml
+  dependencies:
+    uses: ./.github/workflows/_dependencies.yml
    with:
      working-directory: ${{ inputs.working-directory }}
    secrets: inherit
--- a/.github/workflows/_pydantic_compatibility.yml
+++ b/.github/workflows/_pydantic_compatibility.yml
@ -1,4 +1,4 @@
-name: pydantic v1/v2 compatibility
+name: dependencies

 on:
  workflow_call:
@ -28,7 +28,7 @@ jobs:
          - "3.9"
          - "3.10"
          - "3.11"
-    name: Pydantic v1/v2 compatibility - Python ${{ matrix.python-version }}
+    name: dependencies - Python ${{ matrix.python-version }}
    steps:
      - uses: actions/checkout@v4

@ -42,7 +42,15 @@ jobs:

      - name: Install dependencies
        shell: bash
-        run: poetry install  --with test
+        run: poetry install
+
+      - name: Check imports with base dependencies
+        shell: bash
+        run: poetry run make check_imports
+
+      - name: Install test dependencies
+        shell: bash
+        run: poetry install --with test

      - name: Install langchain editable
        working-directory: ${{ inputs.working-directory }}
--- a/libs/core/Makefile
+++ b/libs/core/Makefile
@ -15,6 +15,10 @@ tests:
 test_watch:
 	poetry run ptw --snapshot-update --now . -- -vv -x tests/unit_tests

+check_imports: langchain_core/**/*.py
+	for f in $^ ; do \
+		python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \
+	done
 extended_tests:
 	poetry run pytest --only-extended $(TEST_FILE)

@ -32,7 +36,7 @@ lint_tests: PYTHON_FILES=tests

 lint lint_diff lint_package lint_tests:
 	./scripts/check_pydantic.sh .
-	./scripts/check_imports.sh
+	./scripts/lint_imports.sh
 	poetry run ruff .
 	[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff
 	[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES)
--- a/libs/core/scripts/check_imports.sh
+++ b/libs/core/scripts/check_imports.sh
--- a/libs/experimental/Makefile
+++ b/libs/experimental/Makefile
@ -21,6 +21,11 @@ extended_tests:
 integration_tests:
 	poetry run pytest tests/integration_tests

+check_imports: langchain_experimental/**/*.py
+	for f in $^ ; do \
+		python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \
+	done
+

 ######################
 # LINTING AND FORMATTING
--- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py
@ -1,10 +1,11 @@
 import re
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import Dict, List
+from typing import TYPE_CHECKING, Dict, List

-from presidio_analyzer import RecognizerResult
-from presidio_anonymizer.entities import EngineResult
+if TYPE_CHECKING:
+    from presidio_analyzer import RecognizerResult
+    from presidio_anonymizer.entities import EngineResult

 MappingDataType = Dict[str, Dict[str, str]]

@ -62,8 +63,8 @@ class DeanonymizerMapping:

 def create_anonymizer_mapping(
    original_text: str,
-    analyzer_results: List[RecognizerResult],
-    anonymizer_results: EngineResult,
+    analyzer_results: List["RecognizerResult"],
+    anonymizer_results: "EngineResult",
    is_reversed: bool = False,
 ) -> MappingDataType:
    """Creates or updates the mapping used to anonymize and/or deanonymize text.
--- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
@ -23,28 +23,62 @@ from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
    get_pseudoanonymizer_mapping,
 )

-try:
-    from presidio_analyzer import AnalyzerEngine
+if TYPE_CHECKING:
+    from presidio_analyzer import AnalyzerEngine, EntityRecognizer
    from presidio_analyzer.nlp_engine import NlpEngineProvider
-
-except ImportError as e:
-    raise ImportError(
-        "Could not import presidio_analyzer, please install with "
-        "`pip install presidio-analyzer`. You will also need to download a "
-        "spaCy model to use the analyzer, e.g. "
-        "`python -m spacy download en_core_web_lg`."
-    ) from e
-try:
    from presidio_anonymizer import AnonymizerEngine
    from presidio_anonymizer.entities import OperatorConfig
-except ImportError as e:
-    raise ImportError(
-        "Could not import presidio_anonymizer, please install with "
-        "`pip install presidio-anonymizer`."
-    ) from e

-if TYPE_CHECKING:
-    from presidio_analyzer import EntityRecognizer
+
+def _import_analyzer_engine() -> "AnalyzerEngine":
+    try:
+        from presidio_analyzer import AnalyzerEngine
+
+    except ImportError as e:
+        raise ImportError(
+            "Could not import presidio_analyzer, please install with "
+            "`pip install presidio-analyzer`. You will also need to download a "
+            "spaCy model to use the analyzer, e.g. "
+            "`python -m spacy download en_core_web_lg`."
+        ) from e
+    return AnalyzerEngine
+
+
+def _import_nlp_engine_provider() -> "NlpEngineProvider":
+    try:
+        from presidio_analyzer.nlp_engine import NlpEngineProvider
+
+    except ImportError as e:
+        raise ImportError(
+            "Could not import presidio_analyzer, please install with "
+            "`pip install presidio-analyzer`. You will also need to download a "
+            "spaCy model to use the analyzer, e.g. "
+            "`python -m spacy download en_core_web_lg`."
+        ) from e
+    return NlpEngineProvider
+
+
+def _import_anonymizer_engine() -> "AnonymizerEngine":
+    try:
+        from presidio_anonymizer import AnonymizerEngine
+    except ImportError as e:
+        raise ImportError(
+            "Could not import presidio_anonymizer, please install with "
+            "`pip install presidio-anonymizer`."
+        ) from e
+    return AnonymizerEngine
+
+
+def _import_operator_config() -> "OperatorConfig":
+    try:
+        from presidio_anonymizer.entities import OperatorConfig
+    except ImportError as e:
+        raise ImportError(
+            "Could not import presidio_anonymizer, please install with "
+            "`pip install presidio-anonymizer`."
+        ) from e
+    return OperatorConfig
+

 # Configuring Anonymizer for multiple languages
 # Detailed description and examples can be found here:
@ -89,6 +123,11 @@ class PresidioAnonymizerBase(AnonymizerBase):
                Defaults to None, in which case faker will be seeded randomly
                and provide random values.
        """
+        OperatorConfig = _import_operator_config()
+        AnalyzerEngine = _import_analyzer_engine()
+        NlpEngineProvider = _import_nlp_engine_provider()
+        AnonymizerEngine = _import_anonymizer_engine()
+
        self.analyzed_fields = (
            analyzed_fields
            if analyzed_fields is not None
--- a/libs/langchain/Makefile
+++ b/libs/langchain/Makefile
@ -40,6 +40,11 @@ docker_tests:
 	docker build -t my-langchain-image:test .
 	docker run --rm my-langchain-image:test

+check_imports: langchain/**/*.py
+	for f in $^ ; do \
+		python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \
+	done
+
 ######################
 # LINTING AND FORMATTING
 ######################
@ -53,7 +58,7 @@ lint_tests: PYTHON_FILES=tests

 lint lint_diff lint_package lint_tests:
 	./scripts/check_pydantic.sh .
-	./scripts/check_imports.sh
+	./scripts/lint_imports.sh
 	poetry run ruff .
 	[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff
 	[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES)
--- a/libs/langchain/langchain/utilities/github.py
+++ b/libs/langchain/langchain/utilities/github.py
@ -5,7 +5,6 @@ import json
 from typing import TYPE_CHECKING, Any, Dict, List, Optional

 import requests
-import tiktoken
 from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator

 from langchain.utils import get_from_dict_or_env
@ -15,6 +14,18 @@ if TYPE_CHECKING:
    from github.PullRequest import PullRequest


+def _import_tiktoken() -> Any:
+    """Import tiktoken."""
+    try:
+        import tiktoken
+    except ImportError:
+        raise ImportError(
+            "tiktoken is not installed. "
+            "Please install it with `pip install tiktoken`"
+        )
+    return tiktoken
+
+
 class GitHubAPIWrapper(BaseModel):
    """Wrapper for GitHub API."""

@ -385,6 +396,7 @@ class GitHubAPIWrapper(BaseModel):
            dict: A dictionary containing the issue's title,
            body, and comments as a string
        """
+        tiktoken = _import_tiktoken()
        MAX_TOKENS_FOR_FILES = 3_000
        pr_files = []
        pr = self.github_repo_instance.get_pull(number=int(pr_number))
@ -453,6 +465,7 @@ class GitHubAPIWrapper(BaseModel):
        total_tokens = 0

        def get_tokens(text: str) -> int:
+            tiktoken = _import_tiktoken()
            return len(tiktoken.get_encoding("cl100k_base").encode(text))

        def add_to_dict(data_dict: Dict[str, Any], key: str, value: str) -> None:
--- a/libs/langchain/langchain/vectorstores/_pgvector_data_models.py
+++ b/libs/langchain/langchain/vectorstores/_pgvector_data_models.py
@ -1,71 +0,0 @@
-from typing import Optional, Tuple
-
-import sqlalchemy
-from pgvector.sqlalchemy import Vector
-from sqlalchemy.dialects.postgresql import JSON, UUID
-from sqlalchemy.orm import Session, relationship
-
-from langchain.vectorstores.pgvector import BaseModel
-
-
-class CollectionStore(BaseModel):
-    """Collection store."""
-
-    __tablename__ = "langchain_pg_collection"
-
-    name = sqlalchemy.Column(sqlalchemy.String)
-    cmetadata = sqlalchemy.Column(JSON)
-
-    embeddings = relationship(
-        "EmbeddingStore",
-        back_populates="collection",
-        passive_deletes=True,
-    )
-
-    @classmethod
-    def get_by_name(cls, session: Session, name: str) -> Optional["CollectionStore"]:
-        return session.query(cls).filter(cls.name == name).first()  # type: ignore
-
-    @classmethod
-    def get_or_create(
-        cls,
-        session: Session,
-        name: str,
-        cmetadata: Optional[dict] = None,
-    ) -> Tuple["CollectionStore", bool]:
-        """
-        Get or create a collection.
-        Returns [Collection, bool] where the bool is True if the collection was created.
-        """
-        created = False
-        collection = cls.get_by_name(session, name)
-        if collection:
-            return collection, created
-
-        collection = cls(name=name, cmetadata=cmetadata)
-        session.add(collection)
-        session.commit()
-        created = True
-        return collection, created
-
-
-class EmbeddingStore(BaseModel):
-    """Embedding store."""
-
-    __tablename__ = "langchain_pg_embedding"
-
-    collection_id = sqlalchemy.Column(
-        UUID(as_uuid=True),
-        sqlalchemy.ForeignKey(
-            f"{CollectionStore.__tablename__}.uuid",
-            ondelete="CASCADE",
-        ),
-    )
-    collection = relationship(CollectionStore, back_populates="embeddings")
-
-    embedding: Vector = sqlalchemy.Column(Vector(None))
-    document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
-    cmetadata = sqlalchemy.Column(JSON, nullable=True)
-
-    # custom_id : any user defined id
-    custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)
--- a/libs/langchain/langchain/vectorstores/pgvector.py
+++ b/libs/langchain/langchain/vectorstores/pgvector.py
@ -7,7 +7,6 @@ import logging
 import uuid
 from functools import partial
 from typing import (
-    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
@ -22,8 +21,8 @@ from typing import (
 import numpy as np
 import sqlalchemy
 from sqlalchemy import delete
-from sqlalchemy.dialects.postgresql import UUID
-from sqlalchemy.orm import Session
+from sqlalchemy.dialects.postgresql import JSON, UUID
+from sqlalchemy.orm import Session, relationship

 try:
    from sqlalchemy.orm import declarative_base
@ -37,9 +36,6 @@ from langchain_core.vectorstores import VectorStore
 from langchain.utils import get_from_dict_or_env
 from langchain.vectorstores.utils import maximal_marginal_relevance

-if TYPE_CHECKING:
-    from langchain.vectorstores._pgvector_data_models import CollectionStore
-

 class DistanceStrategy(str, enum.Enum):
    """Enumerator of the Distance strategies."""
@ -64,6 +60,74 @@ class BaseModel(Base):
    uuid = sqlalchemy.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)


+class CollectionStore(BaseModel):
+    """Collection store."""
+
+    __tablename__ = "langchain_pg_collection"
+
+    name = sqlalchemy.Column(sqlalchemy.String)
+    cmetadata = sqlalchemy.Column(JSON)
+
+    embeddings = relationship(
+        "EmbeddingStore",
+        back_populates="collection",
+        passive_deletes=True,
+    )
+
+    @classmethod
+    def get_by_name(cls, session: Session, name: str) -> Optional["CollectionStore"]:
+        return session.query(cls).filter(cls.name == name).first()  # type: ignore
+
+    @classmethod
+    def get_or_create(
+        cls,
+        session: Session,
+        name: str,
+        cmetadata: Optional[dict] = None,
+    ) -> Tuple["CollectionStore", bool]:
+        """
+        Get or create a collection.
+        Returns [Collection, bool] where the bool is True if the collection was created.
+        """
+        created = False
+        collection = cls.get_by_name(session, name)
+        if collection:
+            return collection, created
+
+        collection = cls(name=name, cmetadata=cmetadata)
+        session.add(collection)
+        session.commit()
+        created = True
+        return collection, created
+
+
+def _get_embedding_store() -> Any:
+    from pgvector.sqlalchemy import Vector
+
+    class EmbeddingStore(BaseModel):
+        """Embedding store."""
+
+        __tablename__ = "langchain_pg_embedding"
+
+        collection_id = sqlalchemy.Column(
+            UUID(as_uuid=True),
+            sqlalchemy.ForeignKey(
+                f"{CollectionStore.__tablename__}.uuid",
+                ondelete="CASCADE",
+            ),
+        )
+        collection = relationship(CollectionStore, back_populates="embeddings")
+
+        embedding: Vector = sqlalchemy.Column(Vector(None))
+        document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
+        cmetadata = sqlalchemy.Column(JSON, nullable=True)
+
+        # custom_id : any user defined id
+        custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)
+
+    return EmbeddingStore
+
+
 def _results_to_docs(docs_and_scores: Any) -> List[Document]:
    """Return docs from docs and scores."""
    return [doc for doc, _ in docs_and_scores]
@ -138,13 +202,9 @@ class PGVector(VectorStore):
    ) -> None:
        """Initialize the store."""
        self.create_vector_extension()
-        from langchain.vectorstores._pgvector_data_models import (
-            CollectionStore,
-            EmbeddingStore,
-        )

        self.CollectionStore = CollectionStore
-        self.EmbeddingStore = EmbeddingStore
+        self.EmbeddingStore = _get_embedding_store()
        self.create_tables_if_not_exists()
        self.create_collection()

--- a/libs/langchain/scripts/check_imports.sh
+++ b/libs/langchain/scripts/check_imports.sh