core[patch], langchain[patch], experimental[patch]: import CI (#14414)

This commit is contained in:
Erick Friis 2023-12-08 11:28:55 -08:00 committed by GitHub
parent ba083887e5
commit b3f226e8f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 177 additions and 113 deletions

View File

@ -52,8 +52,8 @@ jobs:
working-directory: ${{ inputs.working-directory }} working-directory: ${{ inputs.working-directory }}
secrets: inherit secrets: inherit
pydantic-compatibility: dependencies:
uses: ./.github/workflows/_pydantic_compatibility.yml uses: ./.github/workflows/_dependencies.yml
with: with:
working-directory: ${{ inputs.working-directory }} working-directory: ${{ inputs.working-directory }}
secrets: inherit secrets: inherit

View File

@ -1,4 +1,4 @@
name: pydantic v1/v2 compatibility name: dependencies
on: on:
workflow_call: workflow_call:
@ -28,7 +28,7 @@ jobs:
- "3.9" - "3.9"
- "3.10" - "3.10"
- "3.11" - "3.11"
name: Pydantic v1/v2 compatibility - Python ${{ matrix.python-version }} name: dependencies - Python ${{ matrix.python-version }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -41,6 +41,14 @@ jobs:
cache-key: pydantic-cross-compat cache-key: pydantic-cross-compat
- name: Install dependencies - name: Install dependencies
shell: bash
run: poetry install
- name: Check imports with base dependencies
shell: bash
run: poetry run make check_imports
- name: Install test dependencies
shell: bash shell: bash
run: poetry install --with test run: poetry install --with test

View File

@ -15,6 +15,10 @@ tests:
test_watch: test_watch:
poetry run ptw --snapshot-update --now . -- -vv -x tests/unit_tests poetry run ptw --snapshot-update --now . -- -vv -x tests/unit_tests
check_imports: langchain_core/**/*.py
for f in $^ ; do \
python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \
done
extended_tests: extended_tests:
poetry run pytest --only-extended $(TEST_FILE) poetry run pytest --only-extended $(TEST_FILE)
@ -32,7 +36,7 @@ lint_tests: PYTHON_FILES=tests
lint lint_diff lint_package lint_tests: lint lint_diff lint_package lint_tests:
./scripts/check_pydantic.sh . ./scripts/check_pydantic.sh .
./scripts/check_imports.sh ./scripts/lint_imports.sh
poetry run ruff . poetry run ruff .
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES) [ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES)

View File

@ -21,6 +21,11 @@ extended_tests:
integration_tests: integration_tests:
poetry run pytest tests/integration_tests poetry run pytest tests/integration_tests
check_imports: langchain_experimental/**/*.py
for f in $^ ; do \
python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \
done
###################### ######################
# LINTING AND FORMATTING # LINTING AND FORMATTING

View File

@ -1,10 +1,11 @@
import re import re
from collections import defaultdict from collections import defaultdict
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Dict, List from typing import TYPE_CHECKING, Dict, List
from presidio_analyzer import RecognizerResult if TYPE_CHECKING:
from presidio_anonymizer.entities import EngineResult from presidio_analyzer import RecognizerResult
from presidio_anonymizer.entities import EngineResult
MappingDataType = Dict[str, Dict[str, str]] MappingDataType = Dict[str, Dict[str, str]]
@ -62,8 +63,8 @@ class DeanonymizerMapping:
def create_anonymizer_mapping( def create_anonymizer_mapping(
original_text: str, original_text: str,
analyzer_results: List[RecognizerResult], analyzer_results: List["RecognizerResult"],
anonymizer_results: EngineResult, anonymizer_results: "EngineResult",
is_reversed: bool = False, is_reversed: bool = False,
) -> MappingDataType: ) -> MappingDataType:
"""Creates or updates the mapping used to anonymize and/or deanonymize text. """Creates or updates the mapping used to anonymize and/or deanonymize text.

View File

@ -23,28 +23,62 @@ from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
get_pseudoanonymizer_mapping, get_pseudoanonymizer_mapping,
) )
try: if TYPE_CHECKING:
from presidio_analyzer import AnalyzerEngine from presidio_analyzer import AnalyzerEngine, EntityRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
except ImportError as e:
def _import_analyzer_engine() -> "AnalyzerEngine":
try:
from presidio_analyzer import AnalyzerEngine
except ImportError as e:
raise ImportError( raise ImportError(
"Could not import presidio_analyzer, please install with " "Could not import presidio_analyzer, please install with "
"`pip install presidio-analyzer`. You will also need to download a " "`pip install presidio-analyzer`. You will also need to download a "
"spaCy model to use the analyzer, e.g. " "spaCy model to use the analyzer, e.g. "
"`python -m spacy download en_core_web_lg`." "`python -m spacy download en_core_web_lg`."
) from e ) from e
try: return AnalyzerEngine
def _import_nlp_engine_provider() -> "NlpEngineProvider":
try:
from presidio_analyzer.nlp_engine import NlpEngineProvider
except ImportError as e:
raise ImportError(
"Could not import presidio_analyzer, please install with "
"`pip install presidio-analyzer`. You will also need to download a "
"spaCy model to use the analyzer, e.g. "
"`python -m spacy download en_core_web_lg`."
) from e
return NlpEngineProvider
def _import_anonymizer_engine() -> "AnonymizerEngine":
try:
from presidio_anonymizer import AnonymizerEngine from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig except ImportError as e:
except ImportError as e:
raise ImportError( raise ImportError(
"Could not import presidio_anonymizer, please install with " "Could not import presidio_anonymizer, please install with "
"`pip install presidio-anonymizer`." "`pip install presidio-anonymizer`."
) from e ) from e
return AnonymizerEngine
def _import_operator_config() -> "OperatorConfig":
try:
from presidio_anonymizer.entities import OperatorConfig
except ImportError as e:
raise ImportError(
"Could not import presidio_anonymizer, please install with "
"`pip install presidio-anonymizer`."
) from e
return OperatorConfig
if TYPE_CHECKING:
from presidio_analyzer import EntityRecognizer
# Configuring Anonymizer for multiple languages # Configuring Anonymizer for multiple languages
# Detailed description and examples can be found here: # Detailed description and examples can be found here:
@ -89,6 +123,11 @@ class PresidioAnonymizerBase(AnonymizerBase):
Defaults to None, in which case faker will be seeded randomly Defaults to None, in which case faker will be seeded randomly
and provide random values. and provide random values.
""" """
OperatorConfig = _import_operator_config()
AnalyzerEngine = _import_analyzer_engine()
NlpEngineProvider = _import_nlp_engine_provider()
AnonymizerEngine = _import_anonymizer_engine()
self.analyzed_fields = ( self.analyzed_fields = (
analyzed_fields analyzed_fields
if analyzed_fields is not None if analyzed_fields is not None

View File

@ -40,6 +40,11 @@ docker_tests:
docker build -t my-langchain-image:test . docker build -t my-langchain-image:test .
docker run --rm my-langchain-image:test docker run --rm my-langchain-image:test
check_imports: langchain/**/*.py
for f in $^ ; do \
python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \
done
###################### ######################
# LINTING AND FORMATTING # LINTING AND FORMATTING
###################### ######################
@ -53,7 +58,7 @@ lint_tests: PYTHON_FILES=tests
lint lint_diff lint_package lint_tests: lint lint_diff lint_package lint_tests:
./scripts/check_pydantic.sh . ./scripts/check_pydantic.sh .
./scripts/check_imports.sh ./scripts/lint_imports.sh
poetry run ruff . poetry run ruff .
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES) [ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES)

View File

@ -5,7 +5,6 @@ import json
from typing import TYPE_CHECKING, Any, Dict, List, Optional from typing import TYPE_CHECKING, Any, Dict, List, Optional
import requests import requests
import tiktoken
from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator
from langchain.utils import get_from_dict_or_env from langchain.utils import get_from_dict_or_env
@ -15,6 +14,18 @@ if TYPE_CHECKING:
from github.PullRequest import PullRequest from github.PullRequest import PullRequest
def _import_tiktoken() -> Any:
"""Import tiktoken."""
try:
import tiktoken
except ImportError:
raise ImportError(
"tiktoken is not installed. "
"Please install it with `pip install tiktoken`"
)
return tiktoken
class GitHubAPIWrapper(BaseModel): class GitHubAPIWrapper(BaseModel):
"""Wrapper for GitHub API.""" """Wrapper for GitHub API."""
@ -385,6 +396,7 @@ class GitHubAPIWrapper(BaseModel):
dict: A dictionary containing the issue's title, dict: A dictionary containing the issue's title,
body, and comments as a string body, and comments as a string
""" """
tiktoken = _import_tiktoken()
MAX_TOKENS_FOR_FILES = 3_000 MAX_TOKENS_FOR_FILES = 3_000
pr_files = [] pr_files = []
pr = self.github_repo_instance.get_pull(number=int(pr_number)) pr = self.github_repo_instance.get_pull(number=int(pr_number))
@ -453,6 +465,7 @@ class GitHubAPIWrapper(BaseModel):
total_tokens = 0 total_tokens = 0
def get_tokens(text: str) -> int: def get_tokens(text: str) -> int:
tiktoken = _import_tiktoken()
return len(tiktoken.get_encoding("cl100k_base").encode(text)) return len(tiktoken.get_encoding("cl100k_base").encode(text))
def add_to_dict(data_dict: Dict[str, Any], key: str, value: str) -> None: def add_to_dict(data_dict: Dict[str, Any], key: str, value: str) -> None:

View File

@ -1,71 +0,0 @@
from typing import Optional, Tuple
import sqlalchemy
from pgvector.sqlalchemy import Vector
from sqlalchemy.dialects.postgresql import JSON, UUID
from sqlalchemy.orm import Session, relationship
from langchain.vectorstores.pgvector import BaseModel
class CollectionStore(BaseModel):
"""Collection store."""
__tablename__ = "langchain_pg_collection"
name = sqlalchemy.Column(sqlalchemy.String)
cmetadata = sqlalchemy.Column(JSON)
embeddings = relationship(
"EmbeddingStore",
back_populates="collection",
passive_deletes=True,
)
@classmethod
def get_by_name(cls, session: Session, name: str) -> Optional["CollectionStore"]:
return session.query(cls).filter(cls.name == name).first() # type: ignore
@classmethod
def get_or_create(
cls,
session: Session,
name: str,
cmetadata: Optional[dict] = None,
) -> Tuple["CollectionStore", bool]:
"""
Get or create a collection.
Returns [Collection, bool] where the bool is True if the collection was created.
"""
created = False
collection = cls.get_by_name(session, name)
if collection:
return collection, created
collection = cls(name=name, cmetadata=cmetadata)
session.add(collection)
session.commit()
created = True
return collection, created
class EmbeddingStore(BaseModel):
"""Embedding store."""
__tablename__ = "langchain_pg_embedding"
collection_id = sqlalchemy.Column(
UUID(as_uuid=True),
sqlalchemy.ForeignKey(
f"{CollectionStore.__tablename__}.uuid",
ondelete="CASCADE",
),
)
collection = relationship(CollectionStore, back_populates="embeddings")
embedding: Vector = sqlalchemy.Column(Vector(None))
document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
cmetadata = sqlalchemy.Column(JSON, nullable=True)
# custom_id : any user defined id
custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)

View File

@ -7,7 +7,6 @@ import logging
import uuid import uuid
from functools import partial from functools import partial
from typing import ( from typing import (
TYPE_CHECKING,
Any, Any,
Callable, Callable,
Dict, Dict,
@ -22,8 +21,8 @@ from typing import (
import numpy as np import numpy as np
import sqlalchemy import sqlalchemy
from sqlalchemy import delete from sqlalchemy import delete
from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.dialects.postgresql import JSON, UUID
from sqlalchemy.orm import Session from sqlalchemy.orm import Session, relationship
try: try:
from sqlalchemy.orm import declarative_base from sqlalchemy.orm import declarative_base
@ -37,9 +36,6 @@ from langchain_core.vectorstores import VectorStore
from langchain.utils import get_from_dict_or_env from langchain.utils import get_from_dict_or_env
from langchain.vectorstores.utils import maximal_marginal_relevance from langchain.vectorstores.utils import maximal_marginal_relevance
if TYPE_CHECKING:
from langchain.vectorstores._pgvector_data_models import CollectionStore
class DistanceStrategy(str, enum.Enum): class DistanceStrategy(str, enum.Enum):
"""Enumerator of the Distance strategies.""" """Enumerator of the Distance strategies."""
@ -64,6 +60,74 @@ class BaseModel(Base):
uuid = sqlalchemy.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) uuid = sqlalchemy.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
class CollectionStore(BaseModel):
"""Collection store."""
__tablename__ = "langchain_pg_collection"
name = sqlalchemy.Column(sqlalchemy.String)
cmetadata = sqlalchemy.Column(JSON)
embeddings = relationship(
"EmbeddingStore",
back_populates="collection",
passive_deletes=True,
)
@classmethod
def get_by_name(cls, session: Session, name: str) -> Optional["CollectionStore"]:
return session.query(cls).filter(cls.name == name).first() # type: ignore
@classmethod
def get_or_create(
cls,
session: Session,
name: str,
cmetadata: Optional[dict] = None,
) -> Tuple["CollectionStore", bool]:
"""
Get or create a collection.
Returns [Collection, bool] where the bool is True if the collection was created.
"""
created = False
collection = cls.get_by_name(session, name)
if collection:
return collection, created
collection = cls(name=name, cmetadata=cmetadata)
session.add(collection)
session.commit()
created = True
return collection, created
def _get_embedding_store() -> Any:
from pgvector.sqlalchemy import Vector
class EmbeddingStore(BaseModel):
"""Embedding store."""
__tablename__ = "langchain_pg_embedding"
collection_id = sqlalchemy.Column(
UUID(as_uuid=True),
sqlalchemy.ForeignKey(
f"{CollectionStore.__tablename__}.uuid",
ondelete="CASCADE",
),
)
collection = relationship(CollectionStore, back_populates="embeddings")
embedding: Vector = sqlalchemy.Column(Vector(None))
document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
cmetadata = sqlalchemy.Column(JSON, nullable=True)
# custom_id : any user defined id
custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)
return EmbeddingStore
def _results_to_docs(docs_and_scores: Any) -> List[Document]: def _results_to_docs(docs_and_scores: Any) -> List[Document]:
"""Return docs from docs and scores.""" """Return docs from docs and scores."""
return [doc for doc, _ in docs_and_scores] return [doc for doc, _ in docs_and_scores]
@ -138,13 +202,9 @@ class PGVector(VectorStore):
) -> None: ) -> None:
"""Initialize the store.""" """Initialize the store."""
self.create_vector_extension() self.create_vector_extension()
from langchain.vectorstores._pgvector_data_models import (
CollectionStore,
EmbeddingStore,
)
self.CollectionStore = CollectionStore self.CollectionStore = CollectionStore
self.EmbeddingStore = EmbeddingStore self.EmbeddingStore = _get_embedding_store()
self.create_tables_if_not_exists() self.create_tables_if_not_exists()
self.create_collection() self.create_collection()