core[patch], langchain[patch], experimental[patch]: import CI (#14414)

This commit is contained in:
Erick Friis 2023-12-08 11:28:55 -08:00 committed by GitHub
parent ba083887e5
commit b3f226e8f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 177 additions and 113 deletions

View File

@ -52,8 +52,8 @@ jobs:
working-directory: ${{ inputs.working-directory }}
secrets: inherit
pydantic-compatibility:
uses: ./.github/workflows/_pydantic_compatibility.yml
dependencies:
uses: ./.github/workflows/_dependencies.yml
with:
working-directory: ${{ inputs.working-directory }}
secrets: inherit

View File

@ -1,4 +1,4 @@
name: pydantic v1/v2 compatibility
name: dependencies
on:
workflow_call:
@ -28,7 +28,7 @@ jobs:
- "3.9"
- "3.10"
- "3.11"
name: Pydantic v1/v2 compatibility - Python ${{ matrix.python-version }}
name: dependencies - Python ${{ matrix.python-version }}
steps:
- uses: actions/checkout@v4
@ -42,7 +42,15 @@ jobs:
- name: Install dependencies
shell: bash
run: poetry install --with test
run: poetry install
- name: Check imports with base dependencies
shell: bash
run: poetry run make check_imports
- name: Install test dependencies
shell: bash
run: poetry install --with test
- name: Install langchain editable
working-directory: ${{ inputs.working-directory }}

View File

@ -15,6 +15,10 @@ tests:
test_watch:
poetry run ptw --snapshot-update --now . -- -vv -x tests/unit_tests
check_imports: langchain_core/**/*.py
for f in $^ ; do \
python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \
done
extended_tests:
poetry run pytest --only-extended $(TEST_FILE)
@ -32,7 +36,7 @@ lint_tests: PYTHON_FILES=tests
lint lint_diff lint_package lint_tests:
./scripts/check_pydantic.sh .
./scripts/check_imports.sh
./scripts/lint_imports.sh
poetry run ruff .
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES)

View File

@ -21,6 +21,11 @@ extended_tests:
integration_tests:
poetry run pytest tests/integration_tests
check_imports: langchain_experimental/**/*.py
for f in $^ ; do \
python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \
done
######################
# LINTING AND FORMATTING

View File

@ -1,10 +1,11 @@
import re
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict, List
from typing import TYPE_CHECKING, Dict, List
from presidio_analyzer import RecognizerResult
from presidio_anonymizer.entities import EngineResult
if TYPE_CHECKING:
from presidio_analyzer import RecognizerResult
from presidio_anonymizer.entities import EngineResult
MappingDataType = Dict[str, Dict[str, str]]
@ -62,8 +63,8 @@ class DeanonymizerMapping:
def create_anonymizer_mapping(
original_text: str,
analyzer_results: List[RecognizerResult],
anonymizer_results: EngineResult,
analyzer_results: List["RecognizerResult"],
anonymizer_results: "EngineResult",
is_reversed: bool = False,
) -> MappingDataType:
"""Creates or updates the mapping used to anonymize and/or deanonymize text.

View File

@ -23,28 +23,62 @@ from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
get_pseudoanonymizer_mapping,
)
try:
from presidio_analyzer import AnalyzerEngine
if TYPE_CHECKING:
from presidio_analyzer import AnalyzerEngine, EntityRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
except ImportError as e:
raise ImportError(
"Could not import presidio_analyzer, please install with "
"`pip install presidio-analyzer`. You will also need to download a "
"spaCy model to use the analyzer, e.g. "
"`python -m spacy download en_core_web_lg`."
) from e
try:
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
except ImportError as e:
raise ImportError(
"Could not import presidio_anonymizer, please install with "
"`pip install presidio-anonymizer`."
) from e
if TYPE_CHECKING:
from presidio_analyzer import EntityRecognizer
def _import_analyzer_engine() -> "AnalyzerEngine":
try:
from presidio_analyzer import AnalyzerEngine
except ImportError as e:
raise ImportError(
"Could not import presidio_analyzer, please install with "
"`pip install presidio-analyzer`. You will also need to download a "
"spaCy model to use the analyzer, e.g. "
"`python -m spacy download en_core_web_lg`."
) from e
return AnalyzerEngine
def _import_nlp_engine_provider() -> "NlpEngineProvider":
try:
from presidio_analyzer.nlp_engine import NlpEngineProvider
except ImportError as e:
raise ImportError(
"Could not import presidio_analyzer, please install with "
"`pip install presidio-analyzer`. You will also need to download a "
"spaCy model to use the analyzer, e.g. "
"`python -m spacy download en_core_web_lg`."
) from e
return NlpEngineProvider
def _import_anonymizer_engine() -> "AnonymizerEngine":
try:
from presidio_anonymizer import AnonymizerEngine
except ImportError as e:
raise ImportError(
"Could not import presidio_anonymizer, please install with "
"`pip install presidio-anonymizer`."
) from e
return AnonymizerEngine
def _import_operator_config() -> "OperatorConfig":
try:
from presidio_anonymizer.entities import OperatorConfig
except ImportError as e:
raise ImportError(
"Could not import presidio_anonymizer, please install with "
"`pip install presidio-anonymizer`."
) from e
return OperatorConfig
# Configuring Anonymizer for multiple languages
# Detailed description and examples can be found here:
@ -89,6 +123,11 @@ class PresidioAnonymizerBase(AnonymizerBase):
Defaults to None, in which case faker will be seeded randomly
and provide random values.
"""
OperatorConfig = _import_operator_config()
AnalyzerEngine = _import_analyzer_engine()
NlpEngineProvider = _import_nlp_engine_provider()
AnonymizerEngine = _import_anonymizer_engine()
self.analyzed_fields = (
analyzed_fields
if analyzed_fields is not None

View File

@ -40,6 +40,11 @@ docker_tests:
docker build -t my-langchain-image:test .
docker run --rm my-langchain-image:test
check_imports: langchain/**/*.py
for f in $^ ; do \
python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \
done
######################
# LINTING AND FORMATTING
######################
@ -53,7 +58,7 @@ lint_tests: PYTHON_FILES=tests
lint lint_diff lint_package lint_tests:
./scripts/check_pydantic.sh .
./scripts/check_imports.sh
./scripts/lint_imports.sh
poetry run ruff .
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES)

View File

@ -5,7 +5,6 @@ import json
from typing import TYPE_CHECKING, Any, Dict, List, Optional
import requests
import tiktoken
from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator
from langchain.utils import get_from_dict_or_env
@ -15,6 +14,18 @@ if TYPE_CHECKING:
from github.PullRequest import PullRequest
def _import_tiktoken() -> Any:
"""Import tiktoken."""
try:
import tiktoken
except ImportError:
raise ImportError(
"tiktoken is not installed. "
"Please install it with `pip install tiktoken`"
)
return tiktoken
class GitHubAPIWrapper(BaseModel):
"""Wrapper for GitHub API."""
@ -385,6 +396,7 @@ class GitHubAPIWrapper(BaseModel):
dict: A dictionary containing the issue's title,
body, and comments as a string
"""
tiktoken = _import_tiktoken()
MAX_TOKENS_FOR_FILES = 3_000
pr_files = []
pr = self.github_repo_instance.get_pull(number=int(pr_number))
@ -453,6 +465,7 @@ class GitHubAPIWrapper(BaseModel):
total_tokens = 0
def get_tokens(text: str) -> int:
tiktoken = _import_tiktoken()
return len(tiktoken.get_encoding("cl100k_base").encode(text))
def add_to_dict(data_dict: Dict[str, Any], key: str, value: str) -> None:

View File

@ -1,71 +0,0 @@
from typing import Optional, Tuple
import sqlalchemy
from pgvector.sqlalchemy import Vector
from sqlalchemy.dialects.postgresql import JSON, UUID
from sqlalchemy.orm import Session, relationship
from langchain.vectorstores.pgvector import BaseModel
class CollectionStore(BaseModel):
"""Collection store."""
__tablename__ = "langchain_pg_collection"
name = sqlalchemy.Column(sqlalchemy.String)
cmetadata = sqlalchemy.Column(JSON)
embeddings = relationship(
"EmbeddingStore",
back_populates="collection",
passive_deletes=True,
)
@classmethod
def get_by_name(cls, session: Session, name: str) -> Optional["CollectionStore"]:
return session.query(cls).filter(cls.name == name).first() # type: ignore
@classmethod
def get_or_create(
cls,
session: Session,
name: str,
cmetadata: Optional[dict] = None,
) -> Tuple["CollectionStore", bool]:
"""
Get or create a collection.
Returns [Collection, bool] where the bool is True if the collection was created.
"""
created = False
collection = cls.get_by_name(session, name)
if collection:
return collection, created
collection = cls(name=name, cmetadata=cmetadata)
session.add(collection)
session.commit()
created = True
return collection, created
class EmbeddingStore(BaseModel):
"""Embedding store."""
__tablename__ = "langchain_pg_embedding"
collection_id = sqlalchemy.Column(
UUID(as_uuid=True),
sqlalchemy.ForeignKey(
f"{CollectionStore.__tablename__}.uuid",
ondelete="CASCADE",
),
)
collection = relationship(CollectionStore, back_populates="embeddings")
embedding: Vector = sqlalchemy.Column(Vector(None))
document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
cmetadata = sqlalchemy.Column(JSON, nullable=True)
# custom_id : any user defined id
custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)

View File

@ -7,7 +7,6 @@ import logging
import uuid
from functools import partial
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
@ -22,8 +21,8 @@ from typing import (
import numpy as np
import sqlalchemy
from sqlalchemy import delete
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import Session
from sqlalchemy.dialects.postgresql import JSON, UUID
from sqlalchemy.orm import Session, relationship
try:
from sqlalchemy.orm import declarative_base
@ -37,9 +36,6 @@ from langchain_core.vectorstores import VectorStore
from langchain.utils import get_from_dict_or_env
from langchain.vectorstores.utils import maximal_marginal_relevance
if TYPE_CHECKING:
from langchain.vectorstores._pgvector_data_models import CollectionStore
class DistanceStrategy(str, enum.Enum):
"""Enumerator of the Distance strategies."""
@ -64,6 +60,74 @@ class BaseModel(Base):
uuid = sqlalchemy.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
class CollectionStore(BaseModel):
"""Collection store."""
__tablename__ = "langchain_pg_collection"
name = sqlalchemy.Column(sqlalchemy.String)
cmetadata = sqlalchemy.Column(JSON)
embeddings = relationship(
"EmbeddingStore",
back_populates="collection",
passive_deletes=True,
)
@classmethod
def get_by_name(cls, session: Session, name: str) -> Optional["CollectionStore"]:
return session.query(cls).filter(cls.name == name).first() # type: ignore
@classmethod
def get_or_create(
cls,
session: Session,
name: str,
cmetadata: Optional[dict] = None,
) -> Tuple["CollectionStore", bool]:
"""
Get or create a collection.
Returns [Collection, bool] where the bool is True if the collection was created.
"""
created = False
collection = cls.get_by_name(session, name)
if collection:
return collection, created
collection = cls(name=name, cmetadata=cmetadata)
session.add(collection)
session.commit()
created = True
return collection, created
def _get_embedding_store() -> Any:
from pgvector.sqlalchemy import Vector
class EmbeddingStore(BaseModel):
"""Embedding store."""
__tablename__ = "langchain_pg_embedding"
collection_id = sqlalchemy.Column(
UUID(as_uuid=True),
sqlalchemy.ForeignKey(
f"{CollectionStore.__tablename__}.uuid",
ondelete="CASCADE",
),
)
collection = relationship(CollectionStore, back_populates="embeddings")
embedding: Vector = sqlalchemy.Column(Vector(None))
document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
cmetadata = sqlalchemy.Column(JSON, nullable=True)
# custom_id : any user defined id
custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)
return EmbeddingStore
def _results_to_docs(docs_and_scores: Any) -> List[Document]:
"""Return docs from docs and scores."""
return [doc for doc, _ in docs_and_scores]
@ -138,13 +202,9 @@ class PGVector(VectorStore):
) -> None:
"""Initialize the store."""
self.create_vector_extension()
from langchain.vectorstores._pgvector_data_models import (
CollectionStore,
EmbeddingStore,
)
self.CollectionStore = CollectionStore
self.EmbeddingStore = EmbeddingStore
self.EmbeddingStore = _get_embedding_store()
self.create_tables_if_not_exists()
self.create_collection()