mirror of
https://github.com/hwchase17/langchain.git
synced 2026-04-05 03:48:48 +00:00
x
This commit is contained in:
@@ -3,7 +3,6 @@ from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import enum
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from typing import (
|
||||
@@ -90,9 +89,8 @@ SUPPORTED_OPERATORS = (
|
||||
)
|
||||
|
||||
|
||||
def _get_embedding_collection_store(
|
||||
vector_dimension: Optional[int] = None, *, use_jsonb: bool = True
|
||||
) -> Any:
|
||||
def _get_embedding_collection_store(vector_dimension: int) -> Any:
|
||||
"""Get the Embedding and Collection store classes."""
|
||||
global _classes
|
||||
if _classes is not None:
|
||||
return _classes
|
||||
@@ -141,60 +139,35 @@ def _get_embedding_collection_store(
|
||||
created = True
|
||||
return collection, created
|
||||
|
||||
if use_jsonb:
|
||||
# TODO(PRIOR TO LANDING): Create a gin index on the cmetadata field
|
||||
class EmbeddingStore(BaseModel):
|
||||
"""Embedding store."""
|
||||
class EmbeddingStore(BaseModel):
|
||||
"""Embedding store."""
|
||||
|
||||
__tablename__ = "langchain_pg_embedding"
|
||||
__tablename__ = "langchain_pg_embedding"
|
||||
|
||||
collection_id = sqlalchemy.Column(
|
||||
UUID(as_uuid=True),
|
||||
sqlalchemy.ForeignKey(
|
||||
f"{CollectionStore.__tablename__}.uuid",
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
)
|
||||
collection = relationship(CollectionStore, back_populates="embeddings")
|
||||
collection_id = sqlalchemy.Column(
|
||||
UUID(as_uuid=True),
|
||||
sqlalchemy.ForeignKey(
|
||||
f"{CollectionStore.__tablename__}.uuid",
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
)
|
||||
collection = relationship(CollectionStore, back_populates="embeddings")
|
||||
|
||||
embedding: Vector = sqlalchemy.Column(Vector(vector_dimension))
|
||||
document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
|
||||
cmetadata = sqlalchemy.Column(JSONB, nullable=True)
|
||||
embedding: Vector = sqlalchemy.Column(Vector(vector_dimension))
|
||||
document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
|
||||
cmetadata = sqlalchemy.Column(JSONB, nullable=True)
|
||||
|
||||
# custom_id : any user defined id
|
||||
custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)
|
||||
# custom_id : any user defined id
|
||||
custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)
|
||||
|
||||
__table_args__ = (
|
||||
sqlalchemy.Index(
|
||||
"ix_cmetadata_gin",
|
||||
"cmetadata",
|
||||
postgresql_using="gin",
|
||||
postgresql_ops={"cmetadata": "jsonb_path_ops"},
|
||||
),
|
||||
)
|
||||
else:
|
||||
# For backwards comaptibilty with older versions of pgvector
|
||||
# This should be removed in the future (remove during migration)
|
||||
class EmbeddingStore(BaseModel): # type: ignore[no-redef]
|
||||
"""Embedding store."""
|
||||
|
||||
__tablename__ = "langchain_pg_embedding"
|
||||
|
||||
collection_id = sqlalchemy.Column(
|
||||
UUID(as_uuid=True),
|
||||
sqlalchemy.ForeignKey(
|
||||
f"{CollectionStore.__tablename__}.uuid",
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
)
|
||||
collection = relationship(CollectionStore, back_populates="embeddings")
|
||||
|
||||
embedding: Vector = sqlalchemy.Column(Vector(vector_dimension))
|
||||
document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
|
||||
cmetadata = sqlalchemy.Column(JSON, nullable=True)
|
||||
|
||||
# custom_id : any user defined id
|
||||
custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)
|
||||
__table_args__ = (
|
||||
sqlalchemy.Index(
|
||||
"ix_cmetadata_gin",
|
||||
"cmetadata",
|
||||
postgresql_using="gin",
|
||||
postgresql_ops={"cmetadata": "jsonb_path_ops"},
|
||||
),
|
||||
)
|
||||
|
||||
_classes = (EmbeddingStore, CollectionStore)
|
||||
|
||||
@@ -227,11 +200,6 @@ class PGVector(VectorStore):
|
||||
pre_delete_collection: If True, will delete the collection if it exists.
|
||||
(default: False). Useful for testing.
|
||||
engine_args: SQLAlchemy's create engine arguments.
|
||||
use_jsonb: Use JSONB instead of JSON for metadata. (default: True)
|
||||
Strongly discouraged from using JSON as it's not as efficient
|
||||
for querying.
|
||||
It's provided here for backwards compatibility with older versions,
|
||||
and will be removed in the future.
|
||||
create_extension: If True, will create the vector extension if it doesn't exist.
|
||||
disabling creation is useful when using ReadOnly Databases.
|
||||
|
||||
@@ -249,7 +217,6 @@ class PGVector(VectorStore):
|
||||
documents=docs,
|
||||
collection_name=COLLECTION_NAME,
|
||||
connection_string=CONNECTION_STRING,
|
||||
use_jsonb=True,
|
||||
)
|
||||
"""
|
||||
|
||||
@@ -267,7 +234,6 @@ class PGVector(VectorStore):
|
||||
*,
|
||||
connection: Optional[sqlalchemy.engine.Connection] = None,
|
||||
engine_args: Optional[dict[str, Any]] = None,
|
||||
use_jsonb: bool = False,
|
||||
create_extension: bool = True,
|
||||
) -> None:
|
||||
"""Initialize the PGVector store."""
|
||||
@@ -282,30 +248,8 @@ class PGVector(VectorStore):
|
||||
self.override_relevance_score_fn = relevance_score_fn
|
||||
self.engine_args = engine_args or {}
|
||||
self._bind = connection if connection else self._create_engine()
|
||||
self.use_jsonb = use_jsonb
|
||||
self.create_extension = create_extension
|
||||
|
||||
if not use_jsonb:
|
||||
# Replace with a deprecation warning.
|
||||
warn_deprecated(
|
||||
"0.0.29",
|
||||
pending=True,
|
||||
message=(
|
||||
"Please use JSONB instead of JSON for metadata. "
|
||||
"This change will allow for more efficient querying that "
|
||||
"involves filtering based on metadata."
|
||||
"Please note that filtering operators have been changed "
|
||||
"when using JSOB metadata to be prefixed with a $ sign "
|
||||
"to avoid name collisions with columns. "
|
||||
"If you're using an existing database, you will need to create a"
|
||||
"db migration for your metadata column to be JSONB and update your "
|
||||
"queries to use the new operators. "
|
||||
),
|
||||
alternative=(
|
||||
"Instantiate with use_jsonb=True to use JSONB instead "
|
||||
"of JSON for metadata."
|
||||
),
|
||||
)
|
||||
self.__post_init__()
|
||||
|
||||
def __post_init__(
|
||||
@@ -316,7 +260,7 @@ class PGVector(VectorStore):
|
||||
self.create_vector_extension()
|
||||
|
||||
EmbeddingStore, CollectionStore = _get_embedding_collection_store(
|
||||
self._embedding_length, use_jsonb=self.use_jsonb
|
||||
self._embedding_length
|
||||
)
|
||||
self.CollectionStore = CollectionStore
|
||||
self.EmbeddingStore = EmbeddingStore
|
||||
@@ -434,8 +378,6 @@ class PGVector(VectorStore):
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
connection_string: Optional[str] = None,
|
||||
pre_delete_collection: bool = False,
|
||||
*,
|
||||
use_jsonb: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> PGVector:
|
||||
if ids is None:
|
||||
@@ -452,7 +394,6 @@ class PGVector(VectorStore):
|
||||
embedding_function=embedding,
|
||||
distance_strategy=distance_strategy,
|
||||
pre_delete_collection=pre_delete_collection,
|
||||
use_jsonb=use_jsonb,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -711,99 +652,6 @@ class PGVector(VectorStore):
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
def _create_filter_clause_deprecated(self, key, value): # type: ignore[no-untyped-def]
|
||||
"""Deprecated functionality.
|
||||
|
||||
This is for backwards compatibility with the JSON based schema for metadata.
|
||||
It uses incorrect operator syntax (operators are not prefixed with $).
|
||||
|
||||
This implementation is not efficient, and has bugs associated with
|
||||
the way that it handles numeric filter clauses.
|
||||
"""
|
||||
IN, NIN, BETWEEN, GT, LT, NE = "in", "nin", "between", "gt", "lt", "ne"
|
||||
EQ, LIKE, CONTAINS, OR, AND = "eq", "like", "contains", "or", "and"
|
||||
|
||||
value_case_insensitive = {k.lower(): v for k, v in value.items()}
|
||||
if IN in map(str.lower, value):
|
||||
filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.in_(
|
||||
value_case_insensitive[IN]
|
||||
)
|
||||
elif NIN in map(str.lower, value):
|
||||
filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.not_in(
|
||||
value_case_insensitive[NIN]
|
||||
)
|
||||
elif BETWEEN in map(str.lower, value):
|
||||
filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.between(
|
||||
str(value_case_insensitive[BETWEEN][0]),
|
||||
str(value_case_insensitive[BETWEEN][1]),
|
||||
)
|
||||
elif GT in map(str.lower, value):
|
||||
filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext > str(
|
||||
value_case_insensitive[GT]
|
||||
)
|
||||
elif LT in map(str.lower, value):
|
||||
filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext < str(
|
||||
value_case_insensitive[LT]
|
||||
)
|
||||
elif NE in map(str.lower, value):
|
||||
filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext != str(
|
||||
value_case_insensitive[NE]
|
||||
)
|
||||
elif EQ in map(str.lower, value):
|
||||
filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext == str(
|
||||
value_case_insensitive[EQ]
|
||||
)
|
||||
elif LIKE in map(str.lower, value):
|
||||
filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.like(
|
||||
value_case_insensitive[LIKE]
|
||||
)
|
||||
elif CONTAINS in map(str.lower, value):
|
||||
filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.contains(
|
||||
value_case_insensitive[CONTAINS]
|
||||
)
|
||||
elif OR in map(str.lower, value):
|
||||
or_clauses = [
|
||||
self._create_filter_clause(key, sub_value)
|
||||
for sub_value in value_case_insensitive[OR]
|
||||
]
|
||||
filter_by_metadata = sqlalchemy.or_(*or_clauses)
|
||||
elif AND in map(str.lower, value):
|
||||
and_clauses = [
|
||||
self._create_filter_clause(key, sub_value)
|
||||
for sub_value in value_case_insensitive[AND]
|
||||
]
|
||||
filter_by_metadata = sqlalchemy.and_(*and_clauses)
|
||||
|
||||
else:
|
||||
filter_by_metadata = None
|
||||
|
||||
return filter_by_metadata
|
||||
|
||||
def _create_filter_clause_json_deprecated(
|
||||
self, filter: Any
|
||||
) -> List[SQLColumnExpression]:
|
||||
"""Convert filters from IR to SQL clauses.
|
||||
|
||||
**DEPRECATED** This functionality will be deprecated in the future.
|
||||
|
||||
It implements translation of filters for a schema that uses JSON
|
||||
for metadata rather than the JSONB field which is more efficient
|
||||
for querying.
|
||||
"""
|
||||
filter_clauses = []
|
||||
for key, value in filter.items():
|
||||
if isinstance(value, dict):
|
||||
filter_by_metadata = self._create_filter_clause_deprecated(key, value)
|
||||
|
||||
if filter_by_metadata is not None:
|
||||
filter_clauses.append(filter_by_metadata)
|
||||
else:
|
||||
filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext == str(
|
||||
value
|
||||
)
|
||||
filter_clauses.append(filter_by_metadata)
|
||||
return filter_clauses
|
||||
|
||||
def _create_filter_clause(self, filters: Any) -> Any:
|
||||
"""Convert LangChain IR filter representation to matching SQLAlchemy clauses.
|
||||
|
||||
@@ -904,14 +752,9 @@ class PGVector(VectorStore):
|
||||
|
||||
filter_by = [self.EmbeddingStore.collection_id == collection.uuid]
|
||||
if filter:
|
||||
if self.use_jsonb:
|
||||
filter_clauses = self._create_filter_clause(filter)
|
||||
if filter_clauses is not None:
|
||||
filter_by.append(filter_clauses)
|
||||
else:
|
||||
# Old way of doing things
|
||||
filter_clauses = self._create_filter_clause_json_deprecated(filter)
|
||||
filter_by.extend(filter_clauses)
|
||||
filter_clauses = self._create_filter_clause(filter)
|
||||
if filter_clauses is not None:
|
||||
filter_by.append(filter_clauses)
|
||||
|
||||
_type = self.EmbeddingStore
|
||||
|
||||
@@ -964,8 +807,6 @@ class PGVector(VectorStore):
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
ids: Optional[List[str]] = None,
|
||||
pre_delete_collection: bool = False,
|
||||
*,
|
||||
use_jsonb: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> PGVector:
|
||||
"""
|
||||
@@ -985,7 +826,6 @@ class PGVector(VectorStore):
|
||||
collection_name=collection_name,
|
||||
distance_strategy=distance_strategy,
|
||||
pre_delete_collection=pre_delete_collection,
|
||||
use_jsonb=use_jsonb,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -1087,8 +927,6 @@ class PGVector(VectorStore):
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
ids: Optional[List[str]] = None,
|
||||
pre_delete_collection: bool = False,
|
||||
*,
|
||||
use_jsonb: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> PGVector:
|
||||
"""
|
||||
@@ -1112,7 +950,6 @@ class PGVector(VectorStore):
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
collection_name=collection_name,
|
||||
use_jsonb=use_jsonb,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user