diff --git a/libs/partners/postgres/langchain_postgres/vectorstores.py b/libs/partners/postgres/langchain_postgres/vectorstores.py index e27e0cd6d24..30965ef12ed 100644 --- a/libs/partners/postgres/langchain_postgres/vectorstores.py +++ b/libs/partners/postgres/langchain_postgres/vectorstores.py @@ -3,7 +3,6 @@ from __future__ import annotations import contextlib import enum -import json import logging import uuid from typing import ( @@ -90,9 +89,8 @@ SUPPORTED_OPERATORS = ( ) -def _get_embedding_collection_store( - vector_dimension: Optional[int] = None, *, use_jsonb: bool = True -) -> Any: +def _get_embedding_collection_store(vector_dimension: int) -> Any: + """Get the Embedding and Collection store classes.""" global _classes if _classes is not None: return _classes @@ -141,60 +139,35 @@ def _get_embedding_collection_store( created = True return collection, created - if use_jsonb: - # TODO(PRIOR TO LANDING): Create a gin index on the cmetadata field - class EmbeddingStore(BaseModel): - """Embedding store.""" + class EmbeddingStore(BaseModel): + """Embedding store.""" - __tablename__ = "langchain_pg_embedding" + __tablename__ = "langchain_pg_embedding" - collection_id = sqlalchemy.Column( - UUID(as_uuid=True), - sqlalchemy.ForeignKey( - f"{CollectionStore.__tablename__}.uuid", - ondelete="CASCADE", - ), - ) - collection = relationship(CollectionStore, back_populates="embeddings") + collection_id = sqlalchemy.Column( + UUID(as_uuid=True), + sqlalchemy.ForeignKey( + f"{CollectionStore.__tablename__}.uuid", + ondelete="CASCADE", + ), + ) + collection = relationship(CollectionStore, back_populates="embeddings") - embedding: Vector = sqlalchemy.Column(Vector(vector_dimension)) - document = sqlalchemy.Column(sqlalchemy.String, nullable=True) - cmetadata = sqlalchemy.Column(JSONB, nullable=True) + embedding: Vector = sqlalchemy.Column(Vector(vector_dimension)) + document = sqlalchemy.Column(sqlalchemy.String, nullable=True) + cmetadata = sqlalchemy.Column(JSONB, nullable=True) - # custom_id : any user defined id - custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True) + # custom_id : any user defined id + custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True) - __table_args__ = ( - sqlalchemy.Index( - "ix_cmetadata_gin", - "cmetadata", - postgresql_using="gin", - postgresql_ops={"cmetadata": "jsonb_path_ops"}, - ), - ) - else: - # For backwards comaptibilty with older versions of pgvector - # This should be removed in the future (remove during migration) - class EmbeddingStore(BaseModel): # type: ignore[no-redef] - """Embedding store.""" - - __tablename__ = "langchain_pg_embedding" - - collection_id = sqlalchemy.Column( - UUID(as_uuid=True), - sqlalchemy.ForeignKey( - f"{CollectionStore.__tablename__}.uuid", - ondelete="CASCADE", - ), - ) - collection = relationship(CollectionStore, back_populates="embeddings") - - embedding: Vector = sqlalchemy.Column(Vector(vector_dimension)) - document = sqlalchemy.Column(sqlalchemy.String, nullable=True) - cmetadata = sqlalchemy.Column(JSON, nullable=True) - - # custom_id : any user defined id - custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True) + __table_args__ = ( + sqlalchemy.Index( + "ix_cmetadata_gin", + "cmetadata", + postgresql_using="gin", + postgresql_ops={"cmetadata": "jsonb_path_ops"}, + ), + ) _classes = (EmbeddingStore, CollectionStore) @@ -227,11 +200,6 @@ class PGVector(VectorStore): pre_delete_collection: If True, will delete the collection if it exists. (default: False). Useful for testing. engine_args: SQLAlchemy's create engine arguments. - use_jsonb: Use JSONB instead of JSON for metadata. (default: True) - Strongly discouraged from using JSON as it's not as efficient - for querying. - It's provided here for backwards compatibility with older versions, - and will be removed in the future. create_extension: If True, will create the vector extension if it doesn't exist. disabling creation is useful when using ReadOnly Databases. @@ -249,7 +217,6 @@ class PGVector(VectorStore): documents=docs, collection_name=COLLECTION_NAME, connection_string=CONNECTION_STRING, - use_jsonb=True, ) """ @@ -267,7 +234,6 @@ class PGVector(VectorStore): *, connection: Optional[sqlalchemy.engine.Connection] = None, engine_args: Optional[dict[str, Any]] = None, - use_jsonb: bool = False, create_extension: bool = True, ) -> None: """Initialize the PGVector store.""" @@ -282,30 +248,8 @@ class PGVector(VectorStore): self.override_relevance_score_fn = relevance_score_fn self.engine_args = engine_args or {} self._bind = connection if connection else self._create_engine() - self.use_jsonb = use_jsonb self.create_extension = create_extension - if not use_jsonb: - # Replace with a deprecation warning. - warn_deprecated( - "0.0.29", - pending=True, - message=( - "Please use JSONB instead of JSON for metadata. " - "This change will allow for more efficient querying that " - "involves filtering based on metadata." - "Please note that filtering operators have been changed " - "when using JSOB metadata to be prefixed with a $ sign " - "to avoid name collisions with columns. " - "If you're using an existing database, you will need to create a" - "db migration for your metadata column to be JSONB and update your " - "queries to use the new operators. " - ), - alternative=( - "Instantiate with use_jsonb=True to use JSONB instead " - "of JSON for metadata." - ), - ) self.__post_init__() def __post_init__( @@ -316,7 +260,7 @@ class PGVector(VectorStore): self.create_vector_extension() EmbeddingStore, CollectionStore = _get_embedding_collection_store( - self._embedding_length, use_jsonb=self.use_jsonb + self._embedding_length ) self.CollectionStore = CollectionStore self.EmbeddingStore = EmbeddingStore @@ -434,8 +378,6 @@ class PGVector(VectorStore): distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, connection_string: Optional[str] = None, pre_delete_collection: bool = False, - *, - use_jsonb: bool = False, **kwargs: Any, ) -> PGVector: if ids is None: @@ -452,7 +394,6 @@ class PGVector(VectorStore): embedding_function=embedding, distance_strategy=distance_strategy, pre_delete_collection=pre_delete_collection, - use_jsonb=use_jsonb, **kwargs, ) @@ -711,99 +652,6 @@ class PGVector(VectorStore): else: raise NotImplementedError() - def _create_filter_clause_deprecated(self, key, value): # type: ignore[no-untyped-def] - """Deprecated functionality. - - This is for backwards compatibility with the JSON based schema for metadata. - It uses incorrect operator syntax (operators are not prefixed with $). - - This implementation is not efficient, and has bugs associated with - the way that it handles numeric filter clauses. - """ - IN, NIN, BETWEEN, GT, LT, NE = "in", "nin", "between", "gt", "lt", "ne" - EQ, LIKE, CONTAINS, OR, AND = "eq", "like", "contains", "or", "and" - - value_case_insensitive = {k.lower(): v for k, v in value.items()} - if IN in map(str.lower, value): - filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.in_( - value_case_insensitive[IN] - ) - elif NIN in map(str.lower, value): - filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.not_in( - value_case_insensitive[NIN] - ) - elif BETWEEN in map(str.lower, value): - filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.between( - str(value_case_insensitive[BETWEEN][0]), - str(value_case_insensitive[BETWEEN][1]), - ) - elif GT in map(str.lower, value): - filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext > str( - value_case_insensitive[GT] - ) - elif LT in map(str.lower, value): - filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext < str( - value_case_insensitive[LT] - ) - elif NE in map(str.lower, value): - filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext != str( - value_case_insensitive[NE] - ) - elif EQ in map(str.lower, value): - filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext == str( - value_case_insensitive[EQ] - ) - elif LIKE in map(str.lower, value): - filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.like( - value_case_insensitive[LIKE] - ) - elif CONTAINS in map(str.lower, value): - filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext.contains( - value_case_insensitive[CONTAINS] - ) - elif OR in map(str.lower, value): - or_clauses = [ - self._create_filter_clause(key, sub_value) - for sub_value in value_case_insensitive[OR] - ] - filter_by_metadata = sqlalchemy.or_(*or_clauses) - elif AND in map(str.lower, value): - and_clauses = [ - self._create_filter_clause(key, sub_value) - for sub_value in value_case_insensitive[AND] - ] - filter_by_metadata = sqlalchemy.and_(*and_clauses) - - else: - filter_by_metadata = None - - return filter_by_metadata - - def _create_filter_clause_json_deprecated( - self, filter: Any - ) -> List[SQLColumnExpression]: - """Convert filters from IR to SQL clauses. - - **DEPRECATED** This functionality will be deprecated in the future. - - It implements translation of filters for a schema that uses JSON - for metadata rather than the JSONB field which is more efficient - for querying. - """ - filter_clauses = [] - for key, value in filter.items(): - if isinstance(value, dict): - filter_by_metadata = self._create_filter_clause_deprecated(key, value) - - if filter_by_metadata is not None: - filter_clauses.append(filter_by_metadata) - else: - filter_by_metadata = self.EmbeddingStore.cmetadata[key].astext == str( - value - ) - filter_clauses.append(filter_by_metadata) - return filter_clauses - def _create_filter_clause(self, filters: Any) -> Any: """Convert LangChain IR filter representation to matching SQLAlchemy clauses. @@ -904,14 +752,9 @@ class PGVector(VectorStore): filter_by = [self.EmbeddingStore.collection_id == collection.uuid] if filter: - if self.use_jsonb: - filter_clauses = self._create_filter_clause(filter) - if filter_clauses is not None: - filter_by.append(filter_clauses) - else: - # Old way of doing things - filter_clauses = self._create_filter_clause_json_deprecated(filter) - filter_by.extend(filter_clauses) + filter_clauses = self._create_filter_clause(filter) + if filter_clauses is not None: + filter_by.append(filter_clauses) _type = self.EmbeddingStore @@ -964,8 +807,6 @@ class PGVector(VectorStore): distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, ids: Optional[List[str]] = None, pre_delete_collection: bool = False, - *, - use_jsonb: bool = False, **kwargs: Any, ) -> PGVector: """ @@ -985,7 +826,6 @@ class PGVector(VectorStore): collection_name=collection_name, distance_strategy=distance_strategy, pre_delete_collection=pre_delete_collection, - use_jsonb=use_jsonb, **kwargs, ) @@ -1087,8 +927,6 @@ class PGVector(VectorStore): distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, ids: Optional[List[str]] = None, pre_delete_collection: bool = False, - *, - use_jsonb: bool = False, **kwargs: Any, ) -> PGVector: """ @@ -1112,7 +950,6 @@ class PGVector(VectorStore): metadatas=metadatas, ids=ids, collection_name=collection_name, - use_jsonb=use_jsonb, **kwargs, )