Add del vector pgvector + adding modification time to confluence and google drive docs (#9604)

Description:
- adding implementation of delete for pgvector
- adding modification time in docs metadata for confluence and google
drive.

Issue:
https://github.com/langchain-ai/langchain/issues/9312

Tag maintainer: @baskaryan, @eyurtsev, @hwchase17, @rlancemartin.

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Naama Magami 2023-08-25 07:09:30 +03:00 committed by GitHub
parent 3e5cda3405
commit adb21782b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 77 additions and 7 deletions

View File

@ -338,7 +338,9 @@ class ConfluenceLoader(BaseLoader):
),
before_sleep=before_sleep_log(logger, logging.WARNING),
)(self.confluence.get_page_by_id)
page = get_page(page_id=page_id, expand=content_format.value)
page = get_page(
page_id=page_id, expand=f"{content_format.value},version"
)
if not include_restricted_content and not self.is_public_page(page):
continue
doc = self.process_page(
@ -505,13 +507,18 @@ class ConfluenceLoader(BaseLoader):
]
text = text + "".join(comment_texts)
metadata = {
"title": page["title"],
"id": page["id"],
"source": self.base_url.strip("/") + page["_links"]["webui"],
}
if "version" in page and "when" in page["version"]:
metadata["when"] = page["version"]["when"]
return Document(
page_content=text,
metadata={
"title": page["title"],
"id": page["id"],
"source": self.base_url.strip("/") + page["_links"]["webui"],
},
metadata=metadata,
)
def process_attachment(

View File

@ -200,7 +200,11 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
file = service.files().get(fileId=id, supportsAllDrives=True).execute()
file = (
service.files()
.get(fileId=id, supportsAllDrives=True, fields="modifiedTime,name")
.execute()
)
request = service.files().export_media(fileId=id, mimeType="text/plain")
fh = BytesIO()
downloader = MediaIoBaseDownload(fh, request)
@ -219,6 +223,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
metadata = {
"source": f"https://docs.google.com/document/d/{id}/edit",
"title": f"{file.get('name')}",
"when": f"{file.get('modifiedTime')}",
}
return Document(page_content=text, metadata=metadata)

View File

@ -1,5 +1,6 @@
from __future__ import annotations
import contextlib
import enum
import logging
import uuid
@ -8,6 +9,7 @@ from typing import (
Any,
Callable,
Dict,
Generator,
Iterable,
List,
Optional,
@ -16,6 +18,7 @@ from typing import (
)
import sqlalchemy
from sqlalchemy import delete
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import Session, declarative_base
@ -170,6 +173,33 @@ class PGVector(VectorStore):
session.delete(collection)
session.commit()
@contextlib.contextmanager
def _make_session(self) -> Generator[Session, None, None]:
"""Create a context manager for the session, bind to _conn string."""
yield Session(self._conn)
def delete(
self,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> None:
"""Delete vectors by ids or uuids.
Args:
ids: List of ids to delete.
"""
with Session(self._conn) as session:
if ids is not None:
self.logger.debug(
"Trying to delete vectors by ids (represented by the model "
"using the custom ids field)"
)
stmt = delete(self.EmbeddingStore).where(
self.EmbeddingStore.custom_id.in_(ids)
)
session.execute(stmt)
session.commit()
def get_collection(self, session: Session) -> Optional["CollectionStore"]:
return self.CollectionStore.get_by_name(session, self.collection_name)

View File

@ -186,6 +186,34 @@ def test_pgvector_with_filter_in_set() -> None:
]
def test_pgvector_delete_docs() -> None:
"""Add and delete documents."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = PGVector.from_texts(
texts=texts,
collection_name="test_collection_filter",
embedding=FakeEmbeddingsWithAdaDimension(),
metadatas=metadatas,
ids=["1", "2", "3"],
connection_string=CONNECTION_STRING,
pre_delete_collection=True,
)
docsearch.delete(["1", "2"])
with docsearch._make_session() as session:
records = list(session.query(docsearch.EmbeddingStore).all())
# ignoring type error since mypy cannot determine whether
# the list is sortable
assert sorted(record.custom_id for record in records) == ["3"] # type: ignore
docsearch.delete(["2", "3"]) # Should not raise on missing ids
with docsearch._make_session() as session:
records = list(session.query(docsearch.EmbeddingStore).all())
# ignoring type error since mypy cannot determine whether
# the list is sortable
assert sorted(record.custom_id for record in records) == [] # type: ignore
def test_pgvector_relevance_score() -> None:
"""Test to make sure the relevance score is scaled to 0-1."""
texts = ["foo", "bar", "baz"]