Add del vector pgvector + adding modification time to confluence and google drive docs (#9604)

Description:
- adding implementation of delete for pgvector
- adding modification time in docs metadata for confluence and google
drive.

Issue:
https://github.com/langchain-ai/langchain/issues/9312

Tag maintainer: @baskaryan, @eyurtsev, @hwchase17, @rlancemartin.

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Naama Magami 2023-08-25 07:09:30 +03:00 committed by GitHub
parent 3e5cda3405
commit adb21782b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 77 additions and 7 deletions

View File

@ -338,7 +338,9 @@ class ConfluenceLoader(BaseLoader):
), ),
before_sleep=before_sleep_log(logger, logging.WARNING), before_sleep=before_sleep_log(logger, logging.WARNING),
)(self.confluence.get_page_by_id) )(self.confluence.get_page_by_id)
page = get_page(page_id=page_id, expand=content_format.value) page = get_page(
page_id=page_id, expand=f"{content_format.value},version"
)
if not include_restricted_content and not self.is_public_page(page): if not include_restricted_content and not self.is_public_page(page):
continue continue
doc = self.process_page( doc = self.process_page(
@ -505,13 +507,18 @@ class ConfluenceLoader(BaseLoader):
] ]
text = text + "".join(comment_texts) text = text + "".join(comment_texts)
metadata = {
"title": page["title"],
"id": page["id"],
"source": self.base_url.strip("/") + page["_links"]["webui"],
}
if "version" in page and "when" in page["version"]:
metadata["when"] = page["version"]["when"]
return Document( return Document(
page_content=text, page_content=text,
metadata={ metadata=metadata,
"title": page["title"],
"id": page["id"],
"source": self.base_url.strip("/") + page["_links"]["webui"],
},
) )
def process_attachment( def process_attachment(

View File

@ -200,7 +200,11 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
creds = self._load_credentials() creds = self._load_credentials()
service = build("drive", "v3", credentials=creds) service = build("drive", "v3", credentials=creds)
file = service.files().get(fileId=id, supportsAllDrives=True).execute() file = (
service.files()
.get(fileId=id, supportsAllDrives=True, fields="modifiedTime,name")
.execute()
)
request = service.files().export_media(fileId=id, mimeType="text/plain") request = service.files().export_media(fileId=id, mimeType="text/plain")
fh = BytesIO() fh = BytesIO()
downloader = MediaIoBaseDownload(fh, request) downloader = MediaIoBaseDownload(fh, request)
@ -219,6 +223,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
metadata = { metadata = {
"source": f"https://docs.google.com/document/d/{id}/edit", "source": f"https://docs.google.com/document/d/{id}/edit",
"title": f"{file.get('name')}", "title": f"{file.get('name')}",
"when": f"{file.get('modifiedTime')}",
} }
return Document(page_content=text, metadata=metadata) return Document(page_content=text, metadata=metadata)

View File

@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
import contextlib
import enum import enum
import logging import logging
import uuid import uuid
@ -8,6 +9,7 @@ from typing import (
Any, Any,
Callable, Callable,
Dict, Dict,
Generator,
Iterable, Iterable,
List, List,
Optional, Optional,
@ -16,6 +18,7 @@ from typing import (
) )
import sqlalchemy import sqlalchemy
from sqlalchemy import delete
from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import Session, declarative_base from sqlalchemy.orm import Session, declarative_base
@ -170,6 +173,33 @@ class PGVector(VectorStore):
session.delete(collection) session.delete(collection)
session.commit() session.commit()
@contextlib.contextmanager
def _make_session(self) -> Generator[Session, None, None]:
"""Create a context manager for the session, bind to _conn string."""
yield Session(self._conn)
def delete(
self,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> None:
"""Delete vectors by ids or uuids.
Args:
ids: List of ids to delete.
"""
with Session(self._conn) as session:
if ids is not None:
self.logger.debug(
"Trying to delete vectors by ids (represented by the model "
"using the custom ids field)"
)
stmt = delete(self.EmbeddingStore).where(
self.EmbeddingStore.custom_id.in_(ids)
)
session.execute(stmt)
session.commit()
def get_collection(self, session: Session) -> Optional["CollectionStore"]: def get_collection(self, session: Session) -> Optional["CollectionStore"]:
return self.CollectionStore.get_by_name(session, self.collection_name) return self.CollectionStore.get_by_name(session, self.collection_name)

View File

@ -186,6 +186,34 @@ def test_pgvector_with_filter_in_set() -> None:
] ]
def test_pgvector_delete_docs() -> None:
"""Add and delete documents."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = PGVector.from_texts(
texts=texts,
collection_name="test_collection_filter",
embedding=FakeEmbeddingsWithAdaDimension(),
metadatas=metadatas,
ids=["1", "2", "3"],
connection_string=CONNECTION_STRING,
pre_delete_collection=True,
)
docsearch.delete(["1", "2"])
with docsearch._make_session() as session:
records = list(session.query(docsearch.EmbeddingStore).all())
# ignoring type error since mypy cannot determine whether
# the list is sortable
assert sorted(record.custom_id for record in records) == ["3"] # type: ignore
docsearch.delete(["2", "3"]) # Should not raise on missing ids
with docsearch._make_session() as session:
records = list(session.query(docsearch.EmbeddingStore).all())
# ignoring type error since mypy cannot determine whether
# the list is sortable
assert sorted(record.custom_id for record in records) == [] # type: ignore
def test_pgvector_relevance_score() -> None: def test_pgvector_relevance_score() -> None:
"""Test to make sure the relevance score is scaled to 0-1.""" """Test to make sure the relevance score is scaled to 0-1."""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]