diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py index 739f52f48ed..5a12e8984f5 100644 --- a/libs/langchain/langchain/document_loaders/confluence.py +++ b/libs/langchain/langchain/document_loaders/confluence.py @@ -338,7 +338,9 @@ class ConfluenceLoader(BaseLoader): ), before_sleep=before_sleep_log(logger, logging.WARNING), )(self.confluence.get_page_by_id) - page = get_page(page_id=page_id, expand=content_format.value) + page = get_page( + page_id=page_id, expand=f"{content_format.value},version" + ) if not include_restricted_content and not self.is_public_page(page): continue doc = self.process_page( @@ -505,13 +507,18 @@ class ConfluenceLoader(BaseLoader): ] text = text + "".join(comment_texts) + metadata = { + "title": page["title"], + "id": page["id"], + "source": self.base_url.strip("/") + page["_links"]["webui"], + } + + if "version" in page and "when" in page["version"]: + metadata["when"] = page["version"]["when"] + return Document( page_content=text, - metadata={ - "title": page["title"], - "id": page["id"], - "source": self.base_url.strip("/") + page["_links"]["webui"], - }, + metadata=metadata, ) def process_attachment( diff --git a/libs/langchain/langchain/document_loaders/googledrive.py b/libs/langchain/langchain/document_loaders/googledrive.py index 881bafbd5ae..513f9bba773 100644 --- a/libs/langchain/langchain/document_loaders/googledrive.py +++ b/libs/langchain/langchain/document_loaders/googledrive.py @@ -200,7 +200,11 @@ class GoogleDriveLoader(BaseLoader, BaseModel): creds = self._load_credentials() service = build("drive", "v3", credentials=creds) - file = service.files().get(fileId=id, supportsAllDrives=True).execute() + file = ( + service.files() + .get(fileId=id, supportsAllDrives=True, fields="modifiedTime,name") + .execute() + ) request = service.files().export_media(fileId=id, mimeType="text/plain") fh = BytesIO() downloader = MediaIoBaseDownload(fh, request) @@ -219,6 +223,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel): metadata = { "source": f"https://docs.google.com/document/d/{id}/edit", "title": f"{file.get('name')}", + "when": f"{file.get('modifiedTime')}", } return Document(page_content=text, metadata=metadata) diff --git a/libs/langchain/langchain/vectorstores/pgvector.py b/libs/langchain/langchain/vectorstores/pgvector.py index a86a88cb108..6b02fc19c0a 100644 --- a/libs/langchain/langchain/vectorstores/pgvector.py +++ b/libs/langchain/langchain/vectorstores/pgvector.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib import enum import logging import uuid @@ -8,6 +9,7 @@ from typing import ( Any, Callable, Dict, + Generator, Iterable, List, Optional, @@ -16,6 +18,7 @@ from typing import ( ) import sqlalchemy +from sqlalchemy import delete from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.orm import Session, declarative_base @@ -170,6 +173,33 @@ class PGVector(VectorStore): session.delete(collection) session.commit() + @contextlib.contextmanager + def _make_session(self) -> Generator[Session, None, None]: + """Create a context manager for the session, bind to _conn string.""" + yield Session(self._conn) + + def delete( + self, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> None: + """Delete vectors by ids or uuids. + + Args: + ids: List of ids to delete. + """ + with Session(self._conn) as session: + if ids is not None: + self.logger.debug( + "Trying to delete vectors by ids (represented by the model " + "using the custom ids field)" + ) + stmt = delete(self.EmbeddingStore).where( + self.EmbeddingStore.custom_id.in_(ids) + ) + session.execute(stmt) + session.commit() + def get_collection(self, session: Session) -> Optional["CollectionStore"]: return self.CollectionStore.get_by_name(session, self.collection_name) diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py b/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py index 46c8f11e19a..6d6028497cd 100644 --- a/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py +++ b/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py @@ -186,6 +186,34 @@ def test_pgvector_with_filter_in_set() -> None: ] +def test_pgvector_delete_docs() -> None: + """Add and delete documents.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = PGVector.from_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + ids=["1", "2", "3"], + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + docsearch.delete(["1", "2"]) + with docsearch._make_session() as session: + records = list(session.query(docsearch.EmbeddingStore).all()) + # ignoring type error since mypy cannot determine whether + # the list is sortable + assert sorted(record.custom_id for record in records) == ["3"] # type: ignore + + docsearch.delete(["2", "3"]) # Should not raise on missing ids + with docsearch._make_session() as session: + records = list(session.query(docsearch.EmbeddingStore).all()) + # ignoring type error since mypy cannot determine whether + # the list is sortable + assert sorted(record.custom_id for record in records) == [] # type: ignore + + def test_pgvector_relevance_score() -> None: """Test to make sure the relevance score is scaled to 0-1.""" texts = ["foo", "bar", "baz"]