mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-01 12:38:45 +00:00
Add del vector pgvector + adding modification time to confluence and google drive docs (#9604)
Description: - adding implementation of delete for pgvector - adding modification time in docs metadata for confluence and google drive. Issue: https://github.com/langchain-ai/langchain/issues/9312 Tag maintainer: @baskaryan, @eyurtsev, @hwchase17, @rlancemartin. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
3e5cda3405
commit
adb21782b8
@ -338,7 +338,9 @@ class ConfluenceLoader(BaseLoader):
|
||||
),
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)(self.confluence.get_page_by_id)
|
||||
page = get_page(page_id=page_id, expand=content_format.value)
|
||||
page = get_page(
|
||||
page_id=page_id, expand=f"{content_format.value},version"
|
||||
)
|
||||
if not include_restricted_content and not self.is_public_page(page):
|
||||
continue
|
||||
doc = self.process_page(
|
||||
@ -505,13 +507,18 @@ class ConfluenceLoader(BaseLoader):
|
||||
]
|
||||
text = text + "".join(comment_texts)
|
||||
|
||||
metadata = {
|
||||
"title": page["title"],
|
||||
"id": page["id"],
|
||||
"source": self.base_url.strip("/") + page["_links"]["webui"],
|
||||
}
|
||||
|
||||
if "version" in page and "when" in page["version"]:
|
||||
metadata["when"] = page["version"]["when"]
|
||||
|
||||
return Document(
|
||||
page_content=text,
|
||||
metadata={
|
||||
"title": page["title"],
|
||||
"id": page["id"],
|
||||
"source": self.base_url.strip("/") + page["_links"]["webui"],
|
||||
},
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
def process_attachment(
|
||||
|
@ -200,7 +200,11 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
creds = self._load_credentials()
|
||||
service = build("drive", "v3", credentials=creds)
|
||||
|
||||
file = service.files().get(fileId=id, supportsAllDrives=True).execute()
|
||||
file = (
|
||||
service.files()
|
||||
.get(fileId=id, supportsAllDrives=True, fields="modifiedTime,name")
|
||||
.execute()
|
||||
)
|
||||
request = service.files().export_media(fileId=id, mimeType="text/plain")
|
||||
fh = BytesIO()
|
||||
downloader = MediaIoBaseDownload(fh, request)
|
||||
@ -219,6 +223,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
metadata = {
|
||||
"source": f"https://docs.google.com/document/d/{id}/edit",
|
||||
"title": f"{file.get('name')}",
|
||||
"when": f"{file.get('modifiedTime')}",
|
||||
}
|
||||
return Document(page_content=text, metadata=metadata)
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import enum
|
||||
import logging
|
||||
import uuid
|
||||
@ -8,6 +9,7 @@ from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Generator,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
@ -16,6 +18,7 @@ from typing import (
|
||||
)
|
||||
|
||||
import sqlalchemy
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
from sqlalchemy.orm import Session, declarative_base
|
||||
|
||||
@ -170,6 +173,33 @@ class PGVector(VectorStore):
|
||||
session.delete(collection)
|
||||
session.commit()
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _make_session(self) -> Generator[Session, None, None]:
|
||||
"""Create a context manager for the session, bind to _conn string."""
|
||||
yield Session(self._conn)
|
||||
|
||||
def delete(
|
||||
self,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Delete vectors by ids or uuids.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
"""
|
||||
with Session(self._conn) as session:
|
||||
if ids is not None:
|
||||
self.logger.debug(
|
||||
"Trying to delete vectors by ids (represented by the model "
|
||||
"using the custom ids field)"
|
||||
)
|
||||
stmt = delete(self.EmbeddingStore).where(
|
||||
self.EmbeddingStore.custom_id.in_(ids)
|
||||
)
|
||||
session.execute(stmt)
|
||||
session.commit()
|
||||
|
||||
def get_collection(self, session: Session) -> Optional["CollectionStore"]:
|
||||
return self.CollectionStore.get_by_name(session, self.collection_name)
|
||||
|
||||
|
@ -186,6 +186,34 @@ def test_pgvector_with_filter_in_set() -> None:
|
||||
]
|
||||
|
||||
|
||||
def test_pgvector_delete_docs() -> None:
|
||||
"""Add and delete documents."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = PGVector.from_texts(
|
||||
texts=texts,
|
||||
collection_name="test_collection_filter",
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
metadatas=metadatas,
|
||||
ids=["1", "2", "3"],
|
||||
connection_string=CONNECTION_STRING,
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
docsearch.delete(["1", "2"])
|
||||
with docsearch._make_session() as session:
|
||||
records = list(session.query(docsearch.EmbeddingStore).all())
|
||||
# ignoring type error since mypy cannot determine whether
|
||||
# the list is sortable
|
||||
assert sorted(record.custom_id for record in records) == ["3"] # type: ignore
|
||||
|
||||
docsearch.delete(["2", "3"]) # Should not raise on missing ids
|
||||
with docsearch._make_session() as session:
|
||||
records = list(session.query(docsearch.EmbeddingStore).all())
|
||||
# ignoring type error since mypy cannot determine whether
|
||||
# the list is sortable
|
||||
assert sorted(record.custom_id for record in records) == [] # type: ignore
|
||||
|
||||
|
||||
def test_pgvector_relevance_score() -> None:
|
||||
"""Test to make sure the relevance score is scaled to 0-1."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
|
Loading…
Reference in New Issue
Block a user