mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 13:54:48 +00:00
Add del vector pgvector + adding modification time to confluence and google drive docs (#9604)
Description: - adding implementation of delete for pgvector - adding modification time in docs metadata for confluence and google drive. Issue: https://github.com/langchain-ai/langchain/issues/9312 Tag maintainer: @baskaryan, @eyurtsev, @hwchase17, @rlancemartin. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
3e5cda3405
commit
adb21782b8
@ -338,7 +338,9 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
),
|
),
|
||||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||||
)(self.confluence.get_page_by_id)
|
)(self.confluence.get_page_by_id)
|
||||||
page = get_page(page_id=page_id, expand=content_format.value)
|
page = get_page(
|
||||||
|
page_id=page_id, expand=f"{content_format.value},version"
|
||||||
|
)
|
||||||
if not include_restricted_content and not self.is_public_page(page):
|
if not include_restricted_content and not self.is_public_page(page):
|
||||||
continue
|
continue
|
||||||
doc = self.process_page(
|
doc = self.process_page(
|
||||||
@ -505,13 +507,18 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
]
|
]
|
||||||
text = text + "".join(comment_texts)
|
text = text + "".join(comment_texts)
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
"title": page["title"],
|
||||||
|
"id": page["id"],
|
||||||
|
"source": self.base_url.strip("/") + page["_links"]["webui"],
|
||||||
|
}
|
||||||
|
|
||||||
|
if "version" in page and "when" in page["version"]:
|
||||||
|
metadata["when"] = page["version"]["when"]
|
||||||
|
|
||||||
return Document(
|
return Document(
|
||||||
page_content=text,
|
page_content=text,
|
||||||
metadata={
|
metadata=metadata,
|
||||||
"title": page["title"],
|
|
||||||
"id": page["id"],
|
|
||||||
"source": self.base_url.strip("/") + page["_links"]["webui"],
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def process_attachment(
|
def process_attachment(
|
||||||
|
@ -200,7 +200,11 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
creds = self._load_credentials()
|
creds = self._load_credentials()
|
||||||
service = build("drive", "v3", credentials=creds)
|
service = build("drive", "v3", credentials=creds)
|
||||||
|
|
||||||
file = service.files().get(fileId=id, supportsAllDrives=True).execute()
|
file = (
|
||||||
|
service.files()
|
||||||
|
.get(fileId=id, supportsAllDrives=True, fields="modifiedTime,name")
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
request = service.files().export_media(fileId=id, mimeType="text/plain")
|
request = service.files().export_media(fileId=id, mimeType="text/plain")
|
||||||
fh = BytesIO()
|
fh = BytesIO()
|
||||||
downloader = MediaIoBaseDownload(fh, request)
|
downloader = MediaIoBaseDownload(fh, request)
|
||||||
@ -219,6 +223,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
metadata = {
|
metadata = {
|
||||||
"source": f"https://docs.google.com/document/d/{id}/edit",
|
"source": f"https://docs.google.com/document/d/{id}/edit",
|
||||||
"title": f"{file.get('name')}",
|
"title": f"{file.get('name')}",
|
||||||
|
"when": f"{file.get('modifiedTime')}",
|
||||||
}
|
}
|
||||||
return Document(page_content=text, metadata=metadata)
|
return Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import contextlib
|
||||||
import enum
|
import enum
|
||||||
import logging
|
import logging
|
||||||
import uuid
|
import uuid
|
||||||
@ -8,6 +9,7 @@ from typing import (
|
|||||||
Any,
|
Any,
|
||||||
Callable,
|
Callable,
|
||||||
Dict,
|
Dict,
|
||||||
|
Generator,
|
||||||
Iterable,
|
Iterable,
|
||||||
List,
|
List,
|
||||||
Optional,
|
Optional,
|
||||||
@ -16,6 +18,7 @@ from typing import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
import sqlalchemy
|
import sqlalchemy
|
||||||
|
from sqlalchemy import delete
|
||||||
from sqlalchemy.dialects.postgresql import UUID
|
from sqlalchemy.dialects.postgresql import UUID
|
||||||
from sqlalchemy.orm import Session, declarative_base
|
from sqlalchemy.orm import Session, declarative_base
|
||||||
|
|
||||||
@ -170,6 +173,33 @@ class PGVector(VectorStore):
|
|||||||
session.delete(collection)
|
session.delete(collection)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def _make_session(self) -> Generator[Session, None, None]:
|
||||||
|
"""Create a context manager for the session, bind to _conn string."""
|
||||||
|
yield Session(self._conn)
|
||||||
|
|
||||||
|
def delete(
|
||||||
|
self,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
"""Delete vectors by ids or uuids.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ids: List of ids to delete.
|
||||||
|
"""
|
||||||
|
with Session(self._conn) as session:
|
||||||
|
if ids is not None:
|
||||||
|
self.logger.debug(
|
||||||
|
"Trying to delete vectors by ids (represented by the model "
|
||||||
|
"using the custom ids field)"
|
||||||
|
)
|
||||||
|
stmt = delete(self.EmbeddingStore).where(
|
||||||
|
self.EmbeddingStore.custom_id.in_(ids)
|
||||||
|
)
|
||||||
|
session.execute(stmt)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
def get_collection(self, session: Session) -> Optional["CollectionStore"]:
|
def get_collection(self, session: Session) -> Optional["CollectionStore"]:
|
||||||
return self.CollectionStore.get_by_name(session, self.collection_name)
|
return self.CollectionStore.get_by_name(session, self.collection_name)
|
||||||
|
|
||||||
|
@ -186,6 +186,34 @@ def test_pgvector_with_filter_in_set() -> None:
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_pgvector_delete_docs() -> None:
|
||||||
|
"""Add and delete documents."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||||
|
docsearch = PGVector.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
collection_name="test_collection_filter",
|
||||||
|
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||||
|
metadatas=metadatas,
|
||||||
|
ids=["1", "2", "3"],
|
||||||
|
connection_string=CONNECTION_STRING,
|
||||||
|
pre_delete_collection=True,
|
||||||
|
)
|
||||||
|
docsearch.delete(["1", "2"])
|
||||||
|
with docsearch._make_session() as session:
|
||||||
|
records = list(session.query(docsearch.EmbeddingStore).all())
|
||||||
|
# ignoring type error since mypy cannot determine whether
|
||||||
|
# the list is sortable
|
||||||
|
assert sorted(record.custom_id for record in records) == ["3"] # type: ignore
|
||||||
|
|
||||||
|
docsearch.delete(["2", "3"]) # Should not raise on missing ids
|
||||||
|
with docsearch._make_session() as session:
|
||||||
|
records = list(session.query(docsearch.EmbeddingStore).all())
|
||||||
|
# ignoring type error since mypy cannot determine whether
|
||||||
|
# the list is sortable
|
||||||
|
assert sorted(record.custom_id for record in records) == [] # type: ignore
|
||||||
|
|
||||||
|
|
||||||
def test_pgvector_relevance_score() -> None:
|
def test_pgvector_relevance_score() -> None:
|
||||||
"""Test to make sure the relevance score is scaled to 0-1."""
|
"""Test to make sure the relevance score is scaled to 0-1."""
|
||||||
texts = ["foo", "bar", "baz"]
|
texts = ["foo", "bar", "baz"]
|
||||||
|
Loading…
Reference in New Issue
Block a user