mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-07 12:06:43 +00:00
community[patch]: Add semantic info to metadata, classified by pebblo-server. (#20468)
Description: Add support for Semantic topics and entities. Classification done by pebblo-server is not used to enhance metadata of Documents loaded by document loaders. Dependencies: None Documentation: Updated. Signed-off-by: Rahul Tripathi <rauhl.psit.ec@gmail.com> Co-authored-by: Rahul Tripathi <rauhl.psit.ec@gmail.com>
This commit is contained in:
parent
a5028b6356
commit
dc921f0823
@ -69,7 +69,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"### Send semantic topics and identities to Pebblo cloud server\n",
|
"### Send semantic topics and identities to Pebblo cloud server\n",
|
||||||
"\n",
|
"\n",
|
||||||
"To send semantic data to pebblo-cloud, pass api-key to PebbloSafeLoader as an argument or alternatively, put the api-ket in `PEBBLO_API_KEY` environment variable."
|
"To send semantic data to pebblo-cloud, pass api-key to PebbloSafeLoader as an argument or alternatively, put the api-key in `PEBBLO_API_KEY` environment variable."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -91,6 +91,41 @@
|
|||||||
"documents = loader.load()\n",
|
"documents = loader.load()\n",
|
||||||
"print(documents)"
|
"print(documents)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Add semantic topics and identities to loaded metadata\n",
|
||||||
|
"\n",
|
||||||
|
"To add semantic topics and sematic entities to metadata of loaded documents, set load_semantic to True as an argument or alternatively, define a new environment variable `PEBBLO_LOAD_SEMANTIC`, and setting it to True."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders.csv_loader import CSVLoader\n",
|
||||||
|
"from langchain_community.document_loaders import PebbloSafeLoader\n",
|
||||||
|
"\n",
|
||||||
|
"loader = PebbloSafeLoader(\n",
|
||||||
|
" CSVLoader(\"data/corp_sens_data.csv\"),\n",
|
||||||
|
" name=\"acme-corp-rag-1\", # App name (Mandatory)\n",
|
||||||
|
" owner=\"Joe Smith\", # Owner (Optional)\n",
|
||||||
|
" description=\"Support productivity RAG application\", # Description (Optional)\n",
|
||||||
|
" api_key=\"my-api-key\", # API key (Optional, can be set in the environment variable PEBBLO_API_KEY)\n",
|
||||||
|
" load_semantic=True, # Load semantic data (Optional, default is False, can be set in the environment variable PEBBLO_LOAD_SEMANTIC)\n",
|
||||||
|
")\n",
|
||||||
|
"documents = loader.load()\n",
|
||||||
|
"print(documents[0].metadata)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -5,9 +5,9 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import Any, Dict, Iterator, List, Optional
|
from typing import Any, Dict, Iterator, List, Optional, Union
|
||||||
|
|
||||||
import requests
|
import requests # type: ignore
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
from langchain_community.document_loaders.base import BaseLoader
|
from langchain_community.document_loaders.base import BaseLoader
|
||||||
@ -19,6 +19,7 @@ from langchain_community.utilities.pebblo import (
|
|||||||
PLUGIN_VERSION,
|
PLUGIN_VERSION,
|
||||||
App,
|
App,
|
||||||
Doc,
|
Doc,
|
||||||
|
IndexedDocument,
|
||||||
get_full_path,
|
get_full_path,
|
||||||
get_loader_full_path,
|
get_loader_full_path,
|
||||||
get_loader_type,
|
get_loader_type,
|
||||||
@ -43,6 +44,7 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
owner: str = "",
|
owner: str = "",
|
||||||
description: str = "",
|
description: str = "",
|
||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
|
load_semantic: bool = False,
|
||||||
):
|
):
|
||||||
if not name or not isinstance(name, str):
|
if not name or not isinstance(name, str):
|
||||||
raise NameError("Must specify a valid name.")
|
raise NameError("Must specify a valid name.")
|
||||||
@ -50,15 +52,17 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
self.api_key = os.environ.get("PEBBLO_API_KEY") or api_key
|
self.api_key = os.environ.get("PEBBLO_API_KEY") or api_key
|
||||||
self.load_id = str(uuid.uuid4())
|
self.load_id = str(uuid.uuid4())
|
||||||
self.loader = langchain_loader
|
self.loader = langchain_loader
|
||||||
|
self.load_semantic = os.environ.get("PEBBLO_LOAD_SEMANTIC") or load_semantic
|
||||||
self.owner = owner
|
self.owner = owner
|
||||||
self.description = description
|
self.description = description
|
||||||
self.source_path = get_loader_full_path(self.loader)
|
self.source_path = get_loader_full_path(self.loader)
|
||||||
self.source_owner = PebbloSafeLoader.get_file_owner_from_path(self.source_path)
|
self.source_owner = PebbloSafeLoader.get_file_owner_from_path(self.source_path)
|
||||||
self.docs: List[Document] = []
|
self.docs: List[Document] = []
|
||||||
|
self.docs_with_id: Union[List[IndexedDocument], List[Document], List] = []
|
||||||
loader_name = str(type(self.loader)).split(".")[-1].split("'")[0]
|
loader_name = str(type(self.loader)).split(".")[-1].split("'")[0]
|
||||||
self.source_type = get_loader_type(loader_name)
|
self.source_type = get_loader_type(loader_name)
|
||||||
self.source_path_size = self.get_source_size(self.source_path)
|
self.source_path_size = self.get_source_size(self.source_path)
|
||||||
self.source_aggr_size = 0
|
self.source_aggregate_size = 0
|
||||||
self.loader_details = {
|
self.loader_details = {
|
||||||
"loader": loader_name,
|
"loader": loader_name,
|
||||||
"source_path": self.source_path,
|
"source_path": self.source_path,
|
||||||
@ -80,7 +84,15 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
list: Documents fetched from load method of the wrapped `loader`.
|
list: Documents fetched from load method of the wrapped `loader`.
|
||||||
"""
|
"""
|
||||||
self.docs = self.loader.load()
|
self.docs = self.loader.load()
|
||||||
self._send_loader_doc(loading_end=True)
|
if not self.load_semantic:
|
||||||
|
self._classify_doc(self.docs, loading_end=True)
|
||||||
|
return self.docs
|
||||||
|
self.docs_with_id = self._index_docs()
|
||||||
|
classified_docs = self._classify_doc(self.docs_with_id, loading_end=True)
|
||||||
|
self.docs_with_id = self._add_semantic_to_docs(
|
||||||
|
self.docs_with_id, classified_docs
|
||||||
|
)
|
||||||
|
self.docs = self._unindex_docs(self.docs_with_id) # type: ignore
|
||||||
return self.docs
|
return self.docs
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
@ -104,13 +116,19 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
doc = next(doc_iterator)
|
doc = next(doc_iterator)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
self.docs = []
|
self.docs = []
|
||||||
self._send_loader_doc(loading_end=True)
|
|
||||||
break
|
break
|
||||||
self.docs = [
|
self.docs = list((doc,))
|
||||||
doc,
|
if not self.load_semantic:
|
||||||
]
|
self._classify_doc(self.docs, loading_end=True)
|
||||||
self._send_loader_doc()
|
yield self.docs[0]
|
||||||
yield doc
|
else:
|
||||||
|
self.docs_with_id = self._index_docs()
|
||||||
|
classified_doc = self._classify_doc(self.docs)
|
||||||
|
self.docs_with_id = self._add_semantic_to_docs(
|
||||||
|
self.docs_with_id, classified_doc
|
||||||
|
)
|
||||||
|
self.docs = self._unindex_docs(self.docs_with_id) # type: ignore
|
||||||
|
yield self.docs[0]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def set_discover_sent(cls) -> None:
|
def set_discover_sent(cls) -> None:
|
||||||
@ -120,16 +138,23 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
def set_loader_sent(cls) -> None:
|
def set_loader_sent(cls) -> None:
|
||||||
cls._loader_sent = True
|
cls._loader_sent = True
|
||||||
|
|
||||||
def _send_loader_doc(self, loading_end: bool = False) -> list:
|
def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list:
|
||||||
"""Send documents fetched from loader to pebblo-server. Then send
|
"""Send documents fetched from loader to pebblo-server. Then send
|
||||||
classified documents to Daxa cloud(If api_key is present). Internal method.
|
classified documents to Daxa cloud(If api_key is present). Internal method.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
||||||
|
loaded_docs (list): List of documents fetched from loader's load operation.
|
||||||
loading_end (bool, optional): Flag indicating the halt of data
|
loading_end (bool, optional): Flag indicating the halt of data
|
||||||
loading by loader. Defaults to False.
|
loading by loader. Defaults to False.
|
||||||
"""
|
"""
|
||||||
headers = {"Accept": "application/json", "Content-Type": "application/json"}
|
headers = {
|
||||||
doc_content = [doc.dict() for doc in self.docs]
|
"Accept": "application/json",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
if loading_end is True:
|
||||||
|
PebbloSafeLoader.set_loader_sent()
|
||||||
|
doc_content = [doc.dict() for doc in loaded_docs]
|
||||||
docs = []
|
docs = []
|
||||||
for doc in doc_content:
|
for doc in doc_content:
|
||||||
doc_authorized_identities = doc.get("metadata", {}).get(
|
doc_authorized_identities = doc.get("metadata", {}).get(
|
||||||
@ -144,11 +169,13 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
doc_source_size = self.get_source_size(doc_source_path)
|
doc_source_size = self.get_source_size(doc_source_path)
|
||||||
page_content = str(doc.get("page_content"))
|
page_content = str(doc.get("page_content"))
|
||||||
page_content_size = self.calculate_content_size(page_content)
|
page_content_size = self.calculate_content_size(page_content)
|
||||||
self.source_aggr_size += page_content_size
|
self.source_aggregate_size += page_content_size
|
||||||
|
doc_id = doc.get("id", None) or 0
|
||||||
docs.append(
|
docs.append(
|
||||||
{
|
{
|
||||||
"doc": page_content,
|
"doc": page_content,
|
||||||
"source_path": doc_source_path,
|
"source_path": doc_source_path,
|
||||||
|
"id": doc_id,
|
||||||
"last_modified": doc.get("metadata", {}).get("last_modified"),
|
"last_modified": doc.get("metadata", {}).get("last_modified"),
|
||||||
"file_owner": doc_source_owner,
|
"file_owner": doc_source_owner,
|
||||||
**(
|
**(
|
||||||
@ -176,7 +203,9 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
if loading_end is True:
|
if loading_end is True:
|
||||||
payload["loading_end"] = "true"
|
payload["loading_end"] = "true"
|
||||||
if "loader_details" in payload:
|
if "loader_details" in payload:
|
||||||
payload["loader_details"]["source_aggr_size"] = self.source_aggr_size
|
payload["loader_details"]["source_aggregate_size"] = ( # noqa
|
||||||
|
self.source_aggregate_size
|
||||||
|
)
|
||||||
payload = Doc(**payload).dict(exclude_unset=True)
|
payload = Doc(**payload).dict(exclude_unset=True)
|
||||||
load_doc_url = f"{CLASSIFIER_URL}{LOADER_DOC_URL}"
|
load_doc_url = f"{CLASSIFIER_URL}{LOADER_DOC_URL}"
|
||||||
classified_docs = []
|
classified_docs = []
|
||||||
@ -202,11 +231,9 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
except requests.exceptions.RequestException:
|
except requests.exceptions.RequestException:
|
||||||
logger.warning("Unable to reach pebblo server.")
|
logger.warning("Unable to reach pebblo server.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("An Exception caught in _send_loader_doc: %s", e)
|
logger.warning("An Exception caught in _send_loader_doc: local %s", e)
|
||||||
|
|
||||||
if self.api_key:
|
if self.api_key:
|
||||||
if not classified_docs:
|
if not classified_docs:
|
||||||
logger.warning("No classified docs to send to pebblo-cloud.")
|
|
||||||
return classified_docs
|
return classified_docs
|
||||||
try:
|
try:
|
||||||
payload["docs"] = classified_docs
|
payload["docs"] = classified_docs
|
||||||
@ -234,7 +261,7 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
except requests.exceptions.RequestException:
|
except requests.exceptions.RequestException:
|
||||||
logger.warning("Unable to reach Pebblo cloud server.")
|
logger.warning("Unable to reach Pebblo cloud server.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("An Exception caught in _send_loader_doc: %s", e)
|
logger.warning("An Exception caught in _send_loader_doc: cloud %s", e)
|
||||||
|
|
||||||
if loading_end is True:
|
if loading_end is True:
|
||||||
PebbloSafeLoader.set_loader_sent()
|
PebbloSafeLoader.set_loader_sent()
|
||||||
@ -270,6 +297,12 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
pebblo_resp = requests.post(
|
pebblo_resp = requests.post(
|
||||||
app_discover_url, headers=headers, json=payload, timeout=20
|
app_discover_url, headers=headers, json=payload, timeout=20
|
||||||
)
|
)
|
||||||
|
if self.api_key:
|
||||||
|
pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}/v1/discover"
|
||||||
|
headers.update({"x-api-key": self.api_key})
|
||||||
|
_ = requests.post(
|
||||||
|
pebblo_cloud_url, headers=headers, json=payload, timeout=20
|
||||||
|
)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"send_discover[local]: request url %s, body %s len %s\
|
"send_discover[local]: request url %s, body %s len %s\
|
||||||
response status %s body %s",
|
response status %s body %s",
|
||||||
@ -287,8 +320,8 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
except requests.exceptions.RequestException:
|
except requests.exceptions.RequestException:
|
||||||
logger.warning("Unable to reach pebblo server.")
|
logger.warning("Unable to reach pebblo server.")
|
||||||
except Exception:
|
except Exception as e:
|
||||||
logger.warning("An Exception caught in _send_discover.")
|
logger.warning("An Exception caught in _send_discover: local %s", e)
|
||||||
|
|
||||||
if self.api_key:
|
if self.api_key:
|
||||||
try:
|
try:
|
||||||
@ -316,7 +349,7 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
except requests.exceptions.RequestException:
|
except requests.exceptions.RequestException:
|
||||||
logger.warning("Unable to reach Pebblo cloud server.")
|
logger.warning("Unable to reach Pebblo cloud server.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("An Exception caught in _send_discover: %s", e)
|
logger.warning("An Exception caught in _send_discover: cloud %s", e)
|
||||||
|
|
||||||
def _get_app_details(self) -> App:
|
def _get_app_details(self) -> App:
|
||||||
"""Fetch app details. Internal method.
|
"""Fetch app details. Internal method.
|
||||||
@ -378,3 +411,80 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
total_size += os.path.getsize(fp)
|
total_size += os.path.getsize(fp)
|
||||||
size = total_size
|
size = total_size
|
||||||
return size
|
return size
|
||||||
|
|
||||||
|
def _index_docs(self) -> List[IndexedDocument]:
|
||||||
|
"""
|
||||||
|
Indexes the documents and returns a list of IndexedDocument objects.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[IndexedDocument]: A list of IndexedDocument objects with unique IDs.
|
||||||
|
"""
|
||||||
|
docs_with_id = [
|
||||||
|
IndexedDocument(id=hex(i)[2:], **doc.dict())
|
||||||
|
for i, doc in enumerate(self.docs)
|
||||||
|
]
|
||||||
|
return docs_with_id
|
||||||
|
|
||||||
|
def _add_semantic_to_docs(
|
||||||
|
self, docs_with_id: List[IndexedDocument], classified_docs: List[dict]
|
||||||
|
) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Adds semantic metadata to the given list of documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
docs_with_id (List[IndexedDocument]): A list of IndexedDocument objects
|
||||||
|
containing the documents with their IDs.
|
||||||
|
classified_docs (List[dict]): A list of dictionaries containing the
|
||||||
|
classified documents.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: A list of Document objects with added semantic metadata.
|
||||||
|
"""
|
||||||
|
indexed_docs = {
|
||||||
|
doc.id: Document(page_content=doc.page_content, metadata=doc.metadata)
|
||||||
|
for doc in docs_with_id
|
||||||
|
}
|
||||||
|
|
||||||
|
for classified_doc in classified_docs:
|
||||||
|
doc_id = classified_doc.get("id")
|
||||||
|
if doc_id in indexed_docs:
|
||||||
|
self._add_semantic_to_doc(indexed_docs[doc_id], classified_doc)
|
||||||
|
|
||||||
|
semantic_metadata_docs = [doc for doc in indexed_docs.values()]
|
||||||
|
|
||||||
|
return semantic_metadata_docs
|
||||||
|
|
||||||
|
def _unindex_docs(self, docs_with_id: List[IndexedDocument]) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Converts a list of IndexedDocument objects to a list of Document objects.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
docs_with_id (List[IndexedDocument]): A list of IndexedDocument objects.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: A list of Document objects.
|
||||||
|
"""
|
||||||
|
docs = [
|
||||||
|
Document(page_content=doc.page_content, metadata=doc.metadata)
|
||||||
|
for i, doc in enumerate(docs_with_id)
|
||||||
|
]
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def _add_semantic_to_doc(self, doc: Document, classified_doc: dict) -> Document:
|
||||||
|
"""
|
||||||
|
Adds semantic metadata to the given document in-place.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc (Document): A Document object.
|
||||||
|
classified_doc (dict): A dictionary containing the classified document.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document: The Document object with added semantic metadata.
|
||||||
|
"""
|
||||||
|
doc.metadata["pebblo_semantic_entities"] = list(
|
||||||
|
classified_doc.get("entities", {}).keys()
|
||||||
|
)
|
||||||
|
doc.metadata["pebblo_semantic_topics"] = list(
|
||||||
|
classified_doc.get("topics", {}).keys()
|
||||||
|
)
|
||||||
|
return doc
|
||||||
|
@ -6,6 +6,7 @@ import pathlib
|
|||||||
import platform
|
import platform
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
from langchain_core.env import get_runtime_environment
|
from langchain_core.env import get_runtime_environment
|
||||||
from langchain_core.pydantic_v1 import BaseModel
|
from langchain_core.pydantic_v1 import BaseModel
|
||||||
|
|
||||||
@ -61,6 +62,10 @@ SUPPORTED_LOADERS = (*file_loader, *dir_loader, *in_memory)
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class IndexedDocument(Document):
|
||||||
|
id: str
|
||||||
|
|
||||||
|
|
||||||
class Runtime(BaseModel):
|
class Runtime(BaseModel):
|
||||||
"""Pebblo Runtime.
|
"""Pebblo Runtime.
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user