mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 08:03:39 +00:00
community[minor]: Added propagation of document metadata from O365BaseLoader (#20663)
**Description:** - Added propagation of document metadata from O365BaseLoader to FileSystemBlobLoader (O365BaseLoader uses FileSystemBlobLoader under the hood). - This is done by passing dictionary `metadata_dict`: key=filename and value=dictionary containing document's metadata - Modified `FileSystemBlobLoader` to accept the `metadata_dict`, use `mimetype` from it (if available) and pass metadata further into blob loader. **Issue:** - `O365BaseLoader` under the hood downloads documents to temp folder and then uses `FileSystemBlobLoader` on it. - However metadata about the document in question is lost in this process. In particular: - `mime_type`: `FileSystemBlobLoader` guesses `mime_type` from the file extension, but that does not work 100% of the time. - `web_url`: this is useful to keep around since in RAG LLM we might want to provide link to the source document. In order to work well with document parsers, we pass the `web_url` as `source` (`web_url` is ignored by parsers, `source` is preserved) **Dependencies:** None **Twitter handle:** @martintriska1 Please review @baskaryan --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
e5541d1da7
commit
2df8ac402a
@ -1,4 +1,5 @@
|
|||||||
"""Base class for all loaders that uses O365 Package"""
|
"""Base class for all loaders that uses O365 Package"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@ -6,8 +7,8 @@ import os
|
|||||||
import tempfile
|
import tempfile
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path, PurePath
|
||||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Sequence, Union
|
||||||
|
|
||||||
from langchain_core.pydantic_v1 import (
|
from langchain_core.pydantic_v1 import (
|
||||||
BaseModel,
|
BaseModel,
|
||||||
@ -108,14 +109,31 @@ class O365BaseLoader(BaseLoader, BaseModel):
|
|||||||
"""
|
"""
|
||||||
file_mime_types = self._fetch_mime_types
|
file_mime_types = self._fetch_mime_types
|
||||||
items = folder.get_items()
|
items = folder.get_items()
|
||||||
|
metadata_dict: Dict[str, Dict[str, Any]] = {}
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
os.makedirs(os.path.dirname(temp_dir), exist_ok=True)
|
os.makedirs(os.path.dirname(temp_dir), exist_ok=True)
|
||||||
for file in items:
|
for file in items:
|
||||||
if file.is_file:
|
if file.is_file:
|
||||||
if file.mime_type in list(file_mime_types.values()):
|
if file.mime_type in list(file_mime_types.values()):
|
||||||
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
|
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
|
||||||
|
metadata_dict[file.name] = {
|
||||||
|
"source": file.web_url,
|
||||||
|
"mime_type": file.mime_type,
|
||||||
|
"created": file.created,
|
||||||
|
"modified": file.modified,
|
||||||
|
"created_by": str(file.created_by),
|
||||||
|
"modified_by": str(file.modified_by),
|
||||||
|
"description": file.description,
|
||||||
|
}
|
||||||
|
|
||||||
loader = FileSystemBlobLoader(path=temp_dir)
|
loader = FileSystemBlobLoader(path=temp_dir)
|
||||||
yield from loader.yield_blobs()
|
for blob in loader.yield_blobs():
|
||||||
|
if not isinstance(blob.path, PurePath):
|
||||||
|
raise NotImplementedError("Expected blob path to be a PurePath")
|
||||||
|
if blob.path:
|
||||||
|
file_metadata_ = metadata_dict.get(str(blob.path), {})
|
||||||
|
blob.metadata.update(file_metadata_)
|
||||||
|
yield blob
|
||||||
if self.recursive:
|
if self.recursive:
|
||||||
for subfolder in folder.get_child_folders():
|
for subfolder in folder.get_child_folders():
|
||||||
yield from self._load_from_folder(subfolder)
|
yield from self._load_from_folder(subfolder)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
"""Loader that loads data from Sharepoint Document Library"""
|
"""Loader that loads data from Sharepoint Document Library"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@ -82,7 +83,9 @@ class SharePointLoader(O365BaseLoader, BaseLoader):
|
|||||||
if not isinstance(target_folder, Folder):
|
if not isinstance(target_folder, Folder):
|
||||||
raise ValueError("Unable to fetch root folder")
|
raise ValueError("Unable to fetch root folder")
|
||||||
for blob in self._load_from_folder(target_folder):
|
for blob in self._load_from_folder(target_folder):
|
||||||
yield from blob_parser.lazy_parse(blob)
|
for blob_part in blob_parser.lazy_parse(blob):
|
||||||
|
blob_part.metadata.update(blob.metadata)
|
||||||
|
yield blob_part
|
||||||
|
|
||||||
def authorized_identities(self) -> List:
|
def authorized_identities(self) -> List:
|
||||||
data = self._fetch_access_token()
|
data = self._fetch_access_token()
|
||||||
|
Loading…
Reference in New Issue
Block a user