Harrison/unstructured io (#1200)

This commit is contained in:
Harrison Chase 2023-02-20 22:54:49 -08:00 committed by GitHub
parent d90a287d8f
commit 5bdb8dd6fe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 51 additions and 10 deletions

View File

@ -225,7 +225,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 5,
"id": "562769c6", "id": "562769c6",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],

View File

@ -28,13 +28,17 @@ from langchain.document_loaders.s3_file import S3FileLoader
from langchain.document_loaders.srt import SRTLoader from langchain.document_loaders.srt import SRTLoader
from langchain.document_loaders.telegram import TelegramChatLoader from langchain.document_loaders.telegram import TelegramChatLoader
from langchain.document_loaders.text import TextLoader from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.document_loaders.unstructured import (
UnstructuredFileIOLoader,
UnstructuredFileLoader,
)
from langchain.document_loaders.url import UnstructuredURLLoader from langchain.document_loaders.url import UnstructuredURLLoader
from langchain.document_loaders.web_base import WebBaseLoader from langchain.document_loaders.web_base import WebBaseLoader
from langchain.document_loaders.youtube import YoutubeLoader from langchain.document_loaders.youtube import YoutubeLoader
__all__ = [ __all__ = [
"UnstructuredFileLoader", "UnstructuredFileLoader",
"UnstructuredFileIOLoader",
"UnstructuredURLLoader", "UnstructuredURLLoader",
"DirectoryLoader", "DirectoryLoader",
"NotionDirectoryLoader", "NotionDirectoryLoader",

View File

@ -1,14 +1,15 @@
"""Loader that uses unstructured to load files.""" """Loader that uses unstructured to load files."""
from typing import List from abc import ABC, abstractmethod
from typing import IO, List
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
class UnstructuredFileLoader(BaseLoader): class UnstructuredBaseLoader(BaseLoader, ABC):
"""Loader that uses unstructured to load files.""" """Loader that uses unstructured to load files."""
def __init__(self, file_path: str, mode: str = "single"): def __init__(self, mode: str = "single"):
"""Initialize with file path.""" """Initialize with file path."""
try: try:
import unstructured # noqa:F401 import unstructured # noqa:F401
@ -22,13 +23,15 @@ class UnstructuredFileLoader(BaseLoader):
raise ValueError( raise ValueError(
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`" f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
) )
self.file_path = file_path
self.mode = mode self.mode = mode
@abstractmethod
def _get_elements(self) -> List: def _get_elements(self) -> List:
from unstructured.partition.auto import partition """Get elements."""
return partition(filename=self.file_path) @abstractmethod
def _get_metadata(self) -> dict:
"""Get metadata."""
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load file.""" """Load file."""
@ -36,7 +39,7 @@ class UnstructuredFileLoader(BaseLoader):
if self.mode == "elements": if self.mode == "elements":
docs: List[Document] = list() docs: List[Document] = list()
for element in elements: for element in elements:
metadata = {"source": self.file_path} metadata = self._get_metadata()
# NOTE(MthwRobinson) - the attribute check is for backward compatibility # NOTE(MthwRobinson) - the attribute check is for backward compatibility
# with unstructured<0.4.9. The metadata attributed was added in 0.4.9. # with unstructured<0.4.9. The metadata attributed was added in 0.4.9.
if hasattr(element, "metadata"): if hasattr(element, "metadata"):
@ -45,9 +48,43 @@ class UnstructuredFileLoader(BaseLoader):
metadata["category"] = element.category metadata["category"] = element.category
docs.append(Document(page_content=str(element), metadata=metadata)) docs.append(Document(page_content=str(element), metadata=metadata))
elif self.mode == "single": elif self.mode == "single":
metadata = {"source": self.file_path} metadata = self._get_metadata()
text = "\n\n".join([str(el) for el in elements]) text = "\n\n".join([str(el) for el in elements])
docs = [Document(page_content=text, metadata=metadata)] docs = [Document(page_content=text, metadata=metadata)]
else: else:
raise ValueError(f"mode of {self.mode} not supported.") raise ValueError(f"mode of {self.mode} not supported.")
return docs return docs
class UnstructuredFileLoader(UnstructuredBaseLoader):
"""Loader that uses unstructured to load files."""
def __init__(self, file_path: str, mode: str = "single"):
"""Initialize with file path."""
self.file_path = file_path
super().__init__(mode=mode)
def _get_elements(self) -> List:
from unstructured.partition.auto import partition
return partition(filename=self.file_path)
def _get_metadata(self) -> dict:
return {"source": self.file_path}
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
"""Loader that uses unstructured to load file IO objects."""
def __init__(self, file: IO, mode: str = "single"):
"""Initialize with file path."""
self.file = file
super().__init__(mode=mode)
def _get_elements(self) -> List:
from unstructured.partition.auto import partition
return partition(file=self.file)
def _get_metadata(self) -> dict:
return {}