community: better support of pathlib paths in document loaders (#18396)

So this arose from the
https://github.com/langchain-ai/langchain/pull/18397 problem of document
loaders not supporting `pathlib.Path`.

This pull request provides more uniform support for Path as an argument.
The core ideas for this upgrade: 
- if there is a local file path used as an argument, it should be
supported as `pathlib.Path`
- if there are some external calls that may or may not support Pathlib,
the argument is immidiately converted to `str`
- if there `self.file_path` is used in a way that it allows for it to
stay pathlib without conversion, is is only converted for the metadata.

Twitter handle: https://twitter.com/mwmajewsk
This commit is contained in:
mwmajewsk
2024-03-26 16:51:52 +01:00
committed by GitHub
parent 94b869a974
commit f7a1fd91b8
32 changed files with 147 additions and 80 deletions

View File

@@ -1,6 +1,7 @@
"""Loader that uses unstructured to load files."""
import collections
from abc import ABC, abstractmethod
from pathlib import Path
from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
from langchain_core.documents import Document
@@ -155,7 +156,7 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
def __init__(
self,
file_path: Union[str, List[str]],
file_path: Union[str, List[str], Path, List[Path]],
mode: str = "single",
**unstructured_kwargs: Any,
):
@@ -169,9 +170,13 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
if isinstance(self.file_path, list):
elements = []
for file in self.file_path:
if isinstance(file, Path):
file = str(file)
elements.extend(partition(filename=file, **self.unstructured_kwargs))
return elements
else:
if isinstance(self.file_path, Path):
self.file_path = str(self.file_path)
return partition(filename=self.file_path, **self.unstructured_kwargs)
def _get_metadata(self) -> dict:
@@ -179,14 +184,16 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
def get_elements_from_api(
file_path: Union[str, List[str], None] = None,
file_path: Union[str, List[str], Path, List[Path], None] = None,
file: Union[IO, Sequence[IO], None] = None,
api_url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
) -> List:
"""Retrieve a list of elements from the `Unstructured API`."""
if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
if is_list := isinstance(file_path, list):
file_path = [str(path) for path in file_path]
if isinstance(file, collections.abc.Sequence) or is_list:
from unstructured.partition.api import partition_multiple_via_api
_doc_elements = partition_multiple_via_api(
@@ -206,7 +213,7 @@ def get_elements_from_api(
from unstructured.partition.api import partition_via_api
return partition_via_api(
filename=file_path,
filename=str(file_path),
file=file,
api_key=api_key,
api_url=api_url,