community: better support of pathlib paths in document loaders (#18396)

So this arose from the
https://github.com/langchain-ai/langchain/pull/18397 problem of document
loaders not supporting `pathlib.Path`.

This pull request provides more uniform support for Path as an argument.
The core ideas for this upgrade: 
- if there is a local file path used as an argument, it should be
supported as `pathlib.Path`
- if there are some external calls that may or may not support Pathlib,
the argument is immidiately converted to `str`
- if there `self.file_path` is used in a way that it allows for it to
stay pathlib without conversion, is is only converted for the metadata.

Twitter handle: https://twitter.com/mwmajewsk
This commit is contained in:
mwmajewsk 2024-03-26 16:51:52 +01:00 committed by GitHub
parent 94b869a974
commit f7a1fd91b8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
32 changed files with 147 additions and 80 deletions

View File

@ -1,6 +1,6 @@
import re
from pathlib import Path
from typing import Iterator
from typing import Iterator, Union
from langchain_core.documents import Document
@ -14,7 +14,10 @@ class AcreomLoader(BaseLoader):
"""Regex to match front matter metadata in markdown files."""
def __init__(
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
self,
path: Union[str, Path],
encoding: str = "UTF-8",
collect_metadata: bool = True,
):
"""Initialize the loader."""
self.file_path = path

View File

@ -1,5 +1,6 @@
import json
from typing import List
from pathlib import Path
from typing import List, Union
from langchain_core.documents import Document
from langchain_core.utils import stringify_dict
@ -10,7 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader
class AirbyteJSONLoader(BaseLoader):
"""Load local `Airbyte` json files."""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
self.file_path = file_path
"""Path to the directory containing the json files."""
@ -20,5 +21,5 @@ class AirbyteJSONLoader(BaseLoader):
for line in open(self.file_path, "r"):
data = json.loads(line)["_airbyte_data"]
text += stringify_dict(data)
metadata = {"source": self.file_path}
metadata = {"source": str(self.file_path)}
return [Document(page_content=text, metadata=metadata)]

View File

@ -1,7 +1,8 @@
from __future__ import annotations
from enum import Enum
from typing import TYPE_CHECKING, Iterator, Optional
from pathlib import Path
from typing import TYPE_CHECKING, Iterator, Optional, Union
import requests
from langchain_core.documents import Document
@ -44,7 +45,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
def __init__(
self,
file_path: str,
file_path: Union[str, Path],
*,
transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
config: Optional[assemblyai.TranscriptionConfig] = None,
@ -71,7 +72,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
if api_key is not None:
assemblyai.settings.api_key = api_key
self.file_path = file_path
self.file_path = str(file_path)
self.transcript_format = transcript_format
self.transcriber = assemblyai.Transcriber(config=config)

View File

@ -1,5 +1,6 @@
import csv
from typing import List
from pathlib import Path
from typing import List, Union
from langchain_core.documents import Document
@ -9,7 +10,7 @@ from langchain_community.document_loaders.base import BaseLoader
class CoNLLULoader(BaseLoader):
"""Load `CoNLL-U` files."""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path."""
self.file_path = file_path
@ -29,5 +30,5 @@ class CoNLLULoader(BaseLoader):
else:
text += line[1] + " "
metadata = {"source": self.file_path}
metadata = {"source": str(self.file_path)}
return [Document(page_content=text, metadata=metadata)]

View File

@ -1,6 +1,7 @@
import csv
from io import TextIOWrapper
from typing import Any, Dict, Iterator, List, Optional, Sequence
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
from langchain_core.documents import Document
@ -35,7 +36,7 @@ class CSVLoader(BaseLoader):
def __init__(
self,
file_path: str,
file_path: Union[str, Path],
source_column: Optional[str] = None,
metadata_columns: Sequence[str] = (),
csv_args: Optional[Dict] = None,
@ -89,7 +90,7 @@ class CSVLoader(BaseLoader):
source = (
row[self.source_column]
if self.source_column is not None
else self.file_path
else str(self.file_path)
)
except KeyError:
raise ValueError(

View File

@ -1,5 +1,6 @@
import os
from typing import Any, Iterator, List
from pathlib import Path
from typing import Any, Iterator, List, Union
from langchain_core.documents import Document
@ -41,7 +42,10 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
process_attachments = unstructured_kwargs.get("process_attachments")
attachment_partitioner = unstructured_kwargs.get("attachment_partitioner")
@ -79,17 +83,17 @@ class OutlookMessageLoader(BaseLoader):
https://github.com/TeamMsgExtractor/msg-extractor
"""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path.
Args:
file_path: The path to the Outlook Message file.
"""
self.file_path = file_path
self.file_path = str(file_path)
if not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file" % self.file_path)
raise ValueError(f"File path {self.file_path} is not a valid file")
try:
import extract_msg # noqa:F401

View File

@ -5,8 +5,9 @@ https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
import hashlib
import logging
from base64 import b64decode
from pathlib import Path
from time import strptime
from typing import Any, Dict, Iterator, List, Optional
from typing import Any, Dict, Iterator, List, Optional, Union
from langchain_core.documents import Document
@ -35,9 +36,9 @@ class EverNoteLoader(BaseLoader):
the 'source' which contains the file name of the export.
""" # noqa: E501
def __init__(self, file_path: str, load_single_document: bool = True):
def __init__(self, file_path: Union[str, Path], load_single_document: bool = True):
"""Initialize with file path."""
self.file_path = file_path
self.file_path = str(file_path)
self.load_single_document = load_single_document
def _lazy_load(self) -> Iterator[Document]:

View File

@ -1,5 +1,6 @@
"""Loads Microsoft Excel files."""
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -27,7 +28,10 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

View File

@ -1,7 +1,7 @@
import datetime
import json
from pathlib import Path
from typing import Iterator
from typing import Iterator, Union
from langchain_core.documents import Document
@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str:
class FacebookChatLoader(BaseLoader):
"""Load `Facebook Chat` messages directory dump."""
def __init__(self, path: str):
def __init__(self, path: Union[str, Path]):
"""Initialize with a path."""
self.file_path = path

View File

@ -1,7 +1,8 @@
"""Document loader helpers."""
import concurrent.futures
from typing import List, NamedTuple, Optional, cast
from pathlib import Path
from typing import List, NamedTuple, Optional, Union, cast
class FileEncoding(NamedTuple):
@ -15,7 +16,9 @@ class FileEncoding(NamedTuple):
"""The language of the file."""
def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
def detect_file_encodings(
file_path: Union[str, Path], timeout: int = 5
) -> List[FileEncoding]:
"""Try to detect the file encoding.
Returns a list of `FileEncoding` tuples with the detected encodings ordered
@ -27,6 +30,8 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding
"""
import chardet
file_path = str(file_path)
def read_and_detect(file_path: str) -> List[dict]:
with open(file_path, "rb") as f:
rawdata = f.read()

View File

@ -1,4 +1,5 @@
import logging
from pathlib import Path
from typing import Dict, Iterator, Union
from langchain_core.documents import Document
@ -13,7 +14,7 @@ class BSHTMLLoader(BaseLoader):
def __init__(
self,
file_path: str,
file_path: Union[str, Path],
open_encoding: Union[str, None] = None,
bs_kwargs: Union[dict, None] = None,
get_text_separator: str = "",
@ -57,7 +58,7 @@ class BSHTMLLoader(BaseLoader):
title = ""
metadata: Dict[str, Union[str, None]] = {
"source": self.file_path,
"source": str(self.file_path),
"title": title,
}
yield Document(page_content=text, metadata=metadata)

View File

@ -1,4 +1,5 @@
from io import BytesIO
from pathlib import Path
from typing import Any, List, Tuple, Union
import requests
@ -17,7 +18,7 @@ class ImageCaptionLoader(BaseLoader):
def __init__(
self,
images: Union[str, bytes, List[Union[str, bytes]]],
images: Union[str, Path, bytes, List[Union[str, bytes, Path]]],
blip_processor: str = "Salesforce/blip-image-captioning-base",
blip_model: str = "Salesforce/blip-image-captioning-base",
):
@ -29,7 +30,7 @@ class ImageCaptionLoader(BaseLoader):
blip_processor: The name of the pre-trained BLIP processor.
blip_model: The name of the pre-trained BLIP model.
"""
if isinstance(images, (str, bytes)):
if isinstance(images, (str, Path, bytes)):
self.images = [images]
else:
self.images = images
@ -61,7 +62,7 @@ class ImageCaptionLoader(BaseLoader):
return results
def _get_captions_and_metadata(
self, model: Any, processor: Any, image: Union[str, bytes]
self, model: Any, processor: Any, image: Union[str, Path, bytes]
) -> Tuple[str, dict]:
"""Helper function for getting the captions and metadata of an image."""
try:
@ -76,7 +77,9 @@ class ImageCaptionLoader(BaseLoader):
try:
if isinstance(image, bytes):
image = Image.open(BytesIO(image)).convert("RGB")
elif image.startswith("http://") or image.startswith("https://"):
elif isinstance(image, str) and (
image.startswith("http://") or image.startswith("https://")
):
image = Image.open(requests.get(image, stream=True).raw).convert("RGB")
else:
image = Image.open(image).convert("RGB")
@ -94,6 +97,6 @@ class ImageCaptionLoader(BaseLoader):
if isinstance(image_source, bytes):
metadata: dict = {"image_source": "Image bytes provided"}
else:
metadata = {"image_path": image_source}
metadata = {"image_path": str(image_source)}
return caption, metadata

View File

@ -1,5 +1,6 @@
import email
import logging
from pathlib import Path
from typing import Dict, Iterator, Union
from langchain_core.documents import Document
@ -14,7 +15,7 @@ class MHTMLLoader(BaseLoader):
def __init__(
self,
file_path: str,
file_path: Union[str, Path],
open_encoding: Union[str, None] = None,
bs_kwargs: Union[dict, None] = None,
get_text_separator: str = "",
@ -69,7 +70,7 @@ class MHTMLLoader(BaseLoader):
title = ""
metadata: Dict[str, Union[str, None]] = {
"source": self.file_path,
"source": str(self.file_path),
"title": title,
}
yield Document(page_content=text, metadata=metadata)

View File

@ -1,7 +1,7 @@
"""Loads .ipynb notebook files."""
import json
from pathlib import Path
from typing import Any, List
from typing import Any, List, Union
from langchain_core.documents import Document
@ -75,7 +75,7 @@ class NotebookLoader(BaseLoader):
def __init__(
self,
path: str,
path: Union[str, Path],
include_outputs: bool = False,
max_output_length: int = 10,
remove_newline: bool = False,

View File

@ -1,5 +1,5 @@
from pathlib import Path
from typing import List
from typing import List, Union
from langchain_core.documents import Document
@ -9,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
class NotionDirectoryLoader(BaseLoader):
"""Load `Notion directory` dump."""
def __init__(self, path: str, *, encoding: str = "utf-8") -> None:
def __init__(self, path: Union[str, Path], *, encoding: str = "utf-8") -> None:
"""Initialize with a file path."""
self.file_path = path
self.encoding = encoding

View File

@ -2,7 +2,7 @@ import functools
import logging
import re
from pathlib import Path
from typing import Any, Dict, Iterator
from typing import Any, Dict, Iterator, Union
import yaml
from langchain_core.documents import Document
@ -23,7 +23,10 @@ class ObsidianLoader(BaseLoader):
DATAVIEW_INLINE_PAREN_REGEX = re.compile(r"\((\w+)::\s*(.*)\)", re.MULTILINE)
def __init__(
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
self,
path: Union[str, Path],
encoding: str = "UTF-8",
collect_metadata: bool = True,
):
"""Initialize with a path.

View File

@ -1,4 +1,5 @@
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -31,7 +32,10 @@ class UnstructuredODTLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

View File

@ -1,4 +1,5 @@
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -31,7 +32,10 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

View File

@ -80,14 +80,14 @@ class BasePDFLoader(BaseLoader, ABC):
clean up the temporary file after completion.
"""
def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None):
"""Initialize with a file path.
Args:
file_path: Either a local, S3 or web path to a PDF file.
headers: Headers to use for GET request to download a file from a web path.
"""
self.file_path = file_path
self.file_path = str(file_path)
self.web_path = None
self.headers = headers
if "~" in self.file_path:
@ -226,7 +226,7 @@ class PyPDFDirectoryLoader(BaseLoader):
def __init__(
self,
path: str,
path: Union[str, Path],
glob: str = "**/[!.]*.pdf",
silent_errors: bool = False,
load_hidden: bool = False,

View File

@ -1,4 +1,6 @@
import tokenize
from pathlib import Path
from typing import Union
from langchain_community.document_loaders.text import TextLoader
@ -6,7 +8,7 @@ from langchain_community.document_loaders.text import TextLoader
class PythonLoader(TextLoader):
"""Load `Python` files, respecting any non-default encoding if specified."""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path.
Args:

View File

@ -1,5 +1,5 @@
from pathlib import Path
from typing import List
from typing import List, Union
from langchain_core.documents import Document
@ -9,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
class RoamLoader(BaseLoader):
"""Load `Roam` files from a directory."""
def __init__(self, path: str):
def __init__(self, path: Union[str, Path]):
"""Initialize with a path."""
self.file_path = path

View File

@ -1,5 +1,6 @@
"""Loads RST files."""
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -32,7 +33,10 @@ class UnstructuredRSTLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Initialize with a file path.

View File

@ -1,5 +1,6 @@
"""Loads rich text files."""
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -32,7 +33,10 @@ class UnstructuredRTFLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Initialize with a file path.

View File

@ -1,7 +1,7 @@
import json
import zipfile
from pathlib import Path
from typing import Dict, Iterator, List, Optional
from typing import Dict, Iterator, List, Optional, Union
from langchain_core.documents import Document
@ -11,7 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader
class SlackDirectoryLoader(BaseLoader):
"""Load from a `Slack` directory dump."""
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
def __init__(self, zip_path: Union[str, Path], workspace_url: Optional[str] = None):
"""Initialize the SlackDirectoryLoader.
Args:

View File

@ -1,4 +1,5 @@
from typing import List
from pathlib import Path
from typing import List, Union
from langchain_core.documents import Document
@ -8,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
class SRTLoader(BaseLoader):
"""Load `.srt` (subtitle) files."""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path."""
try:
import pysrt # noqa:F401
@ -16,7 +17,7 @@ class SRTLoader(BaseLoader):
raise ImportError(
"package `pysrt` not found, please install it with `pip install pysrt`"
)
self.file_path = file_path
self.file_path = str(file_path)
def load(self) -> List[Document]:
"""Load using pysrt file."""

View File

@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str:
class TelegramChatFileLoader(BaseLoader):
"""Load from `Telegram chat` dump."""
def __init__(self, path: str):
def __init__(self, path: Union[str, Path]):
"""Initialize with a path."""
self.file_path = path

View File

@ -1,5 +1,6 @@
import logging
from typing import Iterator, Optional
from pathlib import Path
from typing import Iterator, Optional, Union
from langchain_core.documents import Document
@ -25,7 +26,7 @@ class TextLoader(BaseLoader):
def __init__(
self,
file_path: str,
file_path: Union[str, Path],
encoding: Optional[str] = None,
autodetect_encoding: bool = False,
):
@ -56,5 +57,5 @@ class TextLoader(BaseLoader):
except Exception as e:
raise RuntimeError(f"Error loading {self.file_path}") from e
metadata = {"source": self.file_path}
metadata = {"source": str(self.file_path)}
yield Document(page_content=text, metadata=metadata)

View File

@ -1,4 +1,5 @@
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -26,7 +27,10 @@ class UnstructuredTSVLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
validate_unstructured_version(min_unstructured_version="0.7.6")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

View File

@ -1,6 +1,7 @@
"""Loader that uses unstructured to load files."""
import collections
from abc import ABC, abstractmethod
from pathlib import Path
from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
from langchain_core.documents import Document
@ -155,7 +156,7 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
def __init__(
self,
file_path: Union[str, List[str]],
file_path: Union[str, List[str], Path, List[Path]],
mode: str = "single",
**unstructured_kwargs: Any,
):
@ -169,9 +170,13 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
if isinstance(self.file_path, list):
elements = []
for file in self.file_path:
if isinstance(file, Path):
file = str(file)
elements.extend(partition(filename=file, **self.unstructured_kwargs))
return elements
else:
if isinstance(self.file_path, Path):
self.file_path = str(self.file_path)
return partition(filename=self.file_path, **self.unstructured_kwargs)
def _get_metadata(self) -> dict:
@ -179,14 +184,16 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
def get_elements_from_api(
file_path: Union[str, List[str], None] = None,
file_path: Union[str, List[str], Path, List[Path], None] = None,
file: Union[IO, Sequence[IO], None] = None,
api_url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
) -> List:
"""Retrieve a list of elements from the `Unstructured API`."""
if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
if is_list := isinstance(file_path, list):
file_path = [str(path) for path in file_path]
if isinstance(file, collections.abc.Sequence) or is_list:
from unstructured.partition.api import partition_multiple_via_api
_doc_elements = partition_multiple_via_api(
@ -206,7 +213,7 @@ def get_elements_from_api(
from unstructured.partition.api import partition_via_api
return partition_via_api(
filename=file_path,
filename=str(file_path),
file=file,
api_key=api_key,
api_url=api_url,

View File

@ -1,7 +1,8 @@
import os
import tempfile
from abc import ABC
from typing import List
from pathlib import Path
from typing import List, Union
from urllib.parse import urlparse
import requests
@ -13,9 +14,9 @@ from langchain_community.document_loaders.parsers import VsdxParser
class VsdxLoader(BaseLoader, ABC):
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with file path."""
self.file_path = file_path
self.file_path = str(file_path)
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)

View File

@ -2,7 +2,8 @@
import os
import tempfile
from abc import ABC
from typing import List
from pathlib import Path
from typing import List, Union
from urllib.parse import urlparse
import requests
@ -19,9 +20,9 @@ class Docx2txtLoader(BaseLoader, ABC):
to a temporary file, and use that, then clean up the temporary file after completion
"""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with file path."""
self.file_path = file_path
self.file_path = str(file_path)
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)

View File

@ -1,5 +1,6 @@
"""Loads Microsoft Excel files."""
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -32,8 +33,12 @@ class UnstructuredXMLLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.6.7")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)