mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-30 16:24:24 +00:00
community: better support of pathlib paths in document loaders (#18396)
So this arose from the https://github.com/langchain-ai/langchain/pull/18397 problem of document loaders not supporting `pathlib.Path`. This pull request provides more uniform support for Path as an argument. The core ideas for this upgrade: - if there is a local file path used as an argument, it should be supported as `pathlib.Path` - if there are some external calls that may or may not support Pathlib, the argument is immidiately converted to `str` - if there `self.file_path` is used in a way that it allows for it to stay pathlib without conversion, is is only converted for the metadata. Twitter handle: https://twitter.com/mwmajewsk
This commit is contained in:
parent
94b869a974
commit
f7a1fd91b8
@ -1,6 +1,6 @@
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
from typing import Iterator, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -14,7 +14,10 @@ class AcreomLoader(BaseLoader):
|
||||
"""Regex to match front matter metadata in markdown files."""
|
||||
|
||||
def __init__(
|
||||
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
encoding: str = "UTF-8",
|
||||
collect_metadata: bool = True,
|
||||
):
|
||||
"""Initialize the loader."""
|
||||
self.file_path = path
|
||||
|
@ -1,5 +1,6 @@
|
||||
import json
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.utils import stringify_dict
|
||||
@ -10,7 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader
|
||||
class AirbyteJSONLoader(BaseLoader):
|
||||
"""Load local `Airbyte` json files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
def __init__(self, file_path: Union[str, Path]):
|
||||
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
|
||||
self.file_path = file_path
|
||||
"""Path to the directory containing the json files."""
|
||||
@ -20,5 +21,5 @@ class AirbyteJSONLoader(BaseLoader):
|
||||
for line in open(self.file_path, "r"):
|
||||
data = json.loads(line)["_airbyte_data"]
|
||||
text += stringify_dict(data)
|
||||
metadata = {"source": self.file_path}
|
||||
metadata = {"source": str(self.file_path)}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
@ -1,7 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Iterator, Optional
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterator, Optional, Union
|
||||
|
||||
import requests
|
||||
from langchain_core.documents import Document
|
||||
@ -44,7 +45,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, Path],
|
||||
*,
|
||||
transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
|
||||
config: Optional[assemblyai.TranscriptionConfig] = None,
|
||||
@ -71,7 +72,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
|
||||
if api_key is not None:
|
||||
assemblyai.settings.api_key = api_key
|
||||
|
||||
self.file_path = file_path
|
||||
self.file_path = str(file_path)
|
||||
self.transcript_format = transcript_format
|
||||
self.transcriber = assemblyai.Transcriber(config=config)
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
import csv
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -9,7 +10,7 @@ from langchain_community.document_loaders.base import BaseLoader
|
||||
class CoNLLULoader(BaseLoader):
|
||||
"""Load `CoNLL-U` files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
def __init__(self, file_path: Union[str, Path]):
|
||||
"""Initialize with a file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
@ -29,5 +30,5 @@ class CoNLLULoader(BaseLoader):
|
||||
else:
|
||||
text += line[1] + " "
|
||||
|
||||
metadata = {"source": self.file_path}
|
||||
metadata = {"source": str(self.file_path)}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
@ -1,6 +1,7 @@
|
||||
import csv
|
||||
from io import TextIOWrapper
|
||||
from typing import Any, Dict, Iterator, List, Optional, Sequence
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -35,7 +36,7 @@ class CSVLoader(BaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, Path],
|
||||
source_column: Optional[str] = None,
|
||||
metadata_columns: Sequence[str] = (),
|
||||
csv_args: Optional[Dict] = None,
|
||||
@ -89,7 +90,7 @@ class CSVLoader(BaseLoader):
|
||||
source = (
|
||||
row[self.source_column]
|
||||
if self.source_column is not None
|
||||
else self.file_path
|
||||
else str(self.file_path)
|
||||
)
|
||||
except KeyError:
|
||||
raise ValueError(
|
||||
|
@ -1,5 +1,6 @@
|
||||
import os
|
||||
from typing import Any, Iterator, List
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterator, List, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -41,7 +42,10 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
process_attachments = unstructured_kwargs.get("process_attachments")
|
||||
attachment_partitioner = unstructured_kwargs.get("attachment_partitioner")
|
||||
@ -79,17 +83,17 @@ class OutlookMessageLoader(BaseLoader):
|
||||
https://github.com/TeamMsgExtractor/msg-extractor
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
def __init__(self, file_path: Union[str, Path]):
|
||||
"""Initialize with a file path.
|
||||
|
||||
Args:
|
||||
file_path: The path to the Outlook Message file.
|
||||
"""
|
||||
|
||||
self.file_path = file_path
|
||||
self.file_path = str(file_path)
|
||||
|
||||
if not os.path.isfile(self.file_path):
|
||||
raise ValueError("File path %s is not a valid file" % self.file_path)
|
||||
raise ValueError(f"File path {self.file_path} is not a valid file")
|
||||
|
||||
try:
|
||||
import extract_msg # noqa:F401
|
||||
|
@ -5,8 +5,9 @@ https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
|
||||
import hashlib
|
||||
import logging
|
||||
from base64 import b64decode
|
||||
from pathlib import Path
|
||||
from time import strptime
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from typing import Any, Dict, Iterator, List, Optional, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -35,9 +36,9 @@ class EverNoteLoader(BaseLoader):
|
||||
the 'source' which contains the file name of the export.
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(self, file_path: str, load_single_document: bool = True):
|
||||
def __init__(self, file_path: Union[str, Path], load_single_document: bool = True):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
self.file_path = str(file_path)
|
||||
self.load_single_document = load_single_document
|
||||
|
||||
def _lazy_load(self) -> Iterator[Document]:
|
||||
|
@ -1,5 +1,6 @@
|
||||
"""Loads Microsoft Excel files."""
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
@ -27,7 +28,10 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
import datetime
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
from typing import Iterator, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str:
|
||||
class FacebookChatLoader(BaseLoader):
|
||||
"""Load `Facebook Chat` messages directory dump."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
def __init__(self, path: Union[str, Path]):
|
||||
"""Initialize with a path."""
|
||||
self.file_path = path
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
"""Document loader helpers."""
|
||||
|
||||
import concurrent.futures
|
||||
from typing import List, NamedTuple, Optional, cast
|
||||
from pathlib import Path
|
||||
from typing import List, NamedTuple, Optional, Union, cast
|
||||
|
||||
|
||||
class FileEncoding(NamedTuple):
|
||||
@ -15,7 +16,9 @@ class FileEncoding(NamedTuple):
|
||||
"""The language of the file."""
|
||||
|
||||
|
||||
def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
|
||||
def detect_file_encodings(
|
||||
file_path: Union[str, Path], timeout: int = 5
|
||||
) -> List[FileEncoding]:
|
||||
"""Try to detect the file encoding.
|
||||
|
||||
Returns a list of `FileEncoding` tuples with the detected encodings ordered
|
||||
@ -27,6 +30,8 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding
|
||||
"""
|
||||
import chardet
|
||||
|
||||
file_path = str(file_path)
|
||||
|
||||
def read_and_detect(file_path: str) -> List[dict]:
|
||||
with open(file_path, "rb") as f:
|
||||
rawdata = f.read()
|
||||
|
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
@ -13,7 +14,7 @@ class BSHTMLLoader(BaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, Path],
|
||||
open_encoding: Union[str, None] = None,
|
||||
bs_kwargs: Union[dict, None] = None,
|
||||
get_text_separator: str = "",
|
||||
@ -57,7 +58,7 @@ class BSHTMLLoader(BaseLoader):
|
||||
title = ""
|
||||
|
||||
metadata: Dict[str, Union[str, None]] = {
|
||||
"source": self.file_path,
|
||||
"source": str(self.file_path),
|
||||
"title": title,
|
||||
}
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
|
@ -1,4 +1,5 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Tuple, Union
|
||||
|
||||
import requests
|
||||
@ -17,7 +18,7 @@ class ImageCaptionLoader(BaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
images: Union[str, bytes, List[Union[str, bytes]]],
|
||||
images: Union[str, Path, bytes, List[Union[str, bytes, Path]]],
|
||||
blip_processor: str = "Salesforce/blip-image-captioning-base",
|
||||
blip_model: str = "Salesforce/blip-image-captioning-base",
|
||||
):
|
||||
@ -29,7 +30,7 @@ class ImageCaptionLoader(BaseLoader):
|
||||
blip_processor: The name of the pre-trained BLIP processor.
|
||||
blip_model: The name of the pre-trained BLIP model.
|
||||
"""
|
||||
if isinstance(images, (str, bytes)):
|
||||
if isinstance(images, (str, Path, bytes)):
|
||||
self.images = [images]
|
||||
else:
|
||||
self.images = images
|
||||
@ -61,7 +62,7 @@ class ImageCaptionLoader(BaseLoader):
|
||||
return results
|
||||
|
||||
def _get_captions_and_metadata(
|
||||
self, model: Any, processor: Any, image: Union[str, bytes]
|
||||
self, model: Any, processor: Any, image: Union[str, Path, bytes]
|
||||
) -> Tuple[str, dict]:
|
||||
"""Helper function for getting the captions and metadata of an image."""
|
||||
try:
|
||||
@ -76,7 +77,9 @@ class ImageCaptionLoader(BaseLoader):
|
||||
try:
|
||||
if isinstance(image, bytes):
|
||||
image = Image.open(BytesIO(image)).convert("RGB")
|
||||
elif image.startswith("http://") or image.startswith("https://"):
|
||||
elif isinstance(image, str) and (
|
||||
image.startswith("http://") or image.startswith("https://")
|
||||
):
|
||||
image = Image.open(requests.get(image, stream=True).raw).convert("RGB")
|
||||
else:
|
||||
image = Image.open(image).convert("RGB")
|
||||
@ -94,6 +97,6 @@ class ImageCaptionLoader(BaseLoader):
|
||||
if isinstance(image_source, bytes):
|
||||
metadata: dict = {"image_source": "Image bytes provided"}
|
||||
else:
|
||||
metadata = {"image_path": image_source}
|
||||
metadata = {"image_path": str(image_source)}
|
||||
|
||||
return caption, metadata
|
||||
|
@ -1,5 +1,6 @@
|
||||
import email
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
@ -14,7 +15,7 @@ class MHTMLLoader(BaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, Path],
|
||||
open_encoding: Union[str, None] = None,
|
||||
bs_kwargs: Union[dict, None] = None,
|
||||
get_text_separator: str = "",
|
||||
@ -69,7 +70,7 @@ class MHTMLLoader(BaseLoader):
|
||||
title = ""
|
||||
|
||||
metadata: Dict[str, Union[str, None]] = {
|
||||
"source": self.file_path,
|
||||
"source": str(self.file_path),
|
||||
"title": title,
|
||||
}
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""Loads .ipynb notebook files."""
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, List
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -75,7 +75,7 @@ class NotebookLoader(BaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
path: Union[str, Path],
|
||||
include_outputs: bool = False,
|
||||
max_output_length: int = 10,
|
||||
remove_newline: bool = False,
|
||||
|
@ -1,5 +1,5 @@
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import List, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -9,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
|
||||
class NotionDirectoryLoader(BaseLoader):
|
||||
"""Load `Notion directory` dump."""
|
||||
|
||||
def __init__(self, path: str, *, encoding: str = "utf-8") -> None:
|
||||
def __init__(self, path: Union[str, Path], *, encoding: str = "utf-8") -> None:
|
||||
"""Initialize with a file path."""
|
||||
self.file_path = path
|
||||
self.encoding = encoding
|
||||
|
@ -2,7 +2,7 @@ import functools
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterator
|
||||
from typing import Any, Dict, Iterator, Union
|
||||
|
||||
import yaml
|
||||
from langchain_core.documents import Document
|
||||
@ -23,7 +23,10 @@ class ObsidianLoader(BaseLoader):
|
||||
DATAVIEW_INLINE_PAREN_REGEX = re.compile(r"\((\w+)::\s*(.*)\)", re.MULTILINE)
|
||||
|
||||
def __init__(
|
||||
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
encoding: str = "UTF-8",
|
||||
collect_metadata: bool = True,
|
||||
):
|
||||
"""Initialize with a path.
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
@ -31,7 +32,10 @@ class UnstructuredODTLoader(UnstructuredFileLoader):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
@ -31,7 +32,10 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
|
@ -80,14 +80,14 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
clean up the temporary file after completion.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
|
||||
def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None):
|
||||
"""Initialize with a file path.
|
||||
|
||||
Args:
|
||||
file_path: Either a local, S3 or web path to a PDF file.
|
||||
headers: Headers to use for GET request to download a file from a web path.
|
||||
"""
|
||||
self.file_path = file_path
|
||||
self.file_path = str(file_path)
|
||||
self.web_path = None
|
||||
self.headers = headers
|
||||
if "~" in self.file_path:
|
||||
@ -226,7 +226,7 @@ class PyPDFDirectoryLoader(BaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
path: Union[str, Path],
|
||||
glob: str = "**/[!.]*.pdf",
|
||||
silent_errors: bool = False,
|
||||
load_hidden: bool = False,
|
||||
|
@ -1,4 +1,6 @@
|
||||
import tokenize
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
from langchain_community.document_loaders.text import TextLoader
|
||||
|
||||
@ -6,7 +8,7 @@ from langchain_community.document_loaders.text import TextLoader
|
||||
class PythonLoader(TextLoader):
|
||||
"""Load `Python` files, respecting any non-default encoding if specified."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
def __init__(self, file_path: Union[str, Path]):
|
||||
"""Initialize with a file path.
|
||||
|
||||
Args:
|
||||
|
@ -1,5 +1,5 @@
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import List, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -9,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
|
||||
class RoamLoader(BaseLoader):
|
||||
"""Load `Roam` files from a directory."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
def __init__(self, path: Union[str, Path]):
|
||||
"""Initialize with a path."""
|
||||
self.file_path = path
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
"""Loads RST files."""
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
@ -32,7 +33,10 @@ class UnstructuredRSTLoader(UnstructuredFileLoader):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
Initialize with a file path.
|
||||
|
@ -1,5 +1,6 @@
|
||||
"""Loads rich text files."""
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
@ -32,7 +33,10 @@ class UnstructuredRTFLoader(UnstructuredFileLoader):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
Initialize with a file path.
|
||||
|
@ -1,7 +1,7 @@
|
||||
import json
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, List, Optional
|
||||
from typing import Dict, Iterator, List, Optional, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -11,7 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader
|
||||
class SlackDirectoryLoader(BaseLoader):
|
||||
"""Load from a `Slack` directory dump."""
|
||||
|
||||
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
|
||||
def __init__(self, zip_path: Union[str, Path], workspace_url: Optional[str] = None):
|
||||
"""Initialize the SlackDirectoryLoader.
|
||||
|
||||
Args:
|
||||
|
@ -1,4 +1,5 @@
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -8,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
|
||||
class SRTLoader(BaseLoader):
|
||||
"""Load `.srt` (subtitle) files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
def __init__(self, file_path: Union[str, Path]):
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
import pysrt # noqa:F401
|
||||
@ -16,7 +17,7 @@ class SRTLoader(BaseLoader):
|
||||
raise ImportError(
|
||||
"package `pysrt` not found, please install it with `pip install pysrt`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
self.file_path = str(file_path)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load using pysrt file."""
|
||||
|
@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str:
|
||||
class TelegramChatFileLoader(BaseLoader):
|
||||
"""Load from `Telegram chat` dump."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
def __init__(self, path: Union[str, Path]):
|
||||
"""Initialize with a path."""
|
||||
self.file_path = path
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
import logging
|
||||
from typing import Iterator, Optional
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Optional, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -25,7 +26,7 @@ class TextLoader(BaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, Path],
|
||||
encoding: Optional[str] = None,
|
||||
autodetect_encoding: bool = False,
|
||||
):
|
||||
@ -56,5 +57,5 @@ class TextLoader(BaseLoader):
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error loading {self.file_path}") from e
|
||||
|
||||
metadata = {"source": self.file_path}
|
||||
metadata = {"source": str(self.file_path)}
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
|
@ -1,4 +1,5 @@
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
@ -26,7 +27,10 @@ class UnstructuredTSVLoader(UnstructuredFileLoader):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
validate_unstructured_version(min_unstructured_version="0.7.6")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
@ -1,6 +1,7 @@
|
||||
"""Loader that uses unstructured to load files."""
|
||||
import collections
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
@ -155,7 +156,7 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, List[str]],
|
||||
file_path: Union[str, List[str], Path, List[Path]],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
@ -169,9 +170,13 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
if isinstance(self.file_path, list):
|
||||
elements = []
|
||||
for file in self.file_path:
|
||||
if isinstance(file, Path):
|
||||
file = str(file)
|
||||
elements.extend(partition(filename=file, **self.unstructured_kwargs))
|
||||
return elements
|
||||
else:
|
||||
if isinstance(self.file_path, Path):
|
||||
self.file_path = str(self.file_path)
|
||||
return partition(filename=self.file_path, **self.unstructured_kwargs)
|
||||
|
||||
def _get_metadata(self) -> dict:
|
||||
@ -179,14 +184,16 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
|
||||
|
||||
def get_elements_from_api(
|
||||
file_path: Union[str, List[str], None] = None,
|
||||
file_path: Union[str, List[str], Path, List[Path], None] = None,
|
||||
file: Union[IO, Sequence[IO], None] = None,
|
||||
api_url: str = "https://api.unstructured.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
**unstructured_kwargs: Any,
|
||||
) -> List:
|
||||
"""Retrieve a list of elements from the `Unstructured API`."""
|
||||
if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
|
||||
if is_list := isinstance(file_path, list):
|
||||
file_path = [str(path) for path in file_path]
|
||||
if isinstance(file, collections.abc.Sequence) or is_list:
|
||||
from unstructured.partition.api import partition_multiple_via_api
|
||||
|
||||
_doc_elements = partition_multiple_via_api(
|
||||
@ -206,7 +213,7 @@ def get_elements_from_api(
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
return partition_via_api(
|
||||
filename=file_path,
|
||||
filename=str(file_path),
|
||||
file=file,
|
||||
api_key=api_key,
|
||||
api_url=api_url,
|
||||
|
@ -1,7 +1,8 @@
|
||||
import os
|
||||
import tempfile
|
||||
from abc import ABC
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
@ -13,9 +14,9 @@ from langchain_community.document_loaders.parsers import VsdxParser
|
||||
|
||||
|
||||
class VsdxLoader(BaseLoader, ABC):
|
||||
def __init__(self, file_path: str):
|
||||
def __init__(self, file_path: Union[str, Path]):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
self.file_path = str(file_path)
|
||||
if "~" in self.file_path:
|
||||
self.file_path = os.path.expanduser(self.file_path)
|
||||
|
||||
|
@ -2,7 +2,8 @@
|
||||
import os
|
||||
import tempfile
|
||||
from abc import ABC
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
@ -19,9 +20,9 @@ class Docx2txtLoader(BaseLoader, ABC):
|
||||
to a temporary file, and use that, then clean up the temporary file after completion
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
def __init__(self, file_path: Union[str, Path]):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
self.file_path = str(file_path)
|
||||
if "~" in self.file_path:
|
||||
self.file_path = os.path.expanduser(self.file_path)
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
"""Loads Microsoft Excel files."""
|
||||
from typing import Any, List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
@ -32,8 +33,12 @@ class UnstructuredXMLLoader(UnstructuredFileLoader):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
file_path = str(file_path)
|
||||
validate_unstructured_version(min_unstructured_version="0.6.7")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user