mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-10 03:00:59 +00:00
Compare commits
3 Commits
langchain=
...
bagatur/de
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
abab93b5ec | ||
|
|
37017c0e3f | ||
|
|
40ea8a5402 |
@@ -19,6 +19,8 @@ import importlib
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.blob_loaders.file_system import FileSystemBlobLoader
|
||||
|
||||
from langchain_community.document_loaders.acreom import (
|
||||
AcreomLoader,
|
||||
)
|
||||
@@ -87,7 +89,6 @@ if TYPE_CHECKING:
|
||||
from langchain_community.document_loaders.blob_loaders import (
|
||||
Blob,
|
||||
BlobLoader,
|
||||
FileSystemBlobLoader,
|
||||
YoutubeAudioLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.blockchain import (
|
||||
@@ -800,7 +801,6 @@ __all__ = [
|
||||
"FaunaLoader",
|
||||
"FigmaFileLoader",
|
||||
"FireCrawlLoader",
|
||||
"FileSystemBlobLoader",
|
||||
"GCSDirectoryLoader",
|
||||
"GlueCatalogLoader",
|
||||
"GCSFileLoader",
|
||||
|
||||
@@ -10,6 +10,7 @@ from enum import Enum
|
||||
from pathlib import Path, PurePath
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Sequence, Union
|
||||
|
||||
from langchain_core.blob_loaders.file_system import FileSystemBlobLoader
|
||||
from langchain_core.pydantic_v1 import (
|
||||
BaseModel,
|
||||
BaseSettings,
|
||||
@@ -19,9 +20,6 @@ from langchain_core.pydantic_v1 import (
|
||||
)
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
from langchain_community.document_loaders.blob_loaders.file_system import (
|
||||
FileSystemBlobLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.blob_loaders.schema import Blob
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
||||
@@ -4,12 +4,11 @@ from typing import TYPE_CHECKING, Any
|
||||
from langchain_core.document_loaders import Blob, BlobLoader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.blob_loaders.file_system import FileSystemBlobLoader
|
||||
|
||||
from langchain_community.document_loaders.blob_loaders.cloud_blob_loader import (
|
||||
CloudBlobLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.blob_loaders.file_system import (
|
||||
FileSystemBlobLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.blob_loaders.youtube_audio import (
|
||||
YoutubeAudioLoader,
|
||||
)
|
||||
@@ -39,6 +38,5 @@ __all__ = [
|
||||
"BlobLoader",
|
||||
"Blob",
|
||||
"CloudBlobLoader",
|
||||
"FileSystemBlobLoader",
|
||||
"YoutubeAudioLoader",
|
||||
]
|
||||
|
||||
@@ -1,149 +1,3 @@
|
||||
"""Use to load blobs from the local file system."""
|
||||
from langchain_core.blob_loaders.file_system import FileSystemBlobLoader
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, Iterator, Optional, Sequence, TypeVar, Union
|
||||
|
||||
from langchain_community.document_loaders.blob_loaders.schema import Blob, BlobLoader
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def _make_iterator(
|
||||
length_func: Callable[[], int], show_progress: bool = False
|
||||
) -> Callable[[Iterable[T]], Iterator[T]]:
|
||||
"""Create a function that optionally wraps an iterable in tqdm."""
|
||||
iterator: Callable[[Iterable[T]], Iterator[T]]
|
||||
if show_progress:
|
||||
try:
|
||||
from tqdm.auto import tqdm
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You must install tqdm to use show_progress=True."
|
||||
"You can install tqdm with `pip install tqdm`."
|
||||
)
|
||||
|
||||
# Make sure to provide `total` here so that tqdm can show
|
||||
# a progress bar that takes into account the total number of files.
|
||||
def _with_tqdm(iterable: Iterable[T]) -> Iterator[T]:
|
||||
"""Wrap an iterable in a tqdm progress bar."""
|
||||
return tqdm(iterable, total=length_func())
|
||||
|
||||
iterator = _with_tqdm
|
||||
else:
|
||||
iterator = iter
|
||||
|
||||
return iterator
|
||||
|
||||
|
||||
# PUBLIC API
|
||||
|
||||
|
||||
class FileSystemBlobLoader(BlobLoader):
|
||||
"""Load blobs in the local file system.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||
loader = FileSystemBlobLoader("/path/to/directory")
|
||||
for blob in loader.yield_blobs():
|
||||
print(blob) # noqa: T201
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
*,
|
||||
glob: str = "**/[!.]*",
|
||||
exclude: Sequence[str] = (),
|
||||
suffixes: Optional[Sequence[str]] = None,
|
||||
show_progress: bool = False,
|
||||
) -> None:
|
||||
"""Initialize with a path to directory and how to glob over it.
|
||||
|
||||
Args:
|
||||
path: Path to directory to load from or path to file to load.
|
||||
If a path to a file is provided, glob/exclude/suffixes are ignored.
|
||||
glob: Glob pattern relative to the specified path
|
||||
by default set to pick up all non-hidden files
|
||||
exclude: patterns to exclude from results, use glob syntax
|
||||
suffixes: Provide to keep only files with these suffixes
|
||||
Useful when wanting to keep files with different suffixes
|
||||
Suffixes must include the dot, e.g. ".txt"
|
||||
show_progress: If true, will show a progress bar as the files are loaded.
|
||||
This forces an iteration through all matching files
|
||||
to count them prior to loading them.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||
|
||||
# Load a single file.
|
||||
loader = FileSystemBlobLoader("/path/to/file.txt")
|
||||
|
||||
# Recursively load all text files in a directory.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
|
||||
|
||||
# Recursively load all non-hidden files in a directory.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
|
||||
|
||||
# Load all files in a directory without recursion.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="*")
|
||||
|
||||
# Recursively load all files in a directory, except for py or pyc files.
|
||||
loader = FileSystemBlobLoader(
|
||||
"/path/to/directory",
|
||||
glob="**/*.txt",
|
||||
exclude=["**/*.py", "**/*.pyc"]
|
||||
)
|
||||
""" # noqa: E501
|
||||
if isinstance(path, Path):
|
||||
_path = path
|
||||
elif isinstance(path, str):
|
||||
_path = Path(path)
|
||||
else:
|
||||
raise TypeError(f"Expected str or Path, got {type(path)}")
|
||||
|
||||
self.path = _path.expanduser() # Expand user to handle ~
|
||||
self.glob = glob
|
||||
self.suffixes = set(suffixes or [])
|
||||
self.show_progress = show_progress
|
||||
self.exclude = exclude
|
||||
|
||||
def yield_blobs(
|
||||
self,
|
||||
) -> Iterable[Blob]:
|
||||
"""Yield blobs that match the requested pattern."""
|
||||
iterator = _make_iterator(
|
||||
length_func=self.count_matching_files, show_progress=self.show_progress
|
||||
)
|
||||
|
||||
for path in iterator(self._yield_paths()):
|
||||
yield Blob.from_path(path)
|
||||
|
||||
def _yield_paths(self) -> Iterable[Path]:
|
||||
"""Yield paths that match the requested pattern."""
|
||||
if self.path.is_file():
|
||||
yield self.path
|
||||
return
|
||||
|
||||
paths = self.path.glob(self.glob)
|
||||
for path in paths:
|
||||
if self.exclude:
|
||||
if any(path.match(glob) for glob in self.exclude):
|
||||
continue
|
||||
if path.is_file():
|
||||
if self.suffixes and path.suffix not in self.suffixes:
|
||||
continue
|
||||
yield path
|
||||
|
||||
def count_matching_files(self) -> int:
|
||||
"""Count files that match the pattern without loading them."""
|
||||
# Carry out a full iteration to count the files without
|
||||
# materializing anything expensive in memory.
|
||||
num = 0
|
||||
for _ in self._yield_paths():
|
||||
num += 1
|
||||
return num
|
||||
__all__ = ["FileSystemBlobLoader"]
|
||||
|
||||
@@ -4,12 +4,12 @@ import concurrent.futures
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Literal, Optional, Sequence, Union
|
||||
|
||||
from langchain_core.blob_loaders.file_system import FileSystemBlobLoader
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import (
|
||||
BlobLoader,
|
||||
FileSystemBlobLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.generic import GenericLoader
|
||||
from langchain_community.document_loaders.parsers.registry import get_parser
|
||||
|
||||
@@ -3,69 +3,6 @@
|
||||
This module contains some logic to help assemble more sophisticated parsers.
|
||||
"""
|
||||
|
||||
from typing import Iterator, Mapping, Optional
|
||||
from langchain_core.blob_parsers.mime_type import MimeTypeBasedParser
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders.schema import Blob
|
||||
|
||||
|
||||
class MimeTypeBasedParser(BaseBlobParser):
|
||||
"""Parser that uses `mime`-types to parse a blob.
|
||||
|
||||
This parser is useful for simple pipelines where the mime-type is sufficient
|
||||
to determine how to parse a blob.
|
||||
|
||||
To use, configure handlers based on mime-types and pass them to the initializer.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
|
||||
|
||||
parser = MimeTypeBasedParser(
|
||||
handlers={
|
||||
"application/pdf": ...,
|
||||
},
|
||||
fallback_parser=...,
|
||||
)
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
handlers: Mapping[str, BaseBlobParser],
|
||||
*,
|
||||
fallback_parser: Optional[BaseBlobParser] = None,
|
||||
) -> None:
|
||||
"""Define a parser that uses mime-types to determine how to parse a blob.
|
||||
|
||||
Args:
|
||||
handlers: A mapping from mime-types to functions that take a blob, parse it
|
||||
and return a document.
|
||||
fallback_parser: A fallback_parser parser to use if the mime-type is not
|
||||
found in the handlers. If provided, this parser will be
|
||||
used to parse blobs with all mime-types not found in
|
||||
the handlers.
|
||||
If not provided, a ValueError will be raised if the
|
||||
mime-type is not found in the handlers.
|
||||
"""
|
||||
self.handlers = handlers
|
||||
self.fallback_parser = fallback_parser
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Load documents from a blob."""
|
||||
mimetype = blob.mimetype
|
||||
|
||||
if mimetype is None:
|
||||
raise ValueError(f"{blob} does not have a mimetype.")
|
||||
|
||||
if mimetype in self.handlers:
|
||||
handler = self.handlers[mimetype]
|
||||
yield from handler.lazy_parse(blob)
|
||||
else:
|
||||
if self.fallback_parser is not None:
|
||||
yield from self.fallback_parser.lazy_parse(blob)
|
||||
else:
|
||||
raise ValueError(f"Unsupported mime type: {mimetype}")
|
||||
__all__ = ["MimeTypeBasedParser"]
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
"""Module includes a registry of default parser configurations."""
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
|
||||
from langchain_core.blob_parsers.mime_type import MimeTypeBasedParser
|
||||
from langchain_core.blob_parsers.txt import TextParser
|
||||
from langchain_core.document_loaders.base import BaseBlobParser
|
||||
|
||||
from langchain_community.document_loaders.parsers.msword import MsWordParser
|
||||
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
|
||||
from langchain_community.document_loaders.parsers.txt import TextParser
|
||||
|
||||
|
||||
def _get_default_parser() -> BaseBlobParser:
|
||||
@@ -17,6 +18,7 @@ def _get_default_parser() -> BaseBlobParser:
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
|
||||
MsWordParser()
|
||||
),
|
||||
"text/x-python": TextParser(),
|
||||
},
|
||||
fallback_parser=None,
|
||||
)
|
||||
|
||||
@@ -1,16 +1,5 @@
|
||||
"""Module for parsing text files.."""
|
||||
|
||||
from typing import Iterator
|
||||
from langchain_core.blob_parsers.txt import TextParser
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
|
||||
|
||||
class TextParser(BaseBlobParser):
|
||||
"""Parser for text blobs."""
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||
"""Lazily parse the blob."""
|
||||
yield Document(page_content=blob.as_string(), metadata={"source": blob.source}) # type: ignore[attr-defined]
|
||||
__all__ = ["TextParser"]
|
||||
|
||||
@@ -3,11 +3,11 @@
|
||||
from typing import Iterator
|
||||
|
||||
import pytest
|
||||
from langchain_core.blob_parsers.mime_type import MimeTypeBasedParser
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
|
||||
|
||||
|
||||
class TestMimeBasedParser:
|
||||
|
||||
@@ -6,12 +6,12 @@ from pathlib import Path
|
||||
from typing import Any, Generator, Iterator
|
||||
|
||||
import pytest
|
||||
from langchain_core.blob_parsers.txt import TextParser
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob, FileSystemBlobLoader
|
||||
from langchain_community.document_loaders.generic import GenericLoader
|
||||
from langchain_community.document_loaders.parsers.txt import TextParser
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
4
libs/core/langchain_core/blob_loaders/__init__.py
Normal file
4
libs/core/langchain_core/blob_loaders/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from langchain_core.blob_loaders.file_system import FileSystemBlobLoader
|
||||
from langchain_core.document_loaders.blob_loaders import BlobLoader
|
||||
|
||||
__all__ = ["BlobLoader", "FileSystemBlobLoader"]
|
||||
150
libs/core/langchain_core/blob_loaders/file_system.py
Normal file
150
libs/core/langchain_core/blob_loaders/file_system.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""Use to load blobs from the local file system."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, Iterator, Optional, Sequence, TypeVar, Union
|
||||
|
||||
from langchain_core.document_loaders import BlobLoader
|
||||
from langchain_core.documents.base import Blob
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def _make_iterator(
|
||||
length_func: Callable[[], int], show_progress: bool = False
|
||||
) -> Callable[[Iterable[T]], Iterator[T]]:
|
||||
"""Create a function that optionally wraps an iterable in tqdm."""
|
||||
iterator: Callable[[Iterable[T]], Iterator[T]]
|
||||
if show_progress:
|
||||
try:
|
||||
from tqdm.auto import tqdm
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You must install tqdm to use show_progress=True."
|
||||
"You can install tqdm with `pip install tqdm`."
|
||||
)
|
||||
|
||||
# Make sure to provide `total` here so that tqdm can show
|
||||
# a progress bar that takes into account the total number of files.
|
||||
def _with_tqdm(iterable: Iterable[T]) -> Iterator[T]:
|
||||
"""Wrap an iterable in a tqdm progress bar."""
|
||||
return tqdm(iterable, total=length_func())
|
||||
|
||||
iterator = _with_tqdm
|
||||
else:
|
||||
iterator = iter
|
||||
|
||||
return iterator
|
||||
|
||||
|
||||
# PUBLIC API
|
||||
|
||||
|
||||
class FileSystemBlobLoader(BlobLoader):
|
||||
"""Load blobs in the local file system.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||
loader = FileSystemBlobLoader("/path/to/directory")
|
||||
for blob in loader.yield_blobs():
|
||||
print(blob) # noqa: T201
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
*,
|
||||
glob: str = "**/[!.]*",
|
||||
exclude: Sequence[str] = (),
|
||||
suffixes: Optional[Sequence[str]] = None,
|
||||
show_progress: bool = False,
|
||||
) -> None:
|
||||
"""Initialize with a path to directory and how to glob over it.
|
||||
|
||||
Args:
|
||||
path: Path to directory to load from or path to file to load.
|
||||
If a path to a file is provided, glob/exclude/suffixes are ignored.
|
||||
glob: Glob pattern relative to the specified path
|
||||
by default set to pick up all non-hidden files
|
||||
exclude: patterns to exclude from results, use glob syntax
|
||||
suffixes: Provide to keep only files with these suffixes
|
||||
Useful when wanting to keep files with different suffixes
|
||||
Suffixes must include the dot, e.g. ".txt"
|
||||
show_progress: If true, will show a progress bar as the files are loaded.
|
||||
This forces an iteration through all matching files
|
||||
to count them prior to loading them.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||
|
||||
# Load a single file.
|
||||
loader = FileSystemBlobLoader("/path/to/file.txt")
|
||||
|
||||
# Recursively load all text files in a directory.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
|
||||
|
||||
# Recursively load all non-hidden files in a directory.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
|
||||
|
||||
# Load all files in a directory without recursion.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="*")
|
||||
|
||||
# Recursively load all files in a directory, except for py or pyc files.
|
||||
loader = FileSystemBlobLoader(
|
||||
"/path/to/directory",
|
||||
glob="**/*.txt",
|
||||
exclude=["**/*.py", "**/*.pyc"]
|
||||
)
|
||||
""" # noqa: E501
|
||||
if isinstance(path, Path):
|
||||
_path = path
|
||||
elif isinstance(path, str):
|
||||
_path = Path(path)
|
||||
else:
|
||||
raise TypeError(f"Expected str or Path, got {type(path)}")
|
||||
|
||||
self.path = _path.expanduser() # Expand user to handle ~
|
||||
self.glob = glob
|
||||
self.suffixes = set(suffixes or [])
|
||||
self.show_progress = show_progress
|
||||
self.exclude = exclude if not isinstance(exclude, str) else (exclude,)
|
||||
|
||||
def yield_blobs(
|
||||
self,
|
||||
) -> Iterable[Blob]:
|
||||
"""Yield blobs that match the requested pattern."""
|
||||
iterator = _make_iterator(
|
||||
length_func=self.count_matching_files, show_progress=self.show_progress
|
||||
)
|
||||
|
||||
for path in iterator(self._yield_paths()):
|
||||
yield Blob.from_path(path)
|
||||
|
||||
def _yield_paths(self) -> Iterable[Path]:
|
||||
"""Yield paths that match the requested pattern."""
|
||||
if self.path.is_file():
|
||||
yield self.path
|
||||
return
|
||||
|
||||
paths = self.path.glob(self.glob)
|
||||
for path in paths:
|
||||
if self.exclude:
|
||||
if any(path.match(glob) for glob in self.exclude):
|
||||
continue
|
||||
if path.is_file():
|
||||
if self.suffixes and path.suffix not in self.suffixes:
|
||||
continue
|
||||
yield path
|
||||
|
||||
def count_matching_files(self) -> int:
|
||||
"""Count files that match the pattern without loading them."""
|
||||
# Carry out a full iteration to count the files without
|
||||
# materializing anything expensive in memory.
|
||||
num = 0
|
||||
for _ in self._yield_paths():
|
||||
num += 1
|
||||
return num
|
||||
18
libs/core/langchain_core/blob_parsers/__init__.py
Normal file
18
libs/core/langchain_core/blob_parsers/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.blob_parsers.mime_type import MimeTypeBasedParser
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
if name == "MimeTypeBasedParser":
|
||||
from langchain_core.blob_parsers.mime_type import MimeTypeBasedParser
|
||||
|
||||
return MimeTypeBasedParser
|
||||
else:
|
||||
raise AttributeError(
|
||||
f"No {name} attribute in module langchain_core.blob_parsers."
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["MimeTypeBasedParser"]
|
||||
65
libs/core/langchain_core/blob_parsers/mime_type.py
Normal file
65
libs/core/langchain_core/blob_parsers/mime_type.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from typing import Iterator, Mapping, Optional
|
||||
|
||||
from langchain_core.document_loaders import BaseBlobParser
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.documents.base import Blob
|
||||
|
||||
|
||||
class MimeTypeBasedParser(BaseBlobParser):
|
||||
"""Parser that uses `mime`-types to parse a blob.
|
||||
|
||||
This parser is useful for simple pipelines where the mime-type is sufficient
|
||||
to determine how to parse a blob.
|
||||
|
||||
To use, configure handlers based on mime-types and pass them to the initializer.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
|
||||
|
||||
parser = MimeTypeBasedParser(
|
||||
handlers={
|
||||
"application/pdf": ...,
|
||||
},
|
||||
fallback_parser=...,
|
||||
)
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
handlers: Mapping[str, BaseBlobParser],
|
||||
*,
|
||||
fallback_parser: Optional[BaseBlobParser] = None,
|
||||
) -> None:
|
||||
"""Define a parser that uses mime-types to determine how to parse a blob.
|
||||
|
||||
Args:
|
||||
handlers: A mapping from mime-types to functions that take a blob, parse it
|
||||
and return a document.
|
||||
fallback_parser: A fallback_parser parser to use if the mime-type is not
|
||||
found in the handlers. If provided, this parser will be
|
||||
used to parse blobs with all mime-types not found in
|
||||
the handlers.
|
||||
If not provided, a ValueError will be raised if the
|
||||
mime-type is not found in the handlers.
|
||||
"""
|
||||
self.handlers = handlers
|
||||
self.fallback_parser = fallback_parser
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Load documents from a blob."""
|
||||
mimetype = blob.mimetype
|
||||
|
||||
if mimetype is None:
|
||||
raise ValueError(f"{blob} does not have a mimetype.")
|
||||
|
||||
if mimetype in self.handlers:
|
||||
handler = self.handlers[mimetype]
|
||||
yield from handler.lazy_parse(blob)
|
||||
else:
|
||||
if self.fallback_parser is not None:
|
||||
yield from self.fallback_parser.lazy_parse(blob)
|
||||
else:
|
||||
raise ValueError(f"Unsupported mime type: {mimetype}")
|
||||
13
libs/core/langchain_core/blob_parsers/txt.py
Normal file
13
libs/core/langchain_core/blob_parsers/txt.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from typing import Iterator
|
||||
|
||||
from langchain_core.document_loaders import BaseBlobParser
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.documents.base import Blob
|
||||
|
||||
|
||||
class TextParser(BaseBlobParser):
|
||||
"""Parser for text blobs."""
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||
"""Lazily parse the blob."""
|
||||
yield Document(page_content=blob.as_string(), metadata={"source": blob.source}) # type: ignore[attr-defined]
|
||||
@@ -18,7 +18,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Sequence, Union
|
||||
from typing import List, Optional, Sequence, Union
|
||||
from uuid import UUID
|
||||
|
||||
from langchain_core.messages import (
|
||||
AIMessage,
|
||||
@@ -206,38 +207,27 @@ class InMemoryChatMessageHistory(BaseChatMessageHistory, BaseModel):
|
||||
messages: List[BaseMessage] = Field(default_factory=list)
|
||||
"""A list of messages stored in memory."""
|
||||
|
||||
async def aget_messages(self) -> List[BaseMessage]:
|
||||
"""Async version of getting messages.
|
||||
|
||||
Can over-ride this method to provide an efficient async implementation.
|
||||
In general, fetching messages may involve IO to the underlying
|
||||
persistence layer.
|
||||
|
||||
Returns:
|
||||
List of messages.
|
||||
"""
|
||||
return self.messages
|
||||
|
||||
def add_message(self, message: BaseMessage) -> None:
|
||||
"""Add a self-created message to the store.
|
||||
|
||||
Args:
|
||||
message: The message to add.
|
||||
"""
|
||||
self.messages.append(message)
|
||||
|
||||
async def aadd_messages(self, messages: Sequence[BaseMessage]) -> None:
|
||||
"""Async add messages to the store.
|
||||
|
||||
Args:
|
||||
messages: The messages to add.
|
||||
"""
|
||||
self.add_messages(messages)
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all messages from the store."""
|
||||
self.messages = []
|
||||
|
||||
async def aclear(self) -> None:
|
||||
"""Async clear all messages from the store."""
|
||||
self.clear()
|
||||
|
||||
class BaseHistoryManager(ABC):
|
||||
@abstractmethod
|
||||
def get_session(self, session_id: Union[str, UUID]) -> BaseChatMessageHistory:
|
||||
""""""
|
||||
|
||||
|
||||
class InMemManager(BaseHistoryManager):
|
||||
""""""
|
||||
|
||||
def __init__(self, sessions: Optional[dict] = None) -> None:
|
||||
self.sessions = sessions or {}
|
||||
|
||||
def get_session(self, session_id: Union[str, UUID]) -> InMemoryChatMessageHistory:
|
||||
if session_id not in self.sessions:
|
||||
self.sessions[session_id] = InMemoryChatMessageHistory()
|
||||
return self.sessions[session_id]
|
||||
|
||||
@@ -12,15 +12,11 @@ from typing import (
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchain_core.blob_loaders import FileSystemBlobLoader
|
||||
from langchain_core.document_loaders.base import BaseBlobParser, BaseLoader
|
||||
from langchain_core.document_loaders.blob_loaders import BlobLoader
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser, BaseLoader
|
||||
from langchain_community.document_loaders.blob_loaders import (
|
||||
BlobLoader,
|
||||
FileSystemBlobLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.parsers.registry import get_parser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_text_splitters import TextSplitter
|
||||
|
||||
@@ -172,10 +168,24 @@ class GenericLoader(BaseLoader):
|
||||
# If there is an implementation of get_parser on the class, use it.
|
||||
blob_parser = cls.get_parser(**(parser_kwargs or {}))
|
||||
except NotImplementedError:
|
||||
# if not then use the global registry.
|
||||
blob_parser = get_parser(parser)
|
||||
# if not then try to use the global registry.
|
||||
try:
|
||||
from langchain_community.document_loaders.parsers.registry import (
|
||||
get_parser,
|
||||
)
|
||||
except ImportError as e:
|
||||
raise ValueError("") from e
|
||||
else:
|
||||
blob_parser = get_parser(parser)
|
||||
else:
|
||||
blob_parser = get_parser(parser)
|
||||
try:
|
||||
from langchain_community.document_loaders.parsers.registry import (
|
||||
get_parser,
|
||||
)
|
||||
except ImportError as e:
|
||||
raise ValueError("") from e
|
||||
else:
|
||||
blob_parser = get_parser(parser)
|
||||
else:
|
||||
blob_parser = parser
|
||||
return cls(blob_loader, blob_parser)
|
||||
@@ -293,3 +293,6 @@ class Document(BaseMedia):
|
||||
return f"page_content='{self.page_content}' metadata={self.metadata}"
|
||||
else:
|
||||
return f"page_content='{self.page_content}'"
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.page_content)
|
||||
|
||||
@@ -35,7 +35,9 @@ from langchain_core.utils import get_pydantic_field_names
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.caches import BaseCache
|
||||
from langchain_core.callbacks import Callbacks
|
||||
from langchain_core.chat_history import BaseHistoryManager
|
||||
from langchain_core.outputs import LLMResult
|
||||
from langchain_core.runnables.history import RunnableWithMessageHistory
|
||||
|
||||
|
||||
class LangSmithParams(TypedDict, total=False):
|
||||
@@ -363,7 +365,9 @@ class BaseLanguageModel(
|
||||
"""
|
||||
return len(self.get_token_ids(text))
|
||||
|
||||
def get_num_tokens_from_messages(self, messages: List[BaseMessage]) -> int:
|
||||
def get_num_tokens_from_messages(
|
||||
self, messages: List[MessageLikeRepresentation]
|
||||
) -> int:
|
||||
"""Get the number of tokens in the messages.
|
||||
|
||||
Useful for checking if an input fits in a model's context window.
|
||||
@@ -374,6 +378,7 @@ class BaseLanguageModel(
|
||||
Returns:
|
||||
The sum of the number of tokens across the messages.
|
||||
"""
|
||||
messages = convert_to_messages(messages)
|
||||
return sum([self.get_num_tokens(get_buffer_string([m])) for m in messages])
|
||||
|
||||
@classmethod
|
||||
@@ -383,3 +388,14 @@ class BaseLanguageModel(
|
||||
Use get_pydantic_field_names.
|
||||
"""
|
||||
return get_pydantic_field_names(cls)
|
||||
|
||||
def with_history(
|
||||
self, get_session_history: Union[Callable, BaseHistoryManager]
|
||||
) -> RunnableWithMessageHistory:
|
||||
from langchain_core.chat_history import BaseHistoryManager
|
||||
from langchain_core.runnables.history import RunnableWithMessageHistory
|
||||
|
||||
if isinstance(get_session_history, BaseHistoryManager):
|
||||
get_session_history = get_session_history.get_session
|
||||
|
||||
return RunnableWithMessageHistory(self, get_session_history)
|
||||
|
||||
@@ -6,6 +6,13 @@ from importlib import metadata
|
||||
from typing import Any, Optional
|
||||
|
||||
from langchain_core._api.deprecation import surface_langchain_deprecation_warnings
|
||||
from langchain_core.chat_history import InMemManager
|
||||
from langchain_core.document_loaders.generic import GenericLoader
|
||||
from langchain_core.messages import AIMessage, AnyMessage, HumanMessage, SystemMessage
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.tools import tool
|
||||
|
||||
from langchain.chat_models import init_chat_model as chat_model
|
||||
|
||||
try:
|
||||
__version__ = metadata.version(__package__)
|
||||
@@ -436,4 +443,13 @@ __all__ = [
|
||||
"QAWithSourcesChain",
|
||||
"LlamaCpp",
|
||||
"HuggingFaceTextGenInference",
|
||||
"chat_model",
|
||||
"InMemManager",
|
||||
"GenericLoader",
|
||||
"ChatPromptTemplate",
|
||||
"tool",
|
||||
"AnyMessage",
|
||||
"AIMessage",
|
||||
"SystemMessage",
|
||||
"HumanMessage",
|
||||
]
|
||||
|
||||
@@ -74,7 +74,6 @@ if TYPE_CHECKING:
|
||||
FacebookChatLoader,
|
||||
FaunaLoader,
|
||||
FigmaFileLoader,
|
||||
FileSystemBlobLoader,
|
||||
GCSDirectoryLoader,
|
||||
GCSFileLoader,
|
||||
GeoDataFrameLoader,
|
||||
@@ -188,6 +187,7 @@ if TYPE_CHECKING:
|
||||
YoutubeLoader,
|
||||
YuqueLoader,
|
||||
)
|
||||
from langchain_core.blob_loaders.file_system import FileSystemBlobLoader
|
||||
|
||||
from langchain_core.document_loaders import Blob, BlobLoader
|
||||
|
||||
@@ -437,7 +437,6 @@ __all__ = [
|
||||
"FacebookChatLoader",
|
||||
"FaunaLoader",
|
||||
"FigmaFileLoader",
|
||||
"FileSystemBlobLoader",
|
||||
"GCSDirectoryLoader",
|
||||
"GCSFileLoader",
|
||||
"GeoDataFrameLoader",
|
||||
|
||||
@@ -6,9 +6,9 @@ from langchain._api import create_importer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_community.document_loaders import (
|
||||
FileSystemBlobLoader,
|
||||
YoutubeAudioLoader,
|
||||
)
|
||||
from langchain_core.blob_loaders.file_system import FileSystemBlobLoader
|
||||
|
||||
# Create a way to dynamically look up deprecated imports.
|
||||
# Used to consolidate logic for raising deprecation warnings and
|
||||
@@ -31,6 +31,5 @@ def __getattr__(name: str) -> Any:
|
||||
__all__ = [
|
||||
"BlobLoader",
|
||||
"Blob",
|
||||
"FileSystemBlobLoader",
|
||||
"YoutubeAudioLoader",
|
||||
]
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any
|
||||
from langchain._api import create_importer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
|
||||
from langchain_core.blob_parsers.mime_type import MimeTypeBasedParser
|
||||
|
||||
# Create a way to dynamically look up deprecated imports.
|
||||
# Used to consolidate logic for raising deprecation warnings and
|
||||
@@ -20,6 +20,4 @@ def __getattr__(name: str) -> Any:
|
||||
return _import_attribute(name)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"MimeTypeBasedParser",
|
||||
]
|
||||
__all__ = []
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any
|
||||
from langchain._api import create_importer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_community.document_loaders.parsers.txt import TextParser
|
||||
from langchain_core.blob_parsers.txt import TextParser
|
||||
|
||||
# Create a way to dynamically look up deprecated imports.
|
||||
# Used to consolidate logic for raising deprecation warnings and
|
||||
@@ -18,6 +18,4 @@ def __getattr__(name: str) -> Any:
|
||||
return _import_attribute(name)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"TextParser",
|
||||
]
|
||||
__all__ = []
|
||||
|
||||
@@ -61,6 +61,7 @@ from langchain_core.messages import (
|
||||
ToolCall,
|
||||
ToolMessage,
|
||||
ToolMessageChunk,
|
||||
convert_to_messages,
|
||||
)
|
||||
from langchain_core.messages.ai import UsageMetadata
|
||||
from langchain_core.messages.tool import tool_call_chunk
|
||||
@@ -878,7 +879,7 @@ class BaseChatOpenAI(BaseChatModel):
|
||||
return encoding_model.encode(text)
|
||||
|
||||
# TODO: Count bound tools as part of input.
|
||||
def get_num_tokens_from_messages(self, messages: List[BaseMessage]) -> int:
|
||||
def get_num_tokens_from_messages(self, messages: List[MessageLikeRepresentation]) -> int:
|
||||
"""Calculate num tokens for gpt-3.5-turbo and gpt-4 with tiktoken package.
|
||||
|
||||
**Requirements**: You must have the ``pillow`` installed if you want to count
|
||||
@@ -891,6 +892,7 @@ class BaseChatOpenAI(BaseChatModel):
|
||||
main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb"""
|
||||
if sys.version_info[1] <= 7:
|
||||
return super().get_num_tokens_from_messages(messages)
|
||||
messages = convert_to_messages(messages)
|
||||
model, encoding = self._get_encoding_model()
|
||||
if model.startswith("gpt-3.5-turbo-0301"):
|
||||
# every message follows <im_start>{role/name}\n{content}<im_end>\n
|
||||
|
||||
Reference in New Issue
Block a user