mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 04:07:54 +00:00
community: Use default load() implementation in doc loaders (#18385)
Following https://github.com/langchain-ai/langchain/pull/18289
This commit is contained in:
parent
42341bc787
commit
177f51c7bd
@ -1,6 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, List
|
from typing import Iterator
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -74,6 +74,3 @@ class AcreomLoader(BaseLoader):
|
|||||||
}
|
}
|
||||||
|
|
||||||
yield Document(page_content=text, metadata=metadata)
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Any, Callable, Iterator, List, Mapping, Optional
|
from typing import Any, Callable, Iterator, Mapping, Optional
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.utils.utils import guard_import
|
from langchain_core.utils.utils import guard_import
|
||||||
@ -53,9 +53,6 @@ class AirbyteCDKLoader(BaseLoader):
|
|||||||
self._stream_name = stream_name
|
self._stream_name = stream_name
|
||||||
self._state = state
|
self._state = state
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
return self._integration._load_data(
|
return self._integration._load_data(
|
||||||
stream_name=self._stream_name, state=self._state
|
stream_name=self._stream_name, state=self._state
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Iterator, List
|
from typing import Iterator
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -34,7 +34,3 @@ class AirtableLoader(BaseLoader):
|
|||||||
"table_id": self.table_id,
|
"table_id": self.table_id,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load Documents from table."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -148,7 +148,3 @@ class ArcGISLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
|
|
||||||
yield Document(page_content=page_content, metadata=metadata)
|
yield Document(page_content=page_content, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load all records from FeatureLayer."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -76,9 +76,6 @@ class AstraDBLoader(BaseLoader):
|
|||||||
self.nb_prefetched = nb_prefetched
|
self.nb_prefetched = nb_prefetched
|
||||||
self.extraction_function = extraction_function
|
self.extraction_function = extraction_function
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
for doc in self.collection.paginated_find(
|
for doc in self.collection.paginated_find(
|
||||||
filter=self.filter,
|
filter=self.filter,
|
||||||
|
@ -157,7 +157,3 @@ class AthenaLoader(BaseLoader):
|
|||||||
}
|
}
|
||||||
doc = Document(page_content=page_content, metadata=metadata)
|
doc = Document(page_content=page_content, metadata=metadata)
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load data into document objects."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Iterator, List, Optional
|
from typing import Iterator, Optional
|
||||||
|
|
||||||
from langchain_community.docstore.document import Document
|
from langchain_community.docstore.document import Document
|
||||||
from langchain_community.document_loaders.base import BaseLoader
|
from langchain_community.document_loaders.base import BaseLoader
|
||||||
@ -16,10 +16,6 @@ class AzureAIDataLoader(BaseLoader):
|
|||||||
self.glob_pattern = glob
|
self.glob_pattern = glob
|
||||||
"""Optional glob pattern to select files. Defaults to None."""
|
"""Optional glob pattern to select files. Defaults to None."""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load documents."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""A lazy loader for Documents."""
|
"""A lazy loader for Documents."""
|
||||||
try:
|
try:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -18,9 +18,6 @@ class BaiduBOSDirectoryLoader(BaseLoader):
|
|||||||
self.bucket = bucket
|
self.bucket = bucket
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load documents."""
|
"""Load documents."""
|
||||||
try:
|
try:
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -24,9 +24,6 @@ class BaiduBOSFileLoader(BaseLoader):
|
|||||||
self.bucket = bucket
|
self.bucket = bucket
|
||||||
self.key = key
|
self.key = key
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load documents."""
|
"""Load documents."""
|
||||||
try:
|
try:
|
||||||
|
@ -96,16 +96,3 @@ class BibtexLoader(BaseLoader):
|
|||||||
doc = self._load_entry(entry)
|
doc = self._load_entry(entry)
|
||||||
if doc:
|
if doc:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load bibtex file documents from the given bibtex file path.
|
|
||||||
|
|
||||||
See https://bibtexparser.readthedocs.io/en/master/
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: the path to the bibtex file
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
a list of documents with the document.page_content in text format
|
|
||||||
"""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -61,7 +61,3 @@ class BrowserlessLoader(BaseLoader):
|
|||||||
"source": url,
|
"source": url,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load Documents from URLs."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -5,7 +5,6 @@ from typing import (
|
|||||||
Any,
|
Any,
|
||||||
Callable,
|
Callable,
|
||||||
Iterator,
|
Iterator,
|
||||||
List,
|
|
||||||
Optional,
|
Optional,
|
||||||
Sequence,
|
Sequence,
|
||||||
Union,
|
Union,
|
||||||
@ -106,9 +105,6 @@ class CassandraLoader(BaseLoader):
|
|||||||
if query_execution_profile is not _NOT_SET:
|
if query_execution_profile is not _NOT_SET:
|
||||||
self.query_kwargs["execution_profile"] = query_execution_profile
|
self.query_kwargs["execution_profile"] = query_execution_profile
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
for row in self.session.execute(self.query, **self.query_kwargs):
|
for row in self.session.execute(self.query, **self.query_kwargs):
|
||||||
metadata = self.metadata.copy()
|
metadata = self.metadata.copy()
|
||||||
|
@ -78,14 +78,3 @@ class AsyncChromiumLoader(BaseLoader):
|
|||||||
html_content = asyncio.run(self.ascrape_playwright(url))
|
html_content = asyncio.run(self.ascrape_playwright(url))
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
yield Document(page_content=html_content, metadata=metadata)
|
yield Document(page_content=html_content, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""
|
|
||||||
Load and return all Documents from the provided URLs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List[Document]: A list of Document objects
|
|
||||||
containing the scraped content from each URL.
|
|
||||||
|
|
||||||
"""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -68,10 +68,6 @@ class CouchbaseLoader(BaseLoader):
|
|||||||
self.page_content_fields = page_content_fields
|
self.page_content_fields = page_content_fields
|
||||||
self.metadata_fields = metadata_fields
|
self.metadata_fields = metadata_fields
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load Couchbase data into Document objects."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load Couchbase data into Document objects lazily."""
|
"""Load Couchbase data into Document objects lazily."""
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -26,10 +26,6 @@ class BaseDataFrameLoader(BaseLoader):
|
|||||||
metadata.pop(self.page_content_column)
|
metadata.pop(self.page_content_column)
|
||||||
yield Document(page_content=text, metadata=metadata)
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load full dataframe."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
|
|
||||||
class DataFrameLoader(BaseDataFrameLoader):
|
class DataFrameLoader(BaseDataFrameLoader):
|
||||||
"""Load `Pandas` DataFrame."""
|
"""Load `Pandas` DataFrame."""
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Iterator, List, Optional
|
from typing import Iterator, Optional
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -77,10 +77,6 @@ class AzureAIDocumentIntelligenceLoader(BaseLoader):
|
|||||||
mode=mode,
|
mode=mode,
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load given path as pages."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(
|
def lazy_load(
|
||||||
self,
|
self,
|
||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
|
@ -71,10 +71,6 @@ class EtherscanLoader(BaseLoader):
|
|||||||
for doc in result:
|
for doc in result:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load transactions from spcifc account by Etherscan."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def getNormTx(self) -> List[Document]:
|
def getNormTx(self) -> List[Document]:
|
||||||
url = (
|
url = (
|
||||||
f"https://api.etherscan.io/api?module=account&action=txlist&address={self.account_address}"
|
f"https://api.etherscan.io/api?module=account&action=txlist&address={self.account_address}"
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Iterator, List, Optional, Sequence
|
from typing import Iterator, Optional, Sequence
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -28,9 +28,6 @@ class FaunaLoader(BaseLoader):
|
|||||||
self.secret = secret
|
self.secret = secret
|
||||||
self.metadata_fields = metadata_fields
|
self.metadata_fields = metadata_fields
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
try:
|
try:
|
||||||
from fauna import Page, fql
|
from fauna import Page, fql
|
||||||
|
@ -115,10 +115,6 @@ class GenericLoader(BaseLoader):
|
|||||||
for blob in self.blob_loader.yield_blobs():
|
for blob in self.blob_loader.yield_blobs():
|
||||||
yield from self.blob_parser.lazy_parse(blob)
|
yield from self.blob_parser.lazy_parse(blob)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load all documents."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def load_and_split(
|
def load_and_split(
|
||||||
self, text_splitter: Optional[TextSplitter] = None
|
self, text_splitter: Optional[TextSplitter] = None
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -67,7 +67,3 @@ class GeoDataFrameLoader(BaseLoader):
|
|||||||
|
|
||||||
# using WKT instead of str() to help GIS system interoperability
|
# using WKT instead of str() to help GIS system interoperability
|
||||||
yield Document(page_content=geom.wkt, metadata=metadata)
|
yield Document(page_content=geom.wkt, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load full dataframe."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -127,32 +127,6 @@ class GitHubIssuesLoader(BaseGitHubLoader):
|
|||||||
else:
|
else:
|
||||||
url = None
|
url = None
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""
|
|
||||||
Get issues of a GitHub repository.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A list of Documents with attributes:
|
|
||||||
- page_content
|
|
||||||
- metadata
|
|
||||||
- url
|
|
||||||
- title
|
|
||||||
- creator
|
|
||||||
- created_at
|
|
||||||
- last_update_time
|
|
||||||
- closed_time
|
|
||||||
- number of comments
|
|
||||||
- state
|
|
||||||
- labels
|
|
||||||
- assignee
|
|
||||||
- assignees
|
|
||||||
- milestone
|
|
||||||
- locked
|
|
||||||
- number
|
|
||||||
- is_pull_request
|
|
||||||
"""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def parse_issue(self, issue: dict) -> Document:
|
def parse_issue(self, issue: dict) -> Document:
|
||||||
"""Create Document objects from a list of GitHub issues."""
|
"""Create Document objects from a list of GitHub issues."""
|
||||||
metadata = {
|
metadata = {
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
from typing import Iterator, List, Mapping, Optional, Sequence, Union
|
from typing import Iterator, Mapping, Optional, Sequence, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -84,10 +84,6 @@ class HuggingFaceDatasetLoader(BaseLoader):
|
|||||||
for row in dataset[key]
|
for row in dataset[key]
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load documents."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def parse_obj(self, page_content: Union[str, object]) -> str:
|
def parse_obj(self, page_content: Union[str, object]) -> str:
|
||||||
if isinstance(page_content, object):
|
if isinstance(page_content, object):
|
||||||
return json.dumps(page_content)
|
return json.dumps(page_content)
|
||||||
|
@ -106,7 +106,3 @@ class HuggingFaceModelLoader(BaseLoader):
|
|||||||
page_content=readme_content,
|
page_content=readme_content,
|
||||||
metadata=model,
|
metadata=model,
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load model information, including README content."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -91,6 +91,3 @@ class JoplinLoader(BaseLoader):
|
|||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
yield from self._get_notes()
|
yield from self._get_notes()
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -46,7 +46,3 @@ class LarkSuiteDocLoader(BaseLoader):
|
|||||||
"title": metadata_json["data"]["document"]["title"],
|
"title": metadata_json["data"]["document"]["title"],
|
||||||
}
|
}
|
||||||
yield Document(page_content=text, metadata=metadata)
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load LarkSuite (FeiShu) document."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Any, Iterator, List, Optional, Sequence
|
from typing import Any, Iterator, Optional, Sequence
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -78,6 +78,3 @@ class MaxComputeLoader(BaseLoader):
|
|||||||
else:
|
else:
|
||||||
metadata = {k: v for k, v in row.items() if k not in page_content_data}
|
metadata = {k: v for k, v in row.items() if k not in page_content_data}
|
||||||
yield Document(page_content=page_content, metadata=metadata)
|
yield Document(page_content=page_content, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, List, Optional, Sequence, Union
|
from typing import Iterator, Optional, Sequence, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -87,11 +87,6 @@ class MWDumpLoader(BaseLoader):
|
|||||||
metadata = {"source": page.title}
|
metadata = {"source": page.title}
|
||||||
return Document(page_content=text, metadata=metadata)
|
return Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load from a file path."""
|
|
||||||
|
|
||||||
return [doc for doc in self.lazy_load()]
|
|
||||||
|
|
||||||
def lazy_load(
|
def lazy_load(
|
||||||
self,
|
self,
|
||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
|
@ -23,10 +23,6 @@ class MergedDataLoader(BaseLoader):
|
|||||||
for document in data:
|
for document in data:
|
||||||
yield document
|
yield document
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load docs."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
async def alazy_load(self) -> AsyncIterator[Document]:
|
async def alazy_load(self) -> AsyncIterator[Document]:
|
||||||
"""Lazy load docs from each individual loader."""
|
"""Lazy load docs from each individual loader."""
|
||||||
for loader in self.loaders:
|
for loader in self.loaders:
|
||||||
|
@ -91,7 +91,3 @@ class OneDriveLoader(O365BaseLoader):
|
|||||||
if self.object_ids:
|
if self.object_ids:
|
||||||
for blob in self._load_from_object_ids(drive, self.object_ids):
|
for blob in self._load_from_object_ids(drive, self.object_ids):
|
||||||
yield from blob_parser.lazy_parse(blob)
|
yield from blob_parser.lazy_parse(blob)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load all documents."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -109,18 +109,6 @@ class OneNoteLoader(BaseLoader, BaseModel):
|
|||||||
else:
|
else:
|
||||||
request_url = ""
|
request_url = ""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""
|
|
||||||
Get pages from OneNote notebooks.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A list of Documents with attributes:
|
|
||||||
- page_content
|
|
||||||
- metadata
|
|
||||||
- title
|
|
||||||
"""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def _get_page_content(self, page_id: str) -> str:
|
def _get_page_content(self, page_id: str) -> str:
|
||||||
"""Get page content from OneNote API"""
|
"""Get page content from OneNote API"""
|
||||||
request_url = self.onenote_api_base_url + f"/pages/{page_id}/content"
|
request_url = self.onenote_api_base_url + f"/pages/{page_id}/content"
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Iterator, List
|
from typing import Iterator
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -37,8 +37,3 @@ class OpenCityDataLoader(BaseLoader):
|
|||||||
"source": self.city_id + "_" + self.dataset_id,
|
"source": self.city_id + "_" + self.dataset_id,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load records."""
|
|
||||||
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -157,10 +157,6 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
self.parser = PyPDFParser(password=password, extract_images=extract_images)
|
self.parser = PyPDFParser(password=password, extract_images=extract_images)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load given path as pages."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(
|
def lazy_load(
|
||||||
self,
|
self,
|
||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Iterator, List, Optional
|
from typing import Iterator, Optional
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -32,9 +32,6 @@ class PubMedLoader(BaseLoader):
|
|||||||
top_k_results=load_max_docs,
|
top_k_results=load_max_docs,
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self._client.lazy_load_docs(self.query))
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
for doc in self._client.lazy_load_docs(self.query):
|
for doc in self._client.lazy_load_docs(self.query):
|
||||||
yield doc
|
yield doc
|
||||||
|
@ -84,10 +84,6 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
text = self._clean_data(f.read())
|
text = self._clean_data(f.read())
|
||||||
yield Document(page_content=text, metadata={"source": str(p)})
|
yield Document(page_content=text, metadata={"source": str(p)})
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load documents."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def _clean_data(self, data: str) -> str:
|
def _clean_data(self, data: str) -> str:
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
@ -314,7 +314,3 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
return iter(results or [])
|
return iter(results or [])
|
||||||
else:
|
else:
|
||||||
return self._get_child_links_recursive(self.url, visited)
|
return self._get_child_links_recursive(self.url, visited)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load web pages."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -100,9 +100,6 @@ class RocksetLoader(BaseLoader):
|
|||||||
# ignore
|
# ignore
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
query_results = self.client.Queries.query(
|
query_results = self.client.Queries.query(
|
||||||
sql=self.query
|
sql=self.query
|
||||||
|
@ -124,6 +124,3 @@ class RSpaceLoader(BaseLoader):
|
|||||||
yield d
|
yield d
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unknown global ID type")
|
raise ValueError("Unknown global ID type")
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -54,7 +54,3 @@ class SharePointLoader(O365BaseLoader):
|
|||||||
if self.object_ids:
|
if self.object_ids:
|
||||||
for blob in self._load_from_object_ids(drive, self.object_ids):
|
for blob in self._load_from_object_ids(drive, self.object_ids):
|
||||||
yield from blob_parser.lazy_parse(blob)
|
yield from blob_parser.lazy_parse(blob)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load all documents."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -122,7 +122,3 @@ class SnowflakeLoader(BaseLoader):
|
|||||||
metadata = {k: v for k, v in row.items() if k in metadata_columns}
|
metadata = {k: v for k, v in row.items() if k in metadata_columns}
|
||||||
doc = Document(page_content=page_content, metadata=metadata)
|
doc = Document(page_content=page_content, metadata=metadata)
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load data into document objects."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -104,9 +104,6 @@ class SQLDatabaseLoader(BaseLoader):
|
|||||||
|
|
||||||
yield Document(page_content=page_content, metadata=metadata)
|
yield Document(page_content=page_content, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def page_content_default_mapper(
|
def page_content_default_mapper(
|
||||||
row: sa.RowMapping, column_names: Optional[List[str]] = None
|
row: sa.RowMapping, column_names: Optional[List[str]] = None
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -19,9 +19,6 @@ class TencentCOSDirectoryLoader(BaseLoader):
|
|||||||
self.bucket = bucket
|
self.bucket = bucket
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load documents."""
|
"""Load documents."""
|
||||||
try:
|
try:
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -21,9 +21,6 @@ class TencentCOSFileLoader(BaseLoader):
|
|||||||
self.bucket = bucket
|
self.bucket = bucket
|
||||||
self.key = key
|
self.key = key
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load documents."""
|
"""Load documents."""
|
||||||
try:
|
try:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Callable, Dict, Iterator, List, Optional
|
from typing import Callable, Dict, Iterator, Optional
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -75,6 +75,3 @@ class TensorflowDatasetLoader(BaseLoader):
|
|||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
yield from self._tfds_client.lazy_load()
|
yield from self._tfds_client.lazy_load()
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -65,7 +65,3 @@ class TiDBLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
metadata = {col: row_data[col] for col in self.metadata_columns}
|
metadata = {col: row_data[col] for col in self.metadata_columns}
|
||||||
yield Document(page_content=page_content, metadata=metadata)
|
yield Document(page_content=page_content, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load TiDB data into document objects."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Iterator, List
|
from typing import Iterator
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@ -28,7 +28,3 @@ class ToMarkdownLoader(BaseLoader):
|
|||||||
text = response.json()["article"]
|
text = response.json()["article"]
|
||||||
metadata = {"source": self.url}
|
metadata = {"source": self.url}
|
||||||
yield Document(page_content=text, metadata=metadata)
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load file."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, List, Union
|
from typing import Iterator, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -18,10 +18,6 @@ class TomlLoader(BaseLoader):
|
|||||||
"""Initialize the TomlLoader with a source file or directory."""
|
"""Initialize the TomlLoader with a source file or directory."""
|
||||||
self.source = Path(source)
|
self.source = Path(source)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load and return all documents."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Lazily load the TOML documents from the source file or directory."""
|
"""Lazily load the TOML documents from the source file or directory."""
|
||||||
import tomli
|
import tomli
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Iterator, List, Optional, Sequence
|
from typing import Iterator, Optional, Sequence
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -43,9 +43,3 @@ class WeatherDataLoader(BaseLoader):
|
|||||||
metadata = {"queried_at": datetime.now()}
|
metadata = {"queried_at": datetime.now()}
|
||||||
content = self.client.run(place)
|
content = self.client.run(place)
|
||||||
yield Document(page_content=content, metadata=metadata)
|
yield Document(page_content=content, metadata=metadata)
|
||||||
|
|
||||||
def load(
|
|
||||||
self,
|
|
||||||
) -> List[Document]:
|
|
||||||
"""Load weather data for the given locations."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
@ -251,10 +251,6 @@ class WebBaseLoader(BaseLoader):
|
|||||||
metadata = _build_metadata(soup, path)
|
metadata = _build_metadata(soup, path)
|
||||||
yield Document(page_content=text, metadata=metadata)
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load text from the url(s) in web_path."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def aload(self) -> List[Document]:
|
def aload(self) -> List[Document]:
|
||||||
"""Load text from the urls in web_path async into Documents."""
|
"""Load text from the urls in web_path async into Documents."""
|
||||||
|
|
||||||
|
@ -36,10 +36,6 @@ class ToyLoader(BaseLoader):
|
|||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
yield from self.documents
|
yield from self.documents
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load the documents from the source."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
async def alazy_load(
|
async def alazy_load(
|
||||||
self,
|
self,
|
||||||
) -> AsyncIterator[Document]:
|
) -> AsyncIterator[Document]:
|
||||||
|
Loading…
Reference in New Issue
Block a user