mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-02 13:08:57 +00:00
Add object parsing functionality (#13864)
* **Description:** Parses huggingface dataset Sequence objects into strings for Document loading. * **Issue:** Fixes #10674 * **Tag maintainter:** @baskaryan @eyurtsev --------- Co-authored-by: Amy Han <amyhan@Amys-Air.lan> Co-authored-by: Amy Han <amyhan@Amys-MacBook-Air.local>
This commit is contained in:
parent
981f78f920
commit
750485eaa8
@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
from typing import Iterator, List, Mapping, Optional, Sequence, Union
|
from typing import Iterator, List, Mapping, Optional, Sequence, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@ -28,10 +29,6 @@ class HuggingFaceDatasetLoader(BaseLoader):
|
|||||||
Args:
|
Args:
|
||||||
path: Path or name of the dataset.
|
path: Path or name of the dataset.
|
||||||
page_content_column: Page content column name. Default is "text".
|
page_content_column: Page content column name. Default is "text".
|
||||||
Note: Currently the function assumes the content is a string.
|
|
||||||
If it is not download the dataset using huggingface library and convert
|
|
||||||
using the json or pandas loaders.
|
|
||||||
https://github.com/langchain-ai/langchain/issues/10674
|
|
||||||
name: Name of the dataset configuration.
|
name: Name of the dataset configuration.
|
||||||
data_dir: Data directory of the dataset configuration.
|
data_dir: Data directory of the dataset configuration.
|
||||||
data_files: Path(s) to source data file(s).
|
data_files: Path(s) to source data file(s).
|
||||||
@ -80,7 +77,7 @@ class HuggingFaceDatasetLoader(BaseLoader):
|
|||||||
|
|
||||||
yield from (
|
yield from (
|
||||||
Document(
|
Document(
|
||||||
page_content=row.pop(self.page_content_column),
|
page_content=self.parse_obj(row.pop(self.page_content_column)),
|
||||||
metadata=row,
|
metadata=row,
|
||||||
)
|
)
|
||||||
for key in dataset.keys()
|
for key in dataset.keys()
|
||||||
@ -90,3 +87,8 @@ class HuggingFaceDatasetLoader(BaseLoader):
|
|||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load documents."""
|
"""Load documents."""
|
||||||
return list(self.lazy_load())
|
return list(self.lazy_load())
|
||||||
|
|
||||||
|
def parse_obj(self, page_content: Union[str, object]) -> str:
|
||||||
|
if isinstance(page_content, object):
|
||||||
|
return json.dumps(page_content)
|
||||||
|
return page_content
|
||||||
|
Loading…
Reference in New Issue
Block a user