mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-21 12:01:47 +00:00
upstage[minor]: add merge_and_split function for document loader (#21603)
- Introduce the `merge_and_split` function in the `UpstageLayoutAnalysisLoader`. - The `merge_and_split` function takes a list of documents and a splitter as inputs. - This function merges all documents and then divides them using the `split_documents` method, which is a proprietary function of the splitter. - If the provided splitter is `None` (which is the default setting), the function will simply merge the documents without splitting them.
This commit is contained in:
parent
500569da48
commit
480c02bf55
@ -1,7 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, List, Literal, Optional, Union
|
from typing import Any, Dict, Iterator, List, Literal, Optional, Union
|
||||||
|
|
||||||
from langchain_core.document_loaders import BaseLoader, Blob
|
from langchain_core.document_loaders import BaseLoader, Blob
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@ -204,3 +204,45 @@ class UpstageLayoutAnalysisLoader(BaseLoader):
|
|||||||
exclude=self.exclude,
|
exclude=self.exclude,
|
||||||
)
|
)
|
||||||
yield from parser.lazy_parse(blob)
|
yield from parser.lazy_parse(blob)
|
||||||
|
|
||||||
|
def merge_and_split(
|
||||||
|
self, documents: List[Document], splitter: Optional[object] = None
|
||||||
|
) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Merges the page content and metadata of multiple documents into a single
|
||||||
|
document, or splits the documents using a custom splitter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
documents (list): A list of Document objects to be merged and split.
|
||||||
|
splitter (object, optional): An optional splitter object that implements the
|
||||||
|
`split_documents` method. If provided, the documents will be split using
|
||||||
|
this splitter. Defaults to None, in which case the documents are merged.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of Document objects. If no splitter is provided, a single
|
||||||
|
Document object is returned with the merged content and combined metadata.
|
||||||
|
If a splitter is provided, the documents are split and a list of Document
|
||||||
|
objects is returned.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
AssertionError: If a splitter is provided but it does not implement the
|
||||||
|
`split_documents` method.
|
||||||
|
"""
|
||||||
|
if splitter is None:
|
||||||
|
merged_content = " ".join([doc.page_content for doc in documents])
|
||||||
|
|
||||||
|
metadatas: Dict[str, Any] = dict()
|
||||||
|
for _meta in [doc.metadata for doc in documents]:
|
||||||
|
for key, value in _meta.items():
|
||||||
|
if key in metadatas:
|
||||||
|
metadatas[key].append(value)
|
||||||
|
else:
|
||||||
|
metadatas[key] = [value]
|
||||||
|
|
||||||
|
return [Document(page_content=merged_content, metadata=metadatas)]
|
||||||
|
else:
|
||||||
|
assert hasattr(
|
||||||
|
splitter, "split_documents"
|
||||||
|
), "splitter must implement split_documents method"
|
||||||
|
|
||||||
|
return splitter.split_documents(documents)
|
||||||
|
@ -246,6 +246,8 @@ class UpstageLayoutAnalysisParser(BaseBlobParser):
|
|||||||
"id": elements["id"],
|
"id": elements["id"],
|
||||||
"type": self.output_type,
|
"type": self.output_type,
|
||||||
"split": self.split,
|
"split": self.split,
|
||||||
|
"bbox": elements["bounding_box"],
|
||||||
|
"category": elements["category"],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user