mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-13 05:25:07 +00:00
Fixed source key name for docugami loader (#8598)
The Docugami loader was not returning the source metadata key. This was triggering this exception when used with retrievers, per https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/schema/prompt_template.py#L193C1-L195C41 The fix is simple and just updates the metadata key name for the document each chunk is sourced from, from "name" to "source" as expected. I tested by running the python notebook that has an end to end scenario in it. Tagging DataLoader maintainers @rlancemartin @eyurtsev
This commit is contained in:
@@ -16,6 +16,7 @@ TABLE_NAME = "{http://www.w3.org/1999/xhtml}table"
|
||||
|
||||
XPATH_KEY = "xpath"
|
||||
DOCUMENT_ID_KEY = "id"
|
||||
DOCUMENT_SOURCE_KEY = "source"
|
||||
DOCUMENT_NAME_KEY = "name"
|
||||
STRUCTURE_KEY = "structure"
|
||||
TAG_KEY = "tag"
|
||||
@@ -143,8 +144,9 @@ class DocugamiLoader(BaseLoader, BaseModel):
|
||||
"""Create a Document from a node and text."""
|
||||
metadata = {
|
||||
XPATH_KEY: _xpath_for_chunk(node),
|
||||
DOCUMENT_ID_KEY: document["id"],
|
||||
DOCUMENT_NAME_KEY: document["name"],
|
||||
DOCUMENT_ID_KEY: document[DOCUMENT_ID_KEY],
|
||||
DOCUMENT_NAME_KEY: document[DOCUMENT_NAME_KEY],
|
||||
DOCUMENT_SOURCE_KEY: document[DOCUMENT_NAME_KEY],
|
||||
STRUCTURE_KEY: node.attrib.get("structure", ""),
|
||||
TAG_KEY: re.sub(r"\{.*\}", "", node.tag),
|
||||
}
|
||||
|
Reference in New Issue
Block a user