Fixed source key name for docugami loader (#8598)

The Docugami loader was not returning the source metadata key. This was
triggering this exception when used with retrievers, per
https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/schema/prompt_template.py#L193C1-L195C41

The fix is simple and just updates the metadata key name for the
document each chunk is sourced from, from "name" to "source" as
expected.

I tested by running the python notebook that has an end to end scenario
in it.

Tagging DataLoader maintainers @rlancemartin @eyurtsev
This commit is contained in:
Bagatur
2023-08-23 11:24:55 -07:00
committed by GitHub
2 changed files with 72 additions and 74 deletions

View File

@@ -16,6 +16,7 @@ TABLE_NAME = "{http://www.w3.org/1999/xhtml}table"
XPATH_KEY = "xpath"
DOCUMENT_ID_KEY = "id"
DOCUMENT_SOURCE_KEY = "source"
DOCUMENT_NAME_KEY = "name"
STRUCTURE_KEY = "structure"
TAG_KEY = "tag"
@@ -143,8 +144,9 @@ class DocugamiLoader(BaseLoader, BaseModel):
"""Create a Document from a node and text."""
metadata = {
XPATH_KEY: _xpath_for_chunk(node),
DOCUMENT_ID_KEY: document["id"],
DOCUMENT_NAME_KEY: document["name"],
DOCUMENT_ID_KEY: document[DOCUMENT_ID_KEY],
DOCUMENT_NAME_KEY: document[DOCUMENT_NAME_KEY],
DOCUMENT_SOURCE_KEY: document[DOCUMENT_NAME_KEY],
STRUCTURE_KEY: node.attrib.get("structure", ""),
TAG_KEY: re.sub(r"\{.*\}", "", node.tag),
}