langchain: preserve document metadata in FlashrankRerank (#19148)

**Description:** Preserves document metadata in `FlashrankRerank`
    - **Issue:** #19142
    - **Dependencies:** None
    - **Twitter handle:** n/a

---------

Co-authored-by: Simon Stone <simon.stone@dartmouth.edu>
This commit is contained in:
Simon Stone 2024-03-19 00:15:18 -04:00 committed by GitHub
parent bc648f6cfc
commit 58c7687174
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 12 additions and 3 deletions

View File

@ -53,7 +53,10 @@
"def pretty_print_docs(docs):\n",
" print(\n",
" f\"\\n{'-' * 100}\\n\".join(\n",
" [f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]\n",
" [\n",
" f\"Document {i+1}:\\n\\n{d.page_content}\\nMetadata: {d.metadata}\"\n",
" for i, d in enumerate(docs)\n",
" ]\n",
" )\n",
" )"
]
@ -316,6 +319,8 @@
").load()\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n",
"texts = text_splitter.split_documents(documents)\n",
"for idx, text in enumerate(texts):\n",
" text.metadata[\"id\"] = idx\n",
"\n",
"embedding = OpenAIEmbeddings(model=\"text-embedding-ada-002\")\n",
"retriever = FAISS.from_documents(texts, embedding).as_retriever(search_kwargs={\"k\": 20})\n",

View File

@ -59,16 +59,20 @@ class FlashrankRerank(BaseDocumentCompressor):
callbacks: Optional[Callbacks] = None,
) -> Sequence[Document]:
passages = [
{"id": i, "text": doc.page_content} for i, doc in enumerate(documents)
{"id": i, "text": doc.page_content, "meta": doc.metadata}
for i, doc in enumerate(documents)
]
rerank_request = RerankRequest(query=query, passages=passages)
rerank_response = self.client.rerank(rerank_request)[: self.top_n]
final_results = []
for r in rerank_response:
metadata = r["meta"]
metadata["relevance_score"] = r["score"]
doc = Document(
page_content=r["text"],
metadata={"id": r["id"], "relevance_score": r["score"]},
metadata=metadata,
)
final_results.append(doc)
return final_results