From 9c3da1191072b7e28f73fb4319c9b37ced264368 Mon Sep 17 00:00:00 2001 From: Shikanime Deva Date: Fri, 19 Jul 2024 19:13:12 +0200 Subject: [PATCH] Fix MultiQueryRetriever breaking Embeddings with empty lines (#21093) Fix MultiQueryRetriever breaking Embeddings with empty lines ``` [chain/end] [1:chain:ConversationalRetrievalChain > 2:retriever:Retriever > 3:retriever:Retriever > 4:chain:LLMChain] [2.03s] Exiting Chain run with output: [outputs] > /workspaces/Sfeir/sncf/metabot-backend/.venv/lib/python3.11/site-packages/langchain/retrievers/multi_query.py(116)_aget_relevant_documents() -> if self.include_original: (Pdb) queries ['## Alternative questions for "Hello, tell me about phones?":', '', '1. **What are the latest trends in smartphone technology?** (Focuses on recent advancements)', '2. **How has the mobile phone industry evolved over the years?** (Historical perspective)', '3. **What are the different types of phones available in the market, and which one is best for me?** (Categorization and recommendation)'] ``` Example of failure on VertexAIEmbeddings ``` grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with: status = StatusCode.INVALID_ARGUMENT details = "The text content is empty." debug_error_string = "UNKNOWN:Error received from peer ipv4:142.250.184.234:443 {created_time:"2024-04-30T09:57:45.625698408+00:00", grpc_status:3, grpc_message:"The text content is empty."}" ``` Fixes: #15959 --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Harrison Chase Co-authored-by: Chester Curme --- docs/docs/how_to/MultiQueryRetriever.ipynb | 2 +- .../langchain/langchain/retrievers/multi_query.py | 2 +- .../unit_tests/retrievers/test_multi_query.py | 15 ++++++++++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/docs/docs/how_to/MultiQueryRetriever.ipynb b/docs/docs/how_to/MultiQueryRetriever.ipynb index f1377124aa9..1574eb1dd63 100644 --- a/docs/docs/how_to/MultiQueryRetriever.ipynb +++ b/docs/docs/how_to/MultiQueryRetriever.ipynb @@ -153,7 +153,7 @@ "\n", " def parse(self, text: str) -> List[str]:\n", " lines = text.strip().split(\"\\n\")\n", - " return lines\n", + " return list(filter(None, lines)) # Remove empty lines\n", "\n", "\n", "output_parser = LineListOutputParser()\n", diff --git a/libs/langchain/langchain/retrievers/multi_query.py b/libs/langchain/langchain/retrievers/multi_query.py index 3d1a36be476..23ba88e2a53 100644 --- a/libs/langchain/langchain/retrievers/multi_query.py +++ b/libs/langchain/langchain/retrievers/multi_query.py @@ -24,7 +24,7 @@ class LineListOutputParser(BaseOutputParser[List[str]]): def parse(self, text: str) -> List[str]: lines = text.strip().split("\n") - return lines + return list(filter(None, lines)) # Remove empty lines # Default prompt diff --git a/libs/langchain/tests/unit_tests/retrievers/test_multi_query.py b/libs/langchain/tests/unit_tests/retrievers/test_multi_query.py index 8f80e77e79b..d3529e8d97c 100644 --- a/libs/langchain/tests/unit_tests/retrievers/test_multi_query.py +++ b/libs/langchain/tests/unit_tests/retrievers/test_multi_query.py @@ -3,7 +3,7 @@ from typing import List import pytest as pytest from langchain_core.documents import Document -from langchain.retrievers.multi_query import _unique_documents +from langchain.retrievers.multi_query import LineListOutputParser, _unique_documents @pytest.mark.parametrize( @@ -38,3 +38,16 @@ from langchain.retrievers.multi_query import _unique_documents ) def test__unique_documents(documents: List[Document], expected: List[Document]) -> None: assert _unique_documents(documents) == expected + + +@pytest.mark.parametrize( + "text,expected", + [ + ("foo\nbar\nbaz", ["foo", "bar", "baz"]), + ("foo\nbar\nbaz\n", ["foo", "bar", "baz"]), + ("foo\n\nbar", ["foo", "bar"]), + ], +) +def test_line_list_output_parser(text: str, expected: List[str]) -> None: + parser = LineListOutputParser() + assert parser.parse(text) == expected