mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-05 12:48:12 +00:00
Improve pinecone hybrid search retriever adding metadata support (#5098)
# Improve pinecone hybrid search retriever adding metadata support I simply remove the hardwiring of metadata to the existing implementation allowing one to pass `metadatas` attribute to the constructors and in `get_relevant_documents`. I also add one missing pip install to the accompanying notebook (I am not adding dependencies, they were pre-existing). First contribution, just hoping to help, feel free to critique :) my twitter username is `@andreliebschner` While looking at hybrid search I noticed #3043 and #1743. I think the former can be closed as following the example right now (even prior to my improvements) works just fine, the latter I think can be also closed safely, maybe pointing out the relevant classes and example. Should I reply those issues mentioning someone? @dev2049, @hwchase17 --------- Co-authored-by: Andreas Liebschner <a.liebschner@shopfully.com>
This commit is contained in:
parent
5cd12102be
commit
44dc959584
@ -24,7 +24,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#!pip install pinecone-client"
|
"#!pip install pinecone-client pinecone-text"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -18,6 +18,7 @@ def create_index(
|
|||||||
embeddings: Embeddings,
|
embeddings: Embeddings,
|
||||||
sparse_encoder: Any,
|
sparse_encoder: Any,
|
||||||
ids: Optional[List[str]] = None,
|
ids: Optional[List[str]] = None,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
batch_size = 32
|
batch_size = 32
|
||||||
_iterator = range(0, len(contexts), batch_size)
|
_iterator = range(0, len(contexts), batch_size)
|
||||||
@ -38,8 +39,15 @@ def create_index(
|
|||||||
# extract batch
|
# extract batch
|
||||||
context_batch = contexts[i:i_end]
|
context_batch = contexts[i:i_end]
|
||||||
batch_ids = ids[i:i_end]
|
batch_ids = ids[i:i_end]
|
||||||
|
metadata_batch = (
|
||||||
|
metadatas[i:i_end] if metadatas else [{} for _ in context_batch]
|
||||||
|
)
|
||||||
# add context passages as metadata
|
# add context passages as metadata
|
||||||
meta = [{"context": context} for context in context_batch]
|
meta = [
|
||||||
|
{"context": context, **metadata}
|
||||||
|
for context, metadata in zip(context_batch, metadata_batch)
|
||||||
|
]
|
||||||
|
|
||||||
# create dense vectors
|
# create dense vectors
|
||||||
dense_embeds = embeddings.embed_documents(context_batch)
|
dense_embeds = embeddings.embed_documents(context_batch)
|
||||||
# create sparse vectors
|
# create sparse vectors
|
||||||
@ -78,8 +86,20 @@ class PineconeHybridSearchRetriever(BaseRetriever, BaseModel):
|
|||||||
extra = Extra.forbid
|
extra = Extra.forbid
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
def add_texts(self, texts: List[str], ids: Optional[List[str]] = None) -> None:
|
def add_texts(
|
||||||
create_index(texts, self.index, self.embeddings, self.sparse_encoder, ids=ids)
|
self,
|
||||||
|
texts: List[str],
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
) -> None:
|
||||||
|
create_index(
|
||||||
|
texts,
|
||||||
|
self.index,
|
||||||
|
self.embeddings,
|
||||||
|
self.sparse_encoder,
|
||||||
|
ids=ids,
|
||||||
|
metadatas=metadatas,
|
||||||
|
)
|
||||||
|
|
||||||
@root_validator()
|
@root_validator()
|
||||||
def validate_environment(cls, values: Dict) -> Dict:
|
def validate_environment(cls, values: Dict) -> Dict:
|
||||||
@ -114,7 +134,10 @@ class PineconeHybridSearchRetriever(BaseRetriever, BaseModel):
|
|||||||
)
|
)
|
||||||
final_result = []
|
final_result = []
|
||||||
for res in result["matches"]:
|
for res in result["matches"]:
|
||||||
final_result.append(Document(page_content=res["metadata"]["context"]))
|
context = res["metadata"].pop("context")
|
||||||
|
final_result.append(
|
||||||
|
Document(page_content=context, metadata=res["metadata"])
|
||||||
|
)
|
||||||
# return search results as json
|
# return search results as json
|
||||||
return final_result
|
return final_result
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user