mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-10 05:20:39 +00:00
core[patch]: improve performance of InMemoryVectorStore (#27538)
**Description:** We improve the performance of the InMemoryVectorStore. **Isue:** Originally, similarity was computed document by document: ``` for doc in self.store.values(): vector = doc["vector"] similarity = float(cosine_similarity([embedding], [vector]).item(0)) ``` This is inefficient and does not make use of numpy vectorization. This PR computes the similarity in one vectorized go: ``` docs = list(self.store.values()) similarity = cosine_similarity([embedding], [doc["vector"] for doc in docs]) ``` **Dependencies:** None **Twitter handle:** @b12_consulting, @Vincent_Min --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
d5306899d3
commit
7bc4e320f1
@ -329,23 +329,38 @@ class InMemoryVectorStore(VectorStore):
|
|||||||
filter: Optional[Callable[[Document], bool]] = None,
|
filter: Optional[Callable[[Document], bool]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[tuple[Document, float, list[float]]]:
|
) -> list[tuple[Document, float, list[float]]]:
|
||||||
result = []
|
# get all docs with fixed order in list
|
||||||
for doc in self.store.values():
|
docs = list(self.store.values())
|
||||||
vector = doc["vector"]
|
|
||||||
similarity = float(cosine_similarity([embedding], [vector]).item(0))
|
|
||||||
result.append(
|
|
||||||
(
|
|
||||||
Document(
|
|
||||||
id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
|
|
||||||
),
|
|
||||||
similarity,
|
|
||||||
vector,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
result.sort(key=lambda x: x[1], reverse=True)
|
|
||||||
if filter is not None:
|
if filter is not None:
|
||||||
result = [r for r in result if filter(r[0])]
|
docs = [
|
||||||
return result[:k]
|
doc
|
||||||
|
for doc in docs
|
||||||
|
if filter(Document(page_content=doc["text"], metadata=doc["metadata"]))
|
||||||
|
]
|
||||||
|
|
||||||
|
if not docs:
|
||||||
|
return []
|
||||||
|
|
||||||
|
similarity = cosine_similarity([embedding], [doc["vector"] for doc in docs])[0]
|
||||||
|
|
||||||
|
# get the indices ordered by similarity score
|
||||||
|
top_k_idx = similarity.argsort()[::-1][:k]
|
||||||
|
|
||||||
|
return [
|
||||||
|
(
|
||||||
|
Document(
|
||||||
|
id=doc_dict["id"],
|
||||||
|
page_content=doc_dict["text"],
|
||||||
|
metadata=doc_dict["metadata"],
|
||||||
|
),
|
||||||
|
float(similarity[idx].item()),
|
||||||
|
doc_dict["vector"],
|
||||||
|
)
|
||||||
|
for idx in top_k_idx
|
||||||
|
# Assign using walrus operator to avoid multiple lookups
|
||||||
|
if (doc_dict := docs[idx])
|
||||||
|
]
|
||||||
|
|
||||||
def similarity_search_with_score_by_vector(
|
def similarity_search_with_score_by_vector(
|
||||||
self,
|
self,
|
||||||
|
Loading…
Reference in New Issue
Block a user