mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-01 17:13:22 +00:00
core[patch]: improve performance of InMemoryVectorStore (#27538)
**Description:** We improve the performance of the InMemoryVectorStore. **Isue:** Originally, similarity was computed document by document: ``` for doc in self.store.values(): vector = doc["vector"] similarity = float(cosine_similarity([embedding], [vector]).item(0)) ``` This is inefficient and does not make use of numpy vectorization. This PR computes the similarity in one vectorized go: ``` docs = list(self.store.values()) similarity = cosine_similarity([embedding], [doc["vector"] for doc in docs]) ``` **Dependencies:** None **Twitter handle:** @b12_consulting, @Vincent_Min --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
d5306899d3
commit
7bc4e320f1
@ -329,23 +329,38 @@ class InMemoryVectorStore(VectorStore):
|
||||
filter: Optional[Callable[[Document], bool]] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[tuple[Document, float, list[float]]]:
|
||||
result = []
|
||||
for doc in self.store.values():
|
||||
vector = doc["vector"]
|
||||
similarity = float(cosine_similarity([embedding], [vector]).item(0))
|
||||
result.append(
|
||||
(
|
||||
Document(
|
||||
id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
|
||||
),
|
||||
similarity,
|
||||
vector,
|
||||
)
|
||||
)
|
||||
result.sort(key=lambda x: x[1], reverse=True)
|
||||
# get all docs with fixed order in list
|
||||
docs = list(self.store.values())
|
||||
|
||||
if filter is not None:
|
||||
result = [r for r in result if filter(r[0])]
|
||||
return result[:k]
|
||||
docs = [
|
||||
doc
|
||||
for doc in docs
|
||||
if filter(Document(page_content=doc["text"], metadata=doc["metadata"]))
|
||||
]
|
||||
|
||||
if not docs:
|
||||
return []
|
||||
|
||||
similarity = cosine_similarity([embedding], [doc["vector"] for doc in docs])[0]
|
||||
|
||||
# get the indices ordered by similarity score
|
||||
top_k_idx = similarity.argsort()[::-1][:k]
|
||||
|
||||
return [
|
||||
(
|
||||
Document(
|
||||
id=doc_dict["id"],
|
||||
page_content=doc_dict["text"],
|
||||
metadata=doc_dict["metadata"],
|
||||
),
|
||||
float(similarity[idx].item()),
|
||||
doc_dict["vector"],
|
||||
)
|
||||
for idx in top_k_idx
|
||||
# Assign using walrus operator to avoid multiple lookups
|
||||
if (doc_dict := docs[idx])
|
||||
]
|
||||
|
||||
def similarity_search_with_score_by_vector(
|
||||
self,
|
||||
|
Loading…
Reference in New Issue
Block a user