From f369495fa0f93e1f31749ef2f40f9da0c86bbbe4 Mon Sep 17 00:00:00 2001 From: Jib Date: Tue, 14 May 2024 18:11:26 -0400 Subject: [PATCH] mongodb: [performance] Increase DEFAULT_INSERT_BATCH_SIZE to 100,000 and introduce sizing constraints (#19608) --- .../mongodb/langchain_mongodb/vectorstores.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/libs/partners/mongodb/langchain_mongodb/vectorstores.py b/libs/partners/mongodb/langchain_mongodb/vectorstores.py index 1977045ccbd..a703d59c510 100644 --- a/libs/partners/mongodb/langchain_mongodb/vectorstores.py +++ b/libs/partners/mongodb/langchain_mongodb/vectorstores.py @@ -32,7 +32,7 @@ VST = TypeVar("VST", bound=VectorStore) logger = logging.getLogger(__name__) -DEFAULT_INSERT_BATCH_SIZE = 100 +DEFAULT_INSERT_BATCH_SIZE = 100_000 class MongoDBAtlasVectorSearch(VectorStore): @@ -151,18 +151,24 @@ class MongoDBAtlasVectorSearch(VectorStore): """ batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE) _metadatas: Union[List, Generator] = metadatas or ({} for _ in texts) - texts_batch = [] - metadatas_batch = [] + texts_batch = texts + metadatas_batch = _metadatas result_ids = [] - for i, (text, metadata) in enumerate(zip(texts, _metadatas)): - texts_batch.append(text) - metadatas_batch.append(metadata) - if (i + 1) % batch_size == 0: - result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) - texts_batch = [] - metadatas_batch = [] + if batch_size: + texts_batch = [] + metadatas_batch = [] + size = 0 + for i, (text, metadata) in enumerate(zip(texts, _metadatas)): + size += len(text) + len(metadata) + texts_batch.append(text) + metadatas_batch.append(metadata) + if (i + 1) % batch_size == 0 or size >= 47_000_000: + result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) + texts_batch = [] + metadatas_batch = [] + size = 0 if texts_batch: - result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) + result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) # type: ignore return result_ids def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List: