community[minor]: [PebbloSafeLoader] Implement content-size-based batching (#24871)

- **Title:** [PebbloSafeLoader] Implement content-size-based batching in the classification flow(loader/doc API) - **Description:** - Implemented content-size-based batching in the loader/doc API, set to 100KB with no external configuration option, intentionally hard-coded to prevent timeouts. - Remove unused field(pb_id) from doc_metadata - **Issue:** NA - **Dependencies:** NA - **Add tests and docs:** Updated
2025-09-05 21:12:48 +00:00 · 2024-07-31 18:40:28 +05:30
parent 096b66db4a
commit a6add89bd4
3 changed files with 74 additions and 11 deletions
--- a/libs/community/langchain_community/utilities/pebblo.py
+++ b/libs/community/langchain_community/utilities/pebblo.py
@@ -4,7 +4,7 @@ import logging
 import os
 import pathlib
 import platform
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple

 from langchain_core.documents import Document
 from langchain_core.env import get_runtime_environment
@@ -20,6 +20,7 @@ PEBBLO_CLOUD_URL = os.getenv("PEBBLO_CLOUD_URL", "https://api.daxa.ai")

 LOADER_DOC_URL = "/v1/loader/doc"
 APP_DISCOVER_URL = "/v1/app/discover"
+BATCH_SIZE_BYTES = 100 * 1024  # 100 KB

 # Supported loaders for Pebblo safe data loading
 file_loader = [
@@ -301,3 +302,43 @@ def get_ip() -> str:
    except Exception:
        public_ip = socket.gethostbyname("localhost")
    return public_ip
+
+
+def generate_size_based_batches(
+    docs: List[Document], max_batch_size: int = 100 * 1024
+) -> List[List[Document]]:
+    """
+    Generate batches of documents based on page_content size.
+    Args:
+        docs: List of documents to be batched.
+        max_batch_size: Maximum size of each batch in bytes. Defaults to 100*1024(100KB)
+    Returns:
+        List[List[Document]]: List of batches of documents
+    """
+    batches: List[List[Document]] = []
+    current_batch: List[Document] = []
+    current_batch_size: int = 0
+
+    for doc in docs:
+        # Calculate the size of the document in bytes
+        doc_size: int = len(doc.page_content.encode("utf-8"))
+
+        if doc_size > max_batch_size:
+            # If a single document exceeds the max batch size, send it as a single batch
+            batches.append([doc])
+        else:
+            if current_batch_size + doc_size > max_batch_size:
+                # If adding this document exceeds the max batch size, start a new batch
+                batches.append(current_batch)
+                current_batch = []
+                current_batch_size = 0
+
+            # Add document to the current batch
+            current_batch.append(doc)
+            current_batch_size += doc_size
+
+    # Add the last batch if it has documents
+    if current_batch:
+        batches.append(current_batch)
+
+    return batches