mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-05 21:12:48 +00:00
community[minor]: [PebbloSafeLoader] Implement content-size-based batching (#24871)
- **Title:** [PebbloSafeLoader] Implement content-size-based batching in the classification flow(loader/doc API) - **Description:** - Implemented content-size-based batching in the loader/doc API, set to 100KB with no external configuration option, intentionally hard-coded to prevent timeouts. - Remove unused field(pb_id) from doc_metadata - **Issue:** NA - **Dependencies:** NA - **Add tests and docs:** Updated
This commit is contained in:
@@ -4,7 +4,7 @@ import logging
|
||||
import os
|
||||
import pathlib
|
||||
import platform
|
||||
from typing import Optional, Tuple
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.env import get_runtime_environment
|
||||
@@ -20,6 +20,7 @@ PEBBLO_CLOUD_URL = os.getenv("PEBBLO_CLOUD_URL", "https://api.daxa.ai")
|
||||
|
||||
LOADER_DOC_URL = "/v1/loader/doc"
|
||||
APP_DISCOVER_URL = "/v1/app/discover"
|
||||
BATCH_SIZE_BYTES = 100 * 1024 # 100 KB
|
||||
|
||||
# Supported loaders for Pebblo safe data loading
|
||||
file_loader = [
|
||||
@@ -301,3 +302,43 @@ def get_ip() -> str:
|
||||
except Exception:
|
||||
public_ip = socket.gethostbyname("localhost")
|
||||
return public_ip
|
||||
|
||||
|
||||
def generate_size_based_batches(
|
||||
docs: List[Document], max_batch_size: int = 100 * 1024
|
||||
) -> List[List[Document]]:
|
||||
"""
|
||||
Generate batches of documents based on page_content size.
|
||||
Args:
|
||||
docs: List of documents to be batched.
|
||||
max_batch_size: Maximum size of each batch in bytes. Defaults to 100*1024(100KB)
|
||||
Returns:
|
||||
List[List[Document]]: List of batches of documents
|
||||
"""
|
||||
batches: List[List[Document]] = []
|
||||
current_batch: List[Document] = []
|
||||
current_batch_size: int = 0
|
||||
|
||||
for doc in docs:
|
||||
# Calculate the size of the document in bytes
|
||||
doc_size: int = len(doc.page_content.encode("utf-8"))
|
||||
|
||||
if doc_size > max_batch_size:
|
||||
# If a single document exceeds the max batch size, send it as a single batch
|
||||
batches.append([doc])
|
||||
else:
|
||||
if current_batch_size + doc_size > max_batch_size:
|
||||
# If adding this document exceeds the max batch size, start a new batch
|
||||
batches.append(current_batch)
|
||||
current_batch = []
|
||||
current_batch_size = 0
|
||||
|
||||
# Add document to the current batch
|
||||
current_batch.append(doc)
|
||||
current_batch_size += doc_size
|
||||
|
||||
# Add the last batch if it has documents
|
||||
if current_batch:
|
||||
batches.append(current_batch)
|
||||
|
||||
return batches
|
||||
|
Reference in New Issue
Block a user