mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-30 04:45:23 +00:00
- [x] **PR title**: "community: add Needle retriever and document loader integration" - Where "package" is whichever of langchain, community, core, etc. is being modified. Use "docs: ..." for purely docs changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [x] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** This PR adds a new integration for Needle, which includes: - **NeedleRetriever**: A retriever for fetching documents from Needle collections. - **NeedleLoader**: A document loader for managing and loading documents into Needle collections. - Example notebooks demonstrating usage have been added in: - `docs/docs/integrations/retrievers/needle.ipynb` - `docs/docs/integrations/document_loaders/needle.ipynb`. - **Dependencies:** The `needle-python` package is required as an external dependency for accessing Needle's API. It has been added to the extended testing dependencies list. - **Twitter handle:** Feel free to mention me if this PR gets announced: [needlexai](https://x.com/NeedlexAI). - [x] **Add tests and docs**: If you're adding a new integration, please include 1. Unit tests have been added for both `NeedleRetriever` and `NeedleLoader` in `libs/community/tests/unit_tests`. These tests mock API calls to avoid relying on network access. 2. Example notebooks have been added to `docs/docs/integrations/`, showcasing both retriever and loader functionality. - [x] **Lint and test**: Run `make format`, `make lint`, and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ - `make format`: Passed - `make lint`: Passed - `make test`: Passed (requires `needle-python` to be installed locally; this package is not added to LangChain dependencies). Additional guidelines: - [x] Optional dependencies are imported only within functions. - [x] No dependencies have been added to pyproject.toml files except for those required for unit tests. - [x] The PR does not touch more than one package. - [x] Changes are fully backwards compatible. - [x] Community additions are not re-imported into LangChain core. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
165 lines
5.2 KiB
Python
165 lines
5.2 KiB
Python
from typing import Dict, Iterator, List, Optional
|
|
|
|
from langchain_core.document_loaders.base import BaseLoader
|
|
from langchain_core.documents import Document
|
|
|
|
|
|
class NeedleLoader(BaseLoader):
|
|
"""
|
|
NeedleLoader is a document loader for managing documents stored in a collection.
|
|
|
|
Setup:
|
|
Install the `needle-python` library and set your Needle API key.
|
|
|
|
.. code-block:: bash
|
|
|
|
pip install needle-python
|
|
export NEEDLE_API_KEY="your-api-key"
|
|
|
|
Key init args:
|
|
- `needle_api_key` (Optional[str]): API key for authenticating with Needle.
|
|
- `collection_id` (str): Needle collection to load documents from.
|
|
|
|
Usage:
|
|
.. code-block:: python
|
|
|
|
from langchain_community.document_loaders.needle import NeedleLoader
|
|
|
|
loader = NeedleLoader(
|
|
needle_api_key="your-api-key",
|
|
collection_id="your-collection-id"
|
|
)
|
|
|
|
# Load documents
|
|
documents = loader.load()
|
|
for doc in documents:
|
|
print(doc.metadata)
|
|
|
|
# Lazy load documents
|
|
for doc in loader.lazy_load():
|
|
print(doc.metadata)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
needle_api_key: Optional[str] = None,
|
|
collection_id: Optional[str] = None,
|
|
) -> None:
|
|
"""
|
|
Initializes the NeedleLoader with API key and collection ID.
|
|
|
|
Args:
|
|
needle_api_key (Optional[str]): API key for authenticating with Needle.
|
|
collection_id (Optional[str]): Identifier for the Needle collection.
|
|
|
|
Raises:
|
|
ImportError: If the `needle-python` library is not installed.
|
|
ValueError: If the collection ID is not provided.
|
|
"""
|
|
try:
|
|
from needle.v1 import NeedleClient
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Please install with `pip install needle-python` to use NeedleLoader."
|
|
)
|
|
|
|
super().__init__()
|
|
self.needle_api_key = needle_api_key
|
|
self.collection_id = collection_id
|
|
self.client: Optional[NeedleClient] = None
|
|
|
|
if self.needle_api_key:
|
|
self.client = NeedleClient(api_key=self.needle_api_key)
|
|
|
|
if not self.collection_id:
|
|
raise ValueError("Collection ID must be provided.")
|
|
|
|
def _get_collection(self) -> None:
|
|
"""
|
|
Ensures the Needle collection is set and the client is initialized.
|
|
|
|
Raises:
|
|
ValueError: If the Needle client is not initialized or
|
|
if the collection ID is missing.
|
|
"""
|
|
if self.client is None:
|
|
raise ValueError(
|
|
"NeedleClient is not initialized. Provide a valid API key."
|
|
)
|
|
if not self.collection_id:
|
|
raise ValueError("Collection ID must be provided.")
|
|
|
|
def add_files(self, files: Dict[str, str]) -> None:
|
|
"""
|
|
Adds files to the Needle collection.
|
|
|
|
Args:
|
|
files (Dict[str, str]): Dictionary where keys are file names and values
|
|
are file URLs.
|
|
|
|
Raises:
|
|
ImportError: If the `needle-python` library is not installed.
|
|
ValueError: If the collection is not properly initialized.
|
|
"""
|
|
try:
|
|
from needle.v1.models import FileToAdd
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Please install with `pip install needle-python` to add files."
|
|
)
|
|
|
|
self._get_collection()
|
|
assert self.client is not None, "NeedleClient must be initialized."
|
|
|
|
files_to_add = [FileToAdd(name=name, url=url) for name, url in files.items()]
|
|
|
|
self.client.collections.files.add(
|
|
collection_id=self.collection_id, files=files_to_add
|
|
)
|
|
|
|
def _fetch_documents(self) -> List[Document]:
|
|
"""
|
|
Fetches metadata for documents from the Needle collection.
|
|
|
|
Returns:
|
|
List[Document]: A list of documents with metadata. Content is excluded.
|
|
|
|
Raises:
|
|
ValueError: If the collection is not properly initialized.
|
|
"""
|
|
self._get_collection()
|
|
assert self.client is not None, "NeedleClient must be initialized."
|
|
|
|
files = self.client.collections.files.list(self.collection_id)
|
|
docs = [
|
|
Document(
|
|
page_content="", # Needle doesn't provide file content fetching
|
|
metadata={
|
|
"source": file.url,
|
|
"title": file.name,
|
|
"size": getattr(file, "size", None),
|
|
},
|
|
)
|
|
for file in files
|
|
if file.status == "indexed"
|
|
]
|
|
return docs
|
|
|
|
def load(self) -> List[Document]:
|
|
"""
|
|
Loads all documents from the Needle collection.
|
|
|
|
Returns:
|
|
List[Document]: A list of documents from the collection.
|
|
"""
|
|
return self._fetch_documents()
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
"""
|
|
Lazily loads documents from the Needle collection.
|
|
|
|
Yields:
|
|
Iterator[Document]: An iterator over the documents.
|
|
"""
|
|
yield from self._fetch_documents()
|