community[mionr]: add Jina Reranker in retrievers module (#19406)

- **Description:** Adapt JinaEmbeddings to run with the new Jina AI Rerank API - **Twitter handle:** https://twitter.com/JinaAI_ - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-12 12:59:07 +00:00 · 2024-04-25 19:27:10 +02:00
parent 92969d49cb
commit baefbfb14e
4 changed files with 384 additions and 1 deletions
--- a/libs/community/langchain_community/document_compressors/init.py
+++ b/libs/community/langchain_community/document_compressors/init.py
@@ -2,6 +2,9 @@ import importlib
 from typing import TYPE_CHECKING, Any

 if TYPE_CHECKING:
+    from langchain_community.document_compressors.jina_rerank import (
+        JinaRerank,  # noqa: F401
+    )
    from langchain_community.document_compressors.llmlingua_filter import (
        LLMLinguaCompressor,  # noqa: F401
    )
@@ -14,6 +17,7 @@ __all__ = ["LLMLinguaCompressor", "OpenVINOReranker"]
 _module_lookup = {
    "LLMLinguaCompressor": "langchain_community.document_compressors.llmlingua_filter",
    "OpenVINOReranker": "langchain_community.document_compressors.openvino_rerank",
+    "JinaRerank": "langchain_community.document_compressors.jina_rerank",
 }


--- a/libs/community/langchain_community/document_compressors/jina_rerank.py
+++ b/libs/community/langchain_community/document_compressors/jina_rerank.py
@@ -0,0 +1,125 @@
+from __future__ import annotations
+
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Sequence, Union
+
+import requests
+from langchain_core.callbacks import Callbacks
+from langchain_core.documents import BaseDocumentCompressor, Document
+from langchain_core.pydantic_v1 import Extra, root_validator
+from langchain_core.utils import get_from_dict_or_env
+
+JINA_API_URL: str = "https://api.jina.ai/v1/rerank"
+
+
+class JinaRerank(BaseDocumentCompressor):
+    """Document compressor that uses `Jina Rerank API`."""
+
+    session: Any = None
+    """Requests session to communicate with API."""
+    top_n: Optional[int] = 3
+    """Number of documents to return."""
+    model: str = "jina-reranker-v1-base-en"
+    """Model to use for reranking."""
+    jina_api_key: Optional[str] = None
+    """Jina API key. Must be specified directly or via environment variable 
+        JINA_API_KEY."""
+    user_agent: str = "langchain"
+    """Identifier for the application making the request."""
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+        arbitrary_types_allowed = True
+
+    @root_validator(pre=True)
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that api key exists in environment."""
+        jina_api_key = get_from_dict_or_env(values, "jina_api_key", "JINA_API_KEY")
+        user_agent = values.get("user_agent", "langchain")
+        session = requests.Session()
+        session.headers.update(
+            {
+                "Authorization": f"Bearer {jina_api_key}",
+                "Accept-Encoding": "identity",
+                "Content-type": "application/json",
+                "user-agent": user_agent,
+            }
+        )
+        values["session"] = session
+        return values
+
+    def rerank(
+        self,
+        documents: Sequence[Union[str, Document, dict]],
+        query: str,
+        *,
+        model: Optional[str] = None,
+        top_n: Optional[int] = -1,
+        max_chunks_per_doc: Optional[int] = None,
+    ) -> List[Dict[str, Any]]:
+        """Returns an ordered list of documents ordered by their relevance to the provided query.
+
+        Args:
+            query: The query to use for reranking.
+            documents: A sequence of documents to rerank.
+            model: The model to use for re-ranking. Default to self.model.
+            top_n : The number of results to return. If None returns all results.
+                Defaults to self.top_n.
+            max_chunks_per_doc : The maximum number of chunks derived from a document.
+        """  # noqa: E501
+        if len(documents) == 0:  # to avoid empty api call
+            return []
+        docs = [
+            doc.page_content if isinstance(doc, Document) else doc for doc in documents
+        ]
+        model = model or self.model
+        top_n = top_n if (top_n is None or top_n > 0) else self.top_n
+        data = {
+            "query": query,
+            "documents": docs,
+            "model": model,
+            "top_n": top_n,
+        }
+
+        resp = self.session.post(
+            JINA_API_URL,
+            json=data,
+        ).json()
+
+        if "results" not in resp:
+            raise RuntimeError(resp["detail"])
+
+        results = resp["results"]
+        result_dicts = []
+        for res in results:
+            result_dicts.append(
+                {"index": res["index"], "relevance_score": res["relevance_score"]}
+            )
+        return result_dicts
+
+    def compress_documents(
+        self,
+        documents: Sequence[Document],
+        query: str,
+        callbacks: Optional[Callbacks] = None,
+    ) -> Sequence[Document]:
+        """
+        Compress documents using Jina's Rerank API.
+
+        Args:
+            documents: A sequence of documents to compress.
+            query: The query to use for compressing the documents.
+            callbacks: Callbacks to run during the compression process.
+
+        Returns:
+            A sequence of compressed documents.
+        """
+        compressed = []
+        for res in self.rerank(documents, query):
+            doc = documents[res["index"]]
+            doc_copy = Document(doc.page_content, metadata=deepcopy(doc.metadata))
+            doc_copy.metadata["relevance_score"] = res["relevance_score"]
+            compressed.append(doc_copy)
+        return compressed
--- a/libs/community/tests/unit_tests/document_compressors/test_imports.py
+++ b/libs/community/tests/unit_tests/document_compressors/test_imports.py
@@ -1,6 +1,6 @@
 from langchain_community.document_compressors import __all__, _module_lookup

-EXPECTED_ALL = ["LLMLinguaCompressor", "OpenVINOReranker"]
+EXPECTED_ALL = ["LLMLinguaCompressor", "OpenVINOReranker", "JinaRerank"]


 def test_all_imports() -> None: