community[patch]: Added support for filter out AWS Kendra search by score confidence (#12920)

**Description:** It will add support for filter out kendra search by score confidence which will make result more accurate. For example ``` retriever = AmazonKendraRetriever( index_id=kendra_index_id, top_k=5, region_name=region, score_confidence="HIGH" ) ``` Result will not include the records which has score confidence "LOW" or "MEDIUM". Relevant docs https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/kendra/client/query.html https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/kendra/client/retrieve.html **Issue:** the issue # it resolve #11801 **twitter:** [@SmitCode](https://twitter.com/SmitCode)
2025-08-13 14:50:00 +00:00 · 2024-03-08 06:58:09 +05:30 · 2024-03-08 06:58:09 +05:30 · aed46cd6f2
commit aed46cd6f2
parent 390ef6abe3
1 changed files with 58 additions and 4 deletions
--- a/libs/community/langchain_community/retrievers/kendra.py
+++ b/libs/community/langchain_community/retrievers/kendra.py
@ -1,11 +1,27 @@
 import re
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Union
+from typing import (
    Any,
    Callable,
    Dict,
    List,
    Literal,
    Optional,
    Sequence,
    Union,
 )
 from langchain_core.callbacks import CallbackManagerForRetrieverRun
 from langchain_core.documents import Document
-from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator, validator
+from langchain_core.pydantic_v1 import (
    BaseModel,
    Extra,
    Field,
    root_validator,
    validator,
 )
 from langchain_core.retrievers import BaseRetriever
 from typing_extensions import Annotated
 def clean_excerpt(excerpt: str) -> str:
@ -153,6 +169,8 @@ class ResultItem(BaseModel, ABC, extra=Extra.allow):  # type: ignore[call-arg]
    """The document URI."""
    DocumentAttributes: Optional[List[DocumentAttribute]] = []
    """The document attributes."""
    ScoreAttributes: Optional[dict]
    """The kendra score confidence"""
    @abstractmethod
    def get_title(self) -> str:
@ -178,6 +196,13 @@ class ResultItem(BaseModel, ABC, extra=Extra.allow):  # type: ignore[call-arg]
        """Document attributes dict."""
        return {attr.Key: attr.Value.value for attr in (self.DocumentAttributes or [])}
    def get_score_attribute(self) -> str:
        """Document Score Confidence"""
        if self.ScoreAttributes is not None:
            return self.ScoreAttributes["ScoreConfidence"]
        else:
            return "NOT_AVAILABLE"
    def to_doc(
        self, page_content_formatter: Callable[["ResultItem"], str] = combined_text
    ) -> Document:
@ -192,9 +217,9 @@ class ResultItem(BaseModel, ABC, extra=Extra.allow):  # type: ignore[call-arg]
                "title": self.get_title(),
                "excerpt": self.get_excerpt(),
                "document_attributes": self.get_document_attributes_dict(),
                "score": self.get_score_attribute(),
            }
        )
        return Document(page_content=page_content, metadata=metadata)
@ -290,6 +315,15 @@ class RetrieveResult(BaseModel, extra=Extra.allow):  # type: ignore[call-arg]
    """The result items."""
 KENDRA_CONFIDENCE_MAPPING = {
    "NOT_AVAILABLE": 0.0,
    "LOW": 0.25,
    "MEDIUM": 0.50,
    "HIGH": 0.75,
    "VERY_HIGH": 1.0,
 }
 class AmazonKendraRetriever(BaseRetriever):
    """`Amazon Kendra Index` retriever.
@ -336,6 +370,7 @@ class AmazonKendraRetriever(BaseRetriever):
    page_content_formatter: Callable[[ResultItem], str] = combined_text
    client: Any
    user_context: Optional[Dict] = None
    min_score_confidence: Annotated[Optional[float], Field(ge=0.0, le=1.0)]
    @validator("top_k")
    def validate_top_k(cls, value: int) -> int:
@ -406,6 +441,25 @@ class AmazonKendraRetriever(BaseRetriever):
        ]
        return top_docs
    def _filter_by_score_confidence(self, docs: List[Document]) -> List[Document]:
        """
        Filter out the records that have a score confidence
        greater than the required threshold.
        """
        if not self.min_score_confidence:
            return docs
        filtered_docs = [
            item
            for item in docs
            if (
                item.metadata.get("score") is not None
                and isinstance(item.metadata["score"], str)
                and KENDRA_CONFIDENCE_MAPPING.get(item.metadata["score"], 0.0)
                >= self.min_score_confidence
            )
        ]
        return filtered_docs
    def _get_relevant_documents(
        self,
        query: str,
@ -422,4 +476,4 @@ class AmazonKendraRetriever(BaseRetriever):
        """
        result_items = self._kendra_query(query)
        top_k_docs = self._get_top_k_docs(result_items)
-        return top_k_docs
+        return self._filter_by_score_confidence(top_k_docs)