DB-GPT/dbgpt/rag/transformer/keyword_extractor.py

"""KeywordExtractor class."""
import logging
from typing import List, Optional

from dbgpt.core import LLMClient
from dbgpt.rag.transformer.llm_extractor import LLMExtractor

KEYWORD_EXTRACT_PT = (
    "A question is provided below. Given the question, extract up to "
    "keywords from the text. Focus on extracting the keywords that we can use "
    "to best lookup answers to the question.\n"
    "Generate as more as possible synonyms or alias of the keywords "
    "considering possible cases of capitalization, pluralization, "
    "common expressions, etc.\n"
    "Avoid stopwords.\n"
    "Provide the keywords and synonyms in comma-separated format."
    "Formatted keywords and synonyms text should be separated by a semicolon.\n"
    "---------------------\n"
    "Example:\n"
    "Text: Alice is Bob's mother.\n"
    "Keywords:\nAlice,mother,Bob;mummy\n"
    "Text: Philz is a coffee shop founded in Berkeley in 1982.\n"
    "Keywords:\nPhilz,coffee shop,Berkeley,1982;coffee bar,coffee house\n"
    "---------------------\n"
    "Text: {text}\n"
    "Keywords:\n"
)

logger = logging.getLogger(__name__)


class KeywordExtractor(LLMExtractor):
    """KeywordExtractor class."""

    def __init__(self, llm_client: LLMClient, model_name: str):
        """Initialize the KeywordExtractor."""
        super().__init__(llm_client, model_name, KEYWORD_EXTRACT_PT)

    def _parse_response(self, text: str, limit: Optional[int] = None) -> List[str]:
        keywords = set()

        for part in text.split(";"):
            for s in part.strip().split(","):
                keyword = s.strip()
                if keyword:
                    keywords.add(keyword)
                    if limit and len(keywords) >= limit:
                        return list(keywords)

        return list(keywords)