mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-25 04:53:36 +00:00
Co-authored-by: KingSkyLi <15566300566@163.com> Co-authored-by: aries_ckt <916701291@qq.com> Co-authored-by: Fangyin Cheng <staneyffer@gmail.com>
51 lines
1.8 KiB
Python
51 lines
1.8 KiB
Python
"""KeywordExtractor class."""
|
|
import logging
|
|
from typing import List, Optional
|
|
|
|
from dbgpt.core import LLMClient
|
|
from dbgpt.rag.transformer.llm_extractor import LLMExtractor
|
|
|
|
KEYWORD_EXTRACT_PT = (
|
|
"A question is provided below. Given the question, extract up to "
|
|
"keywords from the text. Focus on extracting the keywords that we can use "
|
|
"to best lookup answers to the question.\n"
|
|
"Generate as more as possible synonyms or alias of the keywords "
|
|
"considering possible cases of capitalization, pluralization, "
|
|
"common expressions, etc.\n"
|
|
"Avoid stopwords.\n"
|
|
"Provide the keywords and synonyms in comma-separated format."
|
|
"Formatted keywords and synonyms text should be separated by a semicolon.\n"
|
|
"---------------------\n"
|
|
"Example:\n"
|
|
"Text: Alice is Bob's mother.\n"
|
|
"Keywords:\nAlice,mother,Bob;mummy\n"
|
|
"Text: Philz is a coffee shop founded in Berkeley in 1982.\n"
|
|
"Keywords:\nPhilz,coffee shop,Berkeley,1982;coffee bar,coffee house\n"
|
|
"---------------------\n"
|
|
"Text: {text}\n"
|
|
"Keywords:\n"
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class KeywordExtractor(LLMExtractor):
|
|
"""KeywordExtractor class."""
|
|
|
|
def __init__(self, llm_client: LLMClient, model_name: str):
|
|
"""Initialize the KeywordExtractor."""
|
|
super().__init__(llm_client, model_name, KEYWORD_EXTRACT_PT)
|
|
|
|
def _parse_response(self, text: str, limit: Optional[int] = None) -> List[str]:
|
|
keywords = set()
|
|
|
|
for part in text.split(";"):
|
|
for s in part.strip().split(","):
|
|
keyword = s.strip()
|
|
if keyword:
|
|
keywords.add(keyword)
|
|
if limit and len(keywords) >= limit:
|
|
return list(keywords)
|
|
|
|
return list(keywords)
|