DB-GPT/dbgpt/rag/transformer/keyword_extractor.py
Florian a9087c3853
feat: add GraphRAG framework and integrate TuGraph (#1506)
Co-authored-by: KingSkyLi <15566300566@163.com>
Co-authored-by: aries_ckt <916701291@qq.com>
Co-authored-by: Fangyin Cheng <staneyffer@gmail.com>
2024-05-16 15:39:50 +08:00

51 lines
1.8 KiB
Python

"""KeywordExtractor class."""
import logging
from typing import List, Optional
from dbgpt.core import LLMClient
from dbgpt.rag.transformer.llm_extractor import LLMExtractor
KEYWORD_EXTRACT_PT = (
"A question is provided below. Given the question, extract up to "
"keywords from the text. Focus on extracting the keywords that we can use "
"to best lookup answers to the question.\n"
"Generate as more as possible synonyms or alias of the keywords "
"considering possible cases of capitalization, pluralization, "
"common expressions, etc.\n"
"Avoid stopwords.\n"
"Provide the keywords and synonyms in comma-separated format."
"Formatted keywords and synonyms text should be separated by a semicolon.\n"
"---------------------\n"
"Example:\n"
"Text: Alice is Bob's mother.\n"
"Keywords:\nAlice,mother,Bob;mummy\n"
"Text: Philz is a coffee shop founded in Berkeley in 1982.\n"
"Keywords:\nPhilz,coffee shop,Berkeley,1982;coffee bar,coffee house\n"
"---------------------\n"
"Text: {text}\n"
"Keywords:\n"
)
logger = logging.getLogger(__name__)
class KeywordExtractor(LLMExtractor):
"""KeywordExtractor class."""
def __init__(self, llm_client: LLMClient, model_name: str):
"""Initialize the KeywordExtractor."""
super().__init__(llm_client, model_name, KEYWORD_EXTRACT_PT)
def _parse_response(self, text: str, limit: Optional[int] = None) -> List[str]:
keywords = set()
for part in text.split(";"):
for s in part.strip().split(","):
keyword = s.strip()
if keyword:
keywords.add(keyword)
if limit and len(keywords) >= limit:
return list(keywords)
return list(keywords)