mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-19 08:47:32 +00:00
fix(RAG): fix url document rag mode (#2874)
This commit is contained in:
parent
26bb07f9d1
commit
4e7070b6ee
@ -9,6 +9,7 @@ This example shows how to use AWEL to build a simple rag summary example.
|
|||||||
```
|
```
|
||||||
export OPENAI_API_KEY={your_openai_key}
|
export OPENAI_API_KEY={your_openai_key}
|
||||||
export OPENAI_API_BASE={your_openai_base}
|
export OPENAI_API_BASE={your_openai_base}
|
||||||
|
export MODEL_NAME={LLM_MODEL_NAME}
|
||||||
```
|
```
|
||||||
or
|
or
|
||||||
```
|
```
|
||||||
@ -23,7 +24,7 @@ This example shows how to use AWEL to build a simple rag summary example.
|
|||||||
|
|
||||||
curl -X POST http://127.0.0.1:5555/api/v1/awel/trigger/examples/rag/summary \
|
curl -X POST http://127.0.0.1:5555/api/v1/awel/trigger/examples/rag/summary \
|
||||||
-H "Content-Type: application/json" -d '{
|
-H "Content-Type: application/json" -d '{
|
||||||
"url": "https://docs.dbgpt.site/docs/awel"
|
"url": "http://docs.dbgpt.cn/docs/awel/"
|
||||||
}'
|
}'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -58,13 +59,17 @@ with DAG("dbgpt_awel_simple_rag_summary_example") as dag:
|
|||||||
"/examples/rag/summary", methods="POST", request_body=TriggerReqBody
|
"/examples/rag/summary", methods="POST", request_body=TriggerReqBody
|
||||||
)
|
)
|
||||||
request_handle_task = RequestHandleOperator()
|
request_handle_task = RequestHandleOperator()
|
||||||
path_operator = MapOperator(lambda request: request["url"])
|
path_operator = MapOperator(lambda request: {"source": request["url"]})
|
||||||
# build knowledge operator
|
# build knowledge operator
|
||||||
knowledge_operator = KnowledgeOperator(knowledge_type=KnowledgeType.URL.name)
|
knowledge_operator = KnowledgeOperator(knowledge_type=KnowledgeType.URL.name)
|
||||||
# build summary assembler operator
|
# build summary assembler operator
|
||||||
summary_operator = SummaryAssemblerOperator(
|
summary_operator = SummaryAssemblerOperator(
|
||||||
llm_client=OpenAILLMClient(api_key=os.getenv("OPENAI_API_KEY", "your api key")),
|
llm_client=OpenAILLMClient(
|
||||||
language="en",
|
api_key=os.getenv("OPENAI_API_KEY", "your api key"),
|
||||||
|
api_base=os.getenv("OPENAI_API_BASE", "your api base"),
|
||||||
|
),
|
||||||
|
language="zh",
|
||||||
|
model_name=os.getenv("MODEL_NAME", "your model name"),
|
||||||
)
|
)
|
||||||
(
|
(
|
||||||
trigger
|
trigger
|
||||||
|
@ -37,16 +37,96 @@ class URLKnowledge(Knowledge):
|
|||||||
"""Fetch URL document from loader."""
|
"""Fetch URL document from loader."""
|
||||||
if self._loader:
|
if self._loader:
|
||||||
documents = self._loader.load()
|
documents = self._loader.load()
|
||||||
|
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
||||||
else:
|
else:
|
||||||
from langchain.document_loaders import WebBaseLoader # mypy: ignore
|
if self._path is None:
|
||||||
|
|
||||||
if self._path is not None:
|
|
||||||
web_reader = WebBaseLoader(web_path=self._path, encoding="utf8")
|
|
||||||
documents = web_reader.load()
|
|
||||||
else:
|
|
||||||
# Handle the case where self._path is None
|
|
||||||
raise ValueError("web_path cannot be None")
|
raise ValueError("web_path cannot be None")
|
||||||
return [Document.langchain2doc(lc_document) for lc_document in documents]
|
|
||||||
|
return self._load_document_default()
|
||||||
|
|
||||||
|
def _load_document_default(self) -> List[Document]:
|
||||||
|
"""Fetch URL document with trafilatura."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
|
"""Clean text by removing special Unicode characters."""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Remove zero-width characters and other invisible Unicode characters
|
||||||
|
text = re.sub(r"[\u200b-\u200f\u2060\ufeff]", "", text)
|
||||||
|
|
||||||
|
# Remove control characters except newline, tab, and carriage return
|
||||||
|
text = re.sub(r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]", "", text)
|
||||||
|
|
||||||
|
# Normalize Unicode characters
|
||||||
|
text = unicodedata.normalize("NFKC", text)
|
||||||
|
|
||||||
|
# Clean up extra whitespace
|
||||||
|
text = " ".join(text.split())
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Set user agent to avoid being blocked
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit"
|
||||||
|
"/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fetch the webpage content
|
||||||
|
response = requests.get(self._path, headers=headers, timeout=30)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Determine encoding
|
||||||
|
if self._encoding is not None:
|
||||||
|
response.encoding = self._encoding
|
||||||
|
elif response.encoding == "ISO-8859-1":
|
||||||
|
response.encoding = response.apparent_encoding
|
||||||
|
|
||||||
|
# Parse HTML content
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
# Remove script and style elements
|
||||||
|
for script in soup(["script", "style"]):
|
||||||
|
script.decompose()
|
||||||
|
|
||||||
|
# Get text content
|
||||||
|
text_content = soup.get_text(strip=True)
|
||||||
|
|
||||||
|
# Clean up whitespace
|
||||||
|
lines = (line.strip() for line in text_content.splitlines())
|
||||||
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||||
|
text_content = " ".join(chunk for chunk in chunks if chunk)
|
||||||
|
text_content = clean_text(text_content)
|
||||||
|
|
||||||
|
# Get page title if available
|
||||||
|
title = (
|
||||||
|
soup.title.string.strip() if soup.title and soup.title.string else ""
|
||||||
|
)
|
||||||
|
title = clean_text(title)
|
||||||
|
|
||||||
|
description = soup.find("meta", attrs={"name": "description"})
|
||||||
|
desc_content = description["content"] if description else ""
|
||||||
|
desc_content = clean_text(desc_content)
|
||||||
|
|
||||||
|
# Create metadata
|
||||||
|
metadata = {
|
||||||
|
"source": self._path,
|
||||||
|
"title": title,
|
||||||
|
"encoding": response.encoding,
|
||||||
|
"description": desc_content,
|
||||||
|
}
|
||||||
|
|
||||||
|
document = Document(content=text_content, metadata=metadata)
|
||||||
|
return [document]
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"Failed to parse URL content: {str(e)}")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
|
||||||
|
Loading…
Reference in New Issue
Block a user