From 4e7070b6eeb2235f5baef12107e8c7a27637c33b Mon Sep 17 00:00:00 2001 From: alanchen <40684202+chenliang15405@users.noreply.github.com> Date: Tue, 5 Aug 2025 21:52:39 +0800 Subject: [PATCH] fix(RAG): fix url document rag mode (#2874) --- examples/awel/simple_rag_summary_example.py | 13 ++- .../src/dbgpt_ext/rag/knowledge/url.py | 96 +++++++++++++++++-- 2 files changed, 97 insertions(+), 12 deletions(-) diff --git a/examples/awel/simple_rag_summary_example.py b/examples/awel/simple_rag_summary_example.py index e84aa9d9c..cf638b7ff 100644 --- a/examples/awel/simple_rag_summary_example.py +++ b/examples/awel/simple_rag_summary_example.py @@ -9,6 +9,7 @@ This example shows how to use AWEL to build a simple rag summary example. ``` export OPENAI_API_KEY={your_openai_key} export OPENAI_API_BASE={your_openai_base} + export MODEL_NAME={LLM_MODEL_NAME} ``` or ``` @@ -23,7 +24,7 @@ This example shows how to use AWEL to build a simple rag summary example. curl -X POST http://127.0.0.1:5555/api/v1/awel/trigger/examples/rag/summary \ -H "Content-Type: application/json" -d '{ - "url": "https://docs.dbgpt.site/docs/awel" + "url": "http://docs.dbgpt.cn/docs/awel/" }' """ @@ -58,13 +59,17 @@ with DAG("dbgpt_awel_simple_rag_summary_example") as dag: "/examples/rag/summary", methods="POST", request_body=TriggerReqBody ) request_handle_task = RequestHandleOperator() - path_operator = MapOperator(lambda request: request["url"]) + path_operator = MapOperator(lambda request: {"source": request["url"]}) # build knowledge operator knowledge_operator = KnowledgeOperator(knowledge_type=KnowledgeType.URL.name) # build summary assembler operator summary_operator = SummaryAssemblerOperator( - llm_client=OpenAILLMClient(api_key=os.getenv("OPENAI_API_KEY", "your api key")), - language="en", + llm_client=OpenAILLMClient( + api_key=os.getenv("OPENAI_API_KEY", "your api key"), + api_base=os.getenv("OPENAI_API_BASE", "your api base"), + ), + language="zh", + model_name=os.getenv("MODEL_NAME", "your model name"), ) ( trigger diff --git a/packages/dbgpt-ext/src/dbgpt_ext/rag/knowledge/url.py b/packages/dbgpt-ext/src/dbgpt_ext/rag/knowledge/url.py index f2d1e2371..363155094 100644 --- a/packages/dbgpt-ext/src/dbgpt_ext/rag/knowledge/url.py +++ b/packages/dbgpt-ext/src/dbgpt_ext/rag/knowledge/url.py @@ -37,16 +37,96 @@ class URLKnowledge(Knowledge): """Fetch URL document from loader.""" if self._loader: documents = self._loader.load() + return [Document.langchain2doc(lc_document) for lc_document in documents] else: - from langchain.document_loaders import WebBaseLoader # mypy: ignore - - if self._path is not None: - web_reader = WebBaseLoader(web_path=self._path, encoding="utf8") - documents = web_reader.load() - else: - # Handle the case where self._path is None + if self._path is None: raise ValueError("web_path cannot be None") - return [Document.langchain2doc(lc_document) for lc_document in documents] + + return self._load_document_default() + + def _load_document_default(self) -> List[Document]: + """Fetch URL document with trafilatura.""" + + import re + import unicodedata + + import requests + from bs4 import BeautifulSoup + + def clean_text(text: str) -> str: + """Clean text by removing special Unicode characters.""" + if not text: + return "" + + # Remove zero-width characters and other invisible Unicode characters + text = re.sub(r"[\u200b-\u200f\u2060\ufeff]", "", text) + + # Remove control characters except newline, tab, and carriage return + text = re.sub(r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]", "", text) + + # Normalize Unicode characters + text = unicodedata.normalize("NFKC", text) + + # Clean up extra whitespace + text = " ".join(text.split()) + + return text.strip() + + try: + # Set user agent to avoid being blocked + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit" + "/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + + # Fetch the webpage content + response = requests.get(self._path, headers=headers, timeout=30) + response.raise_for_status() + + # Determine encoding + if self._encoding is not None: + response.encoding = self._encoding + elif response.encoding == "ISO-8859-1": + response.encoding = response.apparent_encoding + + # Parse HTML content + soup = BeautifulSoup(response.text, "html.parser") + + # Remove script and style elements + for script in soup(["script", "style"]): + script.decompose() + + # Get text content + text_content = soup.get_text(strip=True) + + # Clean up whitespace + lines = (line.strip() for line in text_content.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text_content = " ".join(chunk for chunk in chunks if chunk) + text_content = clean_text(text_content) + + # Get page title if available + title = ( + soup.title.string.strip() if soup.title and soup.title.string else "" + ) + title = clean_text(title) + + description = soup.find("meta", attrs={"name": "description"}) + desc_content = description["content"] if description else "" + desc_content = clean_text(desc_content) + + # Create metadata + metadata = { + "source": self._path, + "title": title, + "encoding": response.encoding, + "description": desc_content, + } + + document = Document(content=text_content, metadata=metadata) + return [document] + except Exception as e: + raise ValueError(f"Failed to parse URL content: {str(e)}") @classmethod def support_chunk_strategy(cls) -> List[ChunkStrategy]: