feat: add document structure into GraphRAG (#2033)

Co-authored-by: Appointat <kuda.czk@antgroup.com> Co-authored-by: tpoisonooo <khj.application@aliyun.com> Co-authored-by: vritser <vritser@163.com>
2025-09-13 21:21:08 +00:00 · 2024-10-18 22:03:08 +08:00
parent 811ce63493
commit 88e3d12bd3
29 changed files with 1909 additions and 935 deletions
--- a/dbgpt/rag/transformer/graph_extractor.py
+++ b/dbgpt/rag/transformer/graph_extractor.py
@@ -65,7 +65,9 @@ class GraphExtractor(LLMExtractor):
                    match = re.match(r"\((.*?)#(.*?)\)", line)
                    if match:
                        name, summary = [part.strip() for part in match.groups()]
-                        graph.upsert_vertex(Vertex(name, description=summary))
+                        graph.upsert_vertex(
+                            Vertex(name, description=summary, vertex_type="entity")
+                        )
                elif current_section == "Relationships":
                    match = re.match(r"\((.*?)#(.*?)#(.*?)#(.*?)\)", line)
                    if match:
@@ -74,7 +76,13 @@ class GraphExtractor(LLMExtractor):
                        ]
                        edge_count += 1
                        graph.append_edge(
-                            Edge(source, target, name, description=summary)
+                            Edge(
+                                source,
+                                target,
+                                name,
+                                description=summary,
+                                edge_type="relation",
+                            )
                        )

            if limit and edge_count >= limit:
--- a/dbgpt/rag/transformer/keyword_extractor.py
+++ b/dbgpt/rag/transformer/keyword_extractor.py
@@ -1,4 +1,5 @@
 """KeywordExtractor class."""
+
 import logging
 from typing import List, Optional

@@ -39,12 +40,15 @@ class KeywordExtractor(LLMExtractor):
    def _parse_response(self, text: str, limit: Optional[int] = None) -> List[str]:
        keywords = set()

-        for part in text.split(";"):
-            for s in part.strip().split(","):
-                keyword = s.strip()
-                if keyword:
-                    keywords.add(keyword)
-                    if limit and len(keywords) >= limit:
-                        return list(keywords)
+        lines = text.replace(":", "\n").split("\n")
+
+        for line in lines:
+            for part in line.split(";"):
+                for s in part.strip().split(","):
+                    keyword = s.strip()
+                    if keyword:
+                        keywords.add(keyword)
+                        if limit and len(keywords) >= limit:
+                            return list(keywords)

        return list(keywords)