feat: add document structure into GraphRAG (#2033)

Co-authored-by: Appointat <kuda.czk@antgroup.com>
Co-authored-by: tpoisonooo <khj.application@aliyun.com>
Co-authored-by: vritser <vritser@163.com>
This commit is contained in:
lipengfei
2024-10-18 22:03:08 +08:00
committed by GitHub
parent 811ce63493
commit 88e3d12bd3
29 changed files with 1909 additions and 935 deletions

View File

@@ -65,7 +65,9 @@ class GraphExtractor(LLMExtractor):
match = re.match(r"\((.*?)#(.*?)\)", line)
if match:
name, summary = [part.strip() for part in match.groups()]
graph.upsert_vertex(Vertex(name, description=summary))
graph.upsert_vertex(
Vertex(name, description=summary, vertex_type="entity")
)
elif current_section == "Relationships":
match = re.match(r"\((.*?)#(.*?)#(.*?)#(.*?)\)", line)
if match:
@@ -74,7 +76,13 @@ class GraphExtractor(LLMExtractor):
]
edge_count += 1
graph.append_edge(
Edge(source, target, name, description=summary)
Edge(
source,
target,
name,
description=summary,
edge_type="relation",
)
)
if limit and edge_count >= limit:

View File

@@ -1,4 +1,5 @@
"""KeywordExtractor class."""
import logging
from typing import List, Optional
@@ -39,12 +40,15 @@ class KeywordExtractor(LLMExtractor):
def _parse_response(self, text: str, limit: Optional[int] = None) -> List[str]:
keywords = set()
for part in text.split(";"):
for s in part.strip().split(","):
keyword = s.strip()
if keyword:
keywords.add(keyword)
if limit and len(keywords) >= limit:
return list(keywords)
lines = text.replace(":", "\n").split("\n")
for line in lines:
for part in line.split(";"):
for s in part.strip().split(","):
keyword = s.strip()
if keyword:
keywords.add(keyword)
if limit and len(keywords) >= limit:
return list(keywords)
return list(keywords)