feat:Add Knowledge Process Workflow (#2210)

This commit is contained in:
Aries-ckt
2024-12-18 11:16:30 +08:00
committed by GitHub
parent 3745d7411d
commit b05febbf77
23 changed files with 7217 additions and 8 deletions

View File

@@ -6,22 +6,136 @@ import uuid
from typing import List, Optional, Tuple
from dbgpt._private.pydantic import ConfigDict, Field
from dbgpt.core import Chunk
from dbgpt.core import Chunk, LLMClient
from dbgpt.core.awel.flow import Parameter, ResourceCategory, register_resource
from dbgpt.rag.transformer.community_summarizer import CommunitySummarizer
from dbgpt.rag.transformer.graph_extractor import GraphExtractor
from dbgpt.storage.knowledge_graph.base import ParagraphChunk
from dbgpt.storage.knowledge_graph.community.community_store import CommunityStore
from dbgpt.storage.knowledge_graph.knowledge_graph import (
GRAPH_PARAMETERS,
BuiltinKnowledgeGraph,
BuiltinKnowledgeGraphConfig,
)
from dbgpt.storage.vector_store.base import VectorStoreConfig
from dbgpt.storage.vector_store.factory import VectorStoreFactory
from dbgpt.storage.vector_store.filters import MetadataFilters
from dbgpt.util.i18n_utils import _
logger = logging.getLogger(__name__)
@register_resource(
_("Community Summary KG Config"),
"community_summary_kg_config",
category=ResourceCategory.KNOWLEDGE_GRAPH,
description=_("community Summary kg Config."),
parameters=[
*GRAPH_PARAMETERS,
Parameter.build_from(
_("Knowledge Graph Type"),
"graph_store_type",
str,
description=_("graph store type."),
optional=True,
default="TuGraph",
),
Parameter.build_from(
_("LLM Client"),
"llm_client",
LLMClient,
description=_("llm client for extract graph triplets."),
),
Parameter.build_from(
_("LLM Model Name"),
"model_name",
str,
description=_("llm model name."),
optional=True,
default=None,
),
Parameter.build_from(
_("Vector Store Type"),
"vector_store_type",
str,
description=_("vector store type."),
optional=True,
default="Chroma",
),
Parameter.build_from(
_("Topk of Knowledge Graph Extract"),
"extract_topk",
int,
description=_("Topk of knowledge graph extract"),
optional=True,
default=5,
),
Parameter.build_from(
_("Recall Score of Knowledge Graph Extract"),
"extract_score_threshold",
float,
description=_("Recall score of knowledge graph extract"),
optional=True,
default=0.3,
),
Parameter.build_from(
_("Recall Score of Community Search in Knowledge Graph"),
"community_topk",
int,
description=_("Recall score of community search in knowledge graph"),
optional=True,
default=50,
),
Parameter.build_from(
_("Recall Score of Community Search in Knowledge Graph"),
"community_score_threshold",
float,
description=_("Recall score of community search in knowledge graph"),
optional=True,
default=0.0,
),
Parameter.build_from(
_("Enable the graph search for documents and chunks"),
"triplet_graph_enabled",
bool,
description=_("Enable the graph search for triplets"),
optional=True,
default=True,
),
Parameter.build_from(
_("Enable the graph search for documents and chunks"),
"document_graph_enabled",
bool,
description=_("Enable the graph search for documents and chunks"),
optional=True,
default=True,
),
Parameter.build_from(
_("Top size of knowledge graph chunk search"),
"knowledge_graph_chunk_search_top_size",
int,
description=_("Top size of knowledge graph chunk search"),
optional=True,
default=5,
),
Parameter.build_from(
_("Batch size of triplets extraction from the text"),
"knowledge_graph_extraction_batch_size",
int,
description=_("Batch size of triplets extraction from the text"),
optional=True,
default=20,
),
Parameter.build_from(
_("Batch size of parallel community building process"),
"community_summary_batch_size",
int,
description=_("TBatch size of parallel community building process"),
optional=True,
default=20,
),
],
)
class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig):
"""Community summary knowledge graph config."""
@@ -80,6 +194,22 @@ class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig):
)
@register_resource(
_("Community Summary Knowledge Graph"),
"community_summary_knowledge_graph",
category=ResourceCategory.KNOWLEDGE_GRAPH,
description=_("Community Summary Knowledge Graph."),
parameters=[
Parameter.build_from(
_("Community Summary Knowledge Graph Config."),
"config",
BuiltinKnowledgeGraphConfig,
description=_("Community Summary Knowledge Graph Config."),
optional=True,
default=None,
),
],
)
class CommunitySummaryKnowledgeGraph(BuiltinKnowledgeGraph):
"""Community summary knowledge graph class."""

View File

@@ -6,7 +6,8 @@ import os
from typing import List, Optional
from dbgpt._private.pydantic import ConfigDict, Field
from dbgpt.core import Chunk, LLMClient
from dbgpt.core import Chunk, Embeddings, LLMClient
from dbgpt.core.awel.flow import Parameter, ResourceCategory, register_resource
from dbgpt.rag.transformer.keyword_extractor import KeywordExtractor
from dbgpt.rag.transformer.triplet_extractor import TripletExtractor
from dbgpt.storage.graph_store.base import GraphStoreBase, GraphStoreConfig
@@ -16,10 +17,87 @@ from dbgpt.storage.knowledge_graph.base import KnowledgeGraphBase, KnowledgeGrap
from dbgpt.storage.knowledge_graph.community.base import GraphStoreAdapter
from dbgpt.storage.knowledge_graph.community.factory import GraphStoreAdapterFactory
from dbgpt.storage.vector_store.filters import MetadataFilters
from dbgpt.util.i18n_utils import _
logger = logging.getLogger(__name__)
GRAPH_PARAMETERS = [
Parameter.build_from(
_("Graph Name"),
"name",
str,
description=_("The name of Graph, if not set, will use the default name."),
optional=True,
default="dbgpt_collection",
),
Parameter.build_from(
_("Embedding Function"),
"embedding_fn",
Embeddings,
description=_(
"The embedding function of vector store, if not set, will use "
"the default embedding function."
),
optional=True,
default=None,
),
Parameter.build_from(
_("Max Chunks Once Load"),
"max_chunks_once_load",
int,
description=_(
"The max number of chunks to load at once. If your document is "
"large, you can set this value to a larger number to speed up the loading "
"process. Default is 10."
),
optional=True,
default=10,
),
Parameter.build_from(
_("Max Threads"),
"max_threads",
int,
description=_(
"The max number of threads to use. Default is 1. If you set "
"this bigger than 1, please make sure your vector store is thread-safe."
),
optional=True,
default=1,
),
]
@register_resource(
_("Builtin Graph Config"),
"knowledge_graph_config",
category=ResourceCategory.KNOWLEDGE_GRAPH,
description=_("knowledge graph config."),
parameters=[
*GRAPH_PARAMETERS,
Parameter.build_from(
_("Knowledge Graph Type"),
"graph_store_type",
str,
description=_("graph store type."),
optional=True,
default="TuGraph",
),
Parameter.build_from(
_("LLM Client"),
"llm_client",
LLMClient,
description=_("llm client for extract graph triplets."),
),
Parameter.build_from(
_("LLM Model Name"),
"model_name",
str,
description=_("llm model name."),
optional=True,
default=None,
),
],
)
class BuiltinKnowledgeGraphConfig(KnowledgeGraphConfig):
"""Builtin knowledge graph config."""
@@ -34,6 +112,22 @@ class BuiltinKnowledgeGraphConfig(KnowledgeGraphConfig):
)
@register_resource(
_("Builtin Knowledge Graph"),
"builtin_knowledge_graph",
category=ResourceCategory.KNOWLEDGE_GRAPH,
description=_("Builtin Knowledge Graph."),
parameters=[
Parameter.build_from(
_("Builtin Knowledge Graph Config."),
"config",
BuiltinKnowledgeGraphConfig,
description=_("Builtin Knowledge Graph Config."),
optional=True,
default=None,
),
],
)
class BuiltinKnowledgeGraph(KnowledgeGraphBase):
"""Builtin knowledge graph class."""