mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-01 16:18:27 +00:00
Merge remote-tracking branch 'origin/dev_ty_06_end' into dev_ty_06_end
This commit is contained in:
commit
ad6900a578
@ -8,7 +8,7 @@ CREATE TABLE `knowledge_space` (
|
||||
`gmt_modified` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_name` (`name`) COMMENT 'index:idx_name'
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='knowledge space table';
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COMMENT='knowledge space table';
|
||||
|
||||
CREATE TABLE `knowledge_document` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT 'auto increment id',
|
||||
@ -25,7 +25,7 @@ CREATE TABLE `knowledge_document` (
|
||||
`gmt_modified` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_doc_name` (`doc_name`) COMMENT 'index:idx_doc_name'
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='knowledge document table';
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COMMENT='knowledge document table';
|
||||
|
||||
CREATE TABLE `document_chunk` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT 'auto increment id',
|
||||
@ -38,4 +38,4 @@ CREATE TABLE `document_chunk` (
|
||||
`gmt_modified` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_document_id` (`document_id`) COMMENT 'index:document_id'
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='knowledge document chunk detail'
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COMMENT='knowledge document chunk detail'
|
@ -6,7 +6,7 @@ from typing import List
|
||||
import markdown
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter
|
||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||
|
||||
from pilot.configs.config import Config
|
||||
from pilot.embedding_engine import SourceEmbedding, register
|
||||
@ -37,11 +37,14 @@ class MarkdownEmbedding(SourceEmbedding):
|
||||
length_function=len,
|
||||
)
|
||||
else:
|
||||
text_splitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
try:
|
||||
text_splitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
except Exception:
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
|
||||
return loader.load_and_split(text_splitter)
|
||||
|
||||
@register
|
||||
|
@ -4,7 +4,7 @@ from typing import List
|
||||
|
||||
from langchain.document_loaders import PyPDFLoader
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter
|
||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||
|
||||
from pilot.configs.config import Config
|
||||
from pilot.embedding_engine import SourceEmbedding, register
|
||||
@ -40,11 +40,14 @@ class PDFEmbedding(SourceEmbedding):
|
||||
length_function=len,
|
||||
)
|
||||
else:
|
||||
text_splitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
try:
|
||||
text_splitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
except Exception:
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
|
||||
return loader.load_and_split(text_splitter)
|
||||
|
||||
@register
|
||||
|
@ -4,7 +4,7 @@ from typing import List
|
||||
|
||||
from langchain.document_loaders import UnstructuredPowerPointLoader
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter
|
||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||
|
||||
from pilot.configs.config import Config
|
||||
from pilot.embedding_engine import SourceEmbedding, register
|
||||
@ -38,11 +38,14 @@ class PPTEmbedding(SourceEmbedding):
|
||||
length_function=len,
|
||||
)
|
||||
else:
|
||||
text_splitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
try:
|
||||
text_splitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
except Exception:
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
|
||||
return loader.load_and_split(text_splitter)
|
||||
|
||||
@register
|
||||
|
@ -59,7 +59,7 @@ class SourceEmbedding(ABC):
|
||||
self.vector_client = VectorStoreConnector(
|
||||
CFG.VECTOR_STORE_TYPE, self.vector_store_config
|
||||
)
|
||||
self.vector_client.load_document(docs)
|
||||
return self.vector_client.load_document(docs)
|
||||
|
||||
@register
|
||||
def similar_search(self, doc, topk):
|
||||
|
@ -3,7 +3,7 @@ from typing import List
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain.document_loaders import WebBaseLoader
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter
|
||||
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
|
||||
|
||||
from pilot.configs.config import Config
|
||||
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
@ -33,11 +33,14 @@ class URLEmbedding(SourceEmbedding):
|
||||
length_function=len,
|
||||
)
|
||||
else:
|
||||
text_splitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
try:
|
||||
text_splitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
except Exception:
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
|
||||
return loader.load_and_split(text_splitter)
|
||||
|
||||
@register
|
||||
|
@ -4,7 +4,7 @@ from typing import List
|
||||
|
||||
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter
|
||||
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
|
||||
|
||||
from pilot.configs.config import Config
|
||||
from pilot.embedding_engine import SourceEmbedding, register
|
||||
@ -32,11 +32,14 @@ class WordEmbedding(SourceEmbedding):
|
||||
length_function=len,
|
||||
)
|
||||
else:
|
||||
text_splitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
try:
|
||||
text_splitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
except Exception:
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
|
||||
return loader.load_and_split(text_splitter)
|
||||
|
||||
@register
|
||||
|
@ -26,8 +26,8 @@ from pilot.openapi.api_v1.api_view_model import (
|
||||
ChatSceneVo,
|
||||
)
|
||||
from pilot.configs.config import Config
|
||||
from pilot.openapi.knowledge.knowledge_service import KnowledgeService
|
||||
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest
|
||||
from pilot.server.knowledge.service import KnowledgeService
|
||||
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
|
||||
|
||||
from pilot.scene.base_chat import BaseChat
|
||||
from pilot.scene.base import ChatScene
|
||||
|
@ -23,7 +23,7 @@ from fastapi import FastAPI, applications
|
||||
from fastapi.openapi.docs import get_swagger_ui_html
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pilot.openapi.knowledge.knowledge_controller import router as knowledge_router
|
||||
from pilot.server.knowledge.api import router as knowledge_router
|
||||
|
||||
|
||||
from pilot.openapi.api_v1.api_v1 import router as api_v1, validation_exception_handler
|
||||
|
@ -2,7 +2,7 @@ import os
|
||||
import shutil
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
from fastapi import APIRouter, File, UploadFile, Request, Form
|
||||
from fastapi import APIRouter, File, UploadFile, Form
|
||||
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
|
||||
@ -12,8 +12,8 @@ from pilot.configs.model_config import LLM_MODEL_CONFIG, KNOWLEDGE_UPLOAD_ROOT_P
|
||||
from pilot.openapi.api_v1.api_view_model import Result
|
||||
from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding
|
||||
|
||||
from pilot.openapi.knowledge.knowledge_service import KnowledgeService
|
||||
from pilot.openapi.knowledge.request.knowledge_request import (
|
||||
from pilot.server.knowledge.service import KnowledgeService
|
||||
from pilot.server.knowledge.request.request import (
|
||||
KnowledgeQueryRequest,
|
||||
KnowledgeQueryResponse,
|
||||
KnowledgeDocumentRequest,
|
||||
@ -22,7 +22,7 @@ from pilot.openapi.knowledge.request.knowledge_request import (
|
||||
DocumentQueryRequest,
|
||||
)
|
||||
|
||||
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest
|
||||
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
|
||||
|
||||
CFG = Config()
|
||||
router = APIRouter()
|
@ -21,3 +21,17 @@ class DocumentQueryResponse(BaseModel):
|
||||
total: int = None
|
||||
"""page: current page"""
|
||||
page: int = None
|
||||
|
||||
|
||||
class SpaceQueryResponse(BaseModel):
|
||||
"""data: data"""
|
||||
|
||||
name: str = None
|
||||
"""vector_type: vector type"""
|
||||
vector_type: str = None
|
||||
"""desc: description"""
|
||||
desc: str = None
|
||||
"""owner: owner"""
|
||||
owner: str = None
|
||||
"""doc_count: doc_count"""
|
||||
doc_count: int = None
|
@ -5,19 +5,19 @@ from pilot.configs.config import Config
|
||||
from pilot.configs.model_config import LLM_MODEL_CONFIG
|
||||
from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding
|
||||
from pilot.logs import logger
|
||||
from pilot.openapi.knowledge.document_chunk_dao import (
|
||||
from pilot.server.knowledge.chunk_db import (
|
||||
DocumentChunkEntity,
|
||||
DocumentChunkDao,
|
||||
)
|
||||
from pilot.openapi.knowledge.knowledge_document_dao import (
|
||||
from pilot.server.knowledge.document_db import (
|
||||
KnowledgeDocumentDao,
|
||||
KnowledgeDocumentEntity,
|
||||
)
|
||||
from pilot.openapi.knowledge.knowledge_space_dao import (
|
||||
from pilot.server.knowledge.space_db import (
|
||||
KnowledgeSpaceDao,
|
||||
KnowledgeSpaceEntity,
|
||||
)
|
||||
from pilot.openapi.knowledge.request.knowledge_request import (
|
||||
from pilot.server.knowledge.request.request import (
|
||||
KnowledgeSpaceRequest,
|
||||
KnowledgeDocumentRequest,
|
||||
DocumentQueryRequest,
|
||||
@ -25,9 +25,9 @@ from pilot.openapi.knowledge.request.knowledge_request import (
|
||||
)
|
||||
from enum import Enum
|
||||
|
||||
from pilot.openapi.knowledge.request.knowledge_response import (
|
||||
from pilot.server.knowledge.request.response import (
|
||||
ChunkQueryResponse,
|
||||
DocumentQueryResponse,
|
||||
DocumentQueryResponse, SpaceQueryResponse,
|
||||
)
|
||||
|
||||
knowledge_space_dao = KnowledgeSpaceDao()
|
||||
@ -195,7 +195,8 @@ class KnowledgeService:
|
||||
vector_ids = client.knowledge_embedding_batch(chunk_docs)
|
||||
doc.status = SyncStatus.FINISHED.name
|
||||
doc.result = "document embedding success"
|
||||
doc.vector_ids = ",".join(vector_ids)
|
||||
if vector_ids is not None:
|
||||
doc.vector_ids = ",".join(vector_ids)
|
||||
logger.info(f"async document embedding, success:{doc.doc_name}")
|
||||
except Exception as e:
|
||||
doc.status = SyncStatus.FAILED.name
|
@ -5,7 +5,7 @@ from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
from pilot.configs.config import Config
|
||||
|
||||
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest
|
||||
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
CFG = Config()
|
@ -6,8 +6,8 @@ import sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
||||
|
||||
from pilot.embedding_engine.knowledge_type import KnowledgeType
|
||||
from pilot.openapi.knowledge.knowledge_service import KnowledgeService
|
||||
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest
|
||||
from pilot.server.knowledge.service import KnowledgeService
|
||||
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
|
||||
|
||||
|
||||
from pilot.configs.config import Config
|
||||
@ -46,12 +46,18 @@ class LocalKnowledgeInit:
|
||||
docs.extend(doc)
|
||||
embedding_engine.index_to_store(docs)
|
||||
print(f"""begin create {self.vector_store_config["vector_store_name"]} space""")
|
||||
space = KnowledgeSpaceRequest
|
||||
space.name = self.vector_store_config["vector_store_name"]
|
||||
space.desc = "knowledge_init.py"
|
||||
space.vector_type = CFG.VECTOR_STORE_TYPE
|
||||
space.owner = "DB-GPT"
|
||||
knowledge_space_service.create_knowledge_space(space)
|
||||
try:
|
||||
space = KnowledgeSpaceRequest
|
||||
space.name = self.vector_store_config["vector_store_name"]
|
||||
space.desc = "knowledge_init.py"
|
||||
space.vector_type = CFG.VECTOR_STORE_TYPE
|
||||
space.owner = "DB-GPT"
|
||||
knowledge_space_service.create_knowledge_space(space)
|
||||
except Exception as e:
|
||||
if "have already named" in str(e):
|
||||
print(f"Warning: you have already named {space.name}")
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Loading…
Reference in New Issue
Block a user