Merge remote-tracking branch 'origin/dev_ty_06_end' into dev_ty_06_end

This commit is contained in:
tuyang.yhj 2023-07-05 11:10:26 +08:00
commit ad6900a578
19 changed files with 93 additions and 57 deletions

View File

@ -8,7 +8,7 @@ CREATE TABLE `knowledge_space` (
`gmt_modified` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time', `gmt_modified` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
PRIMARY KEY (`id`), PRIMARY KEY (`id`),
KEY `idx_name` (`name`) COMMENT 'index:idx_name' KEY `idx_name` (`name`) COMMENT 'index:idx_name'
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='knowledge space table'; ) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COMMENT='knowledge space table';
CREATE TABLE `knowledge_document` ( CREATE TABLE `knowledge_document` (
`id` int NOT NULL AUTO_INCREMENT COMMENT 'auto increment id', `id` int NOT NULL AUTO_INCREMENT COMMENT 'auto increment id',
@ -25,7 +25,7 @@ CREATE TABLE `knowledge_document` (
`gmt_modified` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time', `gmt_modified` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
PRIMARY KEY (`id`), PRIMARY KEY (`id`),
KEY `idx_doc_name` (`doc_name`) COMMENT 'index:idx_doc_name' KEY `idx_doc_name` (`doc_name`) COMMENT 'index:idx_doc_name'
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='knowledge document table'; ) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COMMENT='knowledge document table';
CREATE TABLE `document_chunk` ( CREATE TABLE `document_chunk` (
`id` int NOT NULL AUTO_INCREMENT COMMENT 'auto increment id', `id` int NOT NULL AUTO_INCREMENT COMMENT 'auto increment id',
@ -38,4 +38,4 @@ CREATE TABLE `document_chunk` (
`gmt_modified` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time', `gmt_modified` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
PRIMARY KEY (`id`), PRIMARY KEY (`id`),
KEY `idx_document_id` (`document_id`) COMMENT 'index:document_id' KEY `idx_document_id` (`document_id`) COMMENT 'index:document_id'
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='knowledge document chunk detail' ) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COMMENT='knowledge document chunk detail'

View File

@ -6,7 +6,7 @@ from typing import List
import markdown import markdown
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from langchain.schema import Document from langchain.schema import Document
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config from pilot.configs.config import Config
from pilot.embedding_engine import SourceEmbedding, register from pilot.embedding_engine import SourceEmbedding, register
@ -37,11 +37,14 @@ class MarkdownEmbedding(SourceEmbedding):
length_function=len, length_function=len,
) )
else: else:
try:
text_splitter = SpacyTextSplitter( text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm", pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100, chunk_overlap=100,
) )
except Exception:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
return loader.load_and_split(text_splitter) return loader.load_and_split(text_splitter)
@register @register

View File

@ -4,7 +4,7 @@ from typing import List
from langchain.document_loaders import PyPDFLoader from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document from langchain.schema import Document
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config from pilot.configs.config import Config
from pilot.embedding_engine import SourceEmbedding, register from pilot.embedding_engine import SourceEmbedding, register
@ -40,11 +40,14 @@ class PDFEmbedding(SourceEmbedding):
length_function=len, length_function=len,
) )
else: else:
try:
text_splitter = SpacyTextSplitter( text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm", pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100, chunk_overlap=100,
) )
except Exception:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
return loader.load_and_split(text_splitter) return loader.load_and_split(text_splitter)
@register @register

View File

@ -4,7 +4,7 @@ from typing import List
from langchain.document_loaders import UnstructuredPowerPointLoader from langchain.document_loaders import UnstructuredPowerPointLoader
from langchain.schema import Document from langchain.schema import Document
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config from pilot.configs.config import Config
from pilot.embedding_engine import SourceEmbedding, register from pilot.embedding_engine import SourceEmbedding, register
@ -38,11 +38,14 @@ class PPTEmbedding(SourceEmbedding):
length_function=len, length_function=len,
) )
else: else:
try:
text_splitter = SpacyTextSplitter( text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm", pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100, chunk_overlap=100,
) )
except Exception:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
return loader.load_and_split(text_splitter) return loader.load_and_split(text_splitter)
@register @register

View File

@ -59,7 +59,7 @@ class SourceEmbedding(ABC):
self.vector_client = VectorStoreConnector( self.vector_client = VectorStoreConnector(
CFG.VECTOR_STORE_TYPE, self.vector_store_config CFG.VECTOR_STORE_TYPE, self.vector_store_config
) )
self.vector_client.load_document(docs) return self.vector_client.load_document(docs)
@register @register
def similar_search(self, doc, topk): def similar_search(self, doc, topk):

View File

@ -3,7 +3,7 @@ from typing import List
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from langchain.document_loaders import WebBaseLoader from langchain.document_loaders import WebBaseLoader
from langchain.schema import Document from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config from pilot.configs.config import Config
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
@ -33,11 +33,14 @@ class URLEmbedding(SourceEmbedding):
length_function=len, length_function=len,
) )
else: else:
try:
text_splitter = SpacyTextSplitter( text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm", pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100, chunk_overlap=100,
) )
except Exception:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
return loader.load_and_split(text_splitter) return loader.load_and_split(text_splitter)
@register @register

View File

@ -4,7 +4,7 @@ from typing import List
from langchain.document_loaders import UnstructuredWordDocumentLoader from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.schema import Document from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config from pilot.configs.config import Config
from pilot.embedding_engine import SourceEmbedding, register from pilot.embedding_engine import SourceEmbedding, register
@ -32,11 +32,14 @@ class WordEmbedding(SourceEmbedding):
length_function=len, length_function=len,
) )
else: else:
try:
text_splitter = SpacyTextSplitter( text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm", pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100, chunk_overlap=100,
) )
except Exception:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
return loader.load_and_split(text_splitter) return loader.load_and_split(text_splitter)
@register @register

View File

@ -26,8 +26,8 @@ from pilot.openapi.api_v1.api_view_model import (
ChatSceneVo, ChatSceneVo,
) )
from pilot.configs.config import Config from pilot.configs.config import Config
from pilot.openapi.knowledge.knowledge_service import KnowledgeService from pilot.server.knowledge.service import KnowledgeService
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
from pilot.scene.base_chat import BaseChat from pilot.scene.base_chat import BaseChat
from pilot.scene.base import ChatScene from pilot.scene.base import ChatScene

View File

@ -23,7 +23,7 @@ from fastapi import FastAPI, applications
from fastapi.openapi.docs import get_swagger_ui_html from fastapi.openapi.docs import get_swagger_ui_html
from fastapi.exceptions import RequestValidationError from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from pilot.openapi.knowledge.knowledge_controller import router as knowledge_router from pilot.server.knowledge.api import router as knowledge_router
from pilot.openapi.api_v1.api_v1 import router as api_v1, validation_exception_handler from pilot.openapi.api_v1.api_v1 import router as api_v1, validation_exception_handler

View File

@ -2,7 +2,7 @@ import os
import shutil import shutil
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
from fastapi import APIRouter, File, UploadFile, Request, Form from fastapi import APIRouter, File, UploadFile, Form
from langchain.embeddings import HuggingFaceEmbeddings from langchain.embeddings import HuggingFaceEmbeddings
@ -12,8 +12,8 @@ from pilot.configs.model_config import LLM_MODEL_CONFIG, KNOWLEDGE_UPLOAD_ROOT_P
from pilot.openapi.api_v1.api_view_model import Result from pilot.openapi.api_v1.api_view_model import Result
from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding
from pilot.openapi.knowledge.knowledge_service import KnowledgeService from pilot.server.knowledge.service import KnowledgeService
from pilot.openapi.knowledge.request.knowledge_request import ( from pilot.server.knowledge.request.request import (
KnowledgeQueryRequest, KnowledgeQueryRequest,
KnowledgeQueryResponse, KnowledgeQueryResponse,
KnowledgeDocumentRequest, KnowledgeDocumentRequest,
@ -22,7 +22,7 @@ from pilot.openapi.knowledge.request.knowledge_request import (
DocumentQueryRequest, DocumentQueryRequest,
) )
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
CFG = Config() CFG = Config()
router = APIRouter() router = APIRouter()

View File

@ -21,3 +21,17 @@ class DocumentQueryResponse(BaseModel):
total: int = None total: int = None
"""page: current page""" """page: current page"""
page: int = None page: int = None
class SpaceQueryResponse(BaseModel):
"""data: data"""
name: str = None
"""vector_type: vector type"""
vector_type: str = None
"""desc: description"""
desc: str = None
"""owner: owner"""
owner: str = None
"""doc_count: doc_count"""
doc_count: int = None

View File

@ -5,19 +5,19 @@ from pilot.configs.config import Config
from pilot.configs.model_config import LLM_MODEL_CONFIG from pilot.configs.model_config import LLM_MODEL_CONFIG
from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding
from pilot.logs import logger from pilot.logs import logger
from pilot.openapi.knowledge.document_chunk_dao import ( from pilot.server.knowledge.chunk_db import (
DocumentChunkEntity, DocumentChunkEntity,
DocumentChunkDao, DocumentChunkDao,
) )
from pilot.openapi.knowledge.knowledge_document_dao import ( from pilot.server.knowledge.document_db import (
KnowledgeDocumentDao, KnowledgeDocumentDao,
KnowledgeDocumentEntity, KnowledgeDocumentEntity,
) )
from pilot.openapi.knowledge.knowledge_space_dao import ( from pilot.server.knowledge.space_db import (
KnowledgeSpaceDao, KnowledgeSpaceDao,
KnowledgeSpaceEntity, KnowledgeSpaceEntity,
) )
from pilot.openapi.knowledge.request.knowledge_request import ( from pilot.server.knowledge.request.request import (
KnowledgeSpaceRequest, KnowledgeSpaceRequest,
KnowledgeDocumentRequest, KnowledgeDocumentRequest,
DocumentQueryRequest, DocumentQueryRequest,
@ -25,9 +25,9 @@ from pilot.openapi.knowledge.request.knowledge_request import (
) )
from enum import Enum from enum import Enum
from pilot.openapi.knowledge.request.knowledge_response import ( from pilot.server.knowledge.request.response import (
ChunkQueryResponse, ChunkQueryResponse,
DocumentQueryResponse, DocumentQueryResponse, SpaceQueryResponse,
) )
knowledge_space_dao = KnowledgeSpaceDao() knowledge_space_dao = KnowledgeSpaceDao()
@ -195,6 +195,7 @@ class KnowledgeService:
vector_ids = client.knowledge_embedding_batch(chunk_docs) vector_ids = client.knowledge_embedding_batch(chunk_docs)
doc.status = SyncStatus.FINISHED.name doc.status = SyncStatus.FINISHED.name
doc.result = "document embedding success" doc.result = "document embedding success"
if vector_ids is not None:
doc.vector_ids = ",".join(vector_ids) doc.vector_ids = ",".join(vector_ids)
logger.info(f"async document embedding, success:{doc.doc_name}") logger.info(f"async document embedding, success:{doc.doc_name}")
except Exception as e: except Exception as e:

View File

@ -5,7 +5,7 @@ from sqlalchemy.ext.declarative import declarative_base
from pilot.configs.config import Config from pilot.configs.config import Config
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
CFG = Config() CFG = Config()

View File

@ -6,8 +6,8 @@ import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from pilot.embedding_engine.knowledge_type import KnowledgeType from pilot.embedding_engine.knowledge_type import KnowledgeType
from pilot.openapi.knowledge.knowledge_service import KnowledgeService from pilot.server.knowledge.service import KnowledgeService
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
from pilot.configs.config import Config from pilot.configs.config import Config
@ -46,12 +46,18 @@ class LocalKnowledgeInit:
docs.extend(doc) docs.extend(doc)
embedding_engine.index_to_store(docs) embedding_engine.index_to_store(docs)
print(f"""begin create {self.vector_store_config["vector_store_name"]} space""") print(f"""begin create {self.vector_store_config["vector_store_name"]} space""")
try:
space = KnowledgeSpaceRequest space = KnowledgeSpaceRequest
space.name = self.vector_store_config["vector_store_name"] space.name = self.vector_store_config["vector_store_name"]
space.desc = "knowledge_init.py" space.desc = "knowledge_init.py"
space.vector_type = CFG.VECTOR_STORE_TYPE space.vector_type = CFG.VECTOR_STORE_TYPE
space.owner = "DB-GPT" space.owner = "DB-GPT"
knowledge_space_service.create_knowledge_space(space) knowledge_space_service.create_knowledge_space(space)
except Exception as e:
if "have already named" in str(e):
print(f"Warning: you have already named {space.name}")
else:
raise e
if __name__ == "__main__": if __name__ == "__main__":