Merge remote-tracking branch 'origin/dev_ty_06_end' into dev_ty_06_end

This commit is contained in:
tuyang.yhj 2023-07-05 11:10:26 +08:00
commit ad6900a578
19 changed files with 93 additions and 57 deletions

View File

@ -8,7 +8,7 @@ CREATE TABLE `knowledge_space` (
`gmt_modified` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
PRIMARY KEY (`id`),
KEY `idx_name` (`name`) COMMENT 'index:idx_name'
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='knowledge space table';
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COMMENT='knowledge space table';
CREATE TABLE `knowledge_document` (
`id` int NOT NULL AUTO_INCREMENT COMMENT 'auto increment id',
@ -25,7 +25,7 @@ CREATE TABLE `knowledge_document` (
`gmt_modified` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
PRIMARY KEY (`id`),
KEY `idx_doc_name` (`doc_name`) COMMENT 'index:idx_doc_name'
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='knowledge document table';
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COMMENT='knowledge document table';
CREATE TABLE `document_chunk` (
`id` int NOT NULL AUTO_INCREMENT COMMENT 'auto increment id',
@ -38,4 +38,4 @@ CREATE TABLE `document_chunk` (
`gmt_modified` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
PRIMARY KEY (`id`),
KEY `idx_document_id` (`document_id`) COMMENT 'index:document_id'
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='knowledge document chunk detail'
) ENGINE=InnoDB AUTO_INCREMENT=100001 DEFAULT CHARSET=utf8mb4 COMMENT='knowledge document chunk detail'

View File

@ -6,7 +6,7 @@ from typing import List
import markdown
from bs4 import BeautifulSoup
from langchain.schema import Document
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config
from pilot.embedding_engine import SourceEmbedding, register
@ -37,11 +37,14 @@ class MarkdownEmbedding(SourceEmbedding):
length_function=len,
)
else:
text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100,
)
try:
text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100,
)
except Exception:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
return loader.load_and_split(text_splitter)
@register

View File

@ -4,7 +4,7 @@ from typing import List
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config
from pilot.embedding_engine import SourceEmbedding, register
@ -40,11 +40,14 @@ class PDFEmbedding(SourceEmbedding):
length_function=len,
)
else:
text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100,
)
try:
text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100,
)
except Exception:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
return loader.load_and_split(text_splitter)
@register

View File

@ -4,7 +4,7 @@ from typing import List
from langchain.document_loaders import UnstructuredPowerPointLoader
from langchain.schema import Document
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config
from pilot.embedding_engine import SourceEmbedding, register
@ -38,11 +38,14 @@ class PPTEmbedding(SourceEmbedding):
length_function=len,
)
else:
text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100,
)
try:
text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100,
)
except Exception:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
return loader.load_and_split(text_splitter)
@register

View File

@ -59,7 +59,7 @@ class SourceEmbedding(ABC):
self.vector_client = VectorStoreConnector(
CFG.VECTOR_STORE_TYPE, self.vector_store_config
)
self.vector_client.load_document(docs)
return self.vector_client.load_document(docs)
@register
def similar_search(self, doc, topk):

View File

@ -3,7 +3,7 @@ from typing import List
from bs4 import BeautifulSoup
from langchain.document_loaders import WebBaseLoader
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
@ -33,11 +33,14 @@ class URLEmbedding(SourceEmbedding):
length_function=len,
)
else:
text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100,
)
try:
text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100,
)
except Exception:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
return loader.load_and_split(text_splitter)
@register

View File

@ -4,7 +4,7 @@ from typing import List
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config
from pilot.embedding_engine import SourceEmbedding, register
@ -32,11 +32,14 @@ class WordEmbedding(SourceEmbedding):
length_function=len,
)
else:
text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100,
)
try:
text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100,
)
except Exception:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
return loader.load_and_split(text_splitter)
@register

View File

@ -26,8 +26,8 @@ from pilot.openapi.api_v1.api_view_model import (
ChatSceneVo,
)
from pilot.configs.config import Config
from pilot.openapi.knowledge.knowledge_service import KnowledgeService
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest
from pilot.server.knowledge.service import KnowledgeService
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
from pilot.scene.base_chat import BaseChat
from pilot.scene.base import ChatScene

View File

@ -23,7 +23,7 @@ from fastapi import FastAPI, applications
from fastapi.openapi.docs import get_swagger_ui_html
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from pilot.openapi.knowledge.knowledge_controller import router as knowledge_router
from pilot.server.knowledge.api import router as knowledge_router
from pilot.openapi.api_v1.api_v1 import router as api_v1, validation_exception_handler

View File

@ -2,7 +2,7 @@ import os
import shutil
from tempfile import NamedTemporaryFile
from fastapi import APIRouter, File, UploadFile, Request, Form
from fastapi import APIRouter, File, UploadFile, Form
from langchain.embeddings import HuggingFaceEmbeddings
@ -12,8 +12,8 @@ from pilot.configs.model_config import LLM_MODEL_CONFIG, KNOWLEDGE_UPLOAD_ROOT_P
from pilot.openapi.api_v1.api_view_model import Result
from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding
from pilot.openapi.knowledge.knowledge_service import KnowledgeService
from pilot.openapi.knowledge.request.knowledge_request import (
from pilot.server.knowledge.service import KnowledgeService
from pilot.server.knowledge.request.request import (
KnowledgeQueryRequest,
KnowledgeQueryResponse,
KnowledgeDocumentRequest,
@ -22,7 +22,7 @@ from pilot.openapi.knowledge.request.knowledge_request import (
DocumentQueryRequest,
)
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
CFG = Config()
router = APIRouter()

View File

@ -21,3 +21,17 @@ class DocumentQueryResponse(BaseModel):
total: int = None
"""page: current page"""
page: int = None
class SpaceQueryResponse(BaseModel):
"""data: data"""
name: str = None
"""vector_type: vector type"""
vector_type: str = None
"""desc: description"""
desc: str = None
"""owner: owner"""
owner: str = None
"""doc_count: doc_count"""
doc_count: int = None

View File

@ -5,19 +5,19 @@ from pilot.configs.config import Config
from pilot.configs.model_config import LLM_MODEL_CONFIG
from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding
from pilot.logs import logger
from pilot.openapi.knowledge.document_chunk_dao import (
from pilot.server.knowledge.chunk_db import (
DocumentChunkEntity,
DocumentChunkDao,
)
from pilot.openapi.knowledge.knowledge_document_dao import (
from pilot.server.knowledge.document_db import (
KnowledgeDocumentDao,
KnowledgeDocumentEntity,
)
from pilot.openapi.knowledge.knowledge_space_dao import (
from pilot.server.knowledge.space_db import (
KnowledgeSpaceDao,
KnowledgeSpaceEntity,
)
from pilot.openapi.knowledge.request.knowledge_request import (
from pilot.server.knowledge.request.request import (
KnowledgeSpaceRequest,
KnowledgeDocumentRequest,
DocumentQueryRequest,
@ -25,9 +25,9 @@ from pilot.openapi.knowledge.request.knowledge_request import (
)
from enum import Enum
from pilot.openapi.knowledge.request.knowledge_response import (
from pilot.server.knowledge.request.response import (
ChunkQueryResponse,
DocumentQueryResponse,
DocumentQueryResponse, SpaceQueryResponse,
)
knowledge_space_dao = KnowledgeSpaceDao()
@ -195,7 +195,8 @@ class KnowledgeService:
vector_ids = client.knowledge_embedding_batch(chunk_docs)
doc.status = SyncStatus.FINISHED.name
doc.result = "document embedding success"
doc.vector_ids = ",".join(vector_ids)
if vector_ids is not None:
doc.vector_ids = ",".join(vector_ids)
logger.info(f"async document embedding, success:{doc.doc_name}")
except Exception as e:
doc.status = SyncStatus.FAILED.name

View File

@ -5,7 +5,7 @@ from sqlalchemy.ext.declarative import declarative_base
from pilot.configs.config import Config
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
from sqlalchemy.orm import sessionmaker
CFG = Config()

View File

@ -6,8 +6,8 @@ import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from pilot.embedding_engine.knowledge_type import KnowledgeType
from pilot.openapi.knowledge.knowledge_service import KnowledgeService
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest
from pilot.server.knowledge.service import KnowledgeService
from pilot.server.knowledge.request.request import KnowledgeSpaceRequest
from pilot.configs.config import Config
@ -46,12 +46,18 @@ class LocalKnowledgeInit:
docs.extend(doc)
embedding_engine.index_to_store(docs)
print(f"""begin create {self.vector_store_config["vector_store_name"]} space""")
space = KnowledgeSpaceRequest
space.name = self.vector_store_config["vector_store_name"]
space.desc = "knowledge_init.py"
space.vector_type = CFG.VECTOR_STORE_TYPE
space.owner = "DB-GPT"
knowledge_space_service.create_knowledge_space(space)
try:
space = KnowledgeSpaceRequest
space.name = self.vector_store_config["vector_store_name"]
space.desc = "knowledge_init.py"
space.vector_type = CFG.VECTOR_STORE_TYPE
space.owner = "DB-GPT"
knowledge_space_service.create_knowledge_space(space)
except Exception as e:
if "have already named" in str(e):
print(f"Warning: you have already named {space.name}")
else:
raise e
if __name__ == "__main__":