mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-08-18 07:26:51 +00:00
Merge branch 'zylon-ai:main' into update-ui-include-model-info-#1647
This commit is contained in:
commit
97b8999933
@ -10,5 +10,4 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
PORT: 8080
|
PORT: 8080
|
||||||
PGPT_PROFILES: docker
|
PGPT_PROFILES: docker
|
||||||
PGPT_MODE: local
|
PGPT_MODE: llamacpp
|
||||||
|
|
||||||
|
@ -8,14 +8,14 @@ The clients are kept up to date automatically, so we encourage you to use the la
|
|||||||
|
|
||||||
<Cards>
|
<Cards>
|
||||||
<Card
|
<Card
|
||||||
title="Node.js/TypeScript"
|
title="Node.js/TypeScript - WIP"
|
||||||
icon="fa-brands fa-node"
|
icon="fa-brands fa-node"
|
||||||
href="https://github.com/imartinez/privateGPT-typescript"
|
href="https://github.com/imartinez/privateGPT-typescript"
|
||||||
/>
|
/>
|
||||||
<Card
|
<Card
|
||||||
title="Python"
|
title="Python - Ready!"
|
||||||
icon="fa-brands fa-python"
|
icon="fa-brands fa-python"
|
||||||
href="https://github.com/imartinez/privateGPT-python"
|
href="https://github.com/imartinez/pgpt_python"
|
||||||
/>
|
/>
|
||||||
<br />
|
<br />
|
||||||
</Cards>
|
</Cards>
|
||||||
@ -24,12 +24,12 @@ The clients are kept up to date automatically, so we encourage you to use the la
|
|||||||
|
|
||||||
<Cards>
|
<Cards>
|
||||||
<Card
|
<Card
|
||||||
title="Java"
|
title="Java - WIP"
|
||||||
icon="fa-brands fa-java"
|
icon="fa-brands fa-java"
|
||||||
href="https://github.com/imartinez/privateGPT-java"
|
href="https://github.com/imartinez/privateGPT-java"
|
||||||
/>
|
/>
|
||||||
<Card
|
<Card
|
||||||
title="Go"
|
title="Go - WIP"
|
||||||
icon="fa-brands fa-golang"
|
icon="fa-brands fa-golang"
|
||||||
href="https://github.com/imartinez/privateGPT-go"
|
href="https://github.com/imartinez/privateGPT-go"
|
||||||
/>
|
/>
|
||||||
|
@ -62,6 +62,7 @@ The following ingestion mode exist:
|
|||||||
* `simple`: historic behavior, ingest one document at a time, sequentially
|
* `simple`: historic behavior, ingest one document at a time, sequentially
|
||||||
* `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed)
|
* `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed)
|
||||||
* `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup.
|
* `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup.
|
||||||
|
* `pipeline`: Alternative to parallel.
|
||||||
To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`.
|
To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`.
|
||||||
|
|
||||||
To configure the number of workers used for parallel or batched ingestion, you can use
|
To configure the number of workers used for parallel or batched ingestion, you can use
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
{
|
{
|
||||||
"organization": "privategpt",
|
"organization": "privategpt",
|
||||||
"version": "0.17.2"
|
"version": "0.19.10"
|
||||||
}
|
}
|
@ -6,6 +6,7 @@ import multiprocessing.pool
|
|||||||
import os
|
import os
|
||||||
import threading
|
import threading
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from queue import Queue
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from llama_index.core.data_structs import IndexDict
|
from llama_index.core.data_structs import IndexDict
|
||||||
@ -13,12 +14,13 @@ from llama_index.core.embeddings.utils import EmbedType
|
|||||||
from llama_index.core.indices import VectorStoreIndex, load_index_from_storage
|
from llama_index.core.indices import VectorStoreIndex, load_index_from_storage
|
||||||
from llama_index.core.indices.base import BaseIndex
|
from llama_index.core.indices.base import BaseIndex
|
||||||
from llama_index.core.ingestion import run_transformations
|
from llama_index.core.ingestion import run_transformations
|
||||||
from llama_index.core.schema import Document, TransformComponent
|
from llama_index.core.schema import BaseNode, Document, TransformComponent
|
||||||
from llama_index.core.storage import StorageContext
|
from llama_index.core.storage import StorageContext
|
||||||
|
|
||||||
from private_gpt.components.ingest.ingest_helper import IngestionHelper
|
from private_gpt.components.ingest.ingest_helper import IngestionHelper
|
||||||
from private_gpt.paths import local_data_path
|
from private_gpt.paths import local_data_path
|
||||||
from private_gpt.settings.settings import Settings
|
from private_gpt.settings.settings import Settings
|
||||||
|
from private_gpt.utils.eta import eta
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -314,6 +316,170 @@ class ParallelizedIngestComponent(BaseIngestComponentWithIndex):
|
|||||||
self._file_to_documents_work_pool.terminate()
|
self._file_to_documents_work_pool.terminate()
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineIngestComponent(BaseIngestComponentWithIndex):
|
||||||
|
"""Pipeline ingestion - keeping the embedding worker pool as busy as possible.
|
||||||
|
|
||||||
|
This class implements a threaded ingestion pipeline, which comprises two threads
|
||||||
|
and two queues. The primary thread is responsible for reading and parsing files
|
||||||
|
into documents. These documents are then placed into a queue, which is
|
||||||
|
distributed to a pool of worker processes for embedding computation. After
|
||||||
|
embedding, the documents are transferred to another queue where they are
|
||||||
|
accumulated until a threshold is reached. Upon reaching this threshold, the
|
||||||
|
accumulated documents are flushed to the document store, index, and vector
|
||||||
|
store.
|
||||||
|
|
||||||
|
Exception handling ensures robustness against erroneous files. However, in the
|
||||||
|
pipelined design, one error can lead to the discarding of multiple files. Any
|
||||||
|
discarded files will be reported.
|
||||||
|
"""
|
||||||
|
|
||||||
|
NODE_FLUSH_COUNT = 5000 # Save the index every # nodes.
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
storage_context: StorageContext,
|
||||||
|
embed_model: EmbedType,
|
||||||
|
transformations: list[TransformComponent],
|
||||||
|
count_workers: int,
|
||||||
|
*args: Any,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(storage_context, embed_model, transformations, *args, **kwargs)
|
||||||
|
self.count_workers = count_workers
|
||||||
|
assert (
|
||||||
|
len(self.transformations) >= 2
|
||||||
|
), "Embeddings must be in the transformations"
|
||||||
|
assert count_workers > 0, "count_workers must be > 0"
|
||||||
|
self.count_workers = count_workers
|
||||||
|
# We are doing our own multiprocessing
|
||||||
|
# To do not collide with the multiprocessing of huggingface, we disable it
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
|
|
||||||
|
# doc_q stores parsed files as Document chunks.
|
||||||
|
# Using a shallow queue causes the filesystem parser to block
|
||||||
|
# when it reaches capacity. This ensures it doesn't outpace the
|
||||||
|
# computationally intensive embeddings phase, avoiding unnecessary
|
||||||
|
# memory consumption. The semaphore is used to bound the async worker
|
||||||
|
# embedding computations to cause the doc Q to fill and block.
|
||||||
|
self.doc_semaphore = multiprocessing.Semaphore(
|
||||||
|
self.count_workers
|
||||||
|
) # limit the doc queue to # items.
|
||||||
|
self.doc_q: Queue[tuple[str, str | None, list[Document] | None]] = Queue(20)
|
||||||
|
# node_q stores documents parsed into nodes (embeddings).
|
||||||
|
# Larger queue size so we don't block the embedding workers during a slow
|
||||||
|
# index update.
|
||||||
|
self.node_q: Queue[
|
||||||
|
tuple[str, str | None, list[Document] | None, list[BaseNode] | None]
|
||||||
|
] = Queue(40)
|
||||||
|
threading.Thread(target=self._doc_to_node, daemon=True).start()
|
||||||
|
threading.Thread(target=self._write_nodes, daemon=True).start()
|
||||||
|
|
||||||
|
def _doc_to_node(self) -> None:
|
||||||
|
# Parse documents into nodes
|
||||||
|
with multiprocessing.pool.ThreadPool(processes=self.count_workers) as pool:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
cmd, file_name, documents = self.doc_q.get(
|
||||||
|
block=True
|
||||||
|
) # Documents for a file
|
||||||
|
if cmd == "process":
|
||||||
|
# Push CPU/GPU embedding work to the worker pool
|
||||||
|
# Acquire semaphore to control access to worker pool
|
||||||
|
self.doc_semaphore.acquire()
|
||||||
|
pool.apply_async(
|
||||||
|
self._doc_to_node_worker, (file_name, documents)
|
||||||
|
)
|
||||||
|
elif cmd == "quit":
|
||||||
|
break
|
||||||
|
finally:
|
||||||
|
if cmd != "process":
|
||||||
|
self.doc_q.task_done() # unblock Q joins
|
||||||
|
|
||||||
|
def _doc_to_node_worker(self, file_name: str, documents: list[Document]) -> None:
|
||||||
|
# CPU/GPU intensive work in its own process
|
||||||
|
try:
|
||||||
|
nodes = run_transformations(
|
||||||
|
documents, # type: ignore[arg-type]
|
||||||
|
self.transformations,
|
||||||
|
show_progress=self.show_progress,
|
||||||
|
)
|
||||||
|
self.node_q.put(("process", file_name, documents, nodes))
|
||||||
|
finally:
|
||||||
|
self.doc_semaphore.release()
|
||||||
|
self.doc_q.task_done() # unblock Q joins
|
||||||
|
|
||||||
|
def _save_docs(
|
||||||
|
self, files: list[str], documents: list[Document], nodes: list[BaseNode]
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
logger.info(
|
||||||
|
f"Saving {len(files)} files ({len(documents)} documents / {len(nodes)} nodes)"
|
||||||
|
)
|
||||||
|
self._index.insert_nodes(nodes)
|
||||||
|
for document in documents:
|
||||||
|
self._index.docstore.set_document_hash(
|
||||||
|
document.get_doc_id(), document.hash
|
||||||
|
)
|
||||||
|
self._save_index()
|
||||||
|
except Exception:
|
||||||
|
# Tell the user so they can investigate these files
|
||||||
|
logger.exception(f"Processing files {files}")
|
||||||
|
finally:
|
||||||
|
# Clearing work, even on exception, maintains a clean state.
|
||||||
|
nodes.clear()
|
||||||
|
documents.clear()
|
||||||
|
files.clear()
|
||||||
|
|
||||||
|
def _write_nodes(self) -> None:
|
||||||
|
# Save nodes to index. I/O intensive.
|
||||||
|
node_stack: list[BaseNode] = []
|
||||||
|
doc_stack: list[Document] = []
|
||||||
|
file_stack: list[str] = []
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
cmd, file_name, documents, nodes = self.node_q.get(block=True)
|
||||||
|
if cmd in ("flush", "quit"):
|
||||||
|
if file_stack:
|
||||||
|
self._save_docs(file_stack, doc_stack, node_stack)
|
||||||
|
if cmd == "quit":
|
||||||
|
break
|
||||||
|
elif cmd == "process":
|
||||||
|
node_stack.extend(nodes) # type: ignore[arg-type]
|
||||||
|
doc_stack.extend(documents) # type: ignore[arg-type]
|
||||||
|
file_stack.append(file_name) # type: ignore[arg-type]
|
||||||
|
# Constant saving is heavy on I/O - accumulate to a threshold
|
||||||
|
if len(node_stack) >= self.NODE_FLUSH_COUNT:
|
||||||
|
self._save_docs(file_stack, doc_stack, node_stack)
|
||||||
|
finally:
|
||||||
|
self.node_q.task_done()
|
||||||
|
|
||||||
|
def _flush(self) -> None:
|
||||||
|
self.doc_q.put(("flush", None, None))
|
||||||
|
self.doc_q.join()
|
||||||
|
self.node_q.put(("flush", None, None, None))
|
||||||
|
self.node_q.join()
|
||||||
|
|
||||||
|
def ingest(self, file_name: str, file_data: Path) -> list[Document]:
|
||||||
|
documents = IngestionHelper.transform_file_into_documents(file_name, file_data)
|
||||||
|
self.doc_q.put(("process", file_name, documents))
|
||||||
|
self._flush()
|
||||||
|
return documents
|
||||||
|
|
||||||
|
def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
|
||||||
|
docs = []
|
||||||
|
for file_name, file_data in eta(files):
|
||||||
|
try:
|
||||||
|
documents = IngestionHelper.transform_file_into_documents(
|
||||||
|
file_name, file_data
|
||||||
|
)
|
||||||
|
self.doc_q.put(("process", file_name, documents))
|
||||||
|
docs.extend(documents)
|
||||||
|
except Exception:
|
||||||
|
logger.exception(f"Skipping {file_data.name}")
|
||||||
|
self._flush()
|
||||||
|
return docs
|
||||||
|
|
||||||
|
|
||||||
def get_ingestion_component(
|
def get_ingestion_component(
|
||||||
storage_context: StorageContext,
|
storage_context: StorageContext,
|
||||||
embed_model: EmbedType,
|
embed_model: EmbedType,
|
||||||
@ -336,6 +502,13 @@ def get_ingestion_component(
|
|||||||
transformations=transformations,
|
transformations=transformations,
|
||||||
count_workers=settings.embedding.count_workers,
|
count_workers=settings.embedding.count_workers,
|
||||||
)
|
)
|
||||||
|
elif ingest_mode == "pipeline":
|
||||||
|
return PipelineIngestComponent(
|
||||||
|
storage_context=storage_context,
|
||||||
|
embed_model=embed_model,
|
||||||
|
transformations=transformations,
|
||||||
|
count_workers=settings.embedding.count_workers,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
return SimpleIngestComponent(
|
return SimpleIngestComponent(
|
||||||
storage_context=storage_context,
|
storage_context=storage_context,
|
||||||
|
@ -131,6 +131,7 @@ class LLMComponent:
|
|||||||
temperature=settings.llm.temperature,
|
temperature=settings.llm.temperature,
|
||||||
context_window=settings.llm.context_window,
|
context_window=settings.llm.context_window,
|
||||||
additional_kwargs=settings_kwargs,
|
additional_kwargs=settings_kwargs,
|
||||||
|
request_timeout=ollama_settings.request_timeout,
|
||||||
)
|
)
|
||||||
case "azopenai":
|
case "azopenai":
|
||||||
try:
|
try:
|
||||||
|
@ -8,6 +8,9 @@ from llama_index.core.chat_engine.types import (
|
|||||||
from llama_index.core.indices import VectorStoreIndex
|
from llama_index.core.indices import VectorStoreIndex
|
||||||
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor
|
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor
|
||||||
from llama_index.core.llms import ChatMessage, MessageRole
|
from llama_index.core.llms import ChatMessage, MessageRole
|
||||||
|
from llama_index.core.postprocessor import (
|
||||||
|
SimilarityPostprocessor,
|
||||||
|
)
|
||||||
from llama_index.core.storage import StorageContext
|
from llama_index.core.storage import StorageContext
|
||||||
from llama_index.core.types import TokenGen
|
from llama_index.core.types import TokenGen
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
@ -20,6 +23,7 @@ from private_gpt.components.vector_store.vector_store_component import (
|
|||||||
)
|
)
|
||||||
from private_gpt.open_ai.extensions.context_filter import ContextFilter
|
from private_gpt.open_ai.extensions.context_filter import ContextFilter
|
||||||
from private_gpt.server.chunks.chunks_service import Chunk
|
from private_gpt.server.chunks.chunks_service import Chunk
|
||||||
|
from private_gpt.settings.settings import Settings
|
||||||
|
|
||||||
|
|
||||||
class Completion(BaseModel):
|
class Completion(BaseModel):
|
||||||
@ -68,14 +72,18 @@ class ChatEngineInput:
|
|||||||
|
|
||||||
@singleton
|
@singleton
|
||||||
class ChatService:
|
class ChatService:
|
||||||
|
settings: Settings
|
||||||
|
|
||||||
@inject
|
@inject
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
settings: Settings,
|
||||||
llm_component: LLMComponent,
|
llm_component: LLMComponent,
|
||||||
vector_store_component: VectorStoreComponent,
|
vector_store_component: VectorStoreComponent,
|
||||||
embedding_component: EmbeddingComponent,
|
embedding_component: EmbeddingComponent,
|
||||||
node_store_component: NodeStoreComponent,
|
node_store_component: NodeStoreComponent,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
self.settings = settings
|
||||||
self.llm_component = llm_component
|
self.llm_component = llm_component
|
||||||
self.embedding_component = embedding_component
|
self.embedding_component = embedding_component
|
||||||
self.vector_store_component = vector_store_component
|
self.vector_store_component = vector_store_component
|
||||||
@ -98,9 +106,12 @@ class ChatService:
|
|||||||
use_context: bool = False,
|
use_context: bool = False,
|
||||||
context_filter: ContextFilter | None = None,
|
context_filter: ContextFilter | None = None,
|
||||||
) -> BaseChatEngine:
|
) -> BaseChatEngine:
|
||||||
|
settings = self.settings
|
||||||
if use_context:
|
if use_context:
|
||||||
vector_index_retriever = self.vector_store_component.get_retriever(
|
vector_index_retriever = self.vector_store_component.get_retriever(
|
||||||
index=self.index, context_filter=context_filter
|
index=self.index,
|
||||||
|
context_filter=context_filter,
|
||||||
|
similarity_top_k=self.settings.rag.similarity_top_k,
|
||||||
)
|
)
|
||||||
return ContextChatEngine.from_defaults(
|
return ContextChatEngine.from_defaults(
|
||||||
system_prompt=system_prompt,
|
system_prompt=system_prompt,
|
||||||
@ -108,6 +119,9 @@ class ChatService:
|
|||||||
llm=self.llm_component.llm, # Takes no effect at the moment
|
llm=self.llm_component.llm, # Takes no effect at the moment
|
||||||
node_postprocessors=[
|
node_postprocessors=[
|
||||||
MetadataReplacementPostProcessor(target_metadata_key="window"),
|
MetadataReplacementPostProcessor(target_metadata_key="window"),
|
||||||
|
SimilarityPostprocessor(
|
||||||
|
similarity_cutoff=settings.rag.similarity_value
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import AnyStr, BinaryIO
|
from typing import TYPE_CHECKING, AnyStr, BinaryIO
|
||||||
|
|
||||||
from injector import inject, singleton
|
from injector import inject, singleton
|
||||||
from llama_index.core.node_parser import SentenceWindowNodeParser
|
from llama_index.core.node_parser import SentenceWindowNodeParser
|
||||||
@ -17,6 +17,9 @@ from private_gpt.components.vector_store.vector_store_component import (
|
|||||||
from private_gpt.server.ingest.model import IngestedDoc
|
from private_gpt.server.ingest.model import IngestedDoc
|
||||||
from private_gpt.settings.settings import settings
|
from private_gpt.settings.settings import settings
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from llama_index.core.storage.docstore.types import RefDocInfo
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -86,17 +89,15 @@ class IngestService:
|
|||||||
return [IngestedDoc.from_document(document) for document in documents]
|
return [IngestedDoc.from_document(document) for document in documents]
|
||||||
|
|
||||||
def list_ingested(self) -> list[IngestedDoc]:
|
def list_ingested(self) -> list[IngestedDoc]:
|
||||||
ingested_docs = []
|
ingested_docs: list[IngestedDoc] = []
|
||||||
try:
|
try:
|
||||||
docstore = self.storage_context.docstore
|
docstore = self.storage_context.docstore
|
||||||
ingested_docs_ids: set[str] = set()
|
ref_docs: dict[str, RefDocInfo] | None = docstore.get_all_ref_doc_info()
|
||||||
|
|
||||||
for node in docstore.docs.values():
|
if not ref_docs:
|
||||||
if node.ref_doc_id is not None:
|
return ingested_docs
|
||||||
ingested_docs_ids.add(node.ref_doc_id)
|
|
||||||
|
|
||||||
for doc_id in ingested_docs_ids:
|
for doc_id, ref_doc_info in ref_docs.items():
|
||||||
ref_doc_info = docstore.get_ref_doc_info(ref_doc_id=doc_id)
|
|
||||||
doc_metadata = None
|
doc_metadata = None
|
||||||
if ref_doc_info is not None and ref_doc_info.metadata is not None:
|
if ref_doc_info is not None and ref_doc_info.metadata is not None:
|
||||||
doc_metadata = IngestedDoc.curate_metadata(ref_doc_info.metadata)
|
doc_metadata = IngestedDoc.curate_metadata(ref_doc_info.metadata)
|
||||||
|
@ -155,13 +155,14 @@ class HuggingFaceSettings(BaseModel):
|
|||||||
|
|
||||||
class EmbeddingSettings(BaseModel):
|
class EmbeddingSettings(BaseModel):
|
||||||
mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"]
|
mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"]
|
||||||
ingest_mode: Literal["simple", "batch", "parallel"] = Field(
|
ingest_mode: Literal["simple", "batch", "parallel", "pipeline"] = Field(
|
||||||
"simple",
|
"simple",
|
||||||
description=(
|
description=(
|
||||||
"The ingest mode to use for the embedding engine:\n"
|
"The ingest mode to use for the embedding engine:\n"
|
||||||
"If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n"
|
"If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n"
|
||||||
"If `batch` - if multiple files, parse all the files in parallel, "
|
"If `batch` - if multiple files, parse all the files in parallel, "
|
||||||
"and send them in batch to the embedding model.\n"
|
"and send them in batch to the embedding model.\n"
|
||||||
|
"In `pipeline` - The Embedding engine is kept as busy as possible\n"
|
||||||
"If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n"
|
"If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n"
|
||||||
"`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n"
|
"`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n"
|
||||||
"For modes that leverage parallelization, you can specify the number of "
|
"For modes that leverage parallelization, you can specify the number of "
|
||||||
@ -174,6 +175,7 @@ class EmbeddingSettings(BaseModel):
|
|||||||
"The number of workers to use for file ingestion.\n"
|
"The number of workers to use for file ingestion.\n"
|
||||||
"In `batch` mode, this is the number of workers used to parse the files.\n"
|
"In `batch` mode, this is the number of workers used to parse the files.\n"
|
||||||
"In `parallel` mode, this is the number of workers used to parse the files and embed them.\n"
|
"In `parallel` mode, this is the number of workers used to parse the files and embed them.\n"
|
||||||
|
"In `pipeline` mode, this is the number of workers that can perform embeddings.\n"
|
||||||
"This is only used if `ingest_mode` is not `simple`.\n"
|
"This is only used if `ingest_mode` is not `simple`.\n"
|
||||||
"Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n"
|
"Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n"
|
||||||
"Do not set it higher than your number of threads of your CPU."
|
"Do not set it higher than your number of threads of your CPU."
|
||||||
@ -239,6 +241,10 @@ class OllamaSettings(BaseModel):
|
|||||||
1.1,
|
1.1,
|
||||||
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
|
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
|
||||||
)
|
)
|
||||||
|
request_timeout: float = Field(
|
||||||
|
120.0,
|
||||||
|
description="Time elapsed until ollama times out the request. Default is 120s. Format is float. ",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AzureOpenAISettings(BaseModel):
|
class AzureOpenAISettings(BaseModel):
|
||||||
@ -278,6 +284,17 @@ class UISettings(BaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class RagSettings(BaseModel):
|
||||||
|
similarity_top_k: int = Field(
|
||||||
|
2,
|
||||||
|
description="This value controls the number of documents returned by the RAG pipeline",
|
||||||
|
)
|
||||||
|
similarity_value: float = Field(
|
||||||
|
None,
|
||||||
|
description="If set, any documents retrieved from the RAG must meet a certain match score. Acceptable values are between 0 and 1.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class PostgresSettings(BaseModel):
|
class PostgresSettings(BaseModel):
|
||||||
host: str = Field(
|
host: str = Field(
|
||||||
"localhost",
|
"localhost",
|
||||||
@ -373,6 +390,7 @@ class Settings(BaseModel):
|
|||||||
azopenai: AzureOpenAISettings
|
azopenai: AzureOpenAISettings
|
||||||
vectorstore: VectorstoreSettings
|
vectorstore: VectorstoreSettings
|
||||||
nodestore: NodeStoreSettings
|
nodestore: NodeStoreSettings
|
||||||
|
rag: RagSettings
|
||||||
qdrant: QdrantSettings | None = None
|
qdrant: QdrantSettings | None = None
|
||||||
postgres: PostgresSettings | None = None
|
postgres: PostgresSettings | None = None
|
||||||
|
|
||||||
|
122
private_gpt/utils/eta.py
Normal file
122
private_gpt/utils/eta.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
import datetime
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import time
|
||||||
|
from collections import deque
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def human_time(*args: Any, **kwargs: Any) -> str:
|
||||||
|
def timedelta_total_seconds(timedelta: datetime.timedelta) -> float:
|
||||||
|
return (
|
||||||
|
timedelta.microseconds
|
||||||
|
+ 0.0
|
||||||
|
+ (timedelta.seconds + timedelta.days * 24 * 3600) * 10**6
|
||||||
|
) / 10**6
|
||||||
|
|
||||||
|
secs = float(timedelta_total_seconds(datetime.timedelta(*args, **kwargs)))
|
||||||
|
# We want (ms) precision below 2 seconds
|
||||||
|
if secs < 2:
|
||||||
|
return f"{secs * 1000}ms"
|
||||||
|
units = [("y", 86400 * 365), ("d", 86400), ("h", 3600), ("m", 60), ("s", 1)]
|
||||||
|
parts = []
|
||||||
|
for unit, mul in units:
|
||||||
|
if secs / mul >= 1 or mul == 1:
|
||||||
|
if mul > 1:
|
||||||
|
n = int(math.floor(secs / mul))
|
||||||
|
secs -= n * mul
|
||||||
|
else:
|
||||||
|
# >2s we drop the (ms) component.
|
||||||
|
n = int(secs)
|
||||||
|
if n:
|
||||||
|
parts.append(f"{n}{unit}")
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def eta(iterator: list[Any]) -> Any:
|
||||||
|
"""Report an ETA after 30s and every 60s thereafter."""
|
||||||
|
total = len(iterator)
|
||||||
|
_eta = ETA(total)
|
||||||
|
_eta.needReport(30)
|
||||||
|
for processed, data in enumerate(iterator, start=1):
|
||||||
|
yield data
|
||||||
|
_eta.update(processed)
|
||||||
|
if _eta.needReport(60):
|
||||||
|
logger.info(f"{processed}/{total} - ETA {_eta.human_time()}")
|
||||||
|
|
||||||
|
|
||||||
|
class ETA:
|
||||||
|
"""Predict how long something will take to complete."""
|
||||||
|
|
||||||
|
def __init__(self, total: int):
|
||||||
|
self.total: int = total # Total expected records.
|
||||||
|
self.rate: float = 0.0 # per second
|
||||||
|
self._timing_data: deque[tuple[float, int]] = deque(maxlen=100)
|
||||||
|
self.secondsLeft: float = 0.0
|
||||||
|
self.nexttime: float = 0.0
|
||||||
|
|
||||||
|
def human_time(self) -> str:
|
||||||
|
if self._calc():
|
||||||
|
return f"{human_time(seconds=self.secondsLeft)} @ {int(self.rate * 60)}/min"
|
||||||
|
return "(computing)"
|
||||||
|
|
||||||
|
def update(self, count: int) -> None:
|
||||||
|
# count should be in the range 0 to self.total
|
||||||
|
assert count > 0
|
||||||
|
assert count <= self.total
|
||||||
|
self._timing_data.append((time.time(), count)) # (X,Y) for pearson
|
||||||
|
|
||||||
|
def needReport(self, whenSecs: int) -> bool:
|
||||||
|
now = time.time()
|
||||||
|
if now > self.nexttime:
|
||||||
|
self.nexttime = now + whenSecs
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _calc(self) -> bool:
|
||||||
|
# A sample before a prediction. Need two points to compute slope!
|
||||||
|
if len(self._timing_data) < 3:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
|
||||||
|
# Calculate means and standard deviations.
|
||||||
|
samples = len(self._timing_data)
|
||||||
|
# column wise sum of the timing tuples to compute their mean.
|
||||||
|
mean_x, mean_y = (
|
||||||
|
sum(i) / samples for i in zip(*self._timing_data, strict=False)
|
||||||
|
)
|
||||||
|
std_x = math.sqrt(
|
||||||
|
sum(pow(i[0] - mean_x, 2) for i in self._timing_data) / (samples - 1)
|
||||||
|
)
|
||||||
|
std_y = math.sqrt(
|
||||||
|
sum(pow(i[1] - mean_y, 2) for i in self._timing_data) / (samples - 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate coefficient.
|
||||||
|
sum_xy, sum_sq_v_x, sum_sq_v_y = 0.0, 0.0, 0
|
||||||
|
for x, y in self._timing_data:
|
||||||
|
x -= mean_x
|
||||||
|
y -= mean_y
|
||||||
|
sum_xy += x * y
|
||||||
|
sum_sq_v_x += pow(x, 2)
|
||||||
|
sum_sq_v_y += pow(y, 2)
|
||||||
|
pearson_r = sum_xy / math.sqrt(sum_sq_v_x * sum_sq_v_y)
|
||||||
|
|
||||||
|
# Calculate regression line.
|
||||||
|
# y = mx + b where m is the slope and b is the y-intercept.
|
||||||
|
m = self.rate = pearson_r * (std_y / std_x)
|
||||||
|
y = self.total
|
||||||
|
b = mean_y - m * mean_x
|
||||||
|
x = (y - b) / m
|
||||||
|
|
||||||
|
# Calculate fitted line (transformed/shifted regression line horizontally).
|
||||||
|
fitted_b = self._timing_data[-1][1] - (m * self._timing_data[-1][0])
|
||||||
|
fitted_x = (y - fitted_b) / m
|
||||||
|
_, count = self._timing_data[-1] # adjust last data point progress count
|
||||||
|
adjusted_x = ((fitted_x - x) * (count / self.total)) + x
|
||||||
|
eta_epoch = adjusted_x
|
||||||
|
|
||||||
|
self.secondsLeft = max([eta_epoch - time.time(), 0])
|
||||||
|
return True
|
@ -10,7 +10,7 @@ from private_gpt.settings.settings import settings
|
|||||||
|
|
||||||
resume_download = True
|
resume_download = True
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(prog='Setup: Download models from huggingface')
|
parser = argparse.ArgumentParser(prog='Setup: Download models from Hugging Face')
|
||||||
parser.add_argument('--resume', default=True, action=argparse.BooleanOptionalAction, help='Enable/Disable resume_download options to restart the download progress interrupted')
|
parser.add_argument('--resume', default=True, action=argparse.BooleanOptionalAction, help='Enable/Disable resume_download options to restart the download progress interrupted')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
resume_download = args.resume
|
resume_download = args.resume
|
||||||
|
@ -2,9 +2,35 @@ import argparse
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
|
from private_gpt.paths import local_data_path
|
||||||
|
from private_gpt.settings.settings import settings
|
||||||
|
|
||||||
def wipe():
|
|
||||||
path = "local_data"
|
def wipe() -> None:
|
||||||
|
WIPE_MAP = {
|
||||||
|
"simple": wipe_simple, # node store
|
||||||
|
"chroma": wipe_chroma, # vector store
|
||||||
|
"postgres": wipe_postgres, # node, index and vector store
|
||||||
|
}
|
||||||
|
for dbtype in ("nodestore", "vectorstore"):
|
||||||
|
database = getattr(settings(), dbtype).database
|
||||||
|
func = WIPE_MAP.get(database)
|
||||||
|
if func:
|
||||||
|
func(dbtype)
|
||||||
|
else:
|
||||||
|
print(f"Unable to wipe database '{database}' for '{dbtype}'")
|
||||||
|
|
||||||
|
|
||||||
|
def wipe_file(file: str) -> None:
|
||||||
|
if os.path.isfile(file):
|
||||||
|
os.remove(file)
|
||||||
|
print(f" - Deleted {file}")
|
||||||
|
|
||||||
|
|
||||||
|
def wipe_tree(path: str) -> None:
|
||||||
|
if not os.path.exists(path):
|
||||||
|
print(f"Warning: Path not found {path}")
|
||||||
|
return
|
||||||
print(f"Wiping {path}...")
|
print(f"Wiping {path}...")
|
||||||
all_files = os.listdir(path)
|
all_files = os.listdir(path)
|
||||||
|
|
||||||
@ -24,6 +50,54 @@ def wipe():
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
def wipe_simple(dbtype: str) -> None:
|
||||||
|
assert dbtype == "nodestore"
|
||||||
|
from llama_index.core.storage.docstore.types import (
|
||||||
|
DEFAULT_PERSIST_FNAME as DOCSTORE,
|
||||||
|
)
|
||||||
|
from llama_index.core.storage.index_store.types import (
|
||||||
|
DEFAULT_PERSIST_FNAME as INDEXSTORE,
|
||||||
|
)
|
||||||
|
|
||||||
|
for store in (DOCSTORE, INDEXSTORE):
|
||||||
|
wipe_file(str((local_data_path / store).absolute()))
|
||||||
|
|
||||||
|
|
||||||
|
def wipe_postgres(dbtype: str) -> None:
|
||||||
|
try:
|
||||||
|
import psycopg2
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError("Postgres dependencies not found") from e
|
||||||
|
|
||||||
|
cur = conn = None
|
||||||
|
try:
|
||||||
|
tables = {
|
||||||
|
"nodestore": ["data_docstore", "data_indexstore"],
|
||||||
|
"vectorstore": ["data_embeddings"],
|
||||||
|
}[dbtype]
|
||||||
|
connection = settings().postgres.model_dump(exclude_none=True)
|
||||||
|
schema = connection.pop("schema_name")
|
||||||
|
conn = psycopg2.connect(**connection)
|
||||||
|
cur = conn.cursor()
|
||||||
|
for table in tables:
|
||||||
|
sql = f"DROP TABLE IF EXISTS {schema}.{table}"
|
||||||
|
cur.execute(sql)
|
||||||
|
print(f"Table {schema}.{table} dropped.")
|
||||||
|
conn.commit()
|
||||||
|
except psycopg2.Error as e:
|
||||||
|
print("Error:", e)
|
||||||
|
finally:
|
||||||
|
if cur:
|
||||||
|
cur.close()
|
||||||
|
if conn:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def wipe_chroma(dbtype: str):
|
||||||
|
assert dbtype == "vectorstore"
|
||||||
|
wipe_tree(str((local_data_path / "chroma_db").absolute()))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
commands = {
|
commands = {
|
||||||
"wipe": wipe,
|
"wipe": wipe,
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# poetry install --extras "ui llms-llama-cpp vector-stores-qdrant embeddings-huggingface"
|
||||||
server:
|
server:
|
||||||
env_name: ${APP_ENV:local}
|
env_name: ${APP_ENV:local}
|
||||||
|
|
||||||
|
@ -19,6 +19,7 @@ ollama:
|
|||||||
top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
|
top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
|
||||||
repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
|
repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
|
||||||
repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
|
repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
|
||||||
|
request_timeout: 120.0 # Time elapsed until ollama times out the request. Default is 120s. Format is float.
|
||||||
|
|
||||||
vectorstore:
|
vectorstore:
|
||||||
database: qdrant
|
database: qdrant
|
||||||
|
@ -42,6 +42,12 @@ llm:
|
|||||||
tokenizer: mistralai/Mistral-7B-Instruct-v0.2
|
tokenizer: mistralai/Mistral-7B-Instruct-v0.2
|
||||||
temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
|
temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
|
||||||
|
|
||||||
|
rag:
|
||||||
|
similarity_top_k: 2
|
||||||
|
#This value controls how many "top" documents the RAG returns to use in the context.
|
||||||
|
#similarity_value: 0.45
|
||||||
|
#This value is disabled by default. If you enable this settings, the RAG will only use articles that meet a certain percentage score.
|
||||||
|
|
||||||
llamacpp:
|
llamacpp:
|
||||||
prompt_style: "mistral"
|
prompt_style: "mistral"
|
||||||
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
|
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
|
||||||
@ -89,6 +95,7 @@ ollama:
|
|||||||
llm_model: llama2
|
llm_model: llama2
|
||||||
embedding_model: nomic-embed-text
|
embedding_model: nomic-embed-text
|
||||||
api_base: http://localhost:11434
|
api_base: http://localhost:11434
|
||||||
|
request_timeout: 120.0
|
||||||
|
|
||||||
azopenai:
|
azopenai:
|
||||||
api_key: ${AZ_OPENAI_API_KEY:}
|
api_key: ${AZ_OPENAI_API_KEY:}
|
||||||
|
Loading…
Reference in New Issue
Block a user