diff --git a/docker-compose.yaml b/docker-compose.yaml
index f86d2380..7129b126 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -10,5 +10,4 @@ services:
environment:
PORT: 8080
PGPT_PROFILES: docker
- PGPT_MODE: local
-
+ PGPT_MODE: llamacpp
diff --git a/fern/docs/pages/api-reference/sdks.mdx b/fern/docs/pages/api-reference/sdks.mdx
index 30935b7f..0172d9d4 100644
--- a/fern/docs/pages/api-reference/sdks.mdx
+++ b/fern/docs/pages/api-reference/sdks.mdx
@@ -8,14 +8,14 @@ The clients are kept up to date automatically, so we encourage you to use the la
@@ -24,12 +24,12 @@ The clients are kept up to date automatically, so we encourage you to use the la
diff --git a/fern/docs/pages/manual/ingestion.mdx b/fern/docs/pages/manual/ingestion.mdx
index 39b83cc9..05336559 100644
--- a/fern/docs/pages/manual/ingestion.mdx
+++ b/fern/docs/pages/manual/ingestion.mdx
@@ -62,6 +62,7 @@ The following ingestion mode exist:
* `simple`: historic behavior, ingest one document at a time, sequentially
* `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed)
* `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup.
+* `pipeline`: Alternative to parallel.
To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`.
To configure the number of workers used for parallel or batched ingestion, you can use
diff --git a/fern/fern.config.json b/fern/fern.config.json
index 4e60c037..2ed68270 100644
--- a/fern/fern.config.json
+++ b/fern/fern.config.json
@@ -1,4 +1,4 @@
{
"organization": "privategpt",
- "version": "0.17.2"
+ "version": "0.19.10"
}
\ No newline at end of file
diff --git a/private_gpt/components/ingest/ingest_component.py b/private_gpt/components/ingest/ingest_component.py
index e21b6c23..5ed03959 100644
--- a/private_gpt/components/ingest/ingest_component.py
+++ b/private_gpt/components/ingest/ingest_component.py
@@ -6,6 +6,7 @@ import multiprocessing.pool
import os
import threading
from pathlib import Path
+from queue import Queue
from typing import Any
from llama_index.core.data_structs import IndexDict
@@ -13,12 +14,13 @@ from llama_index.core.embeddings.utils import EmbedType
from llama_index.core.indices import VectorStoreIndex, load_index_from_storage
from llama_index.core.indices.base import BaseIndex
from llama_index.core.ingestion import run_transformations
-from llama_index.core.schema import Document, TransformComponent
+from llama_index.core.schema import BaseNode, Document, TransformComponent
from llama_index.core.storage import StorageContext
from private_gpt.components.ingest.ingest_helper import IngestionHelper
from private_gpt.paths import local_data_path
from private_gpt.settings.settings import Settings
+from private_gpt.utils.eta import eta
logger = logging.getLogger(__name__)
@@ -314,6 +316,170 @@ class ParallelizedIngestComponent(BaseIngestComponentWithIndex):
self._file_to_documents_work_pool.terminate()
+class PipelineIngestComponent(BaseIngestComponentWithIndex):
+ """Pipeline ingestion - keeping the embedding worker pool as busy as possible.
+
+ This class implements a threaded ingestion pipeline, which comprises two threads
+ and two queues. The primary thread is responsible for reading and parsing files
+ into documents. These documents are then placed into a queue, which is
+ distributed to a pool of worker processes for embedding computation. After
+ embedding, the documents are transferred to another queue where they are
+ accumulated until a threshold is reached. Upon reaching this threshold, the
+ accumulated documents are flushed to the document store, index, and vector
+ store.
+
+ Exception handling ensures robustness against erroneous files. However, in the
+ pipelined design, one error can lead to the discarding of multiple files. Any
+ discarded files will be reported.
+ """
+
+ NODE_FLUSH_COUNT = 5000 # Save the index every # nodes.
+
+ def __init__(
+ self,
+ storage_context: StorageContext,
+ embed_model: EmbedType,
+ transformations: list[TransformComponent],
+ count_workers: int,
+ *args: Any,
+ **kwargs: Any,
+ ) -> None:
+ super().__init__(storage_context, embed_model, transformations, *args, **kwargs)
+ self.count_workers = count_workers
+ assert (
+ len(self.transformations) >= 2
+ ), "Embeddings must be in the transformations"
+ assert count_workers > 0, "count_workers must be > 0"
+ self.count_workers = count_workers
+ # We are doing our own multiprocessing
+ # To do not collide with the multiprocessing of huggingface, we disable it
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+ # doc_q stores parsed files as Document chunks.
+ # Using a shallow queue causes the filesystem parser to block
+ # when it reaches capacity. This ensures it doesn't outpace the
+ # computationally intensive embeddings phase, avoiding unnecessary
+ # memory consumption. The semaphore is used to bound the async worker
+ # embedding computations to cause the doc Q to fill and block.
+ self.doc_semaphore = multiprocessing.Semaphore(
+ self.count_workers
+ ) # limit the doc queue to # items.
+ self.doc_q: Queue[tuple[str, str | None, list[Document] | None]] = Queue(20)
+ # node_q stores documents parsed into nodes (embeddings).
+ # Larger queue size so we don't block the embedding workers during a slow
+ # index update.
+ self.node_q: Queue[
+ tuple[str, str | None, list[Document] | None, list[BaseNode] | None]
+ ] = Queue(40)
+ threading.Thread(target=self._doc_to_node, daemon=True).start()
+ threading.Thread(target=self._write_nodes, daemon=True).start()
+
+ def _doc_to_node(self) -> None:
+ # Parse documents into nodes
+ with multiprocessing.pool.ThreadPool(processes=self.count_workers) as pool:
+ while True:
+ try:
+ cmd, file_name, documents = self.doc_q.get(
+ block=True
+ ) # Documents for a file
+ if cmd == "process":
+ # Push CPU/GPU embedding work to the worker pool
+ # Acquire semaphore to control access to worker pool
+ self.doc_semaphore.acquire()
+ pool.apply_async(
+ self._doc_to_node_worker, (file_name, documents)
+ )
+ elif cmd == "quit":
+ break
+ finally:
+ if cmd != "process":
+ self.doc_q.task_done() # unblock Q joins
+
+ def _doc_to_node_worker(self, file_name: str, documents: list[Document]) -> None:
+ # CPU/GPU intensive work in its own process
+ try:
+ nodes = run_transformations(
+ documents, # type: ignore[arg-type]
+ self.transformations,
+ show_progress=self.show_progress,
+ )
+ self.node_q.put(("process", file_name, documents, nodes))
+ finally:
+ self.doc_semaphore.release()
+ self.doc_q.task_done() # unblock Q joins
+
+ def _save_docs(
+ self, files: list[str], documents: list[Document], nodes: list[BaseNode]
+ ) -> None:
+ try:
+ logger.info(
+ f"Saving {len(files)} files ({len(documents)} documents / {len(nodes)} nodes)"
+ )
+ self._index.insert_nodes(nodes)
+ for document in documents:
+ self._index.docstore.set_document_hash(
+ document.get_doc_id(), document.hash
+ )
+ self._save_index()
+ except Exception:
+ # Tell the user so they can investigate these files
+ logger.exception(f"Processing files {files}")
+ finally:
+ # Clearing work, even on exception, maintains a clean state.
+ nodes.clear()
+ documents.clear()
+ files.clear()
+
+ def _write_nodes(self) -> None:
+ # Save nodes to index. I/O intensive.
+ node_stack: list[BaseNode] = []
+ doc_stack: list[Document] = []
+ file_stack: list[str] = []
+ while True:
+ try:
+ cmd, file_name, documents, nodes = self.node_q.get(block=True)
+ if cmd in ("flush", "quit"):
+ if file_stack:
+ self._save_docs(file_stack, doc_stack, node_stack)
+ if cmd == "quit":
+ break
+ elif cmd == "process":
+ node_stack.extend(nodes) # type: ignore[arg-type]
+ doc_stack.extend(documents) # type: ignore[arg-type]
+ file_stack.append(file_name) # type: ignore[arg-type]
+ # Constant saving is heavy on I/O - accumulate to a threshold
+ if len(node_stack) >= self.NODE_FLUSH_COUNT:
+ self._save_docs(file_stack, doc_stack, node_stack)
+ finally:
+ self.node_q.task_done()
+
+ def _flush(self) -> None:
+ self.doc_q.put(("flush", None, None))
+ self.doc_q.join()
+ self.node_q.put(("flush", None, None, None))
+ self.node_q.join()
+
+ def ingest(self, file_name: str, file_data: Path) -> list[Document]:
+ documents = IngestionHelper.transform_file_into_documents(file_name, file_data)
+ self.doc_q.put(("process", file_name, documents))
+ self._flush()
+ return documents
+
+ def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
+ docs = []
+ for file_name, file_data in eta(files):
+ try:
+ documents = IngestionHelper.transform_file_into_documents(
+ file_name, file_data
+ )
+ self.doc_q.put(("process", file_name, documents))
+ docs.extend(documents)
+ except Exception:
+ logger.exception(f"Skipping {file_data.name}")
+ self._flush()
+ return docs
+
+
def get_ingestion_component(
storage_context: StorageContext,
embed_model: EmbedType,
@@ -336,6 +502,13 @@ def get_ingestion_component(
transformations=transformations,
count_workers=settings.embedding.count_workers,
)
+ elif ingest_mode == "pipeline":
+ return PipelineIngestComponent(
+ storage_context=storage_context,
+ embed_model=embed_model,
+ transformations=transformations,
+ count_workers=settings.embedding.count_workers,
+ )
else:
return SimpleIngestComponent(
storage_context=storage_context,
diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py
index 953209a8..4e46c250 100644
--- a/private_gpt/components/llm/llm_component.py
+++ b/private_gpt/components/llm/llm_component.py
@@ -131,6 +131,7 @@ class LLMComponent:
temperature=settings.llm.temperature,
context_window=settings.llm.context_window,
additional_kwargs=settings_kwargs,
+ request_timeout=ollama_settings.request_timeout,
)
case "azopenai":
try:
diff --git a/private_gpt/server/chat/chat_service.py b/private_gpt/server/chat/chat_service.py
index 5369200b..ea57f2c0 100644
--- a/private_gpt/server/chat/chat_service.py
+++ b/private_gpt/server/chat/chat_service.py
@@ -8,6 +8,9 @@ from llama_index.core.chat_engine.types import (
from llama_index.core.indices import VectorStoreIndex
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.llms import ChatMessage, MessageRole
+from llama_index.core.postprocessor import (
+ SimilarityPostprocessor,
+)
from llama_index.core.storage import StorageContext
from llama_index.core.types import TokenGen
from pydantic import BaseModel
@@ -20,6 +23,7 @@ from private_gpt.components.vector_store.vector_store_component import (
)
from private_gpt.open_ai.extensions.context_filter import ContextFilter
from private_gpt.server.chunks.chunks_service import Chunk
+from private_gpt.settings.settings import Settings
class Completion(BaseModel):
@@ -68,14 +72,18 @@ class ChatEngineInput:
@singleton
class ChatService:
+ settings: Settings
+
@inject
def __init__(
self,
+ settings: Settings,
llm_component: LLMComponent,
vector_store_component: VectorStoreComponent,
embedding_component: EmbeddingComponent,
node_store_component: NodeStoreComponent,
) -> None:
+ self.settings = settings
self.llm_component = llm_component
self.embedding_component = embedding_component
self.vector_store_component = vector_store_component
@@ -98,9 +106,12 @@ class ChatService:
use_context: bool = False,
context_filter: ContextFilter | None = None,
) -> BaseChatEngine:
+ settings = self.settings
if use_context:
vector_index_retriever = self.vector_store_component.get_retriever(
- index=self.index, context_filter=context_filter
+ index=self.index,
+ context_filter=context_filter,
+ similarity_top_k=self.settings.rag.similarity_top_k,
)
return ContextChatEngine.from_defaults(
system_prompt=system_prompt,
@@ -108,6 +119,9 @@ class ChatService:
llm=self.llm_component.llm, # Takes no effect at the moment
node_postprocessors=[
MetadataReplacementPostProcessor(target_metadata_key="window"),
+ SimilarityPostprocessor(
+ similarity_cutoff=settings.rag.similarity_value
+ ),
],
)
else:
diff --git a/private_gpt/server/ingest/ingest_service.py b/private_gpt/server/ingest/ingest_service.py
index 1d6f5ba2..f9ae4728 100644
--- a/private_gpt/server/ingest/ingest_service.py
+++ b/private_gpt/server/ingest/ingest_service.py
@@ -1,7 +1,7 @@
import logging
import tempfile
from pathlib import Path
-from typing import AnyStr, BinaryIO
+from typing import TYPE_CHECKING, AnyStr, BinaryIO
from injector import inject, singleton
from llama_index.core.node_parser import SentenceWindowNodeParser
@@ -17,6 +17,9 @@ from private_gpt.components.vector_store.vector_store_component import (
from private_gpt.server.ingest.model import IngestedDoc
from private_gpt.settings.settings import settings
+if TYPE_CHECKING:
+ from llama_index.core.storage.docstore.types import RefDocInfo
+
logger = logging.getLogger(__name__)
@@ -86,17 +89,15 @@ class IngestService:
return [IngestedDoc.from_document(document) for document in documents]
def list_ingested(self) -> list[IngestedDoc]:
- ingested_docs = []
+ ingested_docs: list[IngestedDoc] = []
try:
docstore = self.storage_context.docstore
- ingested_docs_ids: set[str] = set()
+ ref_docs: dict[str, RefDocInfo] | None = docstore.get_all_ref_doc_info()
- for node in docstore.docs.values():
- if node.ref_doc_id is not None:
- ingested_docs_ids.add(node.ref_doc_id)
+ if not ref_docs:
+ return ingested_docs
- for doc_id in ingested_docs_ids:
- ref_doc_info = docstore.get_ref_doc_info(ref_doc_id=doc_id)
+ for doc_id, ref_doc_info in ref_docs.items():
doc_metadata = None
if ref_doc_info is not None and ref_doc_info.metadata is not None:
doc_metadata = IngestedDoc.curate_metadata(ref_doc_info.metadata)
diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
index 4c274384..5896f00d 100644
--- a/private_gpt/settings/settings.py
+++ b/private_gpt/settings/settings.py
@@ -155,13 +155,14 @@ class HuggingFaceSettings(BaseModel):
class EmbeddingSettings(BaseModel):
mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"]
- ingest_mode: Literal["simple", "batch", "parallel"] = Field(
+ ingest_mode: Literal["simple", "batch", "parallel", "pipeline"] = Field(
"simple",
description=(
"The ingest mode to use for the embedding engine:\n"
"If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n"
"If `batch` - if multiple files, parse all the files in parallel, "
"and send them in batch to the embedding model.\n"
+ "In `pipeline` - The Embedding engine is kept as busy as possible\n"
"If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n"
"`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n"
"For modes that leverage parallelization, you can specify the number of "
@@ -174,6 +175,7 @@ class EmbeddingSettings(BaseModel):
"The number of workers to use for file ingestion.\n"
"In `batch` mode, this is the number of workers used to parse the files.\n"
"In `parallel` mode, this is the number of workers used to parse the files and embed them.\n"
+ "In `pipeline` mode, this is the number of workers that can perform embeddings.\n"
"This is only used if `ingest_mode` is not `simple`.\n"
"Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n"
"Do not set it higher than your number of threads of your CPU."
@@ -239,6 +241,10 @@ class OllamaSettings(BaseModel):
1.1,
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
)
+ request_timeout: float = Field(
+ 120.0,
+ description="Time elapsed until ollama times out the request. Default is 120s. Format is float. ",
+ )
class AzureOpenAISettings(BaseModel):
@@ -278,6 +284,17 @@ class UISettings(BaseModel):
)
+class RagSettings(BaseModel):
+ similarity_top_k: int = Field(
+ 2,
+ description="This value controls the number of documents returned by the RAG pipeline",
+ )
+ similarity_value: float = Field(
+ None,
+ description="If set, any documents retrieved from the RAG must meet a certain match score. Acceptable values are between 0 and 1.",
+ )
+
+
class PostgresSettings(BaseModel):
host: str = Field(
"localhost",
@@ -373,6 +390,7 @@ class Settings(BaseModel):
azopenai: AzureOpenAISettings
vectorstore: VectorstoreSettings
nodestore: NodeStoreSettings
+ rag: RagSettings
qdrant: QdrantSettings | None = None
postgres: PostgresSettings | None = None
diff --git a/private_gpt/utils/eta.py b/private_gpt/utils/eta.py
new file mode 100644
index 00000000..9315334f
--- /dev/null
+++ b/private_gpt/utils/eta.py
@@ -0,0 +1,122 @@
+import datetime
+import logging
+import math
+import time
+from collections import deque
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def human_time(*args: Any, **kwargs: Any) -> str:
+ def timedelta_total_seconds(timedelta: datetime.timedelta) -> float:
+ return (
+ timedelta.microseconds
+ + 0.0
+ + (timedelta.seconds + timedelta.days * 24 * 3600) * 10**6
+ ) / 10**6
+
+ secs = float(timedelta_total_seconds(datetime.timedelta(*args, **kwargs)))
+ # We want (ms) precision below 2 seconds
+ if secs < 2:
+ return f"{secs * 1000}ms"
+ units = [("y", 86400 * 365), ("d", 86400), ("h", 3600), ("m", 60), ("s", 1)]
+ parts = []
+ for unit, mul in units:
+ if secs / mul >= 1 or mul == 1:
+ if mul > 1:
+ n = int(math.floor(secs / mul))
+ secs -= n * mul
+ else:
+ # >2s we drop the (ms) component.
+ n = int(secs)
+ if n:
+ parts.append(f"{n}{unit}")
+ return " ".join(parts)
+
+
+def eta(iterator: list[Any]) -> Any:
+ """Report an ETA after 30s and every 60s thereafter."""
+ total = len(iterator)
+ _eta = ETA(total)
+ _eta.needReport(30)
+ for processed, data in enumerate(iterator, start=1):
+ yield data
+ _eta.update(processed)
+ if _eta.needReport(60):
+ logger.info(f"{processed}/{total} - ETA {_eta.human_time()}")
+
+
+class ETA:
+ """Predict how long something will take to complete."""
+
+ def __init__(self, total: int):
+ self.total: int = total # Total expected records.
+ self.rate: float = 0.0 # per second
+ self._timing_data: deque[tuple[float, int]] = deque(maxlen=100)
+ self.secondsLeft: float = 0.0
+ self.nexttime: float = 0.0
+
+ def human_time(self) -> str:
+ if self._calc():
+ return f"{human_time(seconds=self.secondsLeft)} @ {int(self.rate * 60)}/min"
+ return "(computing)"
+
+ def update(self, count: int) -> None:
+ # count should be in the range 0 to self.total
+ assert count > 0
+ assert count <= self.total
+ self._timing_data.append((time.time(), count)) # (X,Y) for pearson
+
+ def needReport(self, whenSecs: int) -> bool:
+ now = time.time()
+ if now > self.nexttime:
+ self.nexttime = now + whenSecs
+ return True
+ return False
+
+ def _calc(self) -> bool:
+ # A sample before a prediction. Need two points to compute slope!
+ if len(self._timing_data) < 3:
+ return False
+
+ # http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
+ # Calculate means and standard deviations.
+ samples = len(self._timing_data)
+ # column wise sum of the timing tuples to compute their mean.
+ mean_x, mean_y = (
+ sum(i) / samples for i in zip(*self._timing_data, strict=False)
+ )
+ std_x = math.sqrt(
+ sum(pow(i[0] - mean_x, 2) for i in self._timing_data) / (samples - 1)
+ )
+ std_y = math.sqrt(
+ sum(pow(i[1] - mean_y, 2) for i in self._timing_data) / (samples - 1)
+ )
+
+ # Calculate coefficient.
+ sum_xy, sum_sq_v_x, sum_sq_v_y = 0.0, 0.0, 0
+ for x, y in self._timing_data:
+ x -= mean_x
+ y -= mean_y
+ sum_xy += x * y
+ sum_sq_v_x += pow(x, 2)
+ sum_sq_v_y += pow(y, 2)
+ pearson_r = sum_xy / math.sqrt(sum_sq_v_x * sum_sq_v_y)
+
+ # Calculate regression line.
+ # y = mx + b where m is the slope and b is the y-intercept.
+ m = self.rate = pearson_r * (std_y / std_x)
+ y = self.total
+ b = mean_y - m * mean_x
+ x = (y - b) / m
+
+ # Calculate fitted line (transformed/shifted regression line horizontally).
+ fitted_b = self._timing_data[-1][1] - (m * self._timing_data[-1][0])
+ fitted_x = (y - fitted_b) / m
+ _, count = self._timing_data[-1] # adjust last data point progress count
+ adjusted_x = ((fitted_x - x) * (count / self.total)) + x
+ eta_epoch = adjusted_x
+
+ self.secondsLeft = max([eta_epoch - time.time(), 0])
+ return True
diff --git a/scripts/setup b/scripts/setup
index 3e02e641..edba1049 100755
--- a/scripts/setup
+++ b/scripts/setup
@@ -10,7 +10,7 @@ from private_gpt.settings.settings import settings
resume_download = True
if __name__ == '__main__':
- parser = argparse.ArgumentParser(prog='Setup: Download models from huggingface')
+ parser = argparse.ArgumentParser(prog='Setup: Download models from Hugging Face')
parser.add_argument('--resume', default=True, action=argparse.BooleanOptionalAction, help='Enable/Disable resume_download options to restart the download progress interrupted')
args = parser.parse_args()
resume_download = args.resume
diff --git a/scripts/utils.py b/scripts/utils.py
index 6f5006c4..48068789 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -2,9 +2,35 @@ import argparse
import os
import shutil
+from private_gpt.paths import local_data_path
+from private_gpt.settings.settings import settings
-def wipe():
- path = "local_data"
+
+def wipe() -> None:
+ WIPE_MAP = {
+ "simple": wipe_simple, # node store
+ "chroma": wipe_chroma, # vector store
+ "postgres": wipe_postgres, # node, index and vector store
+ }
+ for dbtype in ("nodestore", "vectorstore"):
+ database = getattr(settings(), dbtype).database
+ func = WIPE_MAP.get(database)
+ if func:
+ func(dbtype)
+ else:
+ print(f"Unable to wipe database '{database}' for '{dbtype}'")
+
+
+def wipe_file(file: str) -> None:
+ if os.path.isfile(file):
+ os.remove(file)
+ print(f" - Deleted {file}")
+
+
+def wipe_tree(path: str) -> None:
+ if not os.path.exists(path):
+ print(f"Warning: Path not found {path}")
+ return
print(f"Wiping {path}...")
all_files = os.listdir(path)
@@ -24,6 +50,54 @@ def wipe():
continue
+def wipe_simple(dbtype: str) -> None:
+ assert dbtype == "nodestore"
+ from llama_index.core.storage.docstore.types import (
+ DEFAULT_PERSIST_FNAME as DOCSTORE,
+ )
+ from llama_index.core.storage.index_store.types import (
+ DEFAULT_PERSIST_FNAME as INDEXSTORE,
+ )
+
+ for store in (DOCSTORE, INDEXSTORE):
+ wipe_file(str((local_data_path / store).absolute()))
+
+
+def wipe_postgres(dbtype: str) -> None:
+ try:
+ import psycopg2
+ except ImportError as e:
+ raise ImportError("Postgres dependencies not found") from e
+
+ cur = conn = None
+ try:
+ tables = {
+ "nodestore": ["data_docstore", "data_indexstore"],
+ "vectorstore": ["data_embeddings"],
+ }[dbtype]
+ connection = settings().postgres.model_dump(exclude_none=True)
+ schema = connection.pop("schema_name")
+ conn = psycopg2.connect(**connection)
+ cur = conn.cursor()
+ for table in tables:
+ sql = f"DROP TABLE IF EXISTS {schema}.{table}"
+ cur.execute(sql)
+ print(f"Table {schema}.{table} dropped.")
+ conn.commit()
+ except psycopg2.Error as e:
+ print("Error:", e)
+ finally:
+ if cur:
+ cur.close()
+ if conn:
+ conn.close()
+
+
+def wipe_chroma(dbtype: str):
+ assert dbtype == "vectorstore"
+ wipe_tree(str((local_data_path / "chroma_db").absolute()))
+
+
if __name__ == "__main__":
commands = {
"wipe": wipe,
diff --git a/settings-local.yaml b/settings-local.yaml
index 2c1995bc..c9d02742 100644
--- a/settings-local.yaml
+++ b/settings-local.yaml
@@ -1,3 +1,4 @@
+# poetry install --extras "ui llms-llama-cpp vector-stores-qdrant embeddings-huggingface"
server:
env_name: ${APP_ENV:local}
diff --git a/settings-ollama.yaml b/settings-ollama.yaml
index 9a0aaed0..d7e1a12c 100644
--- a/settings-ollama.yaml
+++ b/settings-ollama.yaml
@@ -14,11 +14,12 @@ ollama:
llm_model: mistral
embedding_model: nomic-embed-text
api_base: http://localhost:11434
- tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.
- top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
- top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
- repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
- repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
+ tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.
+ top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
+ top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
+ repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
+ repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
+ request_timeout: 120.0 # Time elapsed until ollama times out the request. Default is 120s. Format is float.
vectorstore:
database: qdrant
diff --git a/settings.yaml b/settings.yaml
index 0b4cb341..87a63ef4 100644
--- a/settings.yaml
+++ b/settings.yaml
@@ -42,6 +42,12 @@ llm:
tokenizer: mistralai/Mistral-7B-Instruct-v0.2
temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
+rag:
+ similarity_top_k: 2
+ #This value controls how many "top" documents the RAG returns to use in the context.
+ #similarity_value: 0.45
+ #This value is disabled by default. If you enable this settings, the RAG will only use articles that meet a certain percentage score.
+
llamacpp:
prompt_style: "mistral"
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
@@ -89,6 +95,7 @@ ollama:
llm_model: llama2
embedding_model: nomic-embed-text
api_base: http://localhost:11434
+ request_timeout: 120.0
azopenai:
api_key: ${AZ_OPENAI_API_KEY:}