Merge branch 'zylon-ai:main' into update-ui-include-model-info-#1647

2025-08-16 06:33:31 +00:00 · 2024-03-27 12:25:36 +01:00 · 2024-03-27 12:25:36 +01:00 · 97b8999933
commit 97b8999933
parent ab79a93fc8 087cb0b7b7
15 changed files with 439 additions and 27 deletions
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -10,5 +10,4 @@ services:
    environment:
      PORT: 8080
      PGPT_PROFILES: docker
-      PGPT_MODE: local
-
+      PGPT_MODE: llamacpp
--- a/fern/docs/pages/api-reference/sdks.mdx
+++ b/fern/docs/pages/api-reference/sdks.mdx
@ -8,14 +8,14 @@ The clients are kept up to date automatically, so we encourage you to use the la

 <Cards>
  <Card
-    title="Node.js/TypeScript"
+    title="Node.js/TypeScript - WIP"
    icon="fa-brands fa-node"
    href="https://github.com/imartinez/privateGPT-typescript"
  />
  <Card
-    title="Python"
+    title="Python - Ready!"
    icon="fa-brands fa-python"
-    href="https://github.com/imartinez/privateGPT-python"
+    href="https://github.com/imartinez/pgpt_python"
  />
  <br />
 </Cards>
@ -24,12 +24,12 @@ The clients are kept up to date automatically, so we encourage you to use the la

 <Cards>
  <Card
-    title="Java"
+    title="Java - WIP"
    icon="fa-brands fa-java"
    href="https://github.com/imartinez/privateGPT-java"
  />
  <Card
-    title="Go"
+    title="Go - WIP"
    icon="fa-brands fa-golang"
    href="https://github.com/imartinez/privateGPT-go"
  />
--- a/fern/docs/pages/manual/ingestion.mdx
+++ b/fern/docs/pages/manual/ingestion.mdx
@ -62,6 +62,7 @@ The following ingestion mode exist:
 * `simple`: historic behavior, ingest one document at a time, sequentially
 * `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed)
 * `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup.
+* `pipeline`: Alternative to parallel.
 To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`.

 To configure the number of workers used for parallel or batched ingestion, you can use
--- a/fern/fern.config.json
+++ b/fern/fern.config.json
@ -1,4 +1,4 @@
 {
  "organization": "privategpt",
-  "version": "0.17.2"
+  "version": "0.19.10"
 }
--- a/private_gpt/components/ingest/ingest_component.py
+++ b/private_gpt/components/ingest/ingest_component.py
@ -6,6 +6,7 @@ import multiprocessing.pool
 import os
 import threading
 from pathlib import Path
+from queue import Queue
 from typing import Any

 from llama_index.core.data_structs import IndexDict
@ -13,12 +14,13 @@ from llama_index.core.embeddings.utils import EmbedType
 from llama_index.core.indices import VectorStoreIndex, load_index_from_storage
 from llama_index.core.indices.base import BaseIndex
 from llama_index.core.ingestion import run_transformations
-from llama_index.core.schema import Document, TransformComponent
+from llama_index.core.schema import BaseNode, Document, TransformComponent
 from llama_index.core.storage import StorageContext

 from private_gpt.components.ingest.ingest_helper import IngestionHelper
 from private_gpt.paths import local_data_path
 from private_gpt.settings.settings import Settings
+from private_gpt.utils.eta import eta

 logger = logging.getLogger(__name__)

@ -314,6 +316,170 @@ class ParallelizedIngestComponent(BaseIngestComponentWithIndex):
        self._file_to_documents_work_pool.terminate()


+class PipelineIngestComponent(BaseIngestComponentWithIndex):
+    """Pipeline ingestion - keeping the embedding worker pool as busy as possible.
+
+    This class implements a threaded ingestion pipeline, which comprises two threads
+    and two queues. The primary thread is responsible for reading and parsing files
+    into documents. These documents are then placed into a queue, which is
+    distributed to a pool of worker processes for embedding computation. After
+    embedding, the documents are transferred to another queue where they are
+    accumulated until a threshold is reached. Upon reaching this threshold, the
+    accumulated documents are flushed to the document store, index, and vector
+    store.
+
+    Exception handling ensures robustness against erroneous files. However, in the
+    pipelined design, one error can lead to the discarding of multiple files. Any
+    discarded files will be reported.
+    """
+
+    NODE_FLUSH_COUNT = 5000  # Save the index every # nodes.
+
+    def __init__(
+        self,
+        storage_context: StorageContext,
+        embed_model: EmbedType,
+        transformations: list[TransformComponent],
+        count_workers: int,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(storage_context, embed_model, transformations, *args, **kwargs)
+        self.count_workers = count_workers
+        assert (
+            len(self.transformations) >= 2
+        ), "Embeddings must be in the transformations"
+        assert count_workers > 0, "count_workers must be > 0"
+        self.count_workers = count_workers
+        # We are doing our own multiprocessing
+        # To do not collide with the multiprocessing of huggingface, we disable it
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+        # doc_q stores parsed files as Document chunks.
+        # Using a shallow queue causes the filesystem parser to block
+        # when it reaches capacity. This ensures it doesn't outpace the
+        # computationally intensive embeddings phase, avoiding unnecessary
+        # memory consumption.  The semaphore is used to bound the async worker
+        # embedding computations to cause the doc Q to fill and block.
+        self.doc_semaphore = multiprocessing.Semaphore(
+            self.count_workers
+        )  # limit the doc queue to # items.
+        self.doc_q: Queue[tuple[str, str | None, list[Document] | None]] = Queue(20)
+        # node_q stores documents parsed into nodes (embeddings).
+        # Larger queue size so we don't block the embedding workers during a slow
+        # index update.
+        self.node_q: Queue[
+            tuple[str, str | None, list[Document] | None, list[BaseNode] | None]
+        ] = Queue(40)
+        threading.Thread(target=self._doc_to_node, daemon=True).start()
+        threading.Thread(target=self._write_nodes, daemon=True).start()
+
+    def _doc_to_node(self) -> None:
+        # Parse documents into nodes
+        with multiprocessing.pool.ThreadPool(processes=self.count_workers) as pool:
+            while True:
+                try:
+                    cmd, file_name, documents = self.doc_q.get(
+                        block=True
+                    )  # Documents for a file
+                    if cmd == "process":
+                        # Push CPU/GPU embedding work to the worker pool
+                        # Acquire semaphore to control access to worker pool
+                        self.doc_semaphore.acquire()
+                        pool.apply_async(
+                            self._doc_to_node_worker, (file_name, documents)
+                        )
+                    elif cmd == "quit":
+                        break
+                finally:
+                    if cmd != "process":
+                        self.doc_q.task_done()  # unblock Q joins
+
+    def _doc_to_node_worker(self, file_name: str, documents: list[Document]) -> None:
+        # CPU/GPU intensive work in its own process
+        try:
+            nodes = run_transformations(
+                documents,  # type: ignore[arg-type]
+                self.transformations,
+                show_progress=self.show_progress,
+            )
+            self.node_q.put(("process", file_name, documents, nodes))
+        finally:
+            self.doc_semaphore.release()
+            self.doc_q.task_done()  # unblock Q joins
+
+    def _save_docs(
+        self, files: list[str], documents: list[Document], nodes: list[BaseNode]
+    ) -> None:
+        try:
+            logger.info(
+                f"Saving {len(files)} files ({len(documents)} documents / {len(nodes)} nodes)"
+            )
+            self._index.insert_nodes(nodes)
+            for document in documents:
+                self._index.docstore.set_document_hash(
+                    document.get_doc_id(), document.hash
+                )
+            self._save_index()
+        except Exception:
+            # Tell the user so they can investigate these files
+            logger.exception(f"Processing files {files}")
+        finally:
+            # Clearing work, even on exception, maintains a clean state.
+            nodes.clear()
+            documents.clear()
+            files.clear()
+
+    def _write_nodes(self) -> None:
+        # Save nodes to index.  I/O intensive.
+        node_stack: list[BaseNode] = []
+        doc_stack: list[Document] = []
+        file_stack: list[str] = []
+        while True:
+            try:
+                cmd, file_name, documents, nodes = self.node_q.get(block=True)
+                if cmd in ("flush", "quit"):
+                    if file_stack:
+                        self._save_docs(file_stack, doc_stack, node_stack)
+                    if cmd == "quit":
+                        break
+                elif cmd == "process":
+                    node_stack.extend(nodes)  # type: ignore[arg-type]
+                    doc_stack.extend(documents)  # type: ignore[arg-type]
+                    file_stack.append(file_name)  # type: ignore[arg-type]
+                    # Constant saving is heavy on I/O - accumulate to a threshold
+                    if len(node_stack) >= self.NODE_FLUSH_COUNT:
+                        self._save_docs(file_stack, doc_stack, node_stack)
+            finally:
+                self.node_q.task_done()
+
+    def _flush(self) -> None:
+        self.doc_q.put(("flush", None, None))
+        self.doc_q.join()
+        self.node_q.put(("flush", None, None, None))
+        self.node_q.join()
+
+    def ingest(self, file_name: str, file_data: Path) -> list[Document]:
+        documents = IngestionHelper.transform_file_into_documents(file_name, file_data)
+        self.doc_q.put(("process", file_name, documents))
+        self._flush()
+        return documents
+
+    def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
+        docs = []
+        for file_name, file_data in eta(files):
+            try:
+                documents = IngestionHelper.transform_file_into_documents(
+                    file_name, file_data
+                )
+                self.doc_q.put(("process", file_name, documents))
+                docs.extend(documents)
+            except Exception:
+                logger.exception(f"Skipping {file_data.name}")
+        self._flush()
+        return docs
+
+
 def get_ingestion_component(
    storage_context: StorageContext,
    embed_model: EmbedType,
@ -336,6 +502,13 @@ def get_ingestion_component(
            transformations=transformations,
            count_workers=settings.embedding.count_workers,
        )
+    elif ingest_mode == "pipeline":
+        return PipelineIngestComponent(
+            storage_context=storage_context,
+            embed_model=embed_model,
+            transformations=transformations,
+            count_workers=settings.embedding.count_workers,
+        )
    else:
        return SimpleIngestComponent(
            storage_context=storage_context,
--- a/private_gpt/components/llm/llm_component.py
+++ b/private_gpt/components/llm/llm_component.py
@ -131,6 +131,7 @@ class LLMComponent:
                    temperature=settings.llm.temperature,
                    context_window=settings.llm.context_window,
                    additional_kwargs=settings_kwargs,
+                    request_timeout=ollama_settings.request_timeout,
                )
            case "azopenai":
                try:
--- a/private_gpt/server/chat/chat_service.py
+++ b/private_gpt/server/chat/chat_service.py
@ -8,6 +8,9 @@ from llama_index.core.chat_engine.types import (
 from llama_index.core.indices import VectorStoreIndex
 from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor
 from llama_index.core.llms import ChatMessage, MessageRole
+from llama_index.core.postprocessor import (
+    SimilarityPostprocessor,
+)
 from llama_index.core.storage import StorageContext
 from llama_index.core.types import TokenGen
 from pydantic import BaseModel
@ -20,6 +23,7 @@ from private_gpt.components.vector_store.vector_store_component import (
 )
 from private_gpt.open_ai.extensions.context_filter import ContextFilter
 from private_gpt.server.chunks.chunks_service import Chunk
+from private_gpt.settings.settings import Settings


 class Completion(BaseModel):
@ -68,14 +72,18 @@ class ChatEngineInput:

@singleton
 class ChatService:
+    settings: Settings
+
    @inject
    def __init__(
        self,
+        settings: Settings,
        llm_component: LLMComponent,
        vector_store_component: VectorStoreComponent,
        embedding_component: EmbeddingComponent,
        node_store_component: NodeStoreComponent,
    ) -> None:
+        self.settings = settings
        self.llm_component = llm_component
        self.embedding_component = embedding_component
        self.vector_store_component = vector_store_component
@ -98,9 +106,12 @@ class ChatService:
        use_context: bool = False,
        context_filter: ContextFilter | None = None,
    ) -> BaseChatEngine:
+        settings = self.settings
        if use_context:
            vector_index_retriever = self.vector_store_component.get_retriever(
-                index=self.index, context_filter=context_filter
+                index=self.index,
+                context_filter=context_filter,
+                similarity_top_k=self.settings.rag.similarity_top_k,
            )
            return ContextChatEngine.from_defaults(
                system_prompt=system_prompt,
@ -108,6 +119,9 @@ class ChatService:
                llm=self.llm_component.llm,  # Takes no effect at the moment
                node_postprocessors=[
                    MetadataReplacementPostProcessor(target_metadata_key="window"),
+                    SimilarityPostprocessor(
+                        similarity_cutoff=settings.rag.similarity_value
+                    ),
                ],
            )
        else:
--- a/private_gpt/server/ingest/ingest_service.py
+++ b/private_gpt/server/ingest/ingest_service.py
@ -1,7 +1,7 @@
 import logging
 import tempfile
 from pathlib import Path
-from typing import AnyStr, BinaryIO
+from typing import TYPE_CHECKING, AnyStr, BinaryIO

 from injector import inject, singleton
 from llama_index.core.node_parser import SentenceWindowNodeParser
@ -17,6 +17,9 @@ from private_gpt.components.vector_store.vector_store_component import (
 from private_gpt.server.ingest.model import IngestedDoc
 from private_gpt.settings.settings import settings

+if TYPE_CHECKING:
+    from llama_index.core.storage.docstore.types import RefDocInfo
+
 logger = logging.getLogger(__name__)


@ -86,17 +89,15 @@ class IngestService:
        return [IngestedDoc.from_document(document) for document in documents]

    def list_ingested(self) -> list[IngestedDoc]:
-        ingested_docs = []
+        ingested_docs: list[IngestedDoc] = []
        try:
            docstore = self.storage_context.docstore
-            ingested_docs_ids: set[str] = set()
+            ref_docs: dict[str, RefDocInfo] | None = docstore.get_all_ref_doc_info()

-            for node in docstore.docs.values():
-                if node.ref_doc_id is not None:
-                    ingested_docs_ids.add(node.ref_doc_id)
+            if not ref_docs:
+                return ingested_docs

-            for doc_id in ingested_docs_ids:
-                ref_doc_info = docstore.get_ref_doc_info(ref_doc_id=doc_id)
+            for doc_id, ref_doc_info in ref_docs.items():
                doc_metadata = None
                if ref_doc_info is not None and ref_doc_info.metadata is not None:
                    doc_metadata = IngestedDoc.curate_metadata(ref_doc_info.metadata)
--- a/private_gpt/settings/settings.py
+++ b/private_gpt/settings/settings.py
@ -155,13 +155,14 @@ class HuggingFaceSettings(BaseModel):

 class EmbeddingSettings(BaseModel):
    mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"]
-    ingest_mode: Literal["simple", "batch", "parallel"] = Field(
+    ingest_mode: Literal["simple", "batch", "parallel", "pipeline"] = Field(
        "simple",
        description=(
            "The ingest mode to use for the embedding engine:\n"
            "If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n"
            "If `batch` - if multiple files, parse all the files in parallel, "
            "and send them in batch to the embedding model.\n"
+            "In `pipeline` - The Embedding engine is kept as busy as possible\n"
            "If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n"
            "`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n"
            "For modes that leverage parallelization, you can specify the number of "
@ -174,6 +175,7 @@ class EmbeddingSettings(BaseModel):
            "The number of workers to use for file ingestion.\n"
            "In `batch` mode, this is the number of workers used to parse the files.\n"
            "In `parallel` mode, this is the number of workers used to parse the files and embed them.\n"
+            "In `pipeline` mode, this is the number of workers that can perform embeddings.\n"
            "This is only used if `ingest_mode` is not `simple`.\n"
            "Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n"
            "Do not set it higher than your number of threads of your CPU."
@ -239,6 +241,10 @@ class OllamaSettings(BaseModel):
        1.1,
        description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
    )
+    request_timeout: float = Field(
+        120.0,
+        description="Time elapsed until ollama times out the request. Default is 120s. Format is float. ",
+    )


 class AzureOpenAISettings(BaseModel):
@ -278,6 +284,17 @@ class UISettings(BaseModel):
    )


+class RagSettings(BaseModel):
+    similarity_top_k: int = Field(
+        2,
+        description="This value controls the number of documents returned by the RAG pipeline",
+    )
+    similarity_value: float = Field(
+        None,
+        description="If set, any documents retrieved from the RAG must meet a certain match score. Acceptable values are between 0 and 1.",
+    )
+
+
 class PostgresSettings(BaseModel):
    host: str = Field(
        "localhost",
@ -373,6 +390,7 @@ class Settings(BaseModel):
    azopenai: AzureOpenAISettings
    vectorstore: VectorstoreSettings
    nodestore: NodeStoreSettings
+    rag: RagSettings
    qdrant: QdrantSettings | None = None
    postgres: PostgresSettings | None = None

--- a/private_gpt/utils/eta.py
+++ b/private_gpt/utils/eta.py
@ -0,0 +1,122 @@
+import datetime
+import logging
+import math
+import time
+from collections import deque
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def human_time(*args: Any, **kwargs: Any) -> str:
+    def timedelta_total_seconds(timedelta: datetime.timedelta) -> float:
+        return (
+            timedelta.microseconds
+            + 0.0
+            + (timedelta.seconds + timedelta.days * 24 * 3600) * 10**6
+        ) / 10**6
+
+    secs = float(timedelta_total_seconds(datetime.timedelta(*args, **kwargs)))
+    # We want (ms) precision below 2 seconds
+    if secs < 2:
+        return f"{secs * 1000}ms"
+    units = [("y", 86400 * 365), ("d", 86400), ("h", 3600), ("m", 60), ("s", 1)]
+    parts = []
+    for unit, mul in units:
+        if secs / mul >= 1 or mul == 1:
+            if mul > 1:
+                n = int(math.floor(secs / mul))
+                secs -= n * mul
+            else:
+                # >2s we drop the (ms) component.
+                n = int(secs)
+            if n:
+                parts.append(f"{n}{unit}")
+    return " ".join(parts)
+
+
+def eta(iterator: list[Any]) -> Any:
+    """Report an ETA after 30s and every 60s thereafter."""
+    total = len(iterator)
+    _eta = ETA(total)
+    _eta.needReport(30)
+    for processed, data in enumerate(iterator, start=1):
+        yield data
+        _eta.update(processed)
+        if _eta.needReport(60):
+            logger.info(f"{processed}/{total} - ETA {_eta.human_time()}")
+
+
+class ETA:
+    """Predict how long something will take to complete."""
+
+    def __init__(self, total: int):
+        self.total: int = total  # Total expected records.
+        self.rate: float = 0.0  # per second
+        self._timing_data: deque[tuple[float, int]] = deque(maxlen=100)
+        self.secondsLeft: float = 0.0
+        self.nexttime: float = 0.0
+
+    def human_time(self) -> str:
+        if self._calc():
+            return f"{human_time(seconds=self.secondsLeft)} @ {int(self.rate * 60)}/min"
+        return "(computing)"
+
+    def update(self, count: int) -> None:
+        # count should be in the range 0 to self.total
+        assert count > 0
+        assert count <= self.total
+        self._timing_data.append((time.time(), count))  # (X,Y) for pearson
+
+    def needReport(self, whenSecs: int) -> bool:
+        now = time.time()
+        if now > self.nexttime:
+            self.nexttime = now + whenSecs
+            return True
+        return False
+
+    def _calc(self) -> bool:
+        # A sample before a prediction.   Need two points to compute slope!
+        if len(self._timing_data) < 3:
+            return False
+
+        # http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
+        # Calculate means and standard deviations.
+        samples = len(self._timing_data)
+        # column wise sum of the timing tuples to compute their mean.
+        mean_x, mean_y = (
+            sum(i) / samples for i in zip(*self._timing_data, strict=False)
+        )
+        std_x = math.sqrt(
+            sum(pow(i[0] - mean_x, 2) for i in self._timing_data) / (samples - 1)
+        )
+        std_y = math.sqrt(
+            sum(pow(i[1] - mean_y, 2) for i in self._timing_data) / (samples - 1)
+        )
+
+        # Calculate coefficient.
+        sum_xy, sum_sq_v_x, sum_sq_v_y = 0.0, 0.0, 0
+        for x, y in self._timing_data:
+            x -= mean_x
+            y -= mean_y
+            sum_xy += x * y
+            sum_sq_v_x += pow(x, 2)
+            sum_sq_v_y += pow(y, 2)
+        pearson_r = sum_xy / math.sqrt(sum_sq_v_x * sum_sq_v_y)
+
+        # Calculate regression line.
+        # y = mx + b where m is the slope and b is the y-intercept.
+        m = self.rate = pearson_r * (std_y / std_x)
+        y = self.total
+        b = mean_y - m * mean_x
+        x = (y - b) / m
+
+        # Calculate fitted line (transformed/shifted regression line horizontally).
+        fitted_b = self._timing_data[-1][1] - (m * self._timing_data[-1][0])
+        fitted_x = (y - fitted_b) / m
+        _, count = self._timing_data[-1]  # adjust last data point progress count
+        adjusted_x = ((fitted_x - x) * (count / self.total)) + x
+        eta_epoch = adjusted_x
+
+        self.secondsLeft = max([eta_epoch - time.time(), 0])
+        return True
--- a/scripts/setup
+++ b/scripts/setup
@ -10,7 +10,7 @@ from private_gpt.settings.settings import settings

 resume_download = True
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(prog='Setup: Download models from huggingface')
+    parser = argparse.ArgumentParser(prog='Setup: Download models from Hugging Face')
    parser.add_argument('--resume', default=True, action=argparse.BooleanOptionalAction, help='Enable/Disable resume_download options to restart the download progress interrupted')
    args = parser.parse_args()
    resume_download = args.resume
--- a/scripts/utils.py
+++ b/scripts/utils.py
@ -2,9 +2,35 @@ import argparse
 import os
 import shutil

+from private_gpt.paths import local_data_path
+from private_gpt.settings.settings import settings

-def wipe():
-    path = "local_data"
+
+def wipe() -> None:
+    WIPE_MAP = {
+        "simple": wipe_simple,  # node store
+        "chroma": wipe_chroma,  # vector store
+        "postgres": wipe_postgres,  # node, index and vector store
+    }
+    for dbtype in ("nodestore", "vectorstore"):
+        database = getattr(settings(), dbtype).database
+        func = WIPE_MAP.get(database)
+        if func:
+            func(dbtype)
+        else:
+            print(f"Unable to wipe database '{database}' for '{dbtype}'")
+
+
+def wipe_file(file: str) -> None:
+    if os.path.isfile(file):
+        os.remove(file)
+        print(f" - Deleted {file}")
+
+
+def wipe_tree(path: str) -> None:
+    if not os.path.exists(path):
+        print(f"Warning: Path not found {path}")
+        return
    print(f"Wiping {path}...")
    all_files = os.listdir(path)

@ -24,6 +50,54 @@ def wipe():
            continue


+def wipe_simple(dbtype: str) -> None:
+    assert dbtype == "nodestore"
+    from llama_index.core.storage.docstore.types import (
+        DEFAULT_PERSIST_FNAME as DOCSTORE,
+    )
+    from llama_index.core.storage.index_store.types import (
+        DEFAULT_PERSIST_FNAME as INDEXSTORE,
+    )
+
+    for store in (DOCSTORE, INDEXSTORE):
+        wipe_file(str((local_data_path / store).absolute()))
+
+
+def wipe_postgres(dbtype: str) -> None:
+    try:
+        import psycopg2
+    except ImportError as e:
+        raise ImportError("Postgres dependencies not found") from e
+
+    cur = conn = None
+    try:
+        tables = {
+            "nodestore": ["data_docstore", "data_indexstore"],
+            "vectorstore": ["data_embeddings"],
+        }[dbtype]
+        connection = settings().postgres.model_dump(exclude_none=True)
+        schema = connection.pop("schema_name")
+        conn = psycopg2.connect(**connection)
+        cur = conn.cursor()
+        for table in tables:
+            sql = f"DROP TABLE IF EXISTS {schema}.{table}"
+            cur.execute(sql)
+            print(f"Table {schema}.{table} dropped.")
+        conn.commit()
+    except psycopg2.Error as e:
+        print("Error:", e)
+    finally:
+        if cur:
+            cur.close()
+        if conn:
+            conn.close()
+
+
+def wipe_chroma(dbtype: str):
+    assert dbtype == "vectorstore"
+    wipe_tree(str((local_data_path / "chroma_db").absolute()))
+
+
 if __name__ == "__main__":
    commands = {
        "wipe": wipe,
--- a/settings-local.yaml
+++ b/settings-local.yaml
@ -1,3 +1,4 @@
+# poetry install --extras "ui llms-llama-cpp vector-stores-qdrant embeddings-huggingface"
 server:
  env_name: ${APP_ENV:local}

--- a/settings-ollama.yaml
+++ b/settings-ollama.yaml
@ -14,11 +14,12 @@ ollama:
  llm_model: mistral
  embedding_model: nomic-embed-text
  api_base: http://localhost:11434
-  tfs_z: 1.0            # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.
-  top_k: 40             # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
-  top_p: 0.9            # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
-  repeat_last_n: 64     # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
-  repeat_penalty: 1.2   # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
+  tfs_z: 1.0              # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.
+  top_k: 40               # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
+  top_p: 0.9              # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
+  repeat_last_n: 64       # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
+  repeat_penalty: 1.2     # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
+  request_timeout: 120.0  # Time elapsed until ollama times out the request. Default is 120s. Format is float.

 vectorstore:
  database: qdrant
--- a/settings.yaml
+++ b/settings.yaml
@ -42,6 +42,12 @@ llm:
  tokenizer: mistralai/Mistral-7B-Instruct-v0.2
  temperature: 0.1      # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)

+rag:
+  similarity_top_k: 2
+  #This value controls how many "top" documents the RAG returns to use in the context.
+  #similarity_value: 0.45
+  #This value is disabled by default.  If you enable this settings, the RAG will only use articles that meet a certain percentage score.
+
 llamacpp:
  prompt_style: "mistral"
  llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
@ -89,6 +95,7 @@ ollama:
  llm_model: llama2
  embedding_model: nomic-embed-text
  api_base: http://localhost:11434
+  request_timeout: 120.0

 azopenai:
  api_key: ${AZ_OPENAI_API_KEY:}