Merge branch 'zylon-ai:main' into update-ui-include-model-info-#1647

2025-08-18 07:26:51 +00:00 · 2024-03-27 12:25:36 +01:00 · 2024-03-27 12:25:36 +01:00 · 97b8999933
commit 97b8999933
parent ab79a93fc8 087cb0b7b7
15 changed files with 439 additions and 27 deletions
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -10,5 +10,4 @@ services:
    environment:
      PORT: 8080
      PGPT_PROFILES: docker
-      PGPT_MODE: local
+      PGPT_MODE: llamacpp
--- a/fern/docs/pages/api-reference/sdks.mdx
+++ b/fern/docs/pages/api-reference/sdks.mdx
@ -8,14 +8,14 @@ The clients are kept up to date automatically, so we encourage you to use the la
 <Cards>
  <Card
-    title="Node.js/TypeScript"
+    title="Node.js/TypeScript - WIP"
    icon="fa-brands fa-node"
    href="https://github.com/imartinez/privateGPT-typescript"
  />
  <Card
-    title="Python"
+    title="Python - Ready!"
    icon="fa-brands fa-python"
-    href="https://github.com/imartinez/privateGPT-python"
+    href="https://github.com/imartinez/pgpt_python"
  />
  <br />
 </Cards>
@ -24,12 +24,12 @@ The clients are kept up to date automatically, so we encourage you to use the la
 <Cards>
  <Card
-    title="Java"
+    title="Java - WIP"
    icon="fa-brands fa-java"
    href="https://github.com/imartinez/privateGPT-java"
  />
  <Card
-    title="Go"
+    title="Go - WIP"
    icon="fa-brands fa-golang"
    href="https://github.com/imartinez/privateGPT-go"
  />
--- a/fern/docs/pages/manual/ingestion.mdx
+++ b/fern/docs/pages/manual/ingestion.mdx
@ -62,6 +62,7 @@ The following ingestion mode exist:
 * `simple`: historic behavior, ingest one document at a time, sequentially
 * `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed)
 * `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup.
 * `pipeline`: Alternative to parallel.
 To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`.
 To configure the number of workers used for parallel or batched ingestion, you can use
--- a/fern/fern.config.json
+++ b/fern/fern.config.json
@ -1,4 +1,4 @@
 {
  "organization": "privategpt",
-  "version": "0.17.2"
+  "version": "0.19.10"
 }
--- a/private_gpt/components/ingest/ingest_component.py
+++ b/private_gpt/components/ingest/ingest_component.py
@ -6,6 +6,7 @@ import multiprocessing.pool
 import os
 import threading
 from pathlib import Path
 from queue import Queue
 from typing import Any
 from llama_index.core.data_structs import IndexDict
@ -13,12 +14,13 @@ from llama_index.core.embeddings.utils import EmbedType
 from llama_index.core.indices import VectorStoreIndex, load_index_from_storage
 from llama_index.core.indices.base import BaseIndex
 from llama_index.core.ingestion import run_transformations
-from llama_index.core.schema import Document, TransformComponent
+from llama_index.core.schema import BaseNode, Document, TransformComponent
 from llama_index.core.storage import StorageContext
 from private_gpt.components.ingest.ingest_helper import IngestionHelper
 from private_gpt.paths import local_data_path
 from private_gpt.settings.settings import Settings
 from private_gpt.utils.eta import eta
 logger = logging.getLogger(__name__)
@ -314,6 +316,170 @@ class ParallelizedIngestComponent(BaseIngestComponentWithIndex):
        self._file_to_documents_work_pool.terminate()
 class PipelineIngestComponent(BaseIngestComponentWithIndex):
    """Pipeline ingestion - keeping the embedding worker pool as busy as possible.
    This class implements a threaded ingestion pipeline, which comprises two threads
    and two queues. The primary thread is responsible for reading and parsing files
    into documents. These documents are then placed into a queue, which is
    distributed to a pool of worker processes for embedding computation. After
    embedding, the documents are transferred to another queue where they are
    accumulated until a threshold is reached. Upon reaching this threshold, the
    accumulated documents are flushed to the document store, index, and vector
    store.
    Exception handling ensures robustness against erroneous files. However, in the
    pipelined design, one error can lead to the discarding of multiple files. Any
    discarded files will be reported.
    """
    NODE_FLUSH_COUNT = 5000  # Save the index every # nodes.
    def __init__(
        self,
        storage_context: StorageContext,
        embed_model: EmbedType,
        transformations: list[TransformComponent],
        count_workers: int,
        *args: Any,
        **kwargs: Any,
    ) -> None:
        super().__init__(storage_context, embed_model, transformations, *args, **kwargs)
        self.count_workers = count_workers
        assert (
            len(self.transformations) >= 2
        ), "Embeddings must be in the transformations"
        assert count_workers > 0, "count_workers must be > 0"
        self.count_workers = count_workers
        # We are doing our own multiprocessing
        # To do not collide with the multiprocessing of huggingface, we disable it
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        # doc_q stores parsed files as Document chunks.
        # Using a shallow queue causes the filesystem parser to block
        # when it reaches capacity. This ensures it doesn't outpace the
        # computationally intensive embeddings phase, avoiding unnecessary
        # memory consumption.  The semaphore is used to bound the async worker
        # embedding computations to cause the doc Q to fill and block.
        self.doc_semaphore = multiprocessing.Semaphore(
            self.count_workers
        )  # limit the doc queue to # items.
        self.doc_q: Queue[tuple[str, str | None, list[Document] | None]] = Queue(20)
        # node_q stores documents parsed into nodes (embeddings).
        # Larger queue size so we don't block the embedding workers during a slow
        # index update.
        self.node_q: Queue[
            tuple[str, str | None, list[Document] | None, list[BaseNode] | None]
        ] = Queue(40)
        threading.Thread(target=self._doc_to_node, daemon=True).start()
        threading.Thread(target=self._write_nodes, daemon=True).start()
    def _doc_to_node(self) -> None:
        # Parse documents into nodes
        with multiprocessing.pool.ThreadPool(processes=self.count_workers) as pool:
            while True:
                try:
                    cmd, file_name, documents = self.doc_q.get(
                        block=True
                    )  # Documents for a file
                    if cmd == "process":
                        # Push CPU/GPU embedding work to the worker pool
                        # Acquire semaphore to control access to worker pool
                        self.doc_semaphore.acquire()
                        pool.apply_async(
                            self._doc_to_node_worker, (file_name, documents)
                        )
                    elif cmd == "quit":
                        break
                finally:
                    if cmd != "process":
                        self.doc_q.task_done()  # unblock Q joins
    def _doc_to_node_worker(self, file_name: str, documents: list[Document]) -> None:
        # CPU/GPU intensive work in its own process
        try:
            nodes = run_transformations(
                documents,  # type: ignore[arg-type]
                self.transformations,
                show_progress=self.show_progress,
            )
            self.node_q.put(("process", file_name, documents, nodes))
        finally:
            self.doc_semaphore.release()
            self.doc_q.task_done()  # unblock Q joins
    def _save_docs(
        self, files: list[str], documents: list[Document], nodes: list[BaseNode]
    ) -> None:
        try:
            logger.info(
                f"Saving {len(files)} files ({len(documents)} documents / {len(nodes)} nodes)"
            )
            self._index.insert_nodes(nodes)
            for document in documents:
                self._index.docstore.set_document_hash(
                    document.get_doc_id(), document.hash
                )
            self._save_index()
        except Exception:
            # Tell the user so they can investigate these files
            logger.exception(f"Processing files {files}")
        finally:
            # Clearing work, even on exception, maintains a clean state.
            nodes.clear()
            documents.clear()
            files.clear()
    def _write_nodes(self) -> None:
        # Save nodes to index.  I/O intensive.
        node_stack: list[BaseNode] = []
        doc_stack: list[Document] = []
        file_stack: list[str] = []
        while True:
            try:
                cmd, file_name, documents, nodes = self.node_q.get(block=True)
                if cmd in ("flush", "quit"):
                    if file_stack:
                        self._save_docs(file_stack, doc_stack, node_stack)
                    if cmd == "quit":
                        break
                elif cmd == "process":
                    node_stack.extend(nodes)  # type: ignore[arg-type]
                    doc_stack.extend(documents)  # type: ignore[arg-type]
                    file_stack.append(file_name)  # type: ignore[arg-type]
                    # Constant saving is heavy on I/O - accumulate to a threshold
                    if len(node_stack) >= self.NODE_FLUSH_COUNT:
                        self._save_docs(file_stack, doc_stack, node_stack)
            finally:
                self.node_q.task_done()
    def _flush(self) -> None:
        self.doc_q.put(("flush", None, None))
        self.doc_q.join()
        self.node_q.put(("flush", None, None, None))
        self.node_q.join()
    def ingest(self, file_name: str, file_data: Path) -> list[Document]:
        documents = IngestionHelper.transform_file_into_documents(file_name, file_data)
        self.doc_q.put(("process", file_name, documents))
        self._flush()
        return documents
    def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
        docs = []
        for file_name, file_data in eta(files):
            try:
                documents = IngestionHelper.transform_file_into_documents(
                    file_name, file_data
                )
                self.doc_q.put(("process", file_name, documents))
                docs.extend(documents)
            except Exception:
                logger.exception(f"Skipping {file_data.name}")
        self._flush()
        return docs
 def get_ingestion_component(
    storage_context: StorageContext,
    embed_model: EmbedType,
@ -336,6 +502,13 @@ def get_ingestion_component(
            transformations=transformations,
            count_workers=settings.embedding.count_workers,
        )
    elif ingest_mode == "pipeline":
        return PipelineIngestComponent(
            storage_context=storage_context,
            embed_model=embed_model,
            transformations=transformations,
            count_workers=settings.embedding.count_workers,
        )
    else:
        return SimpleIngestComponent(
            storage_context=storage_context,
--- a/private_gpt/components/llm/llm_component.py
+++ b/private_gpt/components/llm/llm_component.py
@ -131,6 +131,7 @@ class LLMComponent:
                    temperature=settings.llm.temperature,
                    context_window=settings.llm.context_window,
                    additional_kwargs=settings_kwargs,
                    request_timeout=ollama_settings.request_timeout,
                )
            case "azopenai":
                try:
--- a/private_gpt/server/chat/chat_service.py
+++ b/private_gpt/server/chat/chat_service.py
@ -8,6 +8,9 @@ from llama_index.core.chat_engine.types import (
 from llama_index.core.indices import VectorStoreIndex
 from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor
 from llama_index.core.llms import ChatMessage, MessageRole
 from llama_index.core.postprocessor import (
    SimilarityPostprocessor,
 )
 from llama_index.core.storage import StorageContext
 from llama_index.core.types import TokenGen
 from pydantic import BaseModel
@ -20,6 +23,7 @@ from private_gpt.components.vector_store.vector_store_component import (
 )
 from private_gpt.open_ai.extensions.context_filter import ContextFilter
 from private_gpt.server.chunks.chunks_service import Chunk
 from private_gpt.settings.settings import Settings
 class Completion(BaseModel):
@ -68,14 +72,18 @@ class ChatEngineInput:
@singleton
 class ChatService:
    settings: Settings
    @inject
    def __init__(
        self,
        settings: Settings,
        llm_component: LLMComponent,
        vector_store_component: VectorStoreComponent,
        embedding_component: EmbeddingComponent,
        node_store_component: NodeStoreComponent,
    ) -> None:
        self.settings = settings
        self.llm_component = llm_component
        self.embedding_component = embedding_component
        self.vector_store_component = vector_store_component
@ -98,9 +106,12 @@ class ChatService:
        use_context: bool = False,
        context_filter: ContextFilter | None = None,
    ) -> BaseChatEngine:
        settings = self.settings
        if use_context:
            vector_index_retriever = self.vector_store_component.get_retriever(
-                index=self.index, context_filter=context_filter
+                index=self.index,
                context_filter=context_filter,
                similarity_top_k=self.settings.rag.similarity_top_k,
            )
            return ContextChatEngine.from_defaults(
                system_prompt=system_prompt,
@ -108,6 +119,9 @@ class ChatService:
                llm=self.llm_component.llm,  # Takes no effect at the moment
                node_postprocessors=[
                    MetadataReplacementPostProcessor(target_metadata_key="window"),
                    SimilarityPostprocessor(
                        similarity_cutoff=settings.rag.similarity_value
                    ),
                ],
            )
        else:
--- a/private_gpt/server/ingest/ingest_service.py
+++ b/private_gpt/server/ingest/ingest_service.py
@ -1,7 +1,7 @@
 import logging
 import tempfile
 from pathlib import Path
-from typing import AnyStr, BinaryIO
+from typing import TYPE_CHECKING, AnyStr, BinaryIO
 from injector import inject, singleton
 from llama_index.core.node_parser import SentenceWindowNodeParser
@ -17,6 +17,9 @@ from private_gpt.components.vector_store.vector_store_component import (
 from private_gpt.server.ingest.model import IngestedDoc
 from private_gpt.settings.settings import settings
 if TYPE_CHECKING:
    from llama_index.core.storage.docstore.types import RefDocInfo
 logger = logging.getLogger(__name__)
@ -86,17 +89,15 @@ class IngestService:
        return [IngestedDoc.from_document(document) for document in documents]
    def list_ingested(self) -> list[IngestedDoc]:
-        ingested_docs = []
+        ingested_docs: list[IngestedDoc] = []
        try:
            docstore = self.storage_context.docstore
-            ingested_docs_ids: set[str] = set()
+            ref_docs: dict[str, RefDocInfo] | None = docstore.get_all_ref_doc_info()
-            for node in docstore.docs.values():
+            if not ref_docs:
-                if node.ref_doc_id is not None:
+                return ingested_docs
                    ingested_docs_ids.add(node.ref_doc_id)
-            for doc_id in ingested_docs_ids:
+            for doc_id, ref_doc_info in ref_docs.items():
                ref_doc_info = docstore.get_ref_doc_info(ref_doc_id=doc_id)
                doc_metadata = None
                if ref_doc_info is not None and ref_doc_info.metadata is not None:
                    doc_metadata = IngestedDoc.curate_metadata(ref_doc_info.metadata)
--- a/private_gpt/settings/settings.py
+++ b/private_gpt/settings/settings.py
@ -155,13 +155,14 @@ class HuggingFaceSettings(BaseModel):
 class EmbeddingSettings(BaseModel):
    mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"]
-    ingest_mode: Literal["simple", "batch", "parallel"] = Field(
+    ingest_mode: Literal["simple", "batch", "parallel", "pipeline"] = Field(
        "simple",
        description=(
            "The ingest mode to use for the embedding engine:\n"
            "If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n"
            "If `batch` - if multiple files, parse all the files in parallel, "
            "and send them in batch to the embedding model.\n"
            "In `pipeline` - The Embedding engine is kept as busy as possible\n"
            "If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n"
            "`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n"
            "For modes that leverage parallelization, you can specify the number of "
@ -174,6 +175,7 @@ class EmbeddingSettings(BaseModel):
            "The number of workers to use for file ingestion.\n"
            "In `batch` mode, this is the number of workers used to parse the files.\n"
            "In `parallel` mode, this is the number of workers used to parse the files and embed them.\n"
            "In `pipeline` mode, this is the number of workers that can perform embeddings.\n"
            "This is only used if `ingest_mode` is not `simple`.\n"
            "Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n"
            "Do not set it higher than your number of threads of your CPU."
@ -239,6 +241,10 @@ class OllamaSettings(BaseModel):
        1.1,
        description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
    )
    request_timeout: float = Field(
        120.0,
        description="Time elapsed until ollama times out the request. Default is 120s. Format is float. ",
    )
 class AzureOpenAISettings(BaseModel):
@ -278,6 +284,17 @@ class UISettings(BaseModel):
    )
 class RagSettings(BaseModel):
    similarity_top_k: int = Field(
        2,
        description="This value controls the number of documents returned by the RAG pipeline",
    )
    similarity_value: float = Field(
        None,
        description="If set, any documents retrieved from the RAG must meet a certain match score. Acceptable values are between 0 and 1.",
    )
 class PostgresSettings(BaseModel):
    host: str = Field(
        "localhost",
@ -373,6 +390,7 @@ class Settings(BaseModel):
    azopenai: AzureOpenAISettings
    vectorstore: VectorstoreSettings
    nodestore: NodeStoreSettings
    rag: RagSettings
    qdrant: QdrantSettings | None = None
    postgres: PostgresSettings | None = None
--- a/private_gpt/utils/eta.py
+++ b/private_gpt/utils/eta.py
@ -0,0 +1,122 @@
 import datetime
 import logging
 import math
 import time
 from collections import deque
 from typing import Any
 logger = logging.getLogger(__name__)
 def human_time(*args: Any, **kwargs: Any) -> str:
    def timedelta_total_seconds(timedelta: datetime.timedelta) -> float:
        return (
            timedelta.microseconds
            + 0.0
            + (timedelta.seconds + timedelta.days * 24 * 3600) * 10**6
        ) / 10**6
    secs = float(timedelta_total_seconds(datetime.timedelta(*args, **kwargs)))
    # We want (ms) precision below 2 seconds
    if secs < 2:
        return f"{secs * 1000}ms"
    units = [("y", 86400 * 365), ("d", 86400), ("h", 3600), ("m", 60), ("s", 1)]
    parts = []
    for unit, mul in units:
        if secs / mul >= 1 or mul == 1:
            if mul > 1:
                n = int(math.floor(secs / mul))
                secs -= n * mul
            else:
                # >2s we drop the (ms) component.
                n = int(secs)
            if n:
                parts.append(f"{n}{unit}")
    return " ".join(parts)
 def eta(iterator: list[Any]) -> Any:
    """Report an ETA after 30s and every 60s thereafter."""
    total = len(iterator)
    _eta = ETA(total)
    _eta.needReport(30)
    for processed, data in enumerate(iterator, start=1):
        yield data
        _eta.update(processed)
        if _eta.needReport(60):
            logger.info(f"{processed}/{total} - ETA {_eta.human_time()}")
 class ETA:
    """Predict how long something will take to complete."""
    def __init__(self, total: int):
        self.total: int = total  # Total expected records.
        self.rate: float = 0.0  # per second
        self._timing_data: deque[tuple[float, int]] = deque(maxlen=100)
        self.secondsLeft: float = 0.0
        self.nexttime: float = 0.0
    def human_time(self) -> str:
        if self._calc():
            return f"{human_time(seconds=self.secondsLeft)} @ {int(self.rate * 60)}/min"
        return "(computing)"
    def update(self, count: int) -> None:
        # count should be in the range 0 to self.total
        assert count > 0
        assert count <= self.total
        self._timing_data.append((time.time(), count))  # (X,Y) for pearson
    def needReport(self, whenSecs: int) -> bool:
        now = time.time()
        if now > self.nexttime:
            self.nexttime = now + whenSecs
            return True
        return False
    def _calc(self) -> bool:
        # A sample before a prediction.   Need two points to compute slope!
        if len(self._timing_data) < 3:
            return False
        # http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
        # Calculate means and standard deviations.
        samples = len(self._timing_data)
        # column wise sum of the timing tuples to compute their mean.
        mean_x, mean_y = (
            sum(i) / samples for i in zip(*self._timing_data, strict=False)
        )
        std_x = math.sqrt(
            sum(pow(i[0] - mean_x, 2) for i in self._timing_data) / (samples - 1)
        )
        std_y = math.sqrt(
            sum(pow(i[1] - mean_y, 2) for i in self._timing_data) / (samples - 1)
        )
        # Calculate coefficient.
        sum_xy, sum_sq_v_x, sum_sq_v_y = 0.0, 0.0, 0
        for x, y in self._timing_data:
            x -= mean_x
            y -= mean_y
            sum_xy += x * y
            sum_sq_v_x += pow(x, 2)
            sum_sq_v_y += pow(y, 2)
        pearson_r = sum_xy / math.sqrt(sum_sq_v_x * sum_sq_v_y)
        # Calculate regression line.
        # y = mx + b where m is the slope and b is the y-intercept.
        m = self.rate = pearson_r * (std_y / std_x)
        y = self.total
        b = mean_y - m * mean_x
        x = (y - b) / m
        # Calculate fitted line (transformed/shifted regression line horizontally).
        fitted_b = self._timing_data[-1][1] - (m * self._timing_data[-1][0])
        fitted_x = (y - fitted_b) / m
        _, count = self._timing_data[-1]  # adjust last data point progress count
        adjusted_x = ((fitted_x - x) * (count / self.total)) + x
        eta_epoch = adjusted_x
        self.secondsLeft = max([eta_epoch - time.time(), 0])
        return True
--- a/scripts/setup
+++ b/scripts/setup
@ -10,7 +10,7 @@ from private_gpt.settings.settings import settings
 resume_download = True
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(prog='Setup: Download models from huggingface')
+    parser = argparse.ArgumentParser(prog='Setup: Download models from Hugging Face')
    parser.add_argument('--resume', default=True, action=argparse.BooleanOptionalAction, help='Enable/Disable resume_download options to restart the download progress interrupted')
    args = parser.parse_args()
    resume_download = args.resume
--- a/scripts/utils.py
+++ b/scripts/utils.py
@ -2,9 +2,35 @@ import argparse
 import os
 import shutil
 from private_gpt.paths import local_data_path
 from private_gpt.settings.settings import settings
-def wipe():
+
-    path = "local_data"
+def wipe() -> None:
    WIPE_MAP = {
        "simple": wipe_simple,  # node store
        "chroma": wipe_chroma,  # vector store
        "postgres": wipe_postgres,  # node, index and vector store
    }
    for dbtype in ("nodestore", "vectorstore"):
        database = getattr(settings(), dbtype).database
        func = WIPE_MAP.get(database)
        if func:
            func(dbtype)
        else:
            print(f"Unable to wipe database '{database}' for '{dbtype}'")
 def wipe_file(file: str) -> None:
    if os.path.isfile(file):
        os.remove(file)
        print(f" - Deleted {file}")
 def wipe_tree(path: str) -> None:
    if not os.path.exists(path):
        print(f"Warning: Path not found {path}")
        return
    print(f"Wiping {path}...")
    all_files = os.listdir(path)
@ -24,6 +50,54 @@ def wipe():
            continue
 def wipe_simple(dbtype: str) -> None:
    assert dbtype == "nodestore"
    from llama_index.core.storage.docstore.types import (
        DEFAULT_PERSIST_FNAME as DOCSTORE,
    )
    from llama_index.core.storage.index_store.types import (
        DEFAULT_PERSIST_FNAME as INDEXSTORE,
    )
    for store in (DOCSTORE, INDEXSTORE):
        wipe_file(str((local_data_path / store).absolute()))
 def wipe_postgres(dbtype: str) -> None:
    try:
        import psycopg2
    except ImportError as e:
        raise ImportError("Postgres dependencies not found") from e
    cur = conn = None
    try:
        tables = {
            "nodestore": ["data_docstore", "data_indexstore"],
            "vectorstore": ["data_embeddings"],
        }[dbtype]
        connection = settings().postgres.model_dump(exclude_none=True)
        schema = connection.pop("schema_name")
        conn = psycopg2.connect(**connection)
        cur = conn.cursor()
        for table in tables:
            sql = f"DROP TABLE IF EXISTS {schema}.{table}"
            cur.execute(sql)
            print(f"Table {schema}.{table} dropped.")
        conn.commit()
    except psycopg2.Error as e:
        print("Error:", e)
    finally:
        if cur:
            cur.close()
        if conn:
            conn.close()
 def wipe_chroma(dbtype: str):
    assert dbtype == "vectorstore"
    wipe_tree(str((local_data_path / "chroma_db").absolute()))
 if __name__ == "__main__":
    commands = {
        "wipe": wipe,
--- a/settings-local.yaml
+++ b/settings-local.yaml
@ -1,3 +1,4 @@
 # poetry install --extras "ui llms-llama-cpp vector-stores-qdrant embeddings-huggingface"
 server:
  env_name: ${APP_ENV:local}
--- a/settings-ollama.yaml
+++ b/settings-ollama.yaml
@ -19,6 +19,7 @@ ollama:
  top_p: 0.9              # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
  repeat_last_n: 64       # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
  repeat_penalty: 1.2     # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
  request_timeout: 120.0  # Time elapsed until ollama times out the request. Default is 120s. Format is float.
 vectorstore:
  database: qdrant
--- a/settings.yaml
+++ b/settings.yaml
@ -42,6 +42,12 @@ llm:
  tokenizer: mistralai/Mistral-7B-Instruct-v0.2
  temperature: 0.1      # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
 rag:
  similarity_top_k: 2
  #This value controls how many "top" documents the RAG returns to use in the context.
  #similarity_value: 0.45
  #This value is disabled by default.  If you enable this settings, the RAG will only use articles that meet a certain percentage score.
 llamacpp:
  prompt_style: "mistral"
  llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
@ -89,6 +95,7 @@ ollama:
  llm_model: llama2
  embedding_model: nomic-embed-text
  api_base: http://localhost:11434
  request_timeout: 120.0
 azopenai:
  api_key: ${AZ_OPENAI_API_KEY:}