Merge branch 'zylon-ai:main' into update-ui-include-model-info-#1647

This commit is contained in:
Ingrid Stevens 2024-03-27 12:25:36 +01:00 committed by GitHub
commit 97b8999933
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 439 additions and 27 deletions

View File

@ -10,5 +10,4 @@ services:
environment:
PORT: 8080
PGPT_PROFILES: docker
PGPT_MODE: local
PGPT_MODE: llamacpp

View File

@ -8,14 +8,14 @@ The clients are kept up to date automatically, so we encourage you to use the la
<Cards>
<Card
title="Node.js/TypeScript"
title="Node.js/TypeScript - WIP"
icon="fa-brands fa-node"
href="https://github.com/imartinez/privateGPT-typescript"
/>
<Card
title="Python"
title="Python - Ready!"
icon="fa-brands fa-python"
href="https://github.com/imartinez/privateGPT-python"
href="https://github.com/imartinez/pgpt_python"
/>
<br />
</Cards>
@ -24,12 +24,12 @@ The clients are kept up to date automatically, so we encourage you to use the la
<Cards>
<Card
title="Java"
title="Java - WIP"
icon="fa-brands fa-java"
href="https://github.com/imartinez/privateGPT-java"
/>
<Card
title="Go"
title="Go - WIP"
icon="fa-brands fa-golang"
href="https://github.com/imartinez/privateGPT-go"
/>

View File

@ -62,6 +62,7 @@ The following ingestion mode exist:
* `simple`: historic behavior, ingest one document at a time, sequentially
* `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed)
* `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup.
* `pipeline`: Alternative to parallel.
To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`.
To configure the number of workers used for parallel or batched ingestion, you can use

View File

@ -1,4 +1,4 @@
{
"organization": "privategpt",
"version": "0.17.2"
"version": "0.19.10"
}

View File

@ -6,6 +6,7 @@ import multiprocessing.pool
import os
import threading
from pathlib import Path
from queue import Queue
from typing import Any
from llama_index.core.data_structs import IndexDict
@ -13,12 +14,13 @@ from llama_index.core.embeddings.utils import EmbedType
from llama_index.core.indices import VectorStoreIndex, load_index_from_storage
from llama_index.core.indices.base import BaseIndex
from llama_index.core.ingestion import run_transformations
from llama_index.core.schema import Document, TransformComponent
from llama_index.core.schema import BaseNode, Document, TransformComponent
from llama_index.core.storage import StorageContext
from private_gpt.components.ingest.ingest_helper import IngestionHelper
from private_gpt.paths import local_data_path
from private_gpt.settings.settings import Settings
from private_gpt.utils.eta import eta
logger = logging.getLogger(__name__)
@ -314,6 +316,170 @@ class ParallelizedIngestComponent(BaseIngestComponentWithIndex):
self._file_to_documents_work_pool.terminate()
class PipelineIngestComponent(BaseIngestComponentWithIndex):
"""Pipeline ingestion - keeping the embedding worker pool as busy as possible.
This class implements a threaded ingestion pipeline, which comprises two threads
and two queues. The primary thread is responsible for reading and parsing files
into documents. These documents are then placed into a queue, which is
distributed to a pool of worker processes for embedding computation. After
embedding, the documents are transferred to another queue where they are
accumulated until a threshold is reached. Upon reaching this threshold, the
accumulated documents are flushed to the document store, index, and vector
store.
Exception handling ensures robustness against erroneous files. However, in the
pipelined design, one error can lead to the discarding of multiple files. Any
discarded files will be reported.
"""
NODE_FLUSH_COUNT = 5000 # Save the index every # nodes.
def __init__(
self,
storage_context: StorageContext,
embed_model: EmbedType,
transformations: list[TransformComponent],
count_workers: int,
*args: Any,
**kwargs: Any,
) -> None:
super().__init__(storage_context, embed_model, transformations, *args, **kwargs)
self.count_workers = count_workers
assert (
len(self.transformations) >= 2
), "Embeddings must be in the transformations"
assert count_workers > 0, "count_workers must be > 0"
self.count_workers = count_workers
# We are doing our own multiprocessing
# To do not collide with the multiprocessing of huggingface, we disable it
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# doc_q stores parsed files as Document chunks.
# Using a shallow queue causes the filesystem parser to block
# when it reaches capacity. This ensures it doesn't outpace the
# computationally intensive embeddings phase, avoiding unnecessary
# memory consumption. The semaphore is used to bound the async worker
# embedding computations to cause the doc Q to fill and block.
self.doc_semaphore = multiprocessing.Semaphore(
self.count_workers
) # limit the doc queue to # items.
self.doc_q: Queue[tuple[str, str | None, list[Document] | None]] = Queue(20)
# node_q stores documents parsed into nodes (embeddings).
# Larger queue size so we don't block the embedding workers during a slow
# index update.
self.node_q: Queue[
tuple[str, str | None, list[Document] | None, list[BaseNode] | None]
] = Queue(40)
threading.Thread(target=self._doc_to_node, daemon=True).start()
threading.Thread(target=self._write_nodes, daemon=True).start()
def _doc_to_node(self) -> None:
# Parse documents into nodes
with multiprocessing.pool.ThreadPool(processes=self.count_workers) as pool:
while True:
try:
cmd, file_name, documents = self.doc_q.get(
block=True
) # Documents for a file
if cmd == "process":
# Push CPU/GPU embedding work to the worker pool
# Acquire semaphore to control access to worker pool
self.doc_semaphore.acquire()
pool.apply_async(
self._doc_to_node_worker, (file_name, documents)
)
elif cmd == "quit":
break
finally:
if cmd != "process":
self.doc_q.task_done() # unblock Q joins
def _doc_to_node_worker(self, file_name: str, documents: list[Document]) -> None:
# CPU/GPU intensive work in its own process
try:
nodes = run_transformations(
documents, # type: ignore[arg-type]
self.transformations,
show_progress=self.show_progress,
)
self.node_q.put(("process", file_name, documents, nodes))
finally:
self.doc_semaphore.release()
self.doc_q.task_done() # unblock Q joins
def _save_docs(
self, files: list[str], documents: list[Document], nodes: list[BaseNode]
) -> None:
try:
logger.info(
f"Saving {len(files)} files ({len(documents)} documents / {len(nodes)} nodes)"
)
self._index.insert_nodes(nodes)
for document in documents:
self._index.docstore.set_document_hash(
document.get_doc_id(), document.hash
)
self._save_index()
except Exception:
# Tell the user so they can investigate these files
logger.exception(f"Processing files {files}")
finally:
# Clearing work, even on exception, maintains a clean state.
nodes.clear()
documents.clear()
files.clear()
def _write_nodes(self) -> None:
# Save nodes to index. I/O intensive.
node_stack: list[BaseNode] = []
doc_stack: list[Document] = []
file_stack: list[str] = []
while True:
try:
cmd, file_name, documents, nodes = self.node_q.get(block=True)
if cmd in ("flush", "quit"):
if file_stack:
self._save_docs(file_stack, doc_stack, node_stack)
if cmd == "quit":
break
elif cmd == "process":
node_stack.extend(nodes) # type: ignore[arg-type]
doc_stack.extend(documents) # type: ignore[arg-type]
file_stack.append(file_name) # type: ignore[arg-type]
# Constant saving is heavy on I/O - accumulate to a threshold
if len(node_stack) >= self.NODE_FLUSH_COUNT:
self._save_docs(file_stack, doc_stack, node_stack)
finally:
self.node_q.task_done()
def _flush(self) -> None:
self.doc_q.put(("flush", None, None))
self.doc_q.join()
self.node_q.put(("flush", None, None, None))
self.node_q.join()
def ingest(self, file_name: str, file_data: Path) -> list[Document]:
documents = IngestionHelper.transform_file_into_documents(file_name, file_data)
self.doc_q.put(("process", file_name, documents))
self._flush()
return documents
def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
docs = []
for file_name, file_data in eta(files):
try:
documents = IngestionHelper.transform_file_into_documents(
file_name, file_data
)
self.doc_q.put(("process", file_name, documents))
docs.extend(documents)
except Exception:
logger.exception(f"Skipping {file_data.name}")
self._flush()
return docs
def get_ingestion_component(
storage_context: StorageContext,
embed_model: EmbedType,
@ -336,6 +502,13 @@ def get_ingestion_component(
transformations=transformations,
count_workers=settings.embedding.count_workers,
)
elif ingest_mode == "pipeline":
return PipelineIngestComponent(
storage_context=storage_context,
embed_model=embed_model,
transformations=transformations,
count_workers=settings.embedding.count_workers,
)
else:
return SimpleIngestComponent(
storage_context=storage_context,

View File

@ -131,6 +131,7 @@ class LLMComponent:
temperature=settings.llm.temperature,
context_window=settings.llm.context_window,
additional_kwargs=settings_kwargs,
request_timeout=ollama_settings.request_timeout,
)
case "azopenai":
try:

View File

@ -8,6 +8,9 @@ from llama_index.core.chat_engine.types import (
from llama_index.core.indices import VectorStoreIndex
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core.postprocessor import (
SimilarityPostprocessor,
)
from llama_index.core.storage import StorageContext
from llama_index.core.types import TokenGen
from pydantic import BaseModel
@ -20,6 +23,7 @@ from private_gpt.components.vector_store.vector_store_component import (
)
from private_gpt.open_ai.extensions.context_filter import ContextFilter
from private_gpt.server.chunks.chunks_service import Chunk
from private_gpt.settings.settings import Settings
class Completion(BaseModel):
@ -68,14 +72,18 @@ class ChatEngineInput:
@singleton
class ChatService:
settings: Settings
@inject
def __init__(
self,
settings: Settings,
llm_component: LLMComponent,
vector_store_component: VectorStoreComponent,
embedding_component: EmbeddingComponent,
node_store_component: NodeStoreComponent,
) -> None:
self.settings = settings
self.llm_component = llm_component
self.embedding_component = embedding_component
self.vector_store_component = vector_store_component
@ -98,9 +106,12 @@ class ChatService:
use_context: bool = False,
context_filter: ContextFilter | None = None,
) -> BaseChatEngine:
settings = self.settings
if use_context:
vector_index_retriever = self.vector_store_component.get_retriever(
index=self.index, context_filter=context_filter
index=self.index,
context_filter=context_filter,
similarity_top_k=self.settings.rag.similarity_top_k,
)
return ContextChatEngine.from_defaults(
system_prompt=system_prompt,
@ -108,6 +119,9 @@ class ChatService:
llm=self.llm_component.llm, # Takes no effect at the moment
node_postprocessors=[
MetadataReplacementPostProcessor(target_metadata_key="window"),
SimilarityPostprocessor(
similarity_cutoff=settings.rag.similarity_value
),
],
)
else:

View File

@ -1,7 +1,7 @@
import logging
import tempfile
from pathlib import Path
from typing import AnyStr, BinaryIO
from typing import TYPE_CHECKING, AnyStr, BinaryIO
from injector import inject, singleton
from llama_index.core.node_parser import SentenceWindowNodeParser
@ -17,6 +17,9 @@ from private_gpt.components.vector_store.vector_store_component import (
from private_gpt.server.ingest.model import IngestedDoc
from private_gpt.settings.settings import settings
if TYPE_CHECKING:
from llama_index.core.storage.docstore.types import RefDocInfo
logger = logging.getLogger(__name__)
@ -86,17 +89,15 @@ class IngestService:
return [IngestedDoc.from_document(document) for document in documents]
def list_ingested(self) -> list[IngestedDoc]:
ingested_docs = []
ingested_docs: list[IngestedDoc] = []
try:
docstore = self.storage_context.docstore
ingested_docs_ids: set[str] = set()
ref_docs: dict[str, RefDocInfo] | None = docstore.get_all_ref_doc_info()
for node in docstore.docs.values():
if node.ref_doc_id is not None:
ingested_docs_ids.add(node.ref_doc_id)
if not ref_docs:
return ingested_docs
for doc_id in ingested_docs_ids:
ref_doc_info = docstore.get_ref_doc_info(ref_doc_id=doc_id)
for doc_id, ref_doc_info in ref_docs.items():
doc_metadata = None
if ref_doc_info is not None and ref_doc_info.metadata is not None:
doc_metadata = IngestedDoc.curate_metadata(ref_doc_info.metadata)

View File

@ -155,13 +155,14 @@ class HuggingFaceSettings(BaseModel):
class EmbeddingSettings(BaseModel):
mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"]
ingest_mode: Literal["simple", "batch", "parallel"] = Field(
ingest_mode: Literal["simple", "batch", "parallel", "pipeline"] = Field(
"simple",
description=(
"The ingest mode to use for the embedding engine:\n"
"If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n"
"If `batch` - if multiple files, parse all the files in parallel, "
"and send them in batch to the embedding model.\n"
"In `pipeline` - The Embedding engine is kept as busy as possible\n"
"If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n"
"`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n"
"For modes that leverage parallelization, you can specify the number of "
@ -174,6 +175,7 @@ class EmbeddingSettings(BaseModel):
"The number of workers to use for file ingestion.\n"
"In `batch` mode, this is the number of workers used to parse the files.\n"
"In `parallel` mode, this is the number of workers used to parse the files and embed them.\n"
"In `pipeline` mode, this is the number of workers that can perform embeddings.\n"
"This is only used if `ingest_mode` is not `simple`.\n"
"Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n"
"Do not set it higher than your number of threads of your CPU."
@ -239,6 +241,10 @@ class OllamaSettings(BaseModel):
1.1,
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
)
request_timeout: float = Field(
120.0,
description="Time elapsed until ollama times out the request. Default is 120s. Format is float. ",
)
class AzureOpenAISettings(BaseModel):
@ -278,6 +284,17 @@ class UISettings(BaseModel):
)
class RagSettings(BaseModel):
similarity_top_k: int = Field(
2,
description="This value controls the number of documents returned by the RAG pipeline",
)
similarity_value: float = Field(
None,
description="If set, any documents retrieved from the RAG must meet a certain match score. Acceptable values are between 0 and 1.",
)
class PostgresSettings(BaseModel):
host: str = Field(
"localhost",
@ -373,6 +390,7 @@ class Settings(BaseModel):
azopenai: AzureOpenAISettings
vectorstore: VectorstoreSettings
nodestore: NodeStoreSettings
rag: RagSettings
qdrant: QdrantSettings | None = None
postgres: PostgresSettings | None = None

122
private_gpt/utils/eta.py Normal file
View File

@ -0,0 +1,122 @@
import datetime
import logging
import math
import time
from collections import deque
from typing import Any
logger = logging.getLogger(__name__)
def human_time(*args: Any, **kwargs: Any) -> str:
def timedelta_total_seconds(timedelta: datetime.timedelta) -> float:
return (
timedelta.microseconds
+ 0.0
+ (timedelta.seconds + timedelta.days * 24 * 3600) * 10**6
) / 10**6
secs = float(timedelta_total_seconds(datetime.timedelta(*args, **kwargs)))
# We want (ms) precision below 2 seconds
if secs < 2:
return f"{secs * 1000}ms"
units = [("y", 86400 * 365), ("d", 86400), ("h", 3600), ("m", 60), ("s", 1)]
parts = []
for unit, mul in units:
if secs / mul >= 1 or mul == 1:
if mul > 1:
n = int(math.floor(secs / mul))
secs -= n * mul
else:
# >2s we drop the (ms) component.
n = int(secs)
if n:
parts.append(f"{n}{unit}")
return " ".join(parts)
def eta(iterator: list[Any]) -> Any:
"""Report an ETA after 30s and every 60s thereafter."""
total = len(iterator)
_eta = ETA(total)
_eta.needReport(30)
for processed, data in enumerate(iterator, start=1):
yield data
_eta.update(processed)
if _eta.needReport(60):
logger.info(f"{processed}/{total} - ETA {_eta.human_time()}")
class ETA:
"""Predict how long something will take to complete."""
def __init__(self, total: int):
self.total: int = total # Total expected records.
self.rate: float = 0.0 # per second
self._timing_data: deque[tuple[float, int]] = deque(maxlen=100)
self.secondsLeft: float = 0.0
self.nexttime: float = 0.0
def human_time(self) -> str:
if self._calc():
return f"{human_time(seconds=self.secondsLeft)} @ {int(self.rate * 60)}/min"
return "(computing)"
def update(self, count: int) -> None:
# count should be in the range 0 to self.total
assert count > 0
assert count <= self.total
self._timing_data.append((time.time(), count)) # (X,Y) for pearson
def needReport(self, whenSecs: int) -> bool:
now = time.time()
if now > self.nexttime:
self.nexttime = now + whenSecs
return True
return False
def _calc(self) -> bool:
# A sample before a prediction. Need two points to compute slope!
if len(self._timing_data) < 3:
return False
# http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
# Calculate means and standard deviations.
samples = len(self._timing_data)
# column wise sum of the timing tuples to compute their mean.
mean_x, mean_y = (
sum(i) / samples for i in zip(*self._timing_data, strict=False)
)
std_x = math.sqrt(
sum(pow(i[0] - mean_x, 2) for i in self._timing_data) / (samples - 1)
)
std_y = math.sqrt(
sum(pow(i[1] - mean_y, 2) for i in self._timing_data) / (samples - 1)
)
# Calculate coefficient.
sum_xy, sum_sq_v_x, sum_sq_v_y = 0.0, 0.0, 0
for x, y in self._timing_data:
x -= mean_x
y -= mean_y
sum_xy += x * y
sum_sq_v_x += pow(x, 2)
sum_sq_v_y += pow(y, 2)
pearson_r = sum_xy / math.sqrt(sum_sq_v_x * sum_sq_v_y)
# Calculate regression line.
# y = mx + b where m is the slope and b is the y-intercept.
m = self.rate = pearson_r * (std_y / std_x)
y = self.total
b = mean_y - m * mean_x
x = (y - b) / m
# Calculate fitted line (transformed/shifted regression line horizontally).
fitted_b = self._timing_data[-1][1] - (m * self._timing_data[-1][0])
fitted_x = (y - fitted_b) / m
_, count = self._timing_data[-1] # adjust last data point progress count
adjusted_x = ((fitted_x - x) * (count / self.total)) + x
eta_epoch = adjusted_x
self.secondsLeft = max([eta_epoch - time.time(), 0])
return True

View File

@ -10,7 +10,7 @@ from private_gpt.settings.settings import settings
resume_download = True
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='Setup: Download models from huggingface')
parser = argparse.ArgumentParser(prog='Setup: Download models from Hugging Face')
parser.add_argument('--resume', default=True, action=argparse.BooleanOptionalAction, help='Enable/Disable resume_download options to restart the download progress interrupted')
args = parser.parse_args()
resume_download = args.resume

View File

@ -2,9 +2,35 @@ import argparse
import os
import shutil
from private_gpt.paths import local_data_path
from private_gpt.settings.settings import settings
def wipe():
path = "local_data"
def wipe() -> None:
WIPE_MAP = {
"simple": wipe_simple, # node store
"chroma": wipe_chroma, # vector store
"postgres": wipe_postgres, # node, index and vector store
}
for dbtype in ("nodestore", "vectorstore"):
database = getattr(settings(), dbtype).database
func = WIPE_MAP.get(database)
if func:
func(dbtype)
else:
print(f"Unable to wipe database '{database}' for '{dbtype}'")
def wipe_file(file: str) -> None:
if os.path.isfile(file):
os.remove(file)
print(f" - Deleted {file}")
def wipe_tree(path: str) -> None:
if not os.path.exists(path):
print(f"Warning: Path not found {path}")
return
print(f"Wiping {path}...")
all_files = os.listdir(path)
@ -24,6 +50,54 @@ def wipe():
continue
def wipe_simple(dbtype: str) -> None:
assert dbtype == "nodestore"
from llama_index.core.storage.docstore.types import (
DEFAULT_PERSIST_FNAME as DOCSTORE,
)
from llama_index.core.storage.index_store.types import (
DEFAULT_PERSIST_FNAME as INDEXSTORE,
)
for store in (DOCSTORE, INDEXSTORE):
wipe_file(str((local_data_path / store).absolute()))
def wipe_postgres(dbtype: str) -> None:
try:
import psycopg2
except ImportError as e:
raise ImportError("Postgres dependencies not found") from e
cur = conn = None
try:
tables = {
"nodestore": ["data_docstore", "data_indexstore"],
"vectorstore": ["data_embeddings"],
}[dbtype]
connection = settings().postgres.model_dump(exclude_none=True)
schema = connection.pop("schema_name")
conn = psycopg2.connect(**connection)
cur = conn.cursor()
for table in tables:
sql = f"DROP TABLE IF EXISTS {schema}.{table}"
cur.execute(sql)
print(f"Table {schema}.{table} dropped.")
conn.commit()
except psycopg2.Error as e:
print("Error:", e)
finally:
if cur:
cur.close()
if conn:
conn.close()
def wipe_chroma(dbtype: str):
assert dbtype == "vectorstore"
wipe_tree(str((local_data_path / "chroma_db").absolute()))
if __name__ == "__main__":
commands = {
"wipe": wipe,

View File

@ -1,3 +1,4 @@
# poetry install --extras "ui llms-llama-cpp vector-stores-qdrant embeddings-huggingface"
server:
env_name: ${APP_ENV:local}

View File

@ -14,11 +14,12 @@ ollama:
llm_model: mistral
embedding_model: nomic-embed-text
api_base: http://localhost:11434
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
request_timeout: 120.0 # Time elapsed until ollama times out the request. Default is 120s. Format is float.
vectorstore:
database: qdrant

View File

@ -42,6 +42,12 @@ llm:
tokenizer: mistralai/Mistral-7B-Instruct-v0.2
temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
rag:
similarity_top_k: 2
#This value controls how many "top" documents the RAG returns to use in the context.
#similarity_value: 0.45
#This value is disabled by default. If you enable this settings, the RAG will only use articles that meet a certain percentage score.
llamacpp:
prompt_style: "mistral"
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
@ -89,6 +95,7 @@ ollama:
llm_model: llama2
embedding_model: nomic-embed-text
api_base: http://localhost:11434
request_timeout: 120.0
azopenai:
api_key: ${AZ_OPENAI_API_KEY:}