diff --git a/docker-compose.yaml b/docker-compose.yaml index f86d2380..7129b126 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -10,5 +10,4 @@ services: environment: PORT: 8080 PGPT_PROFILES: docker - PGPT_MODE: local - + PGPT_MODE: llamacpp diff --git a/fern/docs/pages/api-reference/sdks.mdx b/fern/docs/pages/api-reference/sdks.mdx index 30935b7f..0172d9d4 100644 --- a/fern/docs/pages/api-reference/sdks.mdx +++ b/fern/docs/pages/api-reference/sdks.mdx @@ -8,14 +8,14 @@ The clients are kept up to date automatically, so we encourage you to use the la
@@ -24,12 +24,12 @@ The clients are kept up to date automatically, so we encourage you to use the la diff --git a/fern/docs/pages/manual/ingestion.mdx b/fern/docs/pages/manual/ingestion.mdx index 39b83cc9..05336559 100644 --- a/fern/docs/pages/manual/ingestion.mdx +++ b/fern/docs/pages/manual/ingestion.mdx @@ -62,6 +62,7 @@ The following ingestion mode exist: * `simple`: historic behavior, ingest one document at a time, sequentially * `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed) * `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup. +* `pipeline`: Alternative to parallel. To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`. To configure the number of workers used for parallel or batched ingestion, you can use diff --git a/fern/fern.config.json b/fern/fern.config.json index 4e60c037..2ed68270 100644 --- a/fern/fern.config.json +++ b/fern/fern.config.json @@ -1,4 +1,4 @@ { "organization": "privategpt", - "version": "0.17.2" + "version": "0.19.10" } \ No newline at end of file diff --git a/private_gpt/components/ingest/ingest_component.py b/private_gpt/components/ingest/ingest_component.py index e21b6c23..5ed03959 100644 --- a/private_gpt/components/ingest/ingest_component.py +++ b/private_gpt/components/ingest/ingest_component.py @@ -6,6 +6,7 @@ import multiprocessing.pool import os import threading from pathlib import Path +from queue import Queue from typing import Any from llama_index.core.data_structs import IndexDict @@ -13,12 +14,13 @@ from llama_index.core.embeddings.utils import EmbedType from llama_index.core.indices import VectorStoreIndex, load_index_from_storage from llama_index.core.indices.base import BaseIndex from llama_index.core.ingestion import run_transformations -from llama_index.core.schema import Document, TransformComponent +from llama_index.core.schema import BaseNode, Document, TransformComponent from llama_index.core.storage import StorageContext from private_gpt.components.ingest.ingest_helper import IngestionHelper from private_gpt.paths import local_data_path from private_gpt.settings.settings import Settings +from private_gpt.utils.eta import eta logger = logging.getLogger(__name__) @@ -314,6 +316,170 @@ class ParallelizedIngestComponent(BaseIngestComponentWithIndex): self._file_to_documents_work_pool.terminate() +class PipelineIngestComponent(BaseIngestComponentWithIndex): + """Pipeline ingestion - keeping the embedding worker pool as busy as possible. + + This class implements a threaded ingestion pipeline, which comprises two threads + and two queues. The primary thread is responsible for reading and parsing files + into documents. These documents are then placed into a queue, which is + distributed to a pool of worker processes for embedding computation. After + embedding, the documents are transferred to another queue where they are + accumulated until a threshold is reached. Upon reaching this threshold, the + accumulated documents are flushed to the document store, index, and vector + store. + + Exception handling ensures robustness against erroneous files. However, in the + pipelined design, one error can lead to the discarding of multiple files. Any + discarded files will be reported. + """ + + NODE_FLUSH_COUNT = 5000 # Save the index every # nodes. + + def __init__( + self, + storage_context: StorageContext, + embed_model: EmbedType, + transformations: list[TransformComponent], + count_workers: int, + *args: Any, + **kwargs: Any, + ) -> None: + super().__init__(storage_context, embed_model, transformations, *args, **kwargs) + self.count_workers = count_workers + assert ( + len(self.transformations) >= 2 + ), "Embeddings must be in the transformations" + assert count_workers > 0, "count_workers must be > 0" + self.count_workers = count_workers + # We are doing our own multiprocessing + # To do not collide with the multiprocessing of huggingface, we disable it + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + # doc_q stores parsed files as Document chunks. + # Using a shallow queue causes the filesystem parser to block + # when it reaches capacity. This ensures it doesn't outpace the + # computationally intensive embeddings phase, avoiding unnecessary + # memory consumption. The semaphore is used to bound the async worker + # embedding computations to cause the doc Q to fill and block. + self.doc_semaphore = multiprocessing.Semaphore( + self.count_workers + ) # limit the doc queue to # items. + self.doc_q: Queue[tuple[str, str | None, list[Document] | None]] = Queue(20) + # node_q stores documents parsed into nodes (embeddings). + # Larger queue size so we don't block the embedding workers during a slow + # index update. + self.node_q: Queue[ + tuple[str, str | None, list[Document] | None, list[BaseNode] | None] + ] = Queue(40) + threading.Thread(target=self._doc_to_node, daemon=True).start() + threading.Thread(target=self._write_nodes, daemon=True).start() + + def _doc_to_node(self) -> None: + # Parse documents into nodes + with multiprocessing.pool.ThreadPool(processes=self.count_workers) as pool: + while True: + try: + cmd, file_name, documents = self.doc_q.get( + block=True + ) # Documents for a file + if cmd == "process": + # Push CPU/GPU embedding work to the worker pool + # Acquire semaphore to control access to worker pool + self.doc_semaphore.acquire() + pool.apply_async( + self._doc_to_node_worker, (file_name, documents) + ) + elif cmd == "quit": + break + finally: + if cmd != "process": + self.doc_q.task_done() # unblock Q joins + + def _doc_to_node_worker(self, file_name: str, documents: list[Document]) -> None: + # CPU/GPU intensive work in its own process + try: + nodes = run_transformations( + documents, # type: ignore[arg-type] + self.transformations, + show_progress=self.show_progress, + ) + self.node_q.put(("process", file_name, documents, nodes)) + finally: + self.doc_semaphore.release() + self.doc_q.task_done() # unblock Q joins + + def _save_docs( + self, files: list[str], documents: list[Document], nodes: list[BaseNode] + ) -> None: + try: + logger.info( + f"Saving {len(files)} files ({len(documents)} documents / {len(nodes)} nodes)" + ) + self._index.insert_nodes(nodes) + for document in documents: + self._index.docstore.set_document_hash( + document.get_doc_id(), document.hash + ) + self._save_index() + except Exception: + # Tell the user so they can investigate these files + logger.exception(f"Processing files {files}") + finally: + # Clearing work, even on exception, maintains a clean state. + nodes.clear() + documents.clear() + files.clear() + + def _write_nodes(self) -> None: + # Save nodes to index. I/O intensive. + node_stack: list[BaseNode] = [] + doc_stack: list[Document] = [] + file_stack: list[str] = [] + while True: + try: + cmd, file_name, documents, nodes = self.node_q.get(block=True) + if cmd in ("flush", "quit"): + if file_stack: + self._save_docs(file_stack, doc_stack, node_stack) + if cmd == "quit": + break + elif cmd == "process": + node_stack.extend(nodes) # type: ignore[arg-type] + doc_stack.extend(documents) # type: ignore[arg-type] + file_stack.append(file_name) # type: ignore[arg-type] + # Constant saving is heavy on I/O - accumulate to a threshold + if len(node_stack) >= self.NODE_FLUSH_COUNT: + self._save_docs(file_stack, doc_stack, node_stack) + finally: + self.node_q.task_done() + + def _flush(self) -> None: + self.doc_q.put(("flush", None, None)) + self.doc_q.join() + self.node_q.put(("flush", None, None, None)) + self.node_q.join() + + def ingest(self, file_name: str, file_data: Path) -> list[Document]: + documents = IngestionHelper.transform_file_into_documents(file_name, file_data) + self.doc_q.put(("process", file_name, documents)) + self._flush() + return documents + + def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]: + docs = [] + for file_name, file_data in eta(files): + try: + documents = IngestionHelper.transform_file_into_documents( + file_name, file_data + ) + self.doc_q.put(("process", file_name, documents)) + docs.extend(documents) + except Exception: + logger.exception(f"Skipping {file_data.name}") + self._flush() + return docs + + def get_ingestion_component( storage_context: StorageContext, embed_model: EmbedType, @@ -336,6 +502,13 @@ def get_ingestion_component( transformations=transformations, count_workers=settings.embedding.count_workers, ) + elif ingest_mode == "pipeline": + return PipelineIngestComponent( + storage_context=storage_context, + embed_model=embed_model, + transformations=transformations, + count_workers=settings.embedding.count_workers, + ) else: return SimpleIngestComponent( storage_context=storage_context, diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py index 953209a8..4e46c250 100644 --- a/private_gpt/components/llm/llm_component.py +++ b/private_gpt/components/llm/llm_component.py @@ -131,6 +131,7 @@ class LLMComponent: temperature=settings.llm.temperature, context_window=settings.llm.context_window, additional_kwargs=settings_kwargs, + request_timeout=ollama_settings.request_timeout, ) case "azopenai": try: diff --git a/private_gpt/server/chat/chat_service.py b/private_gpt/server/chat/chat_service.py index 5369200b..ea57f2c0 100644 --- a/private_gpt/server/chat/chat_service.py +++ b/private_gpt/server/chat/chat_service.py @@ -8,6 +8,9 @@ from llama_index.core.chat_engine.types import ( from llama_index.core.indices import VectorStoreIndex from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor from llama_index.core.llms import ChatMessage, MessageRole +from llama_index.core.postprocessor import ( + SimilarityPostprocessor, +) from llama_index.core.storage import StorageContext from llama_index.core.types import TokenGen from pydantic import BaseModel @@ -20,6 +23,7 @@ from private_gpt.components.vector_store.vector_store_component import ( ) from private_gpt.open_ai.extensions.context_filter import ContextFilter from private_gpt.server.chunks.chunks_service import Chunk +from private_gpt.settings.settings import Settings class Completion(BaseModel): @@ -68,14 +72,18 @@ class ChatEngineInput: @singleton class ChatService: + settings: Settings + @inject def __init__( self, + settings: Settings, llm_component: LLMComponent, vector_store_component: VectorStoreComponent, embedding_component: EmbeddingComponent, node_store_component: NodeStoreComponent, ) -> None: + self.settings = settings self.llm_component = llm_component self.embedding_component = embedding_component self.vector_store_component = vector_store_component @@ -98,9 +106,12 @@ class ChatService: use_context: bool = False, context_filter: ContextFilter | None = None, ) -> BaseChatEngine: + settings = self.settings if use_context: vector_index_retriever = self.vector_store_component.get_retriever( - index=self.index, context_filter=context_filter + index=self.index, + context_filter=context_filter, + similarity_top_k=self.settings.rag.similarity_top_k, ) return ContextChatEngine.from_defaults( system_prompt=system_prompt, @@ -108,6 +119,9 @@ class ChatService: llm=self.llm_component.llm, # Takes no effect at the moment node_postprocessors=[ MetadataReplacementPostProcessor(target_metadata_key="window"), + SimilarityPostprocessor( + similarity_cutoff=settings.rag.similarity_value + ), ], ) else: diff --git a/private_gpt/server/ingest/ingest_service.py b/private_gpt/server/ingest/ingest_service.py index 1d6f5ba2..f9ae4728 100644 --- a/private_gpt/server/ingest/ingest_service.py +++ b/private_gpt/server/ingest/ingest_service.py @@ -1,7 +1,7 @@ import logging import tempfile from pathlib import Path -from typing import AnyStr, BinaryIO +from typing import TYPE_CHECKING, AnyStr, BinaryIO from injector import inject, singleton from llama_index.core.node_parser import SentenceWindowNodeParser @@ -17,6 +17,9 @@ from private_gpt.components.vector_store.vector_store_component import ( from private_gpt.server.ingest.model import IngestedDoc from private_gpt.settings.settings import settings +if TYPE_CHECKING: + from llama_index.core.storage.docstore.types import RefDocInfo + logger = logging.getLogger(__name__) @@ -86,17 +89,15 @@ class IngestService: return [IngestedDoc.from_document(document) for document in documents] def list_ingested(self) -> list[IngestedDoc]: - ingested_docs = [] + ingested_docs: list[IngestedDoc] = [] try: docstore = self.storage_context.docstore - ingested_docs_ids: set[str] = set() + ref_docs: dict[str, RefDocInfo] | None = docstore.get_all_ref_doc_info() - for node in docstore.docs.values(): - if node.ref_doc_id is not None: - ingested_docs_ids.add(node.ref_doc_id) + if not ref_docs: + return ingested_docs - for doc_id in ingested_docs_ids: - ref_doc_info = docstore.get_ref_doc_info(ref_doc_id=doc_id) + for doc_id, ref_doc_info in ref_docs.items(): doc_metadata = None if ref_doc_info is not None and ref_doc_info.metadata is not None: doc_metadata = IngestedDoc.curate_metadata(ref_doc_info.metadata) diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index 4c274384..5896f00d 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -155,13 +155,14 @@ class HuggingFaceSettings(BaseModel): class EmbeddingSettings(BaseModel): mode: Literal["huggingface", "openai", "azopenai", "sagemaker", "ollama", "mock"] - ingest_mode: Literal["simple", "batch", "parallel"] = Field( + ingest_mode: Literal["simple", "batch", "parallel", "pipeline"] = Field( "simple", description=( "The ingest mode to use for the embedding engine:\n" "If `simple` - ingest files sequentially and one by one. It is the historic behaviour.\n" "If `batch` - if multiple files, parse all the files in parallel, " "and send them in batch to the embedding model.\n" + "In `pipeline` - The Embedding engine is kept as busy as possible\n" "If `parallel` - parse the files in parallel using multiple cores, and embedd them in parallel.\n" "`parallel` is the fastest mode for local setup, as it parallelize IO RW in the index.\n" "For modes that leverage parallelization, you can specify the number of " @@ -174,6 +175,7 @@ class EmbeddingSettings(BaseModel): "The number of workers to use for file ingestion.\n" "In `batch` mode, this is the number of workers used to parse the files.\n" "In `parallel` mode, this is the number of workers used to parse the files and embed them.\n" + "In `pipeline` mode, this is the number of workers that can perform embeddings.\n" "This is only used if `ingest_mode` is not `simple`.\n" "Do not go too high with this number, as it might cause memory issues. (especially in `parallel` mode)\n" "Do not set it higher than your number of threads of your CPU." @@ -239,6 +241,10 @@ class OllamaSettings(BaseModel): 1.1, description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)", ) + request_timeout: float = Field( + 120.0, + description="Time elapsed until ollama times out the request. Default is 120s. Format is float. ", + ) class AzureOpenAISettings(BaseModel): @@ -278,6 +284,17 @@ class UISettings(BaseModel): ) +class RagSettings(BaseModel): + similarity_top_k: int = Field( + 2, + description="This value controls the number of documents returned by the RAG pipeline", + ) + similarity_value: float = Field( + None, + description="If set, any documents retrieved from the RAG must meet a certain match score. Acceptable values are between 0 and 1.", + ) + + class PostgresSettings(BaseModel): host: str = Field( "localhost", @@ -373,6 +390,7 @@ class Settings(BaseModel): azopenai: AzureOpenAISettings vectorstore: VectorstoreSettings nodestore: NodeStoreSettings + rag: RagSettings qdrant: QdrantSettings | None = None postgres: PostgresSettings | None = None diff --git a/private_gpt/utils/eta.py b/private_gpt/utils/eta.py new file mode 100644 index 00000000..9315334f --- /dev/null +++ b/private_gpt/utils/eta.py @@ -0,0 +1,122 @@ +import datetime +import logging +import math +import time +from collections import deque +from typing import Any + +logger = logging.getLogger(__name__) + + +def human_time(*args: Any, **kwargs: Any) -> str: + def timedelta_total_seconds(timedelta: datetime.timedelta) -> float: + return ( + timedelta.microseconds + + 0.0 + + (timedelta.seconds + timedelta.days * 24 * 3600) * 10**6 + ) / 10**6 + + secs = float(timedelta_total_seconds(datetime.timedelta(*args, **kwargs))) + # We want (ms) precision below 2 seconds + if secs < 2: + return f"{secs * 1000}ms" + units = [("y", 86400 * 365), ("d", 86400), ("h", 3600), ("m", 60), ("s", 1)] + parts = [] + for unit, mul in units: + if secs / mul >= 1 or mul == 1: + if mul > 1: + n = int(math.floor(secs / mul)) + secs -= n * mul + else: + # >2s we drop the (ms) component. + n = int(secs) + if n: + parts.append(f"{n}{unit}") + return " ".join(parts) + + +def eta(iterator: list[Any]) -> Any: + """Report an ETA after 30s and every 60s thereafter.""" + total = len(iterator) + _eta = ETA(total) + _eta.needReport(30) + for processed, data in enumerate(iterator, start=1): + yield data + _eta.update(processed) + if _eta.needReport(60): + logger.info(f"{processed}/{total} - ETA {_eta.human_time()}") + + +class ETA: + """Predict how long something will take to complete.""" + + def __init__(self, total: int): + self.total: int = total # Total expected records. + self.rate: float = 0.0 # per second + self._timing_data: deque[tuple[float, int]] = deque(maxlen=100) + self.secondsLeft: float = 0.0 + self.nexttime: float = 0.0 + + def human_time(self) -> str: + if self._calc(): + return f"{human_time(seconds=self.secondsLeft)} @ {int(self.rate * 60)}/min" + return "(computing)" + + def update(self, count: int) -> None: + # count should be in the range 0 to self.total + assert count > 0 + assert count <= self.total + self._timing_data.append((time.time(), count)) # (X,Y) for pearson + + def needReport(self, whenSecs: int) -> bool: + now = time.time() + if now > self.nexttime: + self.nexttime = now + whenSecs + return True + return False + + def _calc(self) -> bool: + # A sample before a prediction. Need two points to compute slope! + if len(self._timing_data) < 3: + return False + + # http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient + # Calculate means and standard deviations. + samples = len(self._timing_data) + # column wise sum of the timing tuples to compute their mean. + mean_x, mean_y = ( + sum(i) / samples for i in zip(*self._timing_data, strict=False) + ) + std_x = math.sqrt( + sum(pow(i[0] - mean_x, 2) for i in self._timing_data) / (samples - 1) + ) + std_y = math.sqrt( + sum(pow(i[1] - mean_y, 2) for i in self._timing_data) / (samples - 1) + ) + + # Calculate coefficient. + sum_xy, sum_sq_v_x, sum_sq_v_y = 0.0, 0.0, 0 + for x, y in self._timing_data: + x -= mean_x + y -= mean_y + sum_xy += x * y + sum_sq_v_x += pow(x, 2) + sum_sq_v_y += pow(y, 2) + pearson_r = sum_xy / math.sqrt(sum_sq_v_x * sum_sq_v_y) + + # Calculate regression line. + # y = mx + b where m is the slope and b is the y-intercept. + m = self.rate = pearson_r * (std_y / std_x) + y = self.total + b = mean_y - m * mean_x + x = (y - b) / m + + # Calculate fitted line (transformed/shifted regression line horizontally). + fitted_b = self._timing_data[-1][1] - (m * self._timing_data[-1][0]) + fitted_x = (y - fitted_b) / m + _, count = self._timing_data[-1] # adjust last data point progress count + adjusted_x = ((fitted_x - x) * (count / self.total)) + x + eta_epoch = adjusted_x + + self.secondsLeft = max([eta_epoch - time.time(), 0]) + return True diff --git a/scripts/setup b/scripts/setup index 3e02e641..edba1049 100755 --- a/scripts/setup +++ b/scripts/setup @@ -10,7 +10,7 @@ from private_gpt.settings.settings import settings resume_download = True if __name__ == '__main__': - parser = argparse.ArgumentParser(prog='Setup: Download models from huggingface') + parser = argparse.ArgumentParser(prog='Setup: Download models from Hugging Face') parser.add_argument('--resume', default=True, action=argparse.BooleanOptionalAction, help='Enable/Disable resume_download options to restart the download progress interrupted') args = parser.parse_args() resume_download = args.resume diff --git a/scripts/utils.py b/scripts/utils.py index 6f5006c4..48068789 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -2,9 +2,35 @@ import argparse import os import shutil +from private_gpt.paths import local_data_path +from private_gpt.settings.settings import settings -def wipe(): - path = "local_data" + +def wipe() -> None: + WIPE_MAP = { + "simple": wipe_simple, # node store + "chroma": wipe_chroma, # vector store + "postgres": wipe_postgres, # node, index and vector store + } + for dbtype in ("nodestore", "vectorstore"): + database = getattr(settings(), dbtype).database + func = WIPE_MAP.get(database) + if func: + func(dbtype) + else: + print(f"Unable to wipe database '{database}' for '{dbtype}'") + + +def wipe_file(file: str) -> None: + if os.path.isfile(file): + os.remove(file) + print(f" - Deleted {file}") + + +def wipe_tree(path: str) -> None: + if not os.path.exists(path): + print(f"Warning: Path not found {path}") + return print(f"Wiping {path}...") all_files = os.listdir(path) @@ -24,6 +50,54 @@ def wipe(): continue +def wipe_simple(dbtype: str) -> None: + assert dbtype == "nodestore" + from llama_index.core.storage.docstore.types import ( + DEFAULT_PERSIST_FNAME as DOCSTORE, + ) + from llama_index.core.storage.index_store.types import ( + DEFAULT_PERSIST_FNAME as INDEXSTORE, + ) + + for store in (DOCSTORE, INDEXSTORE): + wipe_file(str((local_data_path / store).absolute())) + + +def wipe_postgres(dbtype: str) -> None: + try: + import psycopg2 + except ImportError as e: + raise ImportError("Postgres dependencies not found") from e + + cur = conn = None + try: + tables = { + "nodestore": ["data_docstore", "data_indexstore"], + "vectorstore": ["data_embeddings"], + }[dbtype] + connection = settings().postgres.model_dump(exclude_none=True) + schema = connection.pop("schema_name") + conn = psycopg2.connect(**connection) + cur = conn.cursor() + for table in tables: + sql = f"DROP TABLE IF EXISTS {schema}.{table}" + cur.execute(sql) + print(f"Table {schema}.{table} dropped.") + conn.commit() + except psycopg2.Error as e: + print("Error:", e) + finally: + if cur: + cur.close() + if conn: + conn.close() + + +def wipe_chroma(dbtype: str): + assert dbtype == "vectorstore" + wipe_tree(str((local_data_path / "chroma_db").absolute())) + + if __name__ == "__main__": commands = { "wipe": wipe, diff --git a/settings-local.yaml b/settings-local.yaml index 2c1995bc..c9d02742 100644 --- a/settings-local.yaml +++ b/settings-local.yaml @@ -1,3 +1,4 @@ +# poetry install --extras "ui llms-llama-cpp vector-stores-qdrant embeddings-huggingface" server: env_name: ${APP_ENV:local} diff --git a/settings-ollama.yaml b/settings-ollama.yaml index 9a0aaed0..d7e1a12c 100644 --- a/settings-ollama.yaml +++ b/settings-ollama.yaml @@ -14,11 +14,12 @@ ollama: llm_model: mistral embedding_model: nomic-embed-text api_base: http://localhost:11434 - tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. - top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) - top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) - repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) - repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) + tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. + top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) + top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) + repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) + repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) + request_timeout: 120.0 # Time elapsed until ollama times out the request. Default is 120s. Format is float. vectorstore: database: qdrant diff --git a/settings.yaml b/settings.yaml index 0b4cb341..87a63ef4 100644 --- a/settings.yaml +++ b/settings.yaml @@ -42,6 +42,12 @@ llm: tokenizer: mistralai/Mistral-7B-Instruct-v0.2 temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1) +rag: + similarity_top_k: 2 + #This value controls how many "top" documents the RAG returns to use in the context. + #similarity_value: 0.45 + #This value is disabled by default. If you enable this settings, the RAG will only use articles that meet a certain percentage score. + llamacpp: prompt_style: "mistral" llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF @@ -89,6 +95,7 @@ ollama: llm_model: llama2 embedding_model: nomic-embed-text api_base: http://localhost:11434 + request_timeout: 120.0 azopenai: api_key: ${AZ_OPENAI_API_KEY:}