chore(main): release 0.1.0 (#1094 )

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Fix the parallel ingestion mode, and make it available through conf (#1336 )
2025-10-12 10:07:58 +00:00 · 2023-12-01 14:45:54 +01:00 · 2023-11-30 11:41:55 +01:00 · 2023-11-29 20:56:37 +01:00 · 2023-11-29 20:54:22 +01:00 · 2023-11-29 16:46:40 +01:00
149 changed files with 12598 additions and 3249 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,12 @@
+.venv
+models
+.github
+.vscode
+.DS_Store
+.mypy_cache
+.ruff_cache
+local_data
+terraform
+tests
+Dockerfile
+Dockerfile.*
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,24 +0,0 @@
---
-name: Bug report
-about: Create a report to help us improve
-title: ''
-labels: bug
-assignees: ''
-
---
-
-Note: if you'd like to *ask a question* or *open a discussion*, head over to the [Discussions](https://github.com/imartinez/privateGPT/discussions) section and post it there.
-
-**Describe the bug and how to reproduce it**
-A clear and concise description of what the bug is and the steps to reproduce the behavior.
-
-**Expected behavior**
-A clear and concise description of what you expected to happen.
-
-**Environment (please complete the following information):**
- - OS / hardware: [e.g. macOS 12.6 / M1]
- - Python version [e.g. 3.11.3]
- Other relevant information
-
-**Additional context**
-Add any other context about the problem here.
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -1,22 +0,0 @@
---
-name: Feature request
-about: Suggest an idea for this project
-title: ''
-labels: enhancement
-assignees: ''
-
---
-
-Note: if you'd like to *ask a question* or *open a discussion*, head over to the [Discussions](https://github.com/imartinez/privateGPT/discussions) section and post it there.
-
-**Is your feature request related to a problem? Please describe.**
-A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
-
-**Describe the solution you'd like**
-A clear and concise description of what you want to happen.
-
-**Describe alternatives you've considered**
-A clear and concise description of any alternative solutions or features you've considered.
-
-**Additional context**
-Add any other context or screenshots about the feature request here.
--- a/.github/workflows/actions/install_dependencies/action.yml
+++ b/.github/workflows/actions/install_dependencies/action.yml
@@ -0,0 +1,30 @@
+name: "Install Dependencies"
+description: "Action to build the project dependencies from the main versions"
+inputs:
+  python_version:
+    required: true
+    type: string
+    default: "3.11.4"
+  poetry_version:
+    required: true
+    type: string
+    default: "1.5.1"
+
+runs:
+  using: composite
+  steps:
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+      with:
+        version: ${{ inputs.poetry_version }}
+        virtualenvs-create: true
+        virtualenvs-in-project: false
+        installer-parallel: true
+    - uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python_version }}
+        cache: "poetry"
+    - name: Install Dependencies
+      run: poetry install --with ui --no-root
+      shell: bash
+
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -0,0 +1,45 @@
+name: docker
+
+on:
+  release:
+    types: [ published ]
+  workflow_dispatch:
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  build-and-push-image:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Log in to the Container registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=ref,event=branch
+            type=ref,event=pr
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=sha
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: Dockerfile.external
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
--- a/.github/workflows/fern-check.yml
+++ b/.github/workflows/fern-check.yml
@@ -0,0 +1,21 @@
+name: fern check
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "fern/**"
+      
+jobs:
+  fern-check:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      - name: Install Fern
+        run:  npm install -g fern-api
+
+      - name: Check Fern API is valid
+        run: fern check
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -0,0 +1,48 @@
+name: deploy preview docs
+
+on:
+  pull_request_target:
+    branches:
+      - main
+    paths:
+      - "fern/**"
+
+jobs:
+  preview-docs:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js 
+        uses: actions/setup-node@v4
+        with:
+          node-version: "18"
+
+      - name: Install Fern
+        run: npm install -g fern-api
+
+      - name: Generate Documentation Preview with Fern
+        id: generate_docs
+        env:
+          FERN_TOKEN: ${{ secrets.FERN_TOKEN }}
+        run: |
+          output=$(fern generate --docs --preview --log-level debug)
+          echo "$output"
+          # Extract the URL
+          preview_url=$(echo "$output" | grep -oP '(?<=Published docs to )https://[^\s]*')
+          # Set the output for the step
+          echo "::set-output name=preview_url::$preview_url"
+      - name: Comment PR with URL using github-actions bot
+        uses: actions/github-script@v4
+        if: ${{ steps.generate_docs.outputs.preview_url }}
+        with:
+          script: |
+            const preview_url = '${{ steps.generate_docs.outputs.preview_url }}';
+            const issue_number = context.issue.number;
+            github.issues.createComment({
+              ...context.repo,
+              issue_number: issue_number,
+              body: `Published docs preview URL: ${preview_url}`
+            })
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -0,0 +1,26 @@
+name: publish docs
+
+on: 
+  push: 
+    branches: 
+      - main
+    paths:
+      - "fern/**"
+
+jobs:
+  publish-docs:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      - name: Setup node
+        uses: actions/setup-node@v3
+
+      - name: Download Fern
+        run: npm install -g fern-api
+
+      - name: Generate and Publish Docs
+        env:
+          FERN_TOKEN: ${{ secrets.FERN_TOKEN }}
+        run: fern generate --docs --log-level debug
--- a/.github/workflows/release-please.yml
+++ b/.github/workflows/release-please.yml
@@ -0,0 +1,19 @@
+name: release-please
+
+on:
+  push:
+    branches:
+      - main
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  release-please:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: google-github-actions/release-please-action@v3
+        with:
+          release-type: simple
+          version-file: version.txt
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,30 @@
+# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
+#
+# You can adjust the behavior by modifying this file.
+# For more information, see:
+# https://github.com/actions/stale
+name: Mark stale issues and pull requests
+
+on:
+  schedule:
+  - cron: '42 5 * * *'
+
+jobs:
+  stale:
+
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+
+    steps:
+    - uses: actions/stale@v8
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        days-before-stale: 15
+        stale-issue-message: 'Stale issue'
+        stale-pr-message: 'Stale pull request'
+        stale-issue-label: 'stale'
+        stale-pr-label: 'stale'
+        exempt-issue-labels: 'autorelease: pending'
+        exempt-pr-labels: 'autorelease: pending'
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,67 @@
+name: tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref || github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: ./.github/workflows/actions/install_dependencies
+
+  checks:
+    needs: setup
+    runs-on: ubuntu-latest
+    name: ${{ matrix.quality-command }}
+    strategy:
+      matrix:
+        quality-command:
+          - black
+          - ruff
+          - mypy
+    steps:
+      - uses: actions/checkout@v3
+      - uses: ./.github/workflows/actions/install_dependencies
+      - name: run ${{ matrix.quality-command }}
+        run: make ${{ matrix.quality-command }}
+
+  test:
+    needs: setup
+    runs-on: ubuntu-latest
+    name: test
+    steps:
+      - uses: actions/checkout@v3
+      - uses: ./.github/workflows/actions/install_dependencies
+      - name: run test
+        run: make test-coverage
+      # Run even if make test fails for coverage reports
+      # TODO: select a better xml results displayer
+      - name: Archive test results coverage results
+        uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: test_results
+          path: tests-results.xml
+      - name: Archive code coverage results
+        uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: code-coverage-report
+          path: htmlcov/
+
+  all_checks_passed:
+    # Used to easily force requirements checks in GitHub
+    needs:
+      - checks
+      - test
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "All checks passed"
--- a/.gitignore
+++ b/.gitignore
@@ -1,174 +1,29 @@
-# OSX
-.DS_STORE
+.venv

-# Models
-models/
+settings-me.yaml

-# Local Chroma db
-.chroma/
-db/
-persist_directory/chroma.sqlite
+.ruff_cache
+.pytest_cache
+.mypy_cache

-# Byte-compiled / optimized / DLL files
+# byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
-*$py.class

-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
+# unit tests / coverage reports
+/tests-results.xml
+/.coverage
+/coverage.xml
+/htmlcov/

 # pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
+/.python-version

-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
+# IDE
+.idea/
+.vscode/
+/.run/
+.fleet/

-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-
-# vscode
-.vscode/launch.json
+# macOS
+.DS_Store
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,44 +1,43 @@
---
-files: ^(.*\.(py|json|md|sh|yaml|cfg|txt))$
-exclude: ^(\.[^/]*cache/.*|.*/_user.py|source_documents/)$
+default_install_hook_types:
+# Mandatory to install both pre-commit and pre-push hooks (see https://pre-commit.com/#top_level-default_install_hook_types)
+# Add new hook types here to ensure automatic installation when running `pre-commit install`
+- pre-commit
+- pre-push
 repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
-    hooks:
-      #- id: no-commit-to-branch
-      #  args: [--branch, main]
-      - id: check-yaml
-        args: [--unsafe]
-      # - id: debug-statements
-      - id: end-of-file-fixer
-      - id: trailing-whitespace
-        exclude-files: \.md$
-      - id: check-json
-      - id: mixed-line-ending
-      # - id: check-builtin-literals
-      # - id: check-ast
-      - id: check-merge-conflict
-      - id: check-executables-have-shebangs
-      - id: check-shebang-scripts-are-executable
-      - id: check-docstring-first
-      - id: fix-byte-order-marker
-      - id: check-case-conflict
-      # - id: check-toml
-  - repo: https://github.com/adrienverge/yamllint.git
-    rev: v1.29.0
-    hooks:
-      - id: yamllint
-        args:
-          - --no-warnings
-          - -d
-          - '{extends: relaxed, rules: {line-length: {max: 90}}}'
-  - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.2
-    hooks:
-      - id: codespell
-        args:
-          # - --builtin=clear,rare,informal,usage,code,names,en-GB_to_en-US
-          - --builtin=clear,rare,informal,usage,code,names
-          - --ignore-words-list=hass,master
-          - --skip="./.*"
-          - --quiet-level=2
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.3.0
+  hooks:
+  - id: trailing-whitespace
+  - id: end-of-file-fixer
+  - id: check-yaml
+  - id: check-json
+  - id: check-added-large-files
+
+- repo: local
+  hooks:
+  - id: black
+    name: Formatting (black)
+    entry: black
+    language: system
+    types: [python]
+    stages: [commit]
+  - id: ruff
+    name: Linter (ruff)
+    entry: ruff
+    language: system
+    types: [python]
+    stages: [commit]
+  - id: mypy
+    name: Type checking (mypy)
+    entry: make mypy
+    pass_filenames: false
+    language: system
+    types: [python]
+    stages: [commit]
+  - id: test
+    name: Unit tests (pytest)
+    entry: make test
+    pass_filenames: false
+    language: system
+    types: [python]
+    stages: [push]
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -0,0 +1,36 @@
+# Changelog
+
+## [0.1.0](https://github.com/imartinez/privateGPT/compare/v0.0.2...v0.1.0) (2023-11-30)
+
+
+### Features
+
+* Disable Gradio Analytics ([#1165](https://github.com/imartinez/privateGPT/issues/1165)) ([6583dc8](https://github.com/imartinez/privateGPT/commit/6583dc84c082773443fc3973b1cdf8095fa3fec3))
+* Drop loguru and use builtin `logging` ([#1133](https://github.com/imartinez/privateGPT/issues/1133)) ([64c5ae2](https://github.com/imartinez/privateGPT/commit/64c5ae214a9520151c9c2d52ece535867d799367))
+* enable resume download for hf_hub_download ([#1249](https://github.com/imartinez/privateGPT/issues/1249)) ([4197ada](https://github.com/imartinez/privateGPT/commit/4197ada6267c822f32c1d7ba2be6e7ce145a3404))
+* move torch and transformers to local group ([#1172](https://github.com/imartinez/privateGPT/issues/1172)) ([0d677e1](https://github.com/imartinez/privateGPT/commit/0d677e10b970aec222ec04837d0f08f1631b6d4a))
+* Qdrant support ([#1228](https://github.com/imartinez/privateGPT/issues/1228)) ([03d1ae6](https://github.com/imartinez/privateGPT/commit/03d1ae6d70dffdd2411f0d4e92f65080fff5a6e2))
+
+
+### Bug Fixes
+
+* Docker and sagemaker setup ([#1118](https://github.com/imartinez/privateGPT/issues/1118)) ([895588b](https://github.com/imartinez/privateGPT/commit/895588b82a06c2bc71a9e22fb840c7f6442a3b5b))
+* fix pytorch version to avoid wheel bug ([#1123](https://github.com/imartinez/privateGPT/issues/1123)) ([24cfddd](https://github.com/imartinez/privateGPT/commit/24cfddd60f74aadd2dade4c63f6012a2489938a1))
+* Remove global state ([#1216](https://github.com/imartinez/privateGPT/issues/1216)) ([022bd71](https://github.com/imartinez/privateGPT/commit/022bd718e3dfc197027b1e24fb97e5525b186db4))
+* sagemaker config and chat methods ([#1142](https://github.com/imartinez/privateGPT/issues/1142)) ([a517a58](https://github.com/imartinez/privateGPT/commit/a517a588c4927aa5c5c2a93e4f82a58f0599d251))
+* typo in README.md ([#1091](https://github.com/imartinez/privateGPT/issues/1091)) ([ba23443](https://github.com/imartinez/privateGPT/commit/ba23443a70d323cd4f9a242b33fd9dce1bacd2db))
+* Windows 11 failing to auto-delete tmp file ([#1260](https://github.com/imartinez/privateGPT/issues/1260)) ([0d52002](https://github.com/imartinez/privateGPT/commit/0d520026a3d5b08a9b8487be992d3095b21e710c))
+* Windows permission error on ingest service tmp files ([#1280](https://github.com/imartinez/privateGPT/issues/1280)) ([f1cbff0](https://github.com/imartinez/privateGPT/commit/f1cbff0fb7059432d9e71473cbdd039032dab60d))
+
+## [0.0.2](https://github.com/imartinez/privateGPT/compare/v0.0.1...v0.0.2) (2023-10-20)
+
+
+### Bug Fixes
+
+* chromadb max batch size ([#1087](https://github.com/imartinez/privateGPT/issues/1087)) ([f5a9bf4](https://github.com/imartinez/privateGPT/commit/f5a9bf4e374b2d4c76438cf8a97cccf222ec8e6f))
+
+## 0.0.1 (2023-10-20)
+
+### Miscellaneous Chores
+
+* Initial version ([490d93f](https://github.com/imartinez/privateGPT/commit/490d93fdc1977443c92f6c42e57a1c585aa59430))
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,25 @@
+# This CITATION.cff file was generated with cffinit.
+# Visit https://bit.ly/cffinit to generate yours today!
+
+cff-version: 1.2.0
+title: PrivateGPT
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+type: software
+authors:
+  - given-names: Iván
+    family-names: Martínez Toro
+    email: ivanmartit@gmail.com
+    orcid: 'https://orcid.org/0009-0004-5065-2311'
+  - family-names: Gallego Vico
+    given-names: Daniel
+    email: danielgallegovico@gmail.com
+    orcid: 'https://orcid.org/0009-0006-8582-4384'
+  - given-names: Pablo
+    family-names: Orgaz
+    email: pabloogc+gh@gmail.com
+    orcid: 'https://orcid.org/0009-0008-0080-1437'
+repository-code: 'https://github.com/imartinez/privateGPT'
+license: Apache-2.0
+date-released: '2023-05-02'
--- a/Dockerfile.external
+++ b/Dockerfile.external
@@ -0,0 +1,36 @@
+FROM python:3.11.6-slim-bookworm as base
+
+# Install poetry
+RUN pip install pipx
+RUN python3 -m pipx ensurepath
+RUN pipx install poetry
+ENV PATH="/root/.local/bin:$PATH"
+
+# https://python-poetry.org/docs/configuration/#virtualenvsin-project
+ENV POETRY_VIRTUALENVS_IN_PROJECT=true
+
+FROM base as dependencies
+WORKDIR /home/worker/app
+COPY pyproject.toml poetry.lock ./
+
+RUN poetry install --with ui
+
+FROM base as app
+
+ENV PYTHONUNBUFFERED=1
+ENV PORT=8080
+EXPOSE 8080
+
+# Prepare a non-root user
+RUN adduser --system worker
+WORKDIR /home/worker/app
+
+RUN mkdir local_data; chown worker local_data
+RUN mkdir models; chown worker models
+COPY --chown=worker --from=dependencies /home/worker/app/.venv/ .venv
+COPY --chown=worker private_gpt/ private_gpt
+COPY --chown=worker docs/ docs
+COPY --chown=worker *.yaml *.md ./
+
+USER worker
+ENTRYPOINT .venv/bin/python -m private_gpt
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -0,0 +1,47 @@
+### IMPORTANT, THIS IMAGE CAN ONLY BE RUN IN LINUX DOCKER
+### You will run into a segfault in mac
+FROM python:3.11.6-slim-bookworm as base
+
+# Install poetry
+RUN pip install pipx
+RUN python3 -m pipx ensurepath
+RUN pipx install poetry
+ENV PATH="/root/.local/bin:$PATH"
+
+# Dependencies to build llama-cpp
+RUN apt update && apt install -y \
+  libopenblas-dev\
+  ninja-build\
+  build-essential\
+  pkg-config\
+  wget
+
+# https://python-poetry.org/docs/configuration/#virtualenvsin-project
+ENV POETRY_VIRTUALENVS_IN_PROJECT=true
+
+FROM base as dependencies
+WORKDIR /home/worker/app
+COPY pyproject.toml poetry.lock ./
+
+RUN poetry install --with local
+RUN poetry install --with ui
+
+FROM base as app
+
+ENV PYTHONUNBUFFERED=1
+ENV PORT=8080
+EXPOSE 8080
+
+# Prepare a non-root user
+RUN adduser --system worker
+WORKDIR /home/worker/app
+
+RUN mkdir local_data; chown worker local_data
+RUN mkdir models; chown worker models
+COPY --chown=worker --from=dependencies /home/worker/app/.venv/ .venv
+COPY --chown=worker private_gpt/ private_gpt
+COPY --chown=worker docs/ docs
+COPY --chown=worker *.yaml *.md ./
+
+USER worker
+ENTRYPOINT .venv/bin/python -m private_gpt
--- a/55
+++ b/55
@@ -0,0 +1,55 @@
+# Any args passed to the make script, use with $(call args, default_value)
+args = `arg="$(filter-out $@,$(MAKECMDGOALS))" && echo $${arg:-${1}}`
+
+########################################################################################################################
+# Quality checks
+########################################################################################################################
+
+test:
+	PYTHONPATH=. poetry run pytest tests
+
+test-coverage:
+	PYTHONPATH=. poetry run pytest tests --cov private_gpt --cov-report term --cov-report=html --cov-report xml --junit-xml=tests-results.xml
+
+black:
+	poetry run black . --check
+
+ruff:
+	poetry run ruff check private_gpt tests
+
+format:
+	poetry run black .
+	poetry run ruff check private_gpt tests --fix
+
+mypy:
+	poetry run mypy private_gpt
+
+check:
+	make format
+	make mypy
+
+########################################################################################################################
+# Run
+########################################################################################################################
+
+run:
+	poetry run python -m private_gpt
+
+dev-windows:
+	(set PGPT_PROFILES=local & poetry run python -m uvicorn private_gpt.main:app --reload --port 8001)
+
+dev:
+	PYTHONUNBUFFERED=1 PGPT_PROFILES=local poetry run python -m uvicorn private_gpt.main:app --reload --port 8001
+
+########################################################################################################################
+# Misc
+########################################################################################################################
+
+api-docs:
+	PGPT_PROFILES=mock poetry run python scripts/extract_openapi.py private_gpt.main:app --out fern/openapi/openapi.json
+
+ingest:
+	@poetry run python scripts/ingest_folder.py $(call args)
+
+wipe:
+	poetry run python scripts/utils.py wipe
--- a/README.md
+++ b/README.md
@@ -1,151 +1,161 @@
-# privateGPT
-Ask questions to your documents without an internet connection, using the power of LLMs. 100% private, no data leaves your execution environment at any point. You can ingest documents and ask questions without an internet connection!
+# 🔒 PrivateGPT 📑

-> :warning: **This branch is frozen and won't be updated; use main branch instead.** This branch contains the primordial version of PrivateGPT, which was launched in May 2023 as a novel approach to address AI privacy concerns by using LLMs in a complete offline way. That version, which rapidly became a go-to project for privacy-sensitive setups and served as the seed for thousands of local-focused generative AI projects, was the foundation of what PrivateGPT is becoming nowadays. Check the latest updates in main branch.
-<img width="902" alt="demo" src="https://user-images.githubusercontent.com/721666/236942256-985801c9-25b9-48ef-80be-3acbb4575164.png">
+[![Tests](https://github.com/imartinez/privateGPT/actions/workflows/tests.yml/badge.svg)](https://github.com/imartinez/privateGPT/actions/workflows/tests.yml?query=branch%3Amain)
+[![Website](https://img.shields.io/website?up_message=check%20it&down_message=down&url=https%3A%2F%2Fdocs.privategpt.dev%2F&label=Documentation)](https://docs.privategpt.dev/)

-Built with [LangChain](https://github.com/hwchase17/langchain), [LlamaIndex](https://www.llamaindex.ai/), [GPT4All](https://github.com/nomic-ai/gpt4all), [LlamaCpp](https://github.com/ggerganov/llama.cpp), [Chroma](https://www.trychroma.com/) and [SentenceTransformers](https://www.sbert.net/).
+[![Discord](https://img.shields.io/discord/1164200432894234644?logo=discord&label=PrivateGPT)](https://discord.gg/bK6mRVpErU)
+[![X (formerly Twitter) Follow](https://img.shields.io/twitter/follow/PrivateGPT_AI)](https://twitter.com/PrivateGPT_AI)

-# Environment Setup
-In order to set your environment up to run the code here, first install all requirements:

-```shell
-pip3 install -r requirements.txt
+> Install & usage docs: https://docs.privategpt.dev/
+> 
+> Join the community: [Twitter](https://twitter.com/PrivateGPT_AI) & [Discord](https://discord.gg/bK6mRVpErU)
+
+![Gradio UI](/fern/docs/assets/ui.png?raw=true)
+
+PrivateGPT is a production-ready AI project that allows you to ask questions about your documents using the power
+of Large Language Models (LLMs), even in scenarios without an Internet connection. 100% private, no data leaves your
+execution environment at any point.
+
+The project provides an API offering all the primitives required to build private, context-aware AI applications.
+It follows and extends the [OpenAI API standard](https://openai.com/blog/openai-api),
+and supports both normal and streaming responses.
+
+The API is divided into two logical blocks:
+
+**High-level API**, which abstracts all the complexity of a RAG (Retrieval Augmented Generation)
+pipeline implementation:
+- Ingestion of documents: internally managing document parsing,
+splitting, metadata extraction, embedding generation and storage.
+- Chat & Completions using context from ingested documents:
+abstracting the retrieval of context, the prompt engineering and the response generation.
+
+**Low-level API**, which allows advanced users to implement their own complex pipelines:
+- Embeddings generation: based on a piece of text.
+- Contextual chunks retrieval: given a query, returns the most relevant chunks of text from the ingested documents.
+
+In addition to this, a working [Gradio UI](https://www.gradio.app/)
+client is provided to test the API, together with a set of useful tools such as bulk model
+download script, ingestion script, documents folder watch, etc.
+
+> 👂 **Need help applying PrivateGPT to your specific use case?**
+> [Let us know more about it](https://forms.gle/4cSDmH13RZBHV9at7)
+> and we'll try to help! We are refining PrivateGPT through your feedback.
+
+## 🎞️ Overview
+DISCLAIMER: This README is not updated as frequently as the [documentation](https://docs.privategpt.dev/).
+Please check it out for the latest updates!
+
+### Motivation behind PrivateGPT
+Generative AI is a game changer for our society, but adoption in companies of all sizes and data-sensitive
+domains like healthcare or legal is limited by a clear concern: **privacy**.
+Not being able to ensure that your data is fully under your control when using third-party AI tools
+is a risk those industries cannot take.
+
+### Primordial version
+The first version of PrivateGPT was launched in May 2023 as a novel approach to address the privacy
+concerns by using LLMs in a complete offline way.
+
+That version, which rapidly became a go-to project for privacy-sensitive setups and served as the seed
+for thousands of local-focused generative AI projects, was the foundation of what PrivateGPT is becoming nowadays;
+thus a simpler and more educational implementation to understand the basic concepts required
+to build a fully local -and therefore, private- chatGPT-like tool.
+
+If you want to keep experimenting with it, we have saved it in the
+[primordial branch](https://github.com/imartinez/privateGPT/tree/primordial) of the project.
+
+> It is strongly recommended to do a clean clone and install of this new version of
+PrivateGPT if you come from the previous, primordial version.
+
+### Present and Future of PrivateGPT
+PrivateGPT is now evolving towards becoming a gateway to generative AI models and primitives, including
+completions, document ingestion, RAG pipelines and other low-level building blocks.
+We want to make it easier for any developer to build AI applications and experiences, as well as provide
+a suitable extensive architecture for the community to keep contributing.
+
+Stay tuned to our [releases](https://github.com/imartinez/privateGPT/releases) to check out all the new features and changes included.
+
+## 📄 Documentation
+Full documentation on installation, dependencies, configuration, running the server, deployment options,
+ingesting local documents, API details and UI features can be found here: https://docs.privategpt.dev/
+
+## 🧩 Architecture
+Conceptually, PrivateGPT is an API that wraps a RAG pipeline and exposes its
+primitives.
+* The API is built using [FastAPI](https://fastapi.tiangolo.com/) and follows
+  [OpenAI's API scheme](https://platform.openai.com/docs/api-reference).
+* The RAG pipeline is based on [LlamaIndex](https://www.llamaindex.ai/).
+
+The design of PrivateGPT allows to easily extend and adapt both the API and the
+RAG implementation. Some key architectural decisions are:
+* Dependency Injection, decoupling the different components and layers.
+* Usage of LlamaIndex abstractions such as `LLM`, `BaseEmbedding` or `VectorStore`,
+  making it immediate to change the actual implementations of those abstractions.
+* Simplicity, adding as few layers and new abstractions as possible.
+* Ready to use, providing a full implementation of the API and RAG
+  pipeline.
+
+Main building blocks:
+* APIs are defined in `private_gpt:server:<api>`. Each package contains an
+  `<api>_router.py` (FastAPI layer) and an `<api>_service.py` (the
+  service implementation). Each *Service* uses LlamaIndex base abstractions instead
+  of specific implementations,
+  decoupling the actual implementation from its usage.
+* Components are placed in
+  `private_gpt:components:<component>`. Each *Component* is in charge of providing
+  actual implementations to the base abstractions used in the Services - for example
+  `LLMComponent` is in charge of providing an actual implementation of an `LLM`
+  (for example `LlamaCPP` or `OpenAI`).
+
+## 💡 Contributing
+Contributions are welcomed! To ensure code quality we have enabled several format and
+typing checks, just run `make check` before committing to make sure your code is ok.
+Remember to test your code! You'll find a tests folder with helpers, and you can run
+tests using `make test` command.
+
+Don't know what to contribute? Here is the public 
+[Project Board](https://github.com/users/imartinez/projects/3) with several ideas. 
+
+Head over to Discord 
+#contributors channel and ask for write permissions on that Github project.
+
+## 💬 Community
+Join the conversation around PrivateGPT on our:
+- [Twitter (aka X)](https://twitter.com/PrivateGPT_AI)
+- [Discord](https://discord.gg/bK6mRVpErU)
+
+## 📖 Citation
+If you use PrivateGPT in a paper, check out the [Citation file](CITATION.cff) for the correct citation.  
+You can also use the "Cite this repository" button in this repo to get the citation in different formats.
+
+Here are a couple of examples:
+
+#### BibTeX
+```bibtex
+@software{Martinez_Toro_PrivateGPT_2023,
+author = {Martínez Toro, Iván and Gallego Vico, Daniel and Orgaz, Pablo},
+license = {Apache-2.0},
+month = may,
+title = {{PrivateGPT}},
+url = {https://github.com/imartinez/privateGPT},
+year = {2023}
+}
 ```

-*Alternative requirements installation with poetry*
-1. Install [poetry](https://python-poetry.org/docs/#installation)
-
-2. Run this commands
-```shell
-cd privateGPT
-poetry install
-poetry shell
+#### APA
+```
+Martínez Toro, I., Gallego Vico, D., & Orgaz, P. (2023). PrivateGPT [Computer software]. https://github.com/imartinez/privateGPT
 ```

-Then, download the LLM model and place it in a directory of your choice:
- LLM: default to [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin). If you prefer a different GPT4All-J compatible model, just download it and reference it in your `.env` file.
+## 🤗 Partners & Supporters
+PrivateGPT is actively supported by the teams behind:
+* [Qdrant](https://qdrant.tech/), providing the default vector database
+* [Fern](https://buildwithfern.com/), providing Documentation and SDKs
+* [LlamaIndex](https://www.llamaindex.ai/), providing the base RAG framework and abstractions

-Copy the `example.env` template into `.env`
-```shell
-cp example.env .env
-```
-
-and edit the variables appropriately in the `.env` file.
-```
-MODEL_TYPE: supports LlamaCpp or GPT4All
-PERSIST_DIRECTORY: is the folder you want your vectorstore in
-MODEL_PATH: Path to your GPT4All or LlamaCpp supported LLM
-MODEL_N_CTX: Maximum token limit for the LLM model
-MODEL_N_BATCH: Number of tokens in the prompt that are fed into the model at a time. Optimal value differs a lot depending on the model (8 works well for GPT4All, and 1024 is better for LlamaCpp)
-EMBEDDINGS_MODEL_NAME: SentenceTransformers embeddings model name (see https://www.sbert.net/docs/pretrained_models.html)
-TARGET_SOURCE_CHUNKS: The amount of chunks (sources) that will be used to answer a question
-```
-
-Note: because of the way `langchain` loads the `SentenceTransformers` embeddings, the first time you run the script it will require internet connection to download the embeddings model itself.
-
-## Test dataset
-This repo uses a [state of the union transcript](https://github.com/imartinez/privateGPT/blob/main/source_documents/state_of_the_union.txt) as an example.
-
-## Instructions for ingesting your own dataset
-
-Put any and all your files into the `source_documents` directory
-
-The supported extensions are:
-
-   - `.csv`: CSV,
-   - `.docx`: Word Document,
-   - `.doc`: Word Document,
-   - `.enex`: EverNote,
-   - `.eml`: Email,
-   - `.epub`: EPub,
-   - `.html`: HTML File,
-   - `.md`: Markdown,
-   - `.msg`: Outlook Message,
-   - `.odt`: Open Document Text,
-   - `.pdf`: Portable Document Format (PDF),
-   - `.pptx` : PowerPoint Document,
-   - `.ppt` : PowerPoint Document,
-   - `.txt`: Text file (UTF-8),
-
-Run the following command to ingest all the data.
-
-```shell
-python ingest.py
-```
-
-Output should look like this:
-
-```shell
-Creating new vectorstore
-Loading documents from source_documents
-Loading new documents: 100%|██████████████████████| 1/1 [00:01<00:00,  1.73s/it]
-Loaded 1 new documents from source_documents
-Split into 90 chunks of text (max. 500 tokens each)
-Creating embeddings. May take some minutes...
-Using embedded DuckDB with persistence: data will be stored in: db
-Ingestion complete! You can now run privateGPT.py to query your documents
-```
-
-It will create a `db` folder containing the local vectorstore. Will take 20-30 seconds per document, depending on the size of the document.
-You can ingest as many documents as you want, and all will be accumulated in the local embeddings database.
-If you want to start from an empty database, delete the `db` folder.
-
-Note: during the ingest process no data leaves your local environment. You could ingest without an internet connection, except for the first time you run the ingest script, when the embeddings model is downloaded.
-
-## Ask questions to your documents, locally!
-In order to ask a question, run a command like:
-
-```shell
-python privateGPT.py
-```
-
-And wait for the script to require your input.
-
-```plaintext
-> Enter a query:
-```
-
-Hit enter. You'll need to wait 20-30 seconds (depending on your machine) while the LLM model consumes the prompt and prepares the answer. Once done, it will print the answer and the 4 sources it used as context from your documents; you can then ask another question without re-running the script, just wait for the prompt again.
-
-Note: you could turn off your internet connection, and the script inference would still work. No data gets out of your local environment.
-
-Type `exit` to finish the script.
-
-
-### CLI
-The script also supports optional command-line arguments to modify its behavior. You can see a full list of these arguments by running the command ```python privateGPT.py --help``` in your terminal.
-
-
-# How does it work?
-Selecting the right local models and the power of `LangChain` you can run the entire pipeline locally, without any data leaving your environment, and with reasonable performance.
-
- `ingest.py` uses `LangChain` tools to parse the document and create embeddings locally using `HuggingFaceEmbeddings` (`SentenceTransformers`). It then stores the result in a local vector database using `Chroma` vector store.
- `privateGPT.py` uses a local LLM based on `GPT4All-J` or `LlamaCpp` to understand questions and create answers. The context for the answers is extracted from the local vector store using a similarity search to locate the right piece of context from the docs.
- `GPT4All-J` wrapper was introduced in LangChain 0.0.162.
-
-# System Requirements
-
-## Python Version
-To use this software, you must have Python 3.10 or later installed. Earlier versions of Python will not compile.
-
-## C++ Compiler
-If you encounter an error while building a wheel during the `pip install` process, you may need to install a C++ compiler on your computer.
-
-### For Windows 10/11
-To install a C++ compiler on Windows 10/11, follow these steps:
-
-1. Install Visual Studio 2022.
-2. Make sure the following components are selected:
-   * Universal Windows Platform development
-   * C++ CMake tools for Windows
-3. Download the MinGW installer from the [MinGW website](https://sourceforge.net/projects/mingw/).
-4. Run the installer and select the `gcc` component.
-
-## Mac Running Intel
-When running a Mac with Intel hardware (not M1), you may run into _clang: error: the clang compiler does not support '-march=native'_ during pip install.
-
-If so set your archflags during pip install. eg: _ARCHFLAGS="-arch x86_64" pip3 install -r requirements.txt_
-
-# Disclaimer
-This is a test project to validate the feasibility of a fully private solution for question answering using LLMs and Vector embeddings. It is not production ready, and it is not meant to be used in production. The models selection is not optimized for performance, but for privacy; but it is possible to use different models and vectorstores to improve performance.
+This project has been strongly influenced and supported by other amazing projects like 
+[LangChain](https://github.com/hwchase17/langchain),
+[GPT4All](https://github.com/nomic-ai/gpt4all),
+[LlamaCpp](https://github.com/ggerganov/llama.cpp),
+[Chroma](https://www.trychroma.com/)
+and [SentenceTransformers](https://www.sbert.net/).
--- a/constants.py
+++ b/constants.py
@@ -1,16 +0,0 @@
-import os
-from dotenv import load_dotenv
-from chromadb.config import Settings
-
-load_dotenv()
-
-# Define the folder for storing database
-PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')
-if PERSIST_DIRECTORY is None:
-    raise Exception("Please set the PERSIST_DIRECTORY environment variable")
-
-# Define the Chroma settings
-CHROMA_SETTINGS = Settings(
-        persist_directory=PERSIST_DIRECTORY,
-        anonymized_telemetry=False
-)
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -0,0 +1,14 @@
+services:
+  private-gpt:
+    build:
+      dockerfile: Dockerfile.local
+    volumes:
+      - ./local_data/:/home/worker/app/local_data
+      - ./models/:/home/worker/app/models
+    ports:
+      - 8001:8080
+    environment:
+      PORT: 8080
+      PGPT_PROFILES: docker
+      PGPT_MODE: local
+
--- a/docs/.nojekyll
+++ b/docs/.nojekyll
--- a/docs/description.md
+++ b/docs/description.md
@@ -0,0 +1,474 @@
+## Introduction
+
+PrivateGPT provides an **API** containing all the building blocks required to build
+**private, context-aware AI applications**. The API follows and extends OpenAI API standard, and supports
+both normal and streaming responses.
+
+The API is divided in two logical blocks:
+
+- High-level API, abstracting all the complexity of a RAG (Retrieval Augmented Generation) pipeline implementation:
+    - Ingestion of documents: internally managing document parsing, splitting, metadata extraction,
+      embedding generation and storage.
+    - Chat & Completions using context from ingested documents: abstracting the retrieval of context, the prompt
+      engineering and the response generation.
+- Low-level API, allowing advanced users to implement their own complex pipelines:
+    - Embeddings generation: based on a piece of text.
+    - Contextual chunks retrieval: given a query, returns the most relevant chunks of text from the ingested
+      documents.
+
+> A working **Gradio UI client** is provided to test the API, together with a set of
+> useful tools such as bulk model download script, ingestion script, documents folder
+> watch, etc.
+
+## Quick Local Installation steps
+
+The steps in `Installation and Settings` section are better explained and cover more
+setup scenarios. But if you are looking for a quick setup guide, here it is:
+
+```
+# Clone the repo
+git clone https://github.com/imartinez/privateGPT
+cd privateGPT
+
+# Install Python 3.11
+pyenv install 3.11
+pyenv local 3.11
+
+# Install dependencies
+poetry install --with ui,local
+
+# Download Embedding and LLM models
+poetry run python scripts/setup
+
+# (Optional) For Mac with Metal GPU, enable it. Check Installation and Settings section 
+to know how to enable GPU on other platforms
+CMAKE_ARGS="-DLLAMA_METAL=on" pip install --force-reinstall --no-cache-dir llama-cpp-python
+
+# Run the local server  
+PGPT_PROFILES=local make run
+
+# Note: on Mac with Metal you should see a ggml_metal_add_buffer log, stating GPU is 
+being used
+
+# Navigate to the UI and try it out! 
+http://localhost:8001/
+```
+
+## Installation and Settings
+
+### Base requirements to run PrivateGPT
+
+* Git clone PrivateGPT repository, and navigate to it:
+
+```
+  git clone https://github.com/imartinez/privateGPT
+  cd privateGPT
+```
+
+* Install Python 3.11. Ideally through a python version manager like `pyenv`.
+  Python 3.12
+  should work too. Earlier python versions are not supported.
+    * osx/linux: [pyenv](https://github.com/pyenv/pyenv)
+    * windows: [pyenv-win](https://github.com/pyenv-win/pyenv-win)
+
+```  
+pyenv install 3.11
+pyenv local 3.11
+```
+
+* Install [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) for dependency management:
+
+* Have a valid C++ compiler like gcc. See [Troubleshooting: C++ Compiler](#troubleshooting-c-compiler) for more details.
+
+* Install `make` for scripts:
+    * osx: (Using homebrew): `brew install make`
+    * windows: (Using chocolatey) `choco install make`
+
+### Install dependencies
+
+Install the dependencies:
+
+```bash
+poetry install --with ui
+```
+
+Verify everything is working by running `make run` (or `poetry run python -m private_gpt`) and navigate to
+http://localhost:8001. You should see a [Gradio UI](https://gradio.app/) **configured with a mock LLM** that will
+echo back the input. Later we'll see how to configure a real LLM.
+
+### Settings
+
+> Note: the default settings of PrivateGPT work out-of-the-box for a 100% local setup. Skip this section if you just
+> want to test PrivateGPT locally, and come back later to learn about more configuration options.
+
+PrivateGPT is configured through *profiles* that are defined using yaml files, and selected through env variables.
+The full list of properties configurable can be found in `settings.yaml`
+
+#### env var `PGPT_SETTINGS_FOLDER`
+
+The location of the settings folder. Defaults to the root of the project.
+Should contain the default `settings.yaml` and any other `settings-{profile}.yaml`.
+
+#### env var `PGPT_PROFILES`
+
+By default, the profile definition in `settings.yaml` is loaded.
+Using this env var you can load additional profiles; format is a comma separated list of profile names.
+This will merge `settings-{profile}.yaml` on top of the base settings file.
+
+For example:
+`PGPT_PROFILES=local,cuda` will load `settings-local.yaml`
+and `settings-cuda.yaml`, their contents will be merged with
+later profiles properties overriding values of earlier ones like `settings.yaml`.
+
+During testing, the `test` profile will be active along with the default, therefore `settings-test.yaml`
+file is required.
+
+#### Environment variables expansion
+
+Configuration files can contain environment variables,
+they will be expanded at runtime.
+
+Expansion must follow the pattern `${VARIABLE_NAME:default_value}`.
+
+For example, the following configuration will use the value of the `PORT`
+environment variable or `8001` if it's not set.
+Missing variables with no default will produce an error.
+
+```yaml
+server:
+  port: ${PORT:8001}
+```
+
+### Local LLM requirements
+
+Install extra dependencies for local execution:
+
+```bash
+poetry install --with local
+```
+
+For PrivateGPT to run fully locally GPU acceleration is required
+(CPU execution is possible, but very slow), however,
+typical Macbook laptops or window desktops with mid-range GPUs lack VRAM to run
+even the smallest LLMs. For that reason
+**local execution is only supported for models compatible with [llama.cpp](https://github.com/ggerganov/llama.cpp)**
+
+These two models are known to work well:
+
+* https://huggingface.co/TheBloke/Llama-2-7B-chat-GGUF
+* https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF (recommended)
+
+To ease the installation process, use the `setup` script that will download both
+the embedding and the LLM model and place them in the correct location (under `models` folder):
+
+```bash
+poetry run python scripts/setup
+```
+
+If you are ok with CPU execution, you can skip the rest of this section.
+
+As stated before, llama.cpp is required and in
+particular [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
+is used.
+
+> It's highly encouraged that you fully read llama-cpp and llama-cpp-python documentation relevant to your platform.
+> Running into installation issues is very likely, and you'll need to troubleshoot them yourself.
+
+#### Customizing low level parameters
+
+Currently not all the parameters of llama-cpp and llama-cpp-python are available at PrivateGPT's `settings.yaml` file. In case you need to customize parameters such as the number of layers loaded into the GPU, you might change these at the `llm_component.py` file under the `private_gpt/components/llm/llm_component.py`. If you are getting an out of memory error, you might also try a smaller model or stick to the proposed recommended models, instead of custom tuning the parameters.
+
+#### OSX GPU support
+
+You will need to build [llama.cpp](https://github.com/ggerganov/llama.cpp) with
+metal support. To do that run:
+
+```bash
+CMAKE_ARGS="-DLLAMA_METAL=on" pip install --force-reinstall --no-cache-dir llama-cpp-python
+```
+
+#### Windows NVIDIA GPU support
+
+Windows GPU support is done through CUDA.
+Follow the instructions on the original [llama.cpp](https://github.com/ggerganov/llama.cpp) repo to install the required
+dependencies.
+
+Some tips to get it working with an NVIDIA card and CUDA (Tested on Windows 10 with CUDA 11.5 RTX 3070):
+
+* Install latest VS2022 (and build tools) https://visualstudio.microsoft.com/vs/community/
+* Install CUDA toolkit https://developer.nvidia.com/cuda-downloads
+* Verify your installation is correct by running `nvcc --version` and `nvidia-smi`, ensure your CUDA version is up to
+  date and your GPU is detected.
+* [Optional] Install CMake to troubleshoot building issues by compiling llama.cpp directly https://cmake.org/download/
+
+If you have all required dependencies properly configured running the
+following powershell command should succeed.
+
+```powershell
+$env:CMAKE_ARGS='-DLLAMA_CUBLAS=on'; poetry run pip install --force-reinstall --no-cache-dir llama-cpp-python
+```
+
+If your installation was correct, you should see a message similar to the following next
+time you start the server `BLAS = 1`.
+
+```
+llama_new_context_with_model: total VRAM used: 4857.93 MB (model: 4095.05 MB, context: 762.87 MB)
+AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 
+```
+
+Note that llama.cpp offloads matrix calculations to the GPU but the performance is
+still hit heavily due to latency between CPU and GPU communication. You might need to tweak
+batch sizes and other parameters to get the best performance for your particular system.
+
+#### Linux NVIDIA GPU support and Windows-WSL
+
+Linux GPU support is done through CUDA.
+Follow the instructions on the original [llama.cpp](https://github.com/ggerganov/llama.cpp) repo to install the required
+external
+dependencies.
+
+Some tips:
+
+* Make sure you have an up-to-date C++ compiler
+* Install CUDA toolkit https://developer.nvidia.com/cuda-downloads
+* Verify your installation is correct by running `nvcc --version` and `nvidia-smi`, ensure your CUDA version is up to
+  date and your GPU is detected.
+
+After that running the following command in the repository will install llama.cpp with GPU support:
+
+`
+CMAKE_ARGS='-DLLAMA_CUBLAS=on' poetry run pip install --force-reinstall --no-cache-dir llama-cpp-python
+`
+
+If your installation was correct, you should see a message similar to the following next
+time you start the server `BLAS = 1`.
+
+```
+llama_new_context_with_model: total VRAM used: 4857.93 MB (model: 4095.05 MB, context: 762.87 MB)
+AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 
+```
+
+#### Vectorstores
+PrivateGPT supports [Chroma](https://www.trychroma.com/), [Qdrant](https://qdrant.tech/) as vectorstore providers. Chroma being the default.
+
+To enable Qdrant, set the `vectorstore.database` property in the `settings.yaml` file to `qdrant` and install the `qdrant` extra.
+
+```bash
+poetry install --extras qdrant
+```
+
+By default Qdrant tries to connect to an instance at `http://localhost:3000`.
+
+Qdrant settings can be configured by setting values to the `qdrant` property in the `settings.yaml` file.
+
+The available configuration options are:
+| Field        | Description |
+|--------------|-------------|
+| location     | If `:memory:` - use in-memory Qdrant instance.<br>If `str` - use it as a `url` parameter.|
+| url          | Either host or str of 'Optional[scheme], host, Optional[port], Optional[prefix]'.<br> Eg. `http://localhost:6333` |
+| port         | Port of the REST API interface. Default: `6333` |
+| grpc_port    | Port of the gRPC interface. Default: `6334` |
+| prefer_grpc  | If `true` - use gRPC interface whenever possible in custom methods. |
+| https        | If `true` - use HTTPS(SSL) protocol.|
+| api_key      | API key for authentication in Qdrant Cloud.|
+| prefix       | If set, add `prefix` to the REST URL path.<br>Example: `service/v1` will result in `http://localhost:6333/service/v1/{qdrant-endpoint}` for REST API.|
+| timeout      | Timeout for REST and gRPC API requests.<br>Default: 5.0 seconds for REST and unlimited for gRPC |
+| host         | Host name of Qdrant service. If url and host are not set, defaults to 'localhost'.|
+| path         | Persistence path for QdrantLocal. Eg. `local_data/private_gpt/qdrant`|
+| force_disable_check_same_thread         | Force disable check_same_thread for QdrantLocal sqlite connection.|
+
+#### Known issues and Troubleshooting
+
+Execution of LLMs locally still has a lot of sharp edges, specially when running on non Linux platforms.
+You might encounter several issues:
+
+* Performance: RAM or VRAM usage is very high, your computer might experience slowdowns or even crashes.
+* GPU Virtualization on Windows and OSX: Simply not possible with docker desktop, you have to run the server directly on
+  the host.
+* Building errors: Some of PrivateGPT dependencies need to build native code, and they might fail on some platforms.
+  Most likely you are missing some dev tools in your machine (updated C++ compiler, CUDA is not on PATH, etc.).
+  If you encounter any of these issues, please open an issue and we'll try to help.
+
+#### Troubleshooting: C++ Compiler
+
+If you encounter an error while building a wheel during the `pip install` process, you may need to install a C++
+compiler on your computer.
+
+**For Windows 10/11**
+
+To install a C++ compiler on Windows 10/11, follow these steps:
+
+1. Install Visual Studio 2022.
+2. Make sure the following components are selected:
+    * Universal Windows Platform development
+    * C++ CMake tools for Windows
+3. Download the MinGW installer from the [MinGW website](https://sourceforge.net/projects/mingw/).
+4. Run the installer and select the `gcc` component.
+
+** For OSX **
+
+1. Check if you have a C++ compiler installed, Xcode might have done it for you. for example running `gcc`.
+2. If not, you can install clang or gcc with homebrew `brew install gcc`
+
+#### Troubleshooting: Mac Running Intel
+
+When running a Mac with Intel hardware (not M1), you may run into _clang: error: the clang compiler does not support '
+-march=native'_ during pip install.
+
+If so set your archflags during pip install. eg: _ARCHFLAGS="-arch x86_64" pip3 install -r requirements.txt_
+
+## Running the Server
+
+After following the installation steps you should be ready to go. Here are some common run setups:
+
+### Running 100% locally
+
+Make sure you have followed the *Local LLM requirements* section before moving on.
+
+This command will start PrivateGPT using the `settings.yaml` (default profile) together with the `settings-local.yaml`
+configuration files. By default, it will enable both the API and the Gradio UI. Run:
+
+```
+PGPT_PROFILES=local make run
+``` 
+
+or
+
+```
+PGPT_PROFILES=local poetry run python -m private_gpt
+```
+
+When the server is started it will print a log *Application startup complete*.
+Navigate to http://localhost:8001 to use the Gradio UI or to http://localhost:8001/docs (API section) to try the API
+using Swagger UI.
+
+### Local server using OpenAI as LLM
+
+If you cannot run a local model (because you don't have a GPU, for example) or for testing purposes, you may
+decide to run PrivateGPT using OpenAI as the LLM.
+
+In order to do so, create a profile `settings-openai.yaml` with the following contents:
+
+```yaml
+llm:
+  mode: openai
+
+openai:
+  api_key: <your_openai_api_key>  # You could skip this configuration and use the OPENAI_API_KEY env var instead
+```
+
+And run PrivateGPT loading that profile you just created:
+
+```PGPT_PROFILES=openai make run```
+
+or
+
+```PGPT_PROFILES=openai poetry run python -m private_gpt```
+
+> Note this will still use the local Embeddings model, as it is ok to use it on a CPU.
+> We'll support using OpenAI embeddings in a future release.
+
+When the server is started it will print a log *Application startup complete*.
+Navigate to http://localhost:8001 to use the Gradio UI or to http://localhost:8001/docs (API section) to try the API.
+You'll notice the speed and quality of response is higher, given you are using OpenAI's servers for the heavy
+computations.
+
+### Use AWS's Sagemaker
+
+🚧 Under construction 🚧
+
+## Gradio UI user manual
+
+Gradio UI is a ready to use way of testing most of PrivateGPT API functionalities.
+
+![Gradio PrivateGPT](https://lh3.googleusercontent.com/drive-viewer/AK7aPaD_Hc-A8A9ooMe-hPgm_eImgsbxAjb__8nFYj8b_WwzvL1Gy90oAnp1DfhPaN6yGiEHCOXs0r77W1bYHtPzlVwbV7fMsA=s1600)
+
+### Execution Modes
+
+It has 3 modes of execution (you can select in the top-left):
+
+* Query Docs: uses the context from the
+  ingested documents to answer the questions posted in the chat. It also takes
+  into account previous chat messages as context.
+    * Makes use of `/chat/completions` API with `use_context=true` and no
+      `context_filter`.
+* Search in Docs: fast search that returns the 4 most related text
+  chunks, together with their source document and page.
+    * Makes use of `/chunks` API with no `context_filter`, `limit=4` and
+      `prev_next_chunks=0`.
+* LLM Chat: simple, non-contextual chat with the LLM. The ingested documents won't
+  be taken into account, only the previous messages.
+    * Makes use of `/chat/completions` API with `use_context=false`.
+
+### Document Ingestion
+
+Ingest documents by using the `Upload a File` button. You can check the progress of
+the ingestion in the console logs of the server.
+
+The list of ingested files is shown below the button.
+
+If you want to delete the ingested documents, refer to *Reset Local documents
+database* section in the documentation.
+
+### Chat
+
+Normal chat interface, self-explanatory ;)
+
+You can check the actual prompt being passed to the LLM by looking at the logs of
+the server. We'll add better observability in future releases.
+
+## Deployment options
+
+🚧 We are working on Dockerized deployment guidelines 🚧
+
+## Observability
+
+Basic logs are enabled using LlamaIndex
+basic logging (for example ingestion progress or LLM prompts and answers).
+
+🚧 We are working on improved Observability. 🚧
+
+## Ingesting & Managing Documents
+
+🚧 Document Update and Delete are still WIP. 🚧
+
+The ingestion of documents can be done in different ways:
+
+* Using the `/ingest` API
+* Using the Gradio UI
+* Using the Bulk Local Ingestion functionality (check next section)
+
+### Bulk Local Ingestion
+
+When you are running PrivateGPT in a fully local setup, you can ingest a complete folder for convenience (containing
+pdf, text files, etc.)
+and optionally watch changes on it with the command:
+
+```bash
+make ingest /path/to/folder -- --watch
+```
+
+To log the processed and failed files to an additional file, use:
+
+```bash
+make ingest /path/to/folder -- --watch --log-file /path/to/log/file.log
+```
+
+After ingestion is complete, you should be able to chat with your documents
+by navigating to http://localhost:8001 and using the option `Query documents`,
+or using the completions / chat API.
+
+### Reset Local documents database
+
+When running in a local setup, you can remove all ingested documents by simply
+deleting all contents of `local_data` folder (except .gitignore).
+
+To simplify this process, you can use the command:
+```bash
+make wipe
+```
+
+## API
+
+As explained in the introduction, the API contains high level APIs (ingestion and chat/completions) and low level APIs
+(embeddings and chunk retrieval). In this section the different specific API calls are explained.
--- a/docs/index.html
+++ b/docs/index.html
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>PrivateGPT Docs</title>
+    <!-- needed for adaptive design -->
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">
+    <link rel="shortcut icon" href="https://fastapi.tiangolo.com/img/favicon.png">
+    <!-- ReDoc doesn't change outer page styles -->
+    <style>
+      body {
+        margin: 0;
+        padding: 0;
+      }
+    </style>
+</head>
+<body>
+    <noscript> ReDoc requires Javascript to function. Please enable it to browse the documentation. </noscript>
+    <redoc spec-url="/openapi.json"></redoc>
+    <script src="https://cdn.jsdelivr.net/npm/redoc@next/bundles/redoc.standalone.js"></script>
+</body>
--- a/docs/logo.png
+++ b/docs/logo.png
--- a/docs/openapi.json
+++ b/docs/openapi.json
--- a/example.env
+++ b/example.env
@@ -1,7 +0,0 @@
-PERSIST_DIRECTORY=db
-MODEL_TYPE=GPT4All
-MODEL_PATH=models/ggml-gpt4all-j-v1.3-groovy.bin
-EMBEDDINGS_MODEL_NAME=all-MiniLM-L6-v2
-MODEL_N_CTX=1000
-MODEL_N_BATCH=8
-TARGET_SOURCE_CHUNKS=4
--- a/fern/README.md
+++ b/fern/README.md
@@ -0,0 +1,39 @@
+# Documentation of privateGPT
+
+The documentation of this project is being rendered thanks to [fern](https://github.com/fern-api/fern).
+
+Fern is basically transforming your `.md` and `.mdx` files into a static website: your documentation.
+
+The configuration of your documentation is done in the `./docs.yml` file.
+There, you can configure the navbar, tabs, sections and pages being rendered.
+
+The documentation of fern (and the syntax of its configuration `docs.yml`) is 
+available there [docs.buildwithfern.com](https://docs.buildwithfern.com/).
+
+## How to run fern
+
+**You cannot render your documentation locally without fern credentials.**
+
+To see how your documentation looks like, you **have to** use the CICD of this
+repository (by opening a PR, CICD job will be executed, and a preview of 
+your PR's documentation will be deployed in vercel automatically, through fern).
+
+The only thing you can do locally, is to run `fern check`, which check the syntax of
+your `docs.yml` file.
+
+## How to add a new page
+Add in the `docs.yml` a new `page`, with the following syntax:
+
+```yml
+navigation:
+  # ...
+  - tab: my-existing-tab
+    layout:
+      # ...
+      - section: My Existing Section
+        contents:
+          # ...
+          - page: My new page display name
+            # The path of the page, relative to `fern/`
+            path: ./docs/pages/my-existing-tab/new-page-content.mdx
+```
--- a/fern/docs.yml
+++ b/fern/docs.yml
@@ -0,0 +1,111 @@
+# Main Fern configuration file
+instances:
+  - url: privategpt.docs.buildwithfern.com
+    custom-domain: docs.privategpt.dev
+
+title: PrivateGPT | Docs
+
+# The tabs definition, in the top left corner
+tabs:
+  overview:
+    display-name: Overview
+    icon: "fa-solid fa-home"
+  installation:
+    display-name: Installation
+    icon: "fa-solid fa-download"
+  manual:
+    display-name: Manual
+    icon: "fa-solid fa-book"
+  recipes:
+    display-name: Recipes
+    icon: "fa-solid fa-flask"
+  api-reference:
+    display-name: API Reference
+    icon: "fa-solid fa-file-contract"
+
+# Definition of tabs contents, will be displayed on the left side of the page, below all tabs
+navigation:
+  # The default tab
+  - tab: overview
+    layout:
+      - section: Welcome
+        contents:
+          - page: Welcome
+            path: ./docs/pages/overview/welcome.mdx
+          - page: Quickstart
+            path: ./docs/pages/overview/quickstart.mdx
+  # How to install privateGPT, with FAQ and troubleshooting
+  - tab: installation
+    layout:
+      - section: Getting started
+        contents:
+          - page: Installation
+            path: ./docs/pages/installation/installation.mdx
+  # Manual of privateGPT: how to use it and configure it
+  - tab: manual
+    layout:
+      - section: General configuration
+        contents:
+          - page: Configuration
+            path: ./docs/pages/manual/settings.mdx
+      - section: Document management
+        contents:
+          - page: Ingestion
+            path: ./docs/pages/manual/ingestion.mdx
+          - page: Deletion
+            path: ./docs/pages/manual/ingestion-reset.mdx
+      - section: Storage
+        contents:
+          - page: Vector Stores
+            path: ./docs/pages/manual/vectordb.mdx
+      - section: Advanced Setup
+        contents:
+          - page: LLM Backends
+            path: ./docs/pages/manual/llms.mdx
+      - section: User Interface
+        contents:
+          - page: User interface (Gradio) Manual
+            path: ./docs/pages/manual/ui.mdx
+  # Small code snippet or example of usage to help users
+  - tab: recipes
+    layout:
+      - section: Choice of LLM
+        contents:
+          # TODO: add recipes
+          - page: List of LLMs
+            path: ./docs/pages/recipes/list-llm.mdx
+  # More advanced usage of privateGPT, by API
+  - tab: api-reference
+    layout:
+      - section: Overview
+        contents:
+          - page : API Reference overview
+            path: ./docs/pages/api-reference/api-reference.mdx
+          - page: SDKs
+            path: ./docs/pages/api-reference/sdks.mdx
+      - api: API Reference
+
+# Definition of the navbar, will be displayed in the top right corner.
+# `type:primary` is always displayed at the most right side of the navbar
+navbar-links:
+  - type: secondary
+    text: Github
+    url: "https://github.com/imartinez/privateGPT"
+  - type: secondary
+    text: Contact us
+    url: "mailto:hello@zylon.ai"
+  - type: primary
+    text: Join the Discord
+    url: https://discord.com/invite/bK6mRVpErU
+
+colors:
+  accentPrimary:
+    dark: "#C6BBFF"
+    light: "#756E98"
+
+logo:
+  dark: ./docs/assets/logo_light.png
+  light: ./docs/assets/logo_dark.png
+  height: 50
+
+favicon: ./docs/assets/favicon.ico
--- a/fern/docs/assets/favicon.ico
+++ b/fern/docs/assets/favicon.ico
--- a/fern/docs/assets/header.jpeg
+++ b/fern/docs/assets/header.jpeg
--- a/fern/docs/assets/logo_dark.png
+++ b/fern/docs/assets/logo_dark.png
--- a/fern/docs/assets/logo_light.png
+++ b/fern/docs/assets/logo_light.png
--- a/fern/docs/assets/ui.png
+++ b/fern/docs/assets/ui.png
--- a/fern/docs/pages/api-reference/api-reference.mdx
+++ b/fern/docs/pages/api-reference/api-reference.mdx
@@ -0,0 +1 @@
+# API Reference
--- a/fern/docs/pages/api-reference/sdks.mdx
+++ b/fern/docs/pages/api-reference/sdks.mdx
@@ -0,0 +1,38 @@
+We use [Fern](www.buildwithfern.com) to offer API clients for Node.js, Python, Go, and Java.
+We recommend using these clients to interact with our endpoints.
+The clients are kept up to date automatically, so we encourage you to use the latest version.
+
+## SDKs
+
+*Coming soon!*
+
+<Cards>
+  <Card
+    title="Node.js/TypeScript"
+    icon="fa-brands fa-node"
+    href="https://github.com/imartinez/privateGPT-typescript"
+  />
+  <Card
+    title="Python"
+    icon="fa-brands fa-python"
+    href="https://github.com/imartinez/privateGPT-python"
+  />
+  <br />
+</Cards>
+
+<br />
+
+<Cards>
+  <Card
+    title="Java"
+    icon="fa-brands fa-java"
+    href="https://github.com/imartinez/privateGPT-java"
+  />
+  <Card
+    title="Go"
+    icon="fa-brands fa-golang"
+    href="https://github.com/imartinez/privateGPT-go"
+  />
+</Cards>
+
+<br />
--- a/fern/docs/pages/installation/installation.mdx
+++ b/fern/docs/pages/installation/installation.mdx
@@ -0,0 +1,235 @@
+## Installation and Settings
+
+### Base requirements to run PrivateGPT
+
+* Git clone PrivateGPT repository, and navigate to it:
+
+```bash
+  git clone https://github.com/imartinez/privateGPT
+  cd privateGPT
+```
+
+* Install Python `3.11` (*if you do not have it already*). Ideally through a python version manager like `pyenv`.
+  Python 3.12 should work too. Earlier python versions are not supported.
+    * osx/linux: [pyenv](https://github.com/pyenv/pyenv)
+    * windows: [pyenv-win](https://github.com/pyenv-win/pyenv-win)
+
+```bash
+pyenv install 3.11
+pyenv local 3.11
+```
+
+* Install [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) for dependency management:
+
+* Have a valid C++ compiler like gcc. See [Troubleshooting: C++ Compiler](#troubleshooting-c-compiler) for more details.
+
+* Install `make` for scripts:
+    * osx: (Using homebrew): `brew install make`
+    * windows: (Using chocolatey) `choco install make`
+
+### Install dependencies
+
+Install the dependencies:
+
+```bash
+poetry install --with ui
+```
+
+Verify everything is working by running `make run` (or `poetry run python -m private_gpt`) and navigate to
+http://localhost:8001. You should see a [Gradio UI](https://gradio.app/) **configured with a mock LLM** that will
+echo back the input. Below we'll see how to configure a real LLM.
+
+### Settings
+
+<Callout intent="info">
+The default settings of PrivateGPT should work out-of-the-box for a 100% local setup. **However**, as is, it runs exclusively on your CPU.
+Skip this section if you just want to test PrivateGPT locally, and come back later to learn about more configuration options (and have better performances).
+</Callout>
+
+<br />
+
+### Local LLM requirements
+
+Install extra dependencies for local execution:
+
+```bash
+poetry install --with local
+```
+
+For PrivateGPT to run fully locally GPU acceleration is required
+(CPU execution is possible, but very slow), however,
+typical Macbook laptops or window desktops with mid-range GPUs lack VRAM to run
+even the smallest LLMs. For that reason
+**local execution is only supported for models compatible with [llama.cpp](https://github.com/ggerganov/llama.cpp)**
+
+These two models are known to work well:
+
+* https://huggingface.co/TheBloke/Llama-2-7B-chat-GGUF
+* https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF (recommended)
+
+To ease the installation process, use the `setup` script that will download both
+the embedding and the LLM model and place them in the correct location (under `models` folder):
+
+```bash
+poetry run python scripts/setup
+```
+
+If you are ok with CPU execution, you can skip the rest of this section.
+
+As stated before, llama.cpp is required and in
+particular [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
+is used.
+
+> It's highly encouraged that you fully read llama-cpp and llama-cpp-python documentation relevant to your platform.
+> Running into installation issues is very likely, and you'll need to troubleshoot them yourself.
+
+#### Customizing low level parameters
+
+Currently, not all the parameters of `llama.cpp` and `llama-cpp-python` are available at PrivateGPT's `settings.yaml` file.
+In case you need to customize parameters such as the number of layers loaded into the GPU, you might change
+these at the `llm_component.py` file under the `private_gpt/components/llm/llm_component.py`.
+
+##### Available LLM config options
+
+The `llm` section of the settings allows for the following configurations:
+
+- `mode`: how to run your llm
+- `max_new_tokens`: this lets you configure the number of new tokens the LLM will generate and add to the context window (by default Llama.cpp uses `256`)
+
+Example:
+
+```yaml
+llm:
+  mode: local
+  max_new_tokens: 256
+```
+
+If you are getting an out of memory error, you might also try a smaller model or stick to the proposed
+recommended models, instead of custom tuning the parameters.
+
+#### OSX GPU support
+
+You will need to build [llama.cpp](https://github.com/ggerganov/llama.cpp) with metal support.
+
+To do that, you need to install `llama.cpp` python's binding `llama-cpp-python` through pip, with the compilation flag
+that activate `METAL`: you have to pass `-DLLAMA_METAL=on` to the CMake command tha `pip` runs for you (see below).
+
+In other words, one should simply run:
+```bash
+CMAKE_ARGS="-DLLAMA_METAL=on" pip install --force-reinstall --no-cache-dir llama-cpp-python
+```
+
+The above command will force the re-installation of `llama-cpp-python` with `METAL` support by compiling
+`llama.cpp` locally with your `METAL` libraries (shipped by default with your macOS).
+
+More information is available in the documentation of the libraries themselves:
+* [llama-cpp-python](https://github.com/abetlen/llama-cpp-python#installation-with-hardware-acceleration)
+* [llama-cpp-python's documentation](https://llama-cpp-python.readthedocs.io/en/latest/#installation-with-hardware-acceleration)
+* [llama.cpp](https://github.com/ggerganov/llama.cpp#build)
+
+#### Windows NVIDIA GPU support
+
+Windows GPU support is done through CUDA.
+Follow the instructions on the original [llama.cpp](https://github.com/ggerganov/llama.cpp) repo to install the required
+dependencies.
+
+Some tips to get it working with an NVIDIA card and CUDA (Tested on Windows 10 with CUDA 11.5 RTX 3070):
+
+* Install latest VS2022 (and build tools) https://visualstudio.microsoft.com/vs/community/
+* Install CUDA toolkit https://developer.nvidia.com/cuda-downloads
+* Verify your installation is correct by running `nvcc --version` and `nvidia-smi`, ensure your CUDA version is up to
+  date and your GPU is detected.
+* [Optional] Install CMake to troubleshoot building issues by compiling llama.cpp directly https://cmake.org/download/
+
+If you have all required dependencies properly configured running the
+following powershell command should succeed.
+
+```powershell
+$env:CMAKE_ARGS='-DLLAMA_CUBLAS=on'; poetry run pip install --force-reinstall --no-cache-dir llama-cpp-python
+```
+
+If your installation was correct, you should see a message similar to the following next
+time you start the server `BLAS = 1`.
+
+```console
+llama_new_context_with_model: total VRAM used: 4857.93 MB (model: 4095.05 MB, context: 762.87 MB)
+AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 |
+```
+
+Note that llama.cpp offloads matrix calculations to the GPU but the performance is
+still hit heavily due to latency between CPU and GPU communication. You might need to tweak
+batch sizes and other parameters to get the best performance for your particular system.
+
+#### Linux NVIDIA GPU support and Windows-WSL
+
+Linux GPU support is done through CUDA.
+Follow the instructions on the original [llama.cpp](https://github.com/ggerganov/llama.cpp) repo to install the required
+external
+dependencies.
+
+Some tips:
+
+* Make sure you have an up-to-date C++ compiler
+* Install CUDA toolkit https://developer.nvidia.com/cuda-downloads
+* Verify your installation is correct by running `nvcc --version` and `nvidia-smi`, ensure your CUDA version is up to
+  date and your GPU is detected.
+
+After that running the following command in the repository will install llama.cpp with GPU support:
+
+```bash
+CMAKE_ARGS='-DLLAMA_CUBLAS=on' poetry run pip install --force-reinstall --no-cache-dir llama-cpp-python
+```
+
+If your installation was correct, you should see a message similar to the following next
+time you start the server `BLAS = 1`.
+
+```
+llama_new_context_with_model: total VRAM used: 4857.93 MB (model: 4095.05 MB, context: 762.87 MB)
+AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 |
+```
+
+### Known issues and Troubleshooting
+
+Execution of LLMs locally still has a lot of sharp edges, specially when running on non Linux platforms.
+You might encounter several issues:
+
+* Performance: RAM or VRAM usage is very high, your computer might experience slowdowns or even crashes.
+* GPU Virtualization on Windows and OSX: Simply not possible with docker desktop, you have to run the server directly on
+  the host.
+* Building errors: Some of PrivateGPT dependencies need to build native code, and they might fail on some platforms.
+  Most likely you are missing some dev tools in your machine (updated C++ compiler, CUDA is not on PATH, etc.).
+  If you encounter any of these issues, please open an issue and we'll try to help.
+
+One of the first reflex to adopt is: get more information.
+If, during your installation, something does not go as planned, retry in *verbose* mode, and see what goes wrong.
+
+For example, when installing packages with `pip install`, you can add the option `-vvv` to show the details of the installation.
+
+#### Troubleshooting: C++ Compiler
+
+If you encounter an error while building a wheel during the `pip install` process, you may need to install a C++
+compiler on your computer.
+
+**For Windows 10/11**
+
+To install a C++ compiler on Windows 10/11, follow these steps:
+
+1. Install Visual Studio 2022.
+2. Make sure the following components are selected:
+    * Universal Windows Platform development
+    * C++ CMake tools for Windows
+3. Download the MinGW installer from the [MinGW website](https://sourceforge.net/projects/mingw/).
+4. Run the installer and select the `gcc` component.
+
+**For OSX**
+
+1. Check if you have a C++ compiler installed, `Xcode` should have done it for you. To install Xcode, go to the App
+   Store and search for Xcode and install it. **Or** you can install the command line tools by running `xcode-select --install`.
+2. If not, you can install clang or gcc with homebrew `brew install gcc`
+
+#### Troubleshooting: Mac Running Intel
+
+When running a Mac with Intel hardware (not M1), you may run into _clang: error: the clang compiler does not support '
+-march=native'_ during pip install.
+
+If so set your archflags during pip install. eg: _ARCHFLAGS="-arch x86_64" pip3 install -r requirements.txt_
--- a/fern/docs/pages/manual/ingestion-reset.mdx
+++ b/fern/docs/pages/manual/ingestion-reset.mdx
@@ -0,0 +1,14 @@
+# Reset Local documents database
+
+When running in a local setup, you can remove all ingested documents by simply
+deleting all contents of `local_data` folder (except .gitignore).
+
+To simplify this process, you can use the command:
+```bash
+make wipe
+```
+
+# Advanced usage
+
+You can actually delete your documents from your storage by using the
+API endpoint `DELETE` in the Ingestion API.
--- a/fern/docs/pages/manual/ingestion.mdx
+++ b/fern/docs/pages/manual/ingestion.mdx
@@ -0,0 +1,124 @@
+# Ingesting & Managing Documents
+
+The ingestion of documents can be done in different ways:
+
+* Using the `/ingest` API
+* Using the Gradio UI
+* Using the Bulk Local Ingestion functionality (check next section)
+
+## Bulk Local Ingestion
+
+When you are running PrivateGPT in a fully local setup, you can ingest a complete folder for convenience (containing
+pdf, text files, etc.)
+and optionally watch changes on it with the command:
+
+```bash
+make ingest /path/to/folder -- --watch
+```
+
+To log the processed and failed files to an additional file, use:
+
+```bash
+make ingest /path/to/folder -- --watch --log-file /path/to/log/file.log
+```
+
+**Note for Windows Users:** Depending on your Windows version and whether you are using PowerShell to execute
+PrivateGPT API calls, you may need to include the parameter name before passing the folder path for consumption:
+
+```bash
+make ingest arg=/path/to/folder -- --watch --log-file /path/to/log/file.log
+```
+
+After ingestion is complete, you should be able to chat with your documents
+by navigating to http://localhost:8001 and using the option `Query documents`,
+or using the completions / chat API.
+
+## Ingestion troubleshooting
+
+### Running out of memory
+
+To do not run out of memory, you should ingest your documents without the LLM loaded in your (video) memory.
+To do so, you should change your configuration to set `llm.mode: mock`.
+
+You can also use the existing `PGPT_PROFILES=mock` that will set the following configuration for you:
+
+```yaml
+llm:
+  mode: mock
+embedding:
+  mode: local
+```
+
+This configuration allows you to use hardware acceleration for creating embeddings while avoiding loading the full LLM into (video) memory.
+
+Once your documents are ingested, you can set the `llm.mode` value back to `local` (or your previous custom value).
+
+### Ingestion speed
+
+The ingestion speed depends on the number of documents you are ingesting, and the size of each document.
+To speed up the ingestion, you can change the ingestion mode in configuration.
+
+The following ingestion mode exist:
+* `simple`: historic behavior, ingest one document at a time, sequentially
+* `batch`: read, parse, and embed multiple documents using batches (batch read, and then batch parse, and then batch embed)
+* `parallel`: read, parse, and embed multiple documents in parallel. This is the fastest ingestion mode for local setup.
+To change the ingestion mode, you can use the `embedding.ingest_mode` configuration value. The default value is `simple`.
+
+To configure the number of workers used for parallel or batched ingestion, you can use
+the `embedding.count_workers` configuration value. If you set this value too high, you might run out of
+memory, so be mindful when setting this value. The default value is `2`.
+For `batch` mode, you can easily set this value to your number of threads available on your CPU without
+running out of memory. For `parallel` mode, you should be more careful, and set this value to a lower value.
+
+The configuration below should be enough for users who want to stress more their hardware:
+```yaml
+embedding:
+  ingest_mode: parallel
+  count_workers: 4
+```
+
+If your hardware is powerful enough, and that you are loading heavy documents, you can increase the number of workers.
+It is recommended to do your own tests to find the optimal value for your hardware.
+
+If you have a `bash` shell, you can use this set of command to do your own benchmark:
+
+```bash
+# Wipe your local data, to put yourself in a clean state
+# This will delete all your ingested documents
+make wipe
+
+time PGPT_PROFILES=mock python ./scripts/ingest_folder.py ~/my-dir/to-ingest/
+```
+
+## Supported file formats
+
+privateGPT by default supports all the file formats that contains clear text (for example, `.txt` files, `.html`, etc.).
+However, these text based file formats as only considered as text files, and are not pre-processed in any other way.
+
+It also supports the following file formats:
+* `.hwp`
+* `.pdf`
+* `.docx`
+* `.pptx`
+* `.ppt`
+* `.pptm`
+* `.jpg`
+* `.png`
+* `.jpeg`
+* `.mp3`
+* `.mp4`
+* `.csv`
+* `.epub`
+* `.md`
+* `.mbox`
+* `.ipynb`
+* `.json`
+
+**Please note the following nuance**: while `privateGPT` supports these file formats, it **might** require additional
+dependencies to be installed in your python's virtual environment.
+For example, if you try to ingest `.epub` files, `privateGPT` might fail to do it, and will instead display an
+explanatory error asking you to download the necessary dependencies to install this file format.
+
+
+**Other file formats might work**, but they will be considered as plain text
+files (in other words, they will be ingested as `.txt` files).
--- a/fern/docs/pages/manual/llms.mdx
+++ b/fern/docs/pages/manual/llms.mdx
@@ -0,0 +1,83 @@
+## Running the Server
+
+PrivateGPT supports running with different LLMs & setups.
+
+### Local models
+
+Both the LLM and the Embeddings model will run locally.
+
+Make sure you have followed the *Local LLM requirements* section before moving on.
+
+This command will start PrivateGPT using the `settings.yaml` (default profile) together with the `settings-local.yaml`
+configuration files. By default, it will enable both the API and the Gradio UI. Run:
+
+```bash
+PGPT_PROFILES=local make run
+```
+
+or
+
+```bash
+PGPT_PROFILES=local poetry run python -m private_gpt
+```
+
+When the server is started it will print a log *Application startup complete*.
+Navigate to http://localhost:8001 to use the Gradio UI or to http://localhost:8001/docs (API section) to try the API
+using Swagger UI.
+
+### Using OpenAI
+
+If you cannot run a local model (because you don't have a GPU, for example) or for testing purposes, you may
+decide to run PrivateGPT using OpenAI as the LLM and Embeddings model.
+
+In order to do so, create a profile `settings-openai.yaml` with the following contents:
+
+```yaml
+llm:
+  mode: openai
+
+openai:
+  api_key: <your_openai_api_key>  # You could skip this configuration and use the OPENAI_API_KEY env var instead
+```
+
+And run PrivateGPT loading that profile you just created:
+
+`PGPT_PROFILES=openai make run`
+
+or
+
+`PGPT_PROFILES=openai poetry run python -m private_gpt`
+
+When the server is started it will print a log *Application startup complete*.
+Navigate to http://localhost:8001 to use the Gradio UI or to http://localhost:8001/docs (API section) to try the API.
+You'll notice the speed and quality of response is higher, given you are using OpenAI's servers for the heavy
+computations.
+
+### Using AWS Sagemaker
+
+For a fully private & performant setup, you can choose to have both your LLM and Embeddings model deployed using Sagemaker.
+
+Note: how to deploy models on Sagemaker is out of the scope of this documentation.
+
+In order to do so, create a profile `settings-sagemaker.yaml` with the following contents (remember to
+update the values of the llm_endpoint_name and embedding_endpoint_name to yours):
+
+```yaml
+llm:
+  mode: sagemaker
+
+sagemaker:
+  llm_endpoint_name: huggingface-pytorch-tgi-inference-2023-09-25-19-53-32-140
+  embedding_endpoint_name: huggingface-pytorch-inference-2023-11-03-07-41-36-479
+```
+
+And run PrivateGPT loading that profile you just created:
+
+`PGPT_PROFILES=sagemaker make run`
+
+or
+
+`PGPT_PROFILES=sagemaker poetry run python -m private_gpt`
+
+When the server is started it will print a log *Application startup complete*.
+Navigate to http://localhost:8001 to use the Gradio UI or to http://localhost:8001/docs (API section) to try the API.
--- a/fern/docs/pages/manual/settings.mdx
+++ b/fern/docs/pages/manual/settings.mdx
@@ -0,0 +1,80 @@
+# Settings and profiles for your private GPT
+
+The configuration of your private GPT server is done thanks to `settings` files (more precisely `settings.yaml`).
+These text files are written using the [YAML](https://en.wikipedia.org/wiki/YAML) syntax.
+
+While privateGPT is distributing safe and universal configuration files, you might want to quickly customize your
+privateGPT, and this can be done using the `settings` files.
+
+This project is defining the concept of **profiles** (or configuration profiles).
+This mechanism, using your environment variables, is giving you the ability to easily switch between
+configuration you've made.
+
+A typical use case of profile is to easily switch between LLM and embeddings.
+To be a bit more precise, you can change the language (to French, Spanish, Italian, English, etc) by simply changing
+the profile you've selected; no code changes required!
+
+PrivateGPT is configured through *profiles* that are defined using yaml files, and selected through env variables.
+The full list of properties configurable can be found in `settings.yaml`.
+
+## How to know which profiles exist
+Given that a profile `foo_bar` points to the file `settings-foo_bar.yaml` and vice-versa, you simply have to look
+at the files starting with `settings` and ending in `.yaml`.
+
+## How to use an existing profiles
+**Please note that the syntax to set the value of an environment variables depends on your OS**.
+You have to set environment variable `PGPT_PROFILES` to the name of the profile you want to use.
+
+For example, on **linux and macOS**, this gives:
+```bash
+export PGPT_PROFILES=my_profile_name_here
+```
+
+Windows Powershell(s) have a different syntax, one of them being:
+```shell
+set PGPT_PROFILES=my_profile_name_here
+```
+If the above is not working, you might want to try other ways to set an env variable in your window's terminal.
+
+---
+
+Once you've set this environment variable to the desired profile, you can simply launch your privateGPT,
+and it will run using your profile on top of the default configuration.
+
+## Reference
+Additional details on the profiles are described in this section
+
+### Environment variable `PGPT_SETTINGS_FOLDER`
+
+The location of the settings folder. Defaults to the root of the project.
+Should contain the default `settings.yaml` and any other `settings-{profile}.yaml`.
+
+### Environment variable `PGPT_PROFILES`
+
+By default, the profile definition in `settings.yaml` is loaded.
+Using this env var you can load additional profiles; format is a comma separated list of profile names.
+This will merge `settings-{profile}.yaml` on top of the base settings file.
+
+For example:
+`PGPT_PROFILES=local,cuda` will load `settings-local.yaml`
+and `settings-cuda.yaml`, their contents will be merged with
+later profiles properties overriding values of earlier ones like `settings.yaml`.
+
+During testing, the `test` profile will be active along with the default, therefore `settings-test.yaml`
+file is required.
+
+### Environment variables expansion
+
+Configuration files can contain environment variables,
+they will be expanded at runtime.
+
+Expansion must follow the pattern `${VARIABLE_NAME:default_value}`.
+
+For example, the following configuration will use the value of the `PORT`
+environment variable or `8001` if it's not set.
+Missing variables with no default will produce an error.
+
+```yaml
+server:
+  port: ${PORT:8001}
+```
--- a/fern/docs/pages/manual/ui.mdx
+++ b/fern/docs/pages/manual/ui.mdx
@@ -0,0 +1,39 @@
+## Gradio UI user manual
+
+Gradio UI is a ready to use way of testing most of PrivateGPT API functionalities.
+
+![Gradio PrivateGPT](https://lh3.googleusercontent.com/drive-viewer/AK7aPaD_Hc-A8A9ooMe-hPgm_eImgsbxAjb__8nFYj8b_WwzvL1Gy90oAnp1DfhPaN6yGiEHCOXs0r77W1bYHtPzlVwbV7fMsA=s1600)
+
+### Execution Modes
+
+It has 3 modes of execution (you can select in the top-left):
+
+* Query Docs: uses the context from the
+  ingested documents to answer the questions posted in the chat. It also takes
+  into account previous chat messages as context.
+    * Makes use of `/chat/completions` API with `use_context=true` and no
+      `context_filter`.
+* Search in Docs: fast search that returns the 4 most related text
+  chunks, together with their source document and page.
+    * Makes use of `/chunks` API with no `context_filter`, `limit=4` and
+      `prev_next_chunks=0`.
+* LLM Chat: simple, non-contextual chat with the LLM. The ingested documents won't
+  be taken into account, only the previous messages.
+    * Makes use of `/chat/completions` API with `use_context=false`.
+
+### Document Ingestion
+
+Ingest documents by using the `Upload a File` button. You can check the progress of
+the ingestion in the console logs of the server.
+
+The list of ingested files is shown below the button.
+
+If you want to delete the ingested documents, refer to *Reset Local documents
+database* section in the documentation.
+
+### Chat
+
+Normal chat interface, self-explanatory ;)
+
+You can check the actual prompt being passed to the LLM by looking at the logs of
+the server. We'll add better observability in future releases.
--- a/fern/docs/pages/manual/vectordb.mdx
+++ b/fern/docs/pages/manual/vectordb.mdx
@@ -0,0 +1,50 @@
+## Vectorstores
+PrivateGPT supports [Qdrant](https://qdrant.tech/) and [Chroma](https://www.trychroma.com/) as vectorstore providers. Qdrant being the default.
+
+In order to select one or the other, set the `vectorstore.database` property in the `settings.yaml` file to `qdrant` or `chroma`.
+
+```yaml
+vectorstore:
+  database: qdrant
+```
+
+### Qdrant configuration
+
+To enable Qdrant, set the `vectorstore.database` property in the `settings.yaml` file to `qdrant`.
+
+Qdrant settings can be configured by setting values to the `qdrant` property in the `settings.yaml` file.
+
+The available configuration options are:
+| Field        | Description |
+|--------------|-------------|
+| location     | If `:memory:` - use in-memory Qdrant instance. If `str` - use it as a `url` parameter.|
+| url          | Either host or str of 'Optional[scheme], host, Optional[port], Optional[prefix]'. Eg. `http://localhost:6333` |
+| port         | Port of the REST API interface. Default: `6333` |
+| grpc_port    | Port of the gRPC interface. Default: `6334` |
+| prefer_grpc  | If `true` - use gRPC interface whenever possible in custom methods. |
+| https        | If `true` - use HTTPS(SSL) protocol.|
+| api_key      | API key for authentication in Qdrant Cloud.|
+| prefix       | If set, add `prefix` to the REST URL path. Example: `service/v1` will result in `http://localhost:6333/service/v1/{qdrant-endpoint}` for REST API.|
+| timeout      | Timeout for REST and gRPC API requests. Default: 5.0 seconds for REST and unlimited for gRPC |
+| host         | Host name of Qdrant service. If url and host are not set, defaults to 'localhost'.|
+| path         | Persistence path for QdrantLocal. Eg. `local_data/private_gpt/qdrant`|
+| force_disable_check_same_thread         | Force disable check_same_thread for QdrantLocal sqlite connection, defaults to True.|
+
+By default Qdrant tries to connect to an instance of Qdrant server at `http://localhost:3000`.
+
+To obtain a local setup (disk-based database) without running a Qdrant server, configure the `qdrant.path` value in settings.yaml:
+
+```yaml
+qdrant:
+  path: local_data/private_gpt/qdrant
+```
+
+### Chroma configuration
+
+To enable Chroma, set the `vectorstore.database` property in the `settings.yaml` file to `chroma` and install the `chroma` extra.
+
+```bash
+poetry install --extras chroma
+```
+
+By default `chroma` will use a disk-based database stored in local_data_path / "chroma_db" (being local_data_path defined in settings.yaml)
--- a/fern/docs/pages/overview/quickstart.mdx
+++ b/fern/docs/pages/overview/quickstart.mdx
@@ -0,0 +1,21 @@
+## Local Installation steps
+
+The steps in [Installation](/installation) section are better explained and cover more
+setup scenarios (macOS, Windows, Linux).
+But if you like one-liners, have python3.11 installed, and you are running a UNIX (macOS or Linux)
+system, you can get up and running on CPU in few lines:
+
+```bash
+git clone https://github.com/imartinez/privateGPT && cd privateGPT && \
+python3.11 -m venv .venv && source .venv/bin/activate && \
+pip install --upgrade pip poetry && poetry install --with ui,local && ./scripts/setup
+
+# Launch the privateGPT API server **and** the gradio UI
+python3.11 -m private_gpt
+
+# In another terminal, create a new browser window on your private GPT!
+open http:////127.0.0.1:8001/
+```
+
+The above is not working, or it is too slow, so **you want to run it on GPU(s)**?
+Please check the more detailed [installation guide](/installation).
--- a/fern/docs/pages/overview/welcome.mdx
+++ b/fern/docs/pages/overview/welcome.mdx
@@ -0,0 +1,53 @@
+## Introduction 👋
+
+PrivateGPT provides an **API** containing all the building blocks required to
+build **private, context-aware AI applications**.
+The API follows and extends OpenAI API standard, and supports both normal and streaming responses.
+That means that, if you can use OpenAI API in one of your tools, you can use your own PrivateGPT API instead,
+with no code changes, **and for free** if you are running privateGPT in `local` mode.
+
+Looking for the installation quickstart? [Quickstart installation guide for Linux and macOS](/overview/welcome/quickstart).
+
+Do you want to install it on Windows? Or do you want to take full advantage of your hardware for better performances?
+The installation guide will help you in the [Installation section](/installation).
+
+
+## Frequently Visited Resources
+
+<Cards>
+  <Card
+    title="API Reference"
+    icon="fa-solid fa-code"
+    href="/api-reference"
+  />
+  <Card
+    title="Twitter"
+    icon="fa-brands fa-twitter"
+    href="https://twitter.com/PrivateGPT_AI"
+  />
+  <Card
+    title="Discord Server"
+    icon="fa-brands fa-discord"
+    href="https://discord.gg/bK6mRVpErU"
+  />
+</Cards>
+
+## API Organization 
+
+The API is divided in two logical blocks:
+
+1. High-level API, abstracting all the complexity of a RAG (Retrieval Augmented Generation) pipeline implementation:
+    - Ingestion of documents: internally managing document parsing, splitting, metadata extraction,
+      embedding generation and storage.
+    - Chat & Completions using context from ingested documents: abstracting the retrieval of context, the prompt
+      engineering and the response generation.
+
+2. Low-level API, allowing advanced users to implement their own complex pipelines:
+    - Embeddings generation: based on a piece of text.
+    - Contextual chunks retrieval: given a query, returns the most relevant chunks of text from the ingested
+      documents.
+
+<Callout intent = "info">
+A working **Gradio UI client** is provided to test the API, together with a set of useful tools such as bulk
+model download script, ingestion script, documents folder watch, etc.
+</Callout>
--- a/fern/docs/pages/recipes/list-llm.mdx
+++ b/fern/docs/pages/recipes/list-llm.mdx
@@ -0,0 +1,95 @@
+# List of working LLM
+
+**Do you have any working combination of LLM and embeddings?**
+Please open a PR to add it to the list, and come on our Discord to tell us about it!
+
+## Prompt style
+
+LLMs might have been trained with different prompt styles.
+The prompt style is the way the prompt is written, and how the system message is injected in the prompt.
+
+For example, `llama2` looks like this:
+```text
+<s>[INST] <<SYS>>
+{{ system_prompt }}
+<</SYS>>
+
+{{ user_message }} [/INST]
+```
+
+While `default` (the `llama_index` default) looks like this:
+```text
+system: {{ system_prompt }}
+user: {{ user_message }}
+assistant: {{ assistant_message }}
+```
+
+And the "`tag`" style looks like this:
+
+```text
+<|system|>: {{ system_prompt }}
+<|user|>: {{ user_message }}
+<|assistant|>: {{ assistant_message }}
+```
+
+Some LLMs will not understand this prompt style, and will not work (returning nothing).
+You can try to change the prompt style to `default` (or `tag`) in the settings, and it will
+change the way the messages are formatted to be passed to the LLM.
+
+## Example of configuration
+
+You might want to change the prompt depending on the language and model you are using.
+
+### English, with instructions
+
+`settings-en.yaml`:
+```yml
+local:
+  llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.1-GGUF
+  llm_hf_model_file: mistral-7b-instruct-v0.1.Q4_K_M.gguf
+  embedding_hf_model_name: BAAI/bge-small-en-v1.5
+  prompt_style: "llama2"
+```
+
+### French, with instructions
+
+`settings-fr.yaml`:
+```yml
+local:
+  llm_hf_repo_id: TheBloke/Vigogne-2-7B-Instruct-GGUF
+  llm_hf_model_file: vigogne-2-7b-instruct.Q4_K_M.gguf
+  embedding_hf_model_name: dangvantuan/sentence-camembert-base
+  prompt_style: "default"
+  # prompt_style: "tag" # also works
+  # The default system prompt is injected only when the `prompt_style` != default, and there are no system message in the discussion
+  # default_system_prompt: Vous êtes un assistant IA qui répond à la question posée à la fin en utilisant le contexte suivant. Si vous ne connaissez pas la réponse, dites simplement que vous ne savez pas, n'essayez pas d'inventer une réponse. Veuillez répondre exclusivement en français.
+```
+
+You might want to change the prompt as the one above might not directly answer your question.
+You can read online about how to write a good prompt, but in a nutshell, make it (extremely) directive.
+
+You can try and troubleshot your prompt by writing multiline requests in the UI, while
+writing your interaction with the model, for example:
+
+```text
+Tu es un programmeur senior qui programme en python et utilise le framework fastapi. Ecrit moi un serveur qui retourne "hello world".
+```
+
+Another example:
+```text
+Context: None
+Situation: tu es au milieu d'un champ.
+Tache: va a la rivière, en bas du champ.
+Décrit comment aller a la rivière.
+```
+
+### Optimised Models
+GodziLLa2-70B LLM (English, rank 2 on HuggingFace OpenLLM Leaderboard), bge large Embedding Model (rank 1 on HuggingFace MTEB Leaderboard)
+`settings-optimised.yaml`:
+```yml
+local:
+  llm_hf_repo_id: TheBloke/GodziLLa2-70B-GGUF
+  llm_hf_model_file: godzilla2-70b.Q4_K_M.gguf
+  embedding_hf_model_name: BAAI/bge-large-en
+  prompt_style: "llama2"
+```
--- a/fern/fern.config.json
+++ b/fern/fern.config.json
@@ -0,0 +1,4 @@
+{
+  "organization": "privategpt",
+  "version": "0.15.3"
+}
--- a/fern/generators.yml
+++ b/fern/generators.yml
@@ -0,0 +1,8 @@
+groups:
+  public:
+    generators:
+      - name: fernapi/fern-python-sdk
+        version: 0.6.2
+        output:
+          location: local-file-system
+          path: ../../pgpt-sdk/python
--- a/fern/openapi/openapi.json
+++ b/fern/openapi/openapi.json
--- a/ingest.py
+++ b/ingest.py
@@ -1,185 +0,0 @@
-#!/usr/bin/env python3
-import os
-import glob
-from typing import List
-from dotenv import load_dotenv
-from multiprocessing import Pool
-from tqdm import tqdm
-
-from langchain.document_loaders import (
-    CSVLoader,
-    EverNoteLoader,
-    PyMuPDFLoader,
-    TextLoader,
-    UnstructuredEmailLoader,
-    UnstructuredEPubLoader,
-    UnstructuredHTMLLoader,
-    UnstructuredMarkdownLoader,
-    UnstructuredODTLoader,
-    UnstructuredPowerPointLoader,
-    UnstructuredWordDocumentLoader,
-)
-
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.vectorstores import Chroma
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.docstore.document import Document
-
-if not load_dotenv():
-    print("Could not load .env file or it is empty. Please check if it exists and is readable.")
-    exit(1)
-
-from constants import CHROMA_SETTINGS
-import chromadb
-from chromadb.api.segment import API
-
-# Load environment variables
-persist_directory = os.environ.get('PERSIST_DIRECTORY')
-source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents')
-embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
-chunk_size = 500
-chunk_overlap = 50
-
-
-# Custom document loaders
-class MyElmLoader(UnstructuredEmailLoader):
-    """Wrapper to fallback to text/plain when default does not work"""
-
-    def load(self) -> List[Document]:
-        """Wrapper adding fallback for elm without html"""
-        try:
-            try:
-                doc = UnstructuredEmailLoader.load(self)
-            except ValueError as e:
-                if 'text/html content not found in email' in str(e):
-                    # Try plain text
-                    self.unstructured_kwargs["content_source"]="text/plain"
-                    doc = UnstructuredEmailLoader.load(self)
-                else:
-                    raise
-        except Exception as e:
-            # Add file_path to exception message
-            raise type(e)(f"{self.file_path}: {e}") from e
-
-        return doc
-
-
-# Map file extensions to document loaders and their arguments
-LOADER_MAPPING = {
-    ".csv": (CSVLoader, {}),
-    # ".docx": (Docx2txtLoader, {}),
-    ".doc": (UnstructuredWordDocumentLoader, {}),
-    ".docx": (UnstructuredWordDocumentLoader, {}),
-    ".enex": (EverNoteLoader, {}),
-    ".eml": (MyElmLoader, {}),
-    ".epub": (UnstructuredEPubLoader, {}),
-    ".html": (UnstructuredHTMLLoader, {}),
-    ".md": (UnstructuredMarkdownLoader, {}),
-    ".odt": (UnstructuredODTLoader, {}),
-    ".pdf": (PyMuPDFLoader, {}),
-    ".ppt": (UnstructuredPowerPointLoader, {}),
-    ".pptx": (UnstructuredPowerPointLoader, {}),
-    ".txt": (TextLoader, {"encoding": "utf8"}),
-    # Add more mappings for other file extensions and loaders as needed
-}
-
-
-def load_single_document(file_path: str) -> List[Document]:
-    ext = "." + file_path.rsplit(".", 1)[-1].lower()
-    if ext in LOADER_MAPPING:
-        loader_class, loader_args = LOADER_MAPPING[ext]
-        loader = loader_class(file_path, **loader_args)
-        return loader.load()
-
-    raise ValueError(f"Unsupported file extension '{ext}'")
-
-def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
-    """
-    Loads all documents from the source documents directory, ignoring specified files
-    """
-    all_files = []
-    for ext in LOADER_MAPPING:
-        all_files.extend(
-            glob.glob(os.path.join(source_dir, f"**/*{ext.lower()}"), recursive=True)
-        )
-        all_files.extend(
-            glob.glob(os.path.join(source_dir, f"**/*{ext.upper()}"), recursive=True)
-        )
-    filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
-
-    with Pool(processes=os.cpu_count()) as pool:
-        results = []
-        with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
-            for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
-                results.extend(docs)
-                pbar.update()
-
-    return results
-
-def process_documents(ignored_files: List[str] = []) -> List[Document]:
-    """
-    Load documents and split in chunks
-    """
-    print(f"Loading documents from {source_directory}")
-    documents = load_documents(source_directory, ignored_files)
-    if not documents:
-        print("No new documents to load")
-        exit(0)
-    print(f"Loaded {len(documents)} new documents from {source_directory}")
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-    documents = text_splitter.split_documents(documents)
-    print(f"Split into {len(documents)} chunks of text (max. {chunk_size} tokens each)")
-    return documents
-
-def batch_chromadb_insertions(chroma_client: API, documents: List[Document]) -> List[Document]:
-    """
-    Split the total documents to be inserted into batches of documents that the local chroma client can process
-    """
-    # Get max batch size.
-    max_batch_size = chroma_client.max_batch_size
-    for i in range(0, len(documents), max_batch_size):
-        yield documents[i:i + max_batch_size]
-
-
-def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
-    """
-    Checks if vectorstore exists
-    """
-    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
-    if not db.get()['documents']:
-        return False
-    return True
-
-def main():
-    # Create embeddings
-    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
-    # Chroma client
-    chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
-
-    if does_vectorstore_exist(persist_directory, embeddings):
-        # Update and store locally vectorstore
-        print(f"Appending to existing vectorstore at {persist_directory}")
-        db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
-        collection = db.get()
-        documents = process_documents([metadata['source'] for metadata in collection['metadatas']])
-        print(f"Creating embeddings. May take some minutes...")
-        for batched_chromadb_insertion in batch_chromadb_insertions(chroma_client, documents):
-            db.add_documents(batched_chromadb_insertion)
-    else:
-        # Create and store locally vectorstore
-        print("Creating new vectorstore")
-        documents = process_documents()
-        print(f"Creating embeddings. May take some minutes...")
-        # Create the db with the first batch of documents to insert
-        batched_chromadb_insertions = batch_chromadb_insertions(chroma_client, documents)
-        first_insertion = next(batched_chromadb_insertions)
-        db = Chroma.from_documents(first_insertion, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
-        # Add the rest of batches of documents
-        for batched_chromadb_insertion in batched_chromadb_insertions:
-            db.add_documents(batched_chromadb_insertion)
-
-    print(f"Ingestion complete! You can now run privateGPT.py to query your documents")
-
-
-if __name__ == "__main__":
-    main()
--- a/local_data/.gitignore
+++ b/local_data/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
--- a/models/.gitignore
+++ b/models/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
--- a/poetry.lock
+++ b/poetry.lock
--- a/privateGPT.py
+++ b/privateGPT.py
@@ -1,87 +0,0 @@
-#!/usr/bin/env python3
-from dotenv import load_dotenv
-from langchain.chains import RetrievalQA
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
-from langchain.vectorstores import Chroma
-from langchain.llms import GPT4All, LlamaCpp
-import chromadb
-import os
-import argparse
-import time
-
-if not load_dotenv():
-    print("Could not load .env file or it is empty. Please check if it exists and is readable.")
-    exit(1)
-
-embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME")
-persist_directory = os.environ.get('PERSIST_DIRECTORY')
-
-model_type = os.environ.get('MODEL_TYPE')
-model_path = os.environ.get('MODEL_PATH')
-model_n_ctx = os.environ.get('MODEL_N_CTX')
-model_n_batch = int(os.environ.get('MODEL_N_BATCH',8))
-target_source_chunks = int(os.environ.get('TARGET_SOURCE_CHUNKS',4))
-
-from constants import CHROMA_SETTINGS
-
-def main():
-    # Parse the command line arguments
-    args = parse_arguments()
-    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
-    chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
-    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
-    retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
-    # activate/deactivate the streaming StdOut callback for LLMs
-    callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()]
-    # Prepare the LLM
-    match model_type:
-        case "LlamaCpp":
-            llm = LlamaCpp(model_path=model_path, max_tokens=model_n_ctx, n_batch=model_n_batch, callbacks=callbacks, verbose=False)
-        case "GPT4All":
-            llm = GPT4All(model=model_path, max_tokens=model_n_ctx, backend='gptj', n_batch=model_n_batch, callbacks=callbacks, verbose=False)
-        case _default:
-            # raise exception if model_type is not supported
-            raise Exception(f"Model type {model_type} is not supported. Please choose one of the following: LlamaCpp, GPT4All")
-
-    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
-    # Interactive questions and answers
-    while True:
-        query = input("\nEnter a query: ")
-        if query == "exit":
-            break
-        if query.strip() == "":
-            continue
-
-        # Get the answer from the chain
-        start = time.time()
-        res = qa(query)
-        answer, docs = res['result'], [] if args.hide_source else res['source_documents']
-        end = time.time()
-
-        # Print the result
-        print("\n\n> Question:")
-        print(query)
-        print(f"\n> Answer (took {round(end - start, 2)} s.):")
-        print(answer)
-
-        # Print the relevant sources used for the answer
-        for document in docs:
-            print("\n> " + document.metadata["source"] + ":")
-            print(document.page_content)
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(description='privateGPT: Ask questions to your documents without an internet connection, '
-                                                 'using the power of LLMs.')
-    parser.add_argument("--hide-source", "-S", action='store_true',
-                        help='Use this flag to disable printing of source documents used for answers.')
-
-    parser.add_argument("--mute-stream", "-M",
-                        action='store_true',
-                        help='Use this flag to disable the streaming StdOut callback for LLMs.')
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    main()
--- a/private_gpt/init.py
+++ b/private_gpt/init.py
@@ -0,0 +1,23 @@
+"""private-gpt."""
+import logging
+import os
+
+# Set to 'DEBUG' to have extensive logging turned on, even for libraries
+ROOT_LOG_LEVEL = "INFO"
+
+PRETTY_LOG_FORMAT = (
+    "%(asctime)s.%(msecs)03d [%(levelname)-8s] %(name)+25s - %(message)s"
+)
+logging.basicConfig(level=ROOT_LOG_LEVEL, format=PRETTY_LOG_FORMAT, datefmt="%H:%M:%S")
+logging.captureWarnings(True)
+
+# Disable gradio analytics
+# This is done this way because gradio does not solely rely on what values are
+# passed to gr.Blocks(enable_analytics=...) but also on the environment
+# variable GRADIO_ANALYTICS_ENABLED. `gradio.strings` actually reads this env
+# directly, so to fully disable gradio analytics we need to set this env var.
+os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
+
+# Disable chromaDB telemetry
+# It is already disabled, see PR#1144
+# os.environ["ANONYMIZED_TELEMETRY"] = "False"
--- a/private_gpt/main.py
+++ b/private_gpt/main.py
@@ -0,0 +1,11 @@
+# start a fastapi server with uvicorn
+
+import uvicorn
+
+from private_gpt.main import app
+from private_gpt.settings.settings import settings
+
+# Set log_config=None to do not use the uvicorn logging configuration, and
+# use ours instead. For reference, see below:
+# https://github.com/tiangolo/fastapi/discussions/7457#discussioncomment-5141108
+uvicorn.run(app, host="0.0.0.0", port=settings().server.port, log_config=None)
--- a/private_gpt/components/init.py
+++ b/private_gpt/components/init.py
--- a/private_gpt/components/embedding/init.py
+++ b/private_gpt/components/embedding/init.py
--- a/private_gpt/components/embedding/custom/init.py
+++ b/private_gpt/components/embedding/custom/init.py
--- a/private_gpt/components/embedding/custom/sagemaker.py
+++ b/private_gpt/components/embedding/custom/sagemaker.py
@@ -0,0 +1,82 @@
+# mypy: ignore-errors
+import json
+from typing import Any
+
+import boto3
+from llama_index.embeddings.base import BaseEmbedding
+from pydantic import Field, PrivateAttr
+
+
+class SagemakerEmbedding(BaseEmbedding):
+    """Sagemaker Embedding Endpoint.
+
+    To use, you must supply the endpoint name from your deployed
+    Sagemaker embedding model & the region where it is deployed.
+
+    To authenticate, the AWS client uses the following methods to
+    automatically load credentials:
+    https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
+
+    If a specific credential profile should be used, you must pass
+    the name of the profile from the ~/.aws/credentials file that is to be used.
+
+    Make sure the credentials / roles used have the required policies to
+    access the Sagemaker endpoint.
+    See: https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies.html
+    """
+
+    endpoint_name: str = Field(description="")
+
+    _boto_client: Any = boto3.client(
+        "sagemaker-runtime",
+    )  # TODO make it an optional field
+
+    _async_not_implemented_warned: bool = PrivateAttr(default=False)
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "SagemakerEmbedding"
+
+    def _async_not_implemented_warn_once(self) -> None:
+        if not self._async_not_implemented_warned:
+            print("Async embedding not available, falling back to sync method.")
+            self._async_not_implemented_warned = True
+
+    def _embed(self, sentences: list[str]) -> list[list[float]]:
+        request_params = {
+            "inputs": sentences,
+        }
+
+        resp = self._boto_client.invoke_endpoint(
+            EndpointName=self.endpoint_name,
+            Body=json.dumps(request_params),
+            ContentType="application/json",
+        )
+
+        response_body = resp["Body"]
+        response_str = response_body.read().decode("utf-8")
+        response_json = json.loads(response_str)
+
+        return response_json["vectors"]
+
+    def _get_query_embedding(self, query: str) -> list[float]:
+        """Get query embedding."""
+        return self._embed([query])[0]
+
+    async def _aget_query_embedding(self, query: str) -> list[float]:
+        # Warn the user that sync is being used
+        self._async_not_implemented_warn_once()
+        return self._get_query_embedding(query)
+
+    async def _aget_text_embedding(self, text: str) -> list[float]:
+        # Warn the user that sync is being used
+        self._async_not_implemented_warn_once()
+        return self._get_text_embedding(text)
+
+    def _get_text_embedding(self, text: str) -> list[float]:
+        """Get text embedding."""
+        return self._embed([text])[0]
+
+    def _get_text_embeddings(self, texts: list[str]) -> list[list[float]]:
+        """Get text embeddings."""
+        return self._embed(texts)
--- a/private_gpt/components/embedding/embedding_component.py
+++ b/private_gpt/components/embedding/embedding_component.py
@@ -0,0 +1,46 @@
+import logging
+
+from injector import inject, singleton
+from llama_index import MockEmbedding
+from llama_index.embeddings.base import BaseEmbedding
+
+from private_gpt.paths import models_cache_path
+from private_gpt.settings.settings import Settings
+
+logger = logging.getLogger(__name__)
+
+
+@singleton
+class EmbeddingComponent:
+    embedding_model: BaseEmbedding
+
+    @inject
+    def __init__(self, settings: Settings) -> None:
+        embedding_mode = settings.embedding.mode
+        logger.info("Initializing the embedding model in mode=%s", embedding_mode)
+        match embedding_mode:
+            case "local":
+                from llama_index.embeddings import HuggingFaceEmbedding
+
+                self.embedding_model = HuggingFaceEmbedding(
+                    model_name=settings.local.embedding_hf_model_name,
+                    cache_folder=str(models_cache_path),
+                )
+            case "sagemaker":
+
+                from private_gpt.components.embedding.custom.sagemaker import (
+                    SagemakerEmbedding,
+                )
+
+                self.embedding_model = SagemakerEmbedding(
+                    endpoint_name=settings.sagemaker.embedding_endpoint_name,
+                )
+            case "openai":
+                from llama_index import OpenAIEmbedding
+
+                openai_settings = settings.openai.api_key
+                self.embedding_model = OpenAIEmbedding(api_key=openai_settings)
+            case "mock":
+                # Not a random number, is the dimensionality used by
+                # the default embedding model
+                self.embedding_model = MockEmbedding(384)
--- a/private_gpt/components/ingest/init.py
+++ b/private_gpt/components/ingest/init.py
--- a/private_gpt/components/ingest/ingest_component.py
+++ b/private_gpt/components/ingest/ingest_component.py
@@ -0,0 +1,328 @@
+import abc
+import itertools
+import logging
+import multiprocessing
+import multiprocessing.pool
+import os
+import threading
+from pathlib import Path
+from typing import Any
+
+from llama_index import (
+    Document,
+    ServiceContext,
+    StorageContext,
+    VectorStoreIndex,
+    load_index_from_storage,
+)
+from llama_index.data_structs import IndexDict
+from llama_index.indices.base import BaseIndex
+from llama_index.ingestion import run_transformations
+
+from private_gpt.components.ingest.ingest_helper import IngestionHelper
+from private_gpt.paths import local_data_path
+from private_gpt.settings.settings import Settings
+
+logger = logging.getLogger(__name__)
+
+
+class BaseIngestComponent(abc.ABC):
+    def __init__(
+        self,
+        storage_context: StorageContext,
+        service_context: ServiceContext,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        logger.debug("Initializing base ingest component type=%s", type(self).__name__)
+        self.storage_context = storage_context
+        self.service_context = service_context
+
+    @abc.abstractmethod
+    def ingest(self, file_name: str, file_data: Path) -> list[Document]:
+        pass
+
+    @abc.abstractmethod
+    def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
+        pass
+
+    @abc.abstractmethod
+    def delete(self, doc_id: str) -> None:
+        pass
+
+
+class BaseIngestComponentWithIndex(BaseIngestComponent, abc.ABC):
+    def __init__(
+        self,
+        storage_context: StorageContext,
+        service_context: ServiceContext,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(storage_context, service_context, *args, **kwargs)
+
+        self.show_progress = True
+        self._index_thread_lock = (
+            threading.Lock()
+        )  # Thread lock! Not Multiprocessing lock
+        self._index = self._initialize_index()
+
+    def _initialize_index(self) -> BaseIndex[IndexDict]:
+        """Initialize the index from the storage context."""
+        try:
+            # Load the index with store_nodes_override=True to be able to delete them
+            index = load_index_from_storage(
+                storage_context=self.storage_context,
+                service_context=self.service_context,
+                store_nodes_override=True,  # Force store nodes in index and document stores
+                show_progress=self.show_progress,
+            )
+        except ValueError:
+            # There are no index in the storage context, creating a new one
+            logger.info("Creating a new vector store index")
+            index = VectorStoreIndex.from_documents(
+                [],
+                storage_context=self.storage_context,
+                service_context=self.service_context,
+                store_nodes_override=True,  # Force store nodes in index and document stores
+                show_progress=self.show_progress,
+            )
+            index.storage_context.persist(persist_dir=local_data_path)
+        return index
+
+    def _save_index(self) -> None:
+        self._index.storage_context.persist(persist_dir=local_data_path)
+
+    def delete(self, doc_id: str) -> None:
+        with self._index_thread_lock:
+            # Delete the document from the index
+            self._index.delete_ref_doc(doc_id, delete_from_docstore=True)
+
+            # Save the index
+            self._save_index()
+
+
+class SimpleIngestComponent(BaseIngestComponentWithIndex):
+    def __init__(
+        self,
+        storage_context: StorageContext,
+        service_context: ServiceContext,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(storage_context, service_context, *args, **kwargs)
+
+    def ingest(self, file_name: str, file_data: Path) -> list[Document]:
+        logger.info("Ingesting file_name=%s", file_name)
+        documents = IngestionHelper.transform_file_into_documents(file_name, file_data)
+        logger.info(
+            "Transformed file=%s into count=%s documents", file_name, len(documents)
+        )
+        logger.debug("Saving the documents in the index and doc store")
+        return self._save_docs(documents)
+
+    def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
+        saved_documents = []
+        for file_name, file_data in files:
+            documents = IngestionHelper.transform_file_into_documents(
+                file_name, file_data
+            )
+            saved_documents.extend(self._save_docs(documents))
+        return saved_documents
+
+    def _save_docs(self, documents: list[Document]) -> list[Document]:
+        logger.debug("Transforming count=%s documents into nodes", len(documents))
+        with self._index_thread_lock:
+            for document in documents:
+                self._index.insert(document, show_progress=True)
+            logger.debug("Persisting the index and nodes")
+            # persist the index and nodes
+            self._save_index()
+            logger.debug("Persisted the index and nodes")
+        return documents
+
+
+class BatchIngestComponent(BaseIngestComponentWithIndex):
+    """Parallelize the file reading and parsing on multiple CPU core.
+
+    This also makes the embeddings to be computed in batches (on GPU or CPU).
+    """
+
+    def __init__(
+        self,
+        storage_context: StorageContext,
+        service_context: ServiceContext,
+        count_workers: int,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(storage_context, service_context, *args, **kwargs)
+        # Make an efficient use of the CPU and GPU, the embedding
+        # must be in the transformations
+        assert (
+            len(self.service_context.transformations) >= 2
+        ), "Embeddings must be in the transformations"
+        assert count_workers > 0, "count_workers must be > 0"
+        self.count_workers = count_workers
+
+        self._file_to_documents_work_pool = multiprocessing.Pool(
+            processes=self.count_workers
+        )
+
+    def ingest(self, file_name: str, file_data: Path) -> list[Document]:
+        logger.info("Ingesting file_name=%s", file_name)
+        documents = IngestionHelper.transform_file_into_documents(file_name, file_data)
+        logger.info(
+            "Transformed file=%s into count=%s documents", file_name, len(documents)
+        )
+        logger.debug("Saving the documents in the index and doc store")
+        return self._save_docs(documents)
+
+    def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
+        documents = list(
+            itertools.chain.from_iterable(
+                self._file_to_documents_work_pool.starmap(
+                    IngestionHelper.transform_file_into_documents, files
+                )
+            )
+        )
+        logger.info(
+            "Transformed count=%s files into count=%s documents",
+            len(files),
+            len(documents),
+        )
+        return self._save_docs(documents)
+
+    def _save_docs(self, documents: list[Document]) -> list[Document]:
+        logger.debug("Transforming count=%s documents into nodes", len(documents))
+        nodes = run_transformations(
+            documents,  # type: ignore[arg-type]
+            self.service_context.transformations,
+            show_progress=self.show_progress,
+        )
+        # Locking the index to avoid concurrent writes
+        with self._index_thread_lock:
+            logger.info("Inserting count=%s nodes in the index", len(nodes))
+            self._index.insert_nodes(nodes, show_progress=True)
+            for document in documents:
+                self._index.docstore.set_document_hash(
+                    document.get_doc_id(), document.hash
+                )
+            logger.debug("Persisting the index and nodes")
+            # persist the index and nodes
+            self._save_index()
+            logger.debug("Persisted the index and nodes")
+        return documents
+
+
+class ParallelizedIngestComponent(BaseIngestComponentWithIndex):
+    """Parallelize the file ingestion (file reading, embeddings, and index insertion).
+
+    This use the CPU and GPU in parallel (both running at the same time), and
+    reduce the memory pressure by not loading all the files in memory at the same time.
+    """
+
+    def __init__(
+        self,
+        storage_context: StorageContext,
+        service_context: ServiceContext,
+        count_workers: int,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(storage_context, service_context, *args, **kwargs)
+        # To make an efficient use of the CPU and GPU, the embeddings
+        # must be in the transformations (to be computed in batches)
+        assert (
+            len(self.service_context.transformations) >= 2
+        ), "Embeddings must be in the transformations"
+        assert count_workers > 0, "count_workers must be > 0"
+        self.count_workers = count_workers
+        # We are doing our own multiprocessing
+        # To do not collide with the multiprocessing of huggingface, we disable it
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+        self._ingest_work_pool = multiprocessing.pool.ThreadPool(
+            processes=self.count_workers
+        )
+
+        self._file_to_documents_work_pool = multiprocessing.Pool(
+            processes=self.count_workers
+        )
+
+    def ingest(self, file_name: str, file_data: Path) -> list[Document]:
+        logger.info("Ingesting file_name=%s", file_name)
+        # Running in a single (1) process to release the current
+        # thread, and take a dedicated CPU core for computation
+        documents = self._file_to_documents_work_pool.apply(
+            IngestionHelper.transform_file_into_documents, (file_name, file_data)
+        )
+        logger.info(
+            "Transformed file=%s into count=%s documents", file_name, len(documents)
+        )
+        logger.debug("Saving the documents in the index and doc store")
+        return self._save_docs(documents)
+
+    def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]:
+        # Lightweight threads, used for parallelize the
+        # underlying IO calls made in the ingestion
+
+        documents = list(
+            itertools.chain.from_iterable(
+                self._ingest_work_pool.starmap(self.ingest, files)
+            )
+        )
+        return documents
+
+    def _save_docs(self, documents: list[Document]) -> list[Document]:
+        logger.debug("Transforming count=%s documents into nodes", len(documents))
+        nodes = run_transformations(
+            documents,  # type: ignore[arg-type]
+            self.service_context.transformations,
+            show_progress=self.show_progress,
+        )
+        # Locking the index to avoid concurrent writes
+        with self._index_thread_lock:
+            logger.info("Inserting count=%s nodes in the index", len(nodes))
+            self._index.insert_nodes(nodes, show_progress=True)
+            for document in documents:
+                self._index.docstore.set_document_hash(
+                    document.get_doc_id(), document.hash
+                )
+            logger.debug("Persisting the index and nodes")
+            # persist the index and nodes
+            self._save_index()
+            logger.debug("Persisted the index and nodes")
+        return documents
+
+    def __del__(self) -> None:
+        # We need to do the appropriate cleanup of the multiprocessing pools
+        # when the object is deleted. Using root logger to avoid
+        # the logger to be deleted before the pool
+        logging.debug("Closing the ingest work pool")
+        self._ingest_work_pool.close()
+        self._ingest_work_pool.join()
+        self._ingest_work_pool.terminate()
+        logging.debug("Closing the file to documents work pool")
+        self._file_to_documents_work_pool.close()
+        self._file_to_documents_work_pool.join()
+        self._file_to_documents_work_pool.terminate()
+
+
+def get_ingestion_component(
+    storage_context: StorageContext,
+    service_context: ServiceContext,
+    settings: Settings,
+) -> BaseIngestComponent:
+    """Get the ingestion component for the given configuration."""
+    ingest_mode = settings.embedding.ingest_mode
+    if ingest_mode == "batch":
+        return BatchIngestComponent(
+            storage_context, service_context, settings.embedding.count_workers
+        )
+    elif ingest_mode == "parallel":
+        return ParallelizedIngestComponent(
+            storage_context, service_context, settings.embedding.count_workers
+        )
+    else:
+        return SimpleIngestComponent(storage_context, service_context)
--- a/private_gpt/components/ingest/ingest_helper.py
+++ b/private_gpt/components/ingest/ingest_helper.py
@@ -0,0 +1,61 @@
+import logging
+from pathlib import Path
+
+from llama_index import Document
+from llama_index.readers import JSONReader, StringIterableReader
+from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS
+
+logger = logging.getLogger(__name__)
+
+# Patching the default file reader to support other file types
+FILE_READER_CLS = DEFAULT_FILE_READER_CLS.copy()
+FILE_READER_CLS.update(
+    {
+        ".json": JSONReader,
+    }
+)
+
+
+class IngestionHelper:
+    """Helper class to transform a file into a list of documents.
+
+    This class should be used to transform a file into a list of documents.
+    These methods are thread-safe (and multiprocessing-safe).
+    """
+
+    @staticmethod
+    def transform_file_into_documents(
+        file_name: str, file_data: Path
+    ) -> list[Document]:
+        documents = IngestionHelper._load_file_to_documents(file_name, file_data)
+        for document in documents:
+            document.metadata["file_name"] = file_name
+        IngestionHelper._exclude_metadata(documents)
+        return documents
+
+    @staticmethod
+    def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]:
+        logger.debug("Transforming file_name=%s into documents", file_name)
+        extension = Path(file_name).suffix
+        reader_cls = FILE_READER_CLS.get(extension)
+        if reader_cls is None:
+            logger.debug(
+                "No reader found for extension=%s, using default string reader",
+                extension,
+            )
+            # Read as a plain text
+            string_reader = StringIterableReader()
+            return string_reader.load_data([file_data.read_text()])
+
+        logger.debug("Specific reader found for extension=%s", extension)
+        return reader_cls().load_data(file_data)
+
+    @staticmethod
+    def _exclude_metadata(documents: list[Document]) -> None:
+        logger.debug("Excluding metadata from count=%s documents", len(documents))
+        for document in documents:
+            document.metadata["doc_id"] = document.doc_id
+            # We don't want the Embeddings search to receive this metadata
+            document.excluded_embed_metadata_keys = ["doc_id"]
+            # We don't want the LLM to receive these metadata in the context
+            document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"]
--- a/private_gpt/components/llm/init.py
+++ b/private_gpt/components/llm/init.py
@@ -0,0 +1 @@
+"""LLM implementations."""
--- a/private_gpt/components/llm/custom/init.py
+++ b/private_gpt/components/llm/custom/init.py
--- a/private_gpt/components/llm/custom/sagemaker.py
+++ b/private_gpt/components/llm/custom/sagemaker.py
@@ -0,0 +1,275 @@
+# mypy: ignore-errors
+from __future__ import annotations
+
+import io
+import json
+import logging
+from typing import TYPE_CHECKING, Any
+
+import boto3  # type: ignore
+from llama_index.bridge.pydantic import Field
+from llama_index.llms import (
+    CompletionResponse,
+    CustomLLM,
+    LLMMetadata,
+)
+from llama_index.llms.base import (
+    llm_chat_callback,
+    llm_completion_callback,
+)
+from llama_index.llms.generic_utils import (
+    completion_response_to_chat_response,
+    stream_completion_response_to_chat_response,
+)
+from llama_index.llms.llama_utils import (
+    completion_to_prompt as generic_completion_to_prompt,
+)
+from llama_index.llms.llama_utils import (
+    messages_to_prompt as generic_messages_to_prompt,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from llama_index.callbacks import CallbackManager
+    from llama_index.llms import (
+        ChatMessage,
+        ChatResponse,
+        ChatResponseGen,
+        CompletionResponseGen,
+    )
+
+logger = logging.getLogger(__name__)
+
+
+class LineIterator:
+    r"""A helper class for parsing the byte stream input from TGI container.
+
+    The output of the model will be in the following format:
+    ```
+    b'data:{"token": {"text": " a"}}\n\n'
+    b'data:{"token": {"text": " challenging"}}\n\n'
+    b'data:{"token": {"text": " problem"
+    b'}}'
+    ...
+    ```
+
+    While usually each PayloadPart event from the event stream will contain a byte array
+    with a full json, this is not guaranteed and some of the json objects may be split
+    across PayloadPart events. For example:
+    ```
+    {'PayloadPart': {'Bytes': b'{"outputs": '}}
+    {'PayloadPart': {'Bytes': b'[" problem"]}\n'}}
+    ```
+
+
+    This class accounts for this by concatenating bytes written via the 'write' function
+    and then exposing a method which will return lines (ending with a '\n' character)
+    within the buffer via the 'scan_lines' function. It maintains the position of the
+    last read position to ensure that previous bytes are not exposed again. It will
+    also save any pending lines that doe not end with a '\n' to make sure truncations
+    are concatinated
+    """
+
+    def __init__(self, stream: Any) -> None:
+        """Line iterator initializer."""
+        self.byte_iterator = iter(stream)
+        self.buffer = io.BytesIO()
+        self.read_pos = 0
+
+    def __iter__(self) -> Any:
+        """Self iterator."""
+        return self
+
+    def __next__(self) -> Any:
+        """Next element from iterator."""
+        while True:
+            self.buffer.seek(self.read_pos)
+            line = self.buffer.readline()
+            if line and line[-1] == ord("\n"):
+                self.read_pos += len(line)
+                return line[:-1]
+            try:
+                chunk = next(self.byte_iterator)
+            except StopIteration:
+                if self.read_pos < self.buffer.getbuffer().nbytes:
+                    continue
+                raise
+            if "PayloadPart" not in chunk:
+                logger.warning("Unknown event type=%s", chunk)
+                continue
+            self.buffer.seek(0, io.SEEK_END)
+            self.buffer.write(chunk["PayloadPart"]["Bytes"])
+
+
+class SagemakerLLM(CustomLLM):
+    """Sagemaker Inference Endpoint models.
+
+    To use, you must supply the endpoint name from your deployed
+    Sagemaker model & the region where it is deployed.
+
+    To authenticate, the AWS client uses the following methods to
+    automatically load credentials:
+    https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
+
+    If a specific credential profile should be used, you must pass
+    the name of the profile from the ~/.aws/credentials file that is to be used.
+
+    Make sure the credentials / roles used have the required policies to
+    access the Sagemaker endpoint.
+    See: https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies.html
+    """
+
+    endpoint_name: str = Field(description="")
+    temperature: float = Field(description="The temperature to use for sampling.")
+    max_new_tokens: int = Field(description="The maximum number of tokens to generate.")
+    context_window: int = Field(
+        description="The maximum number of context tokens for the model."
+    )
+    messages_to_prompt: Any = Field(
+        description="The function to convert messages to a prompt.", exclude=True
+    )
+    completion_to_prompt: Any = Field(
+        description="The function to convert a completion to a prompt.", exclude=True
+    )
+    generate_kwargs: dict[str, Any] = Field(
+        default_factory=dict, description="Kwargs used for generation."
+    )
+    model_kwargs: dict[str, Any] = Field(
+        default_factory=dict, description="Kwargs used for model initialization."
+    )
+    verbose: bool = Field(description="Whether to print verbose output.")
+
+    _boto_client: Any = boto3.client(
+        "sagemaker-runtime",
+    )  # TODO make it an optional field
+
+    def __init__(
+        self,
+        endpoint_name: str | None = "",
+        temperature: float = 0.1,
+        max_new_tokens: int = 512,  # to review defaults
+        context_window: int = 2048,  # to review defaults
+        messages_to_prompt: Any = None,
+        completion_to_prompt: Any = None,
+        callback_manager: CallbackManager | None = None,
+        generate_kwargs: dict[str, Any] | None = None,
+        model_kwargs: dict[str, Any] | None = None,
+        verbose: bool = True,
+    ) -> None:
+        """SagemakerLLM initializer."""
+        model_kwargs = model_kwargs or {}
+        model_kwargs.update({"n_ctx": context_window, "verbose": verbose})
+
+        messages_to_prompt = messages_to_prompt or generic_messages_to_prompt
+        completion_to_prompt = completion_to_prompt or generic_completion_to_prompt
+
+        generate_kwargs = generate_kwargs or {}
+        generate_kwargs.update(
+            {"temperature": temperature, "max_tokens": max_new_tokens}
+        )
+
+        super().__init__(
+            endpoint_name=endpoint_name,
+            temperature=temperature,
+            context_window=context_window,
+            max_new_tokens=max_new_tokens,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            callback_manager=callback_manager,
+            generate_kwargs=generate_kwargs,
+            model_kwargs=model_kwargs,
+            verbose=verbose,
+        )
+
+    @property
+    def inference_params(self):
+        # TODO expose the rest of params
+        return {
+            "do_sample": True,
+            "top_p": 0.7,
+            "temperature": self.temperature,
+            "top_k": 50,
+            "max_new_tokens": self.max_new_tokens,
+        }
+
+    @property
+    def metadata(self) -> LLMMetadata:
+        """Get LLM metadata."""
+        return LLMMetadata(
+            context_window=self.context_window,
+            num_output=self.max_new_tokens,
+            model_name="Sagemaker LLama 2",
+        )
+
+    @llm_completion_callback()
+    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
+        self.generate_kwargs.update({"stream": False})
+
+        is_formatted = kwargs.pop("formatted", False)
+        if not is_formatted:
+            prompt = self.completion_to_prompt(prompt)
+
+        request_params = {
+            "inputs": prompt,
+            "stream": False,
+            "parameters": self.inference_params,
+        }
+
+        resp = self._boto_client.invoke_endpoint(
+            EndpointName=self.endpoint_name,
+            Body=json.dumps(request_params),
+            ContentType="application/json",
+        )
+
+        response_body = resp["Body"]
+        response_str = response_body.read().decode("utf-8")
+        response_dict = eval(response_str)
+
+        return CompletionResponse(
+            text=response_dict[0]["generated_text"][len(prompt) :], raw=resp
+        )
+
+    @llm_completion_callback()
+    def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
+        def get_stream():
+            text = ""
+
+            request_params = {
+                "inputs": prompt,
+                "stream": True,
+                "parameters": self.inference_params,
+            }
+            resp = self._boto_client.invoke_endpoint_with_response_stream(
+                EndpointName=self.endpoint_name,
+                Body=json.dumps(request_params),
+                ContentType="application/json",
+            )
+
+            event_stream = resp["Body"]
+            start_json = b"{"
+            stop_token = "<|endoftext|>"
+
+            for line in LineIterator(event_stream):
+                if line != b"" and start_json in line:
+                    data = json.loads(line[line.find(start_json) :].decode("utf-8"))
+                    if data["token"]["text"] != stop_token:
+                        delta = data["token"]["text"]
+                        text += delta
+                        yield CompletionResponse(delta=delta, text=text, raw=data)
+
+        return get_stream()
+
+    @llm_chat_callback()
+    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
+        prompt = self.messages_to_prompt(messages)
+        completion_response = self.complete(prompt, formatted=True, **kwargs)
+        return completion_response_to_chat_response(completion_response)
+
+    @llm_chat_callback()
+    def stream_chat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponseGen:
+        prompt = self.messages_to_prompt(messages)
+        completion_response = self.stream_complete(prompt, formatted=True, **kwargs)
+        return stream_completion_response_to_chat_response(completion_response)
--- a/private_gpt/components/llm/llm_component.py
+++ b/private_gpt/components/llm/llm_component.py
@@ -0,0 +1,59 @@
+import logging
+
+from injector import inject, singleton
+from llama_index.llms import MockLLM
+from llama_index.llms.base import LLM
+
+from private_gpt.components.llm.prompt_helper import get_prompt_style
+from private_gpt.paths import models_path
+from private_gpt.settings.settings import Settings
+
+logger = logging.getLogger(__name__)
+
+
+@singleton
+class LLMComponent:
+    llm: LLM
+
+    @inject
+    def __init__(self, settings: Settings) -> None:
+        llm_mode = settings.llm.mode
+        logger.info("Initializing the LLM in mode=%s", llm_mode)
+        match settings.llm.mode:
+            case "local":
+                from llama_index.llms import LlamaCPP
+
+                prompt_style_cls = get_prompt_style(settings.local.prompt_style)
+                prompt_style = prompt_style_cls(
+                    default_system_prompt=settings.local.default_system_prompt
+                )
+
+                self.llm = LlamaCPP(
+                    model_path=str(models_path / settings.local.llm_hf_model_file),
+                    temperature=0.1,
+                    max_new_tokens=settings.llm.max_new_tokens,
+                    # llama2 has a context window of 4096 tokens,
+                    # but we set it lower to allow for some wiggle room
+                    context_window=3900,
+                    generate_kwargs={},
+                    # All to GPU
+                    model_kwargs={"n_gpu_layers": -1},
+                    # transform inputs into Llama2 format
+                    messages_to_prompt=prompt_style.messages_to_prompt,
+                    completion_to_prompt=prompt_style.completion_to_prompt,
+                    verbose=True,
+                )
+
+            case "sagemaker":
+                from private_gpt.components.llm.custom.sagemaker import SagemakerLLM
+
+                self.llm = SagemakerLLM(
+                    endpoint_name=settings.sagemaker.llm_endpoint_name,
+                )
+            case "openai":
+                from llama_index.llms import OpenAI
+
+                openai_settings = settings.openai.api_key
+                self.llm = OpenAI(api_key=openai_settings)
+            case "mock":
+                self.llm = MockLLM()
--- a/private_gpt/components/llm/prompt_helper.py
+++ b/private_gpt/components/llm/prompt_helper.py
@@ -0,0 +1,179 @@
+import abc
+import logging
+from collections.abc import Sequence
+from typing import Any, Literal
+
+from llama_index.llms import ChatMessage, MessageRole
+from llama_index.llms.llama_utils import (
+    DEFAULT_SYSTEM_PROMPT,
+    completion_to_prompt,
+    messages_to_prompt,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class AbstractPromptStyle(abc.ABC):
+    """Abstract class for prompt styles.
+
+    This class is used to format a series of messages into a prompt that can be
+    understood by the models. A series of messages represents the interaction(s)
+    between a user and an assistant. This series of messages can be considered as a
+    session between a user X and an assistant Y.This session holds, through the
+    messages, the state of the conversation. This session, to be understood by the
+    model, needs to be formatted into a prompt (i.e. a string that the models
+    can understand). Prompts can be formatted in different ways,
+    depending on the model.
+
+    The implementations of this class represent the different ways to format a
+    series of messages into a prompt.
+    """
+
+    @abc.abstractmethod
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        logger.debug("Initializing prompt_style=%s", self.__class__.__name__)
+
+    @abc.abstractmethod
+    def _messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str:
+        pass
+
+    @abc.abstractmethod
+    def _completion_to_prompt(self, completion: str) -> str:
+        pass
+
+    def messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str:
+        prompt = self._messages_to_prompt(messages)
+        logger.debug("Got for messages='%s' the prompt='%s'", messages, prompt)
+        return prompt
+
+    def completion_to_prompt(self, completion: str) -> str:
+        prompt = self._completion_to_prompt(completion)
+        logger.debug("Got for completion='%s' the prompt='%s'", completion, prompt)
+        return prompt
+
+
+class AbstractPromptStyleWithSystemPrompt(AbstractPromptStyle, abc.ABC):
+    _DEFAULT_SYSTEM_PROMPT = DEFAULT_SYSTEM_PROMPT
+
+    def __init__(self, default_system_prompt: str | None) -> None:
+        super().__init__()
+        logger.debug("Got default_system_prompt='%s'", default_system_prompt)
+        self.default_system_prompt = default_system_prompt
+
+
+class DefaultPromptStyle(AbstractPromptStyle):
+    """Default prompt style that uses the defaults from llama_utils.
+
+    It basically passes None to the LLM, indicating it should use
+    the default functions.
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+        # Hacky way to override the functions
+        # Override the functions to be None, and pass None to the LLM.
+        self.messages_to_prompt = None  # type: ignore[method-assign, assignment]
+        self.completion_to_prompt = None  # type: ignore[method-assign, assignment]
+
+    def _messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str:
+        return ""
+
+    def _completion_to_prompt(self, completion: str) -> str:
+        return ""
+
+
+class Llama2PromptStyle(AbstractPromptStyleWithSystemPrompt):
+    """Simple prompt style that just uses the default llama_utils functions.
+
+    It transforms the sequence of messages into a prompt that should look like:
+    ```text
+    <s> [INST] <<SYS>> your system prompt here. <</SYS>>
+
+    user message here [/INST] assistant (model) response here </s>
+    ```
+    """
+
+    def __init__(self, default_system_prompt: str | None = None) -> None:
+        # If no system prompt is given, the default one of the implementation is used.
+        super().__init__(default_system_prompt=default_system_prompt)
+
+    def _messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str:
+        return messages_to_prompt(messages, self.default_system_prompt)
+
+    def _completion_to_prompt(self, completion: str) -> str:
+        return completion_to_prompt(completion, self.default_system_prompt)
+
+
+class TagPromptStyle(AbstractPromptStyleWithSystemPrompt):
+    """Tag prompt style (used by Vigogne) that uses the prompt style `<|ROLE|>`.
+
+    It transforms the sequence of messages into a prompt that should look like:
+    ```text
+    <|system|>: your system prompt here.
+    <|user|>: user message here
+    (possibly with context and question)
+    <|assistant|>: assistant (model) response here.
+    ```
+
+    FIXME: should we add surrounding `<s>` and `</s>` tags, like in llama2?
+    """
+
+    def __init__(self, default_system_prompt: str | None = None) -> None:
+        # We have to define a default system prompt here as the LLM will not
+        # use the default llama_utils functions.
+        default_system_prompt = default_system_prompt or self._DEFAULT_SYSTEM_PROMPT
+        super().__init__(default_system_prompt)
+        self.system_prompt: str = default_system_prompt
+
+    def _messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str:
+        messages = list(messages)
+        if messages[0].role != MessageRole.SYSTEM:
+            logger.info(
+                "Adding system_promt='%s' to the given messages as there are none given in the session",
+                self.system_prompt,
+            )
+            messages = [
+                ChatMessage(content=self.system_prompt, role=MessageRole.SYSTEM),
+                *messages,
+            ]
+        return self._format_messages_to_prompt(messages)
+
+    def _completion_to_prompt(self, completion: str) -> str:
+        return (
+            f"<|system|>: {self.system_prompt.strip()}\n"
+            f"<|user|>: {completion.strip()}\n"
+            "<|assistant|>: "
+        )
+
+    @staticmethod
+    def _format_messages_to_prompt(messages: list[ChatMessage]) -> str:
+        """Format message to prompt with `<|ROLE|>: MSG` style."""
+        assert messages[0].role == MessageRole.SYSTEM
+        prompt = ""
+        for message in messages:
+            role = message.role
+            content = message.content or ""
+            message_from_user = f"<|{role.lower()}|>: {content.strip()}"
+            message_from_user += "\n"
+            prompt += message_from_user
+        # we are missing the last <|assistant|> tag that will trigger a completion
+        prompt += "<|assistant|>: "
+        return prompt
+
+
+def get_prompt_style(
+    prompt_style: Literal["default", "llama2", "tag"] | None
+) -> type[AbstractPromptStyle]:
+    """Get the prompt style to use from the given string.
+
+    :param prompt_style: The prompt style to use.
+    :return: The prompt style to use.
+    """
+    if prompt_style is None or prompt_style == "default":
+        return DefaultPromptStyle
+    elif prompt_style == "llama2":
+        return Llama2PromptStyle
+    elif prompt_style == "tag":
+        return TagPromptStyle
+    raise ValueError(f"Unknown prompt_style='{prompt_style}'")
--- a/private_gpt/components/node_store/init.py
+++ b/private_gpt/components/node_store/init.py
--- a/private_gpt/components/node_store/node_store_component.py
+++ b/private_gpt/components/node_store/node_store_component.py
@@ -0,0 +1,34 @@
+import logging
+
+from injector import inject, singleton
+from llama_index.storage.docstore import BaseDocumentStore, SimpleDocumentStore
+from llama_index.storage.index_store import SimpleIndexStore
+from llama_index.storage.index_store.types import BaseIndexStore
+
+from private_gpt.paths import local_data_path
+
+logger = logging.getLogger(__name__)
+
+
+@singleton
+class NodeStoreComponent:
+    index_store: BaseIndexStore
+    doc_store: BaseDocumentStore
+
+    @inject
+    def __init__(self) -> None:
+        try:
+            self.index_store = SimpleIndexStore.from_persist_dir(
+                persist_dir=str(local_data_path)
+            )
+        except FileNotFoundError:
+            logger.debug("Local index store not found, creating a new one")
+            self.index_store = SimpleIndexStore()
+
+        try:
+            self.doc_store = SimpleDocumentStore.from_persist_dir(
+                persist_dir=str(local_data_path)
+            )
+        except FileNotFoundError:
+            logger.debug("Local document store not found, creating a new one")
+            self.doc_store = SimpleDocumentStore()
--- a/private_gpt/components/vector_store/init.py
+++ b/private_gpt/components/vector_store/init.py
--- a/private_gpt/components/vector_store/batched_chroma.py
+++ b/private_gpt/components/vector_store/batched_chroma.py
@@ -0,0 +1,87 @@
+from typing import Any
+
+from llama_index.schema import BaseNode, MetadataMode
+from llama_index.vector_stores import ChromaVectorStore
+from llama_index.vector_stores.chroma import chunk_list
+from llama_index.vector_stores.utils import node_to_metadata_dict
+
+
+class BatchedChromaVectorStore(ChromaVectorStore):
+    """Chroma vector store, batching additions to avoid reaching the max batch limit.
+
+    In this vector store, embeddings are stored within a ChromaDB collection.
+
+    During query time, the index uses ChromaDB to query for the top
+    k most similar nodes.
+
+    Args:
+        chroma_client (from chromadb.api.API):
+            API instance
+        chroma_collection (chromadb.api.models.Collection.Collection):
+            ChromaDB collection instance
+
+    """
+
+    chroma_client: Any | None
+
+    def __init__(
+        self,
+        chroma_client: Any,
+        chroma_collection: Any,
+        host: str | None = None,
+        port: str | None = None,
+        ssl: bool = False,
+        headers: dict[str, str] | None = None,
+        collection_kwargs: dict[Any, Any] | None = None,
+    ) -> None:
+        super().__init__(
+            chroma_collection=chroma_collection,
+            host=host,
+            port=port,
+            ssl=ssl,
+            headers=headers,
+            collection_kwargs=collection_kwargs or {},
+        )
+        self.chroma_client = chroma_client
+
+    def add(self, nodes: list[BaseNode], **add_kwargs: Any) -> list[str]:
+        """Add nodes to index, batching the insertion to avoid issues.
+
+        Args:
+            nodes: List[BaseNode]: list of nodes with embeddings
+            add_kwargs: _
+        """
+        if not self.chroma_client:
+            raise ValueError("Client not initialized")
+
+        if not self._collection:
+            raise ValueError("Collection not initialized")
+
+        max_chunk_size = self.chroma_client.max_batch_size
+        node_chunks = chunk_list(nodes, max_chunk_size)
+
+        all_ids = []
+        for node_chunk in node_chunks:
+            embeddings = []
+            metadatas = []
+            ids = []
+            documents = []
+            for node in node_chunk:
+                embeddings.append(node.get_embedding())
+                metadatas.append(
+                    node_to_metadata_dict(
+                        node, remove_text=True, flat_metadata=self.flat_metadata
+                    )
+                )
+                ids.append(node.node_id)
+                documents.append(node.get_content(metadata_mode=MetadataMode.NONE))
+
+            self._collection.add(
+                embeddings=embeddings,
+                ids=ids,
+                metadatas=metadatas,
+                documents=documents,
+            )
+            all_ids.extend(ids)
+
+        return all_ids
--- a/private_gpt/components/vector_store/vector_store_component.py
+++ b/private_gpt/components/vector_store/vector_store_component.py
@@ -0,0 +1,118 @@
+import logging
+import typing
+
+from injector import inject, singleton
+from llama_index import VectorStoreIndex
+from llama_index.indices.vector_store import VectorIndexRetriever
+from llama_index.vector_stores.types import VectorStore
+
+from private_gpt.components.vector_store.batched_chroma import BatchedChromaVectorStore
+from private_gpt.open_ai.extensions.context_filter import ContextFilter
+from private_gpt.paths import local_data_path
+from private_gpt.settings.settings import Settings
+
+logger = logging.getLogger(__name__)
+
+
+@typing.no_type_check
+def _chromadb_doc_id_metadata_filter(
+    context_filter: ContextFilter | None,
+) -> dict | None:
+    if context_filter is None or context_filter.docs_ids is None:
+        return {}  # No filter
+    elif len(context_filter.docs_ids) < 1:
+        return {"doc_id": "-"}  # Effectively filtering out all docs
+    else:
+        doc_filter_items = []
+        if len(context_filter.docs_ids) > 1:
+            doc_filter = {"$or": doc_filter_items}
+            for doc_id in context_filter.docs_ids:
+                doc_filter_items.append({"doc_id": doc_id})
+        else:
+            doc_filter = {"doc_id": context_filter.docs_ids[0]}
+        return doc_filter
+
+
+@singleton
+class VectorStoreComponent:
+    vector_store: VectorStore
+
+    @inject
+    def __init__(self, settings: Settings) -> None:
+        match settings.vectorstore.database:
+            case "chroma":
+                try:
+                    import chromadb  # type: ignore
+                    from chromadb.config import (  # type: ignore
+                        Settings as ChromaSettings,
+                    )
+                except ImportError as e:
+                    raise ImportError(
+                        "'chromadb' is not installed."
+                        "To use PrivateGPT with Chroma, install the 'chroma' extra."
+                        "`poetry install --extras chroma`"
+                    ) from e
+
+                chroma_settings = ChromaSettings(anonymized_telemetry=False)
+                chroma_client = chromadb.PersistentClient(
+                    path=str((local_data_path / "chroma_db").absolute()),
+                    settings=chroma_settings,
+                )
+                chroma_collection = chroma_client.get_or_create_collection(
+                    "make_this_parameterizable_per_api_call"
+                )  # TODO
+
+                self.vector_store = typing.cast(
+                    VectorStore,
+                    BatchedChromaVectorStore(
+                        chroma_client=chroma_client, chroma_collection=chroma_collection
+                    ),
+                )
+
+            case "qdrant":
+                from llama_index.vector_stores.qdrant import QdrantVectorStore
+                from qdrant_client import QdrantClient
+
+                if settings.qdrant is None:
+                    logger.info(
+                        "Qdrant config not found. Using default settings."
+                        "Trying to connect to Qdrant at localhost:6333."
+                    )
+                    client = QdrantClient()
+                else:
+                    client = QdrantClient(
+                        **settings.qdrant.model_dump(exclude_none=True)
+                    )
+                self.vector_store = typing.cast(
+                    VectorStore,
+                    QdrantVectorStore(
+                        client=client,
+                        collection_name="make_this_parameterizable_per_api_call",
+                    ),  # TODO
+                )
+            case _:
+                # Should be unreachable
+                # The settings validator should have caught this
+                raise ValueError(
+                    f"Vectorstore database {settings.vectorstore.database} not supported"
+                )
+
+    @staticmethod
+    def get_retriever(
+        index: VectorStoreIndex,
+        context_filter: ContextFilter | None = None,
+        similarity_top_k: int = 2,
+    ) -> VectorIndexRetriever:
+        # This way we support qdrant (using doc_ids) and chroma (using where clause)
+        return VectorIndexRetriever(
+            index=index,
+            similarity_top_k=similarity_top_k,
+            doc_ids=context_filter.docs_ids if context_filter else None,
+            vector_store_kwargs={
+                "where": _chromadb_doc_id_metadata_filter(context_filter)
+            },
+        )
+
+    def close(self) -> None:
+        if hasattr(self.vector_store.client, "close"):
+            self.vector_store.client.close()
--- a/private_gpt/constants.py
+++ b/private_gpt/constants.py
@@ -0,0 +1,3 @@
+from pathlib import Path
+
+PROJECT_ROOT_PATH: Path = Path(__file__).parents[1]
--- a/private_gpt/di.py
+++ b/private_gpt/di.py
@@ -0,0 +1,19 @@
+from injector import Injector
+
+from private_gpt.settings.settings import Settings, unsafe_typed_settings
+
+
+def create_application_injector() -> Injector:
+    _injector = Injector(auto_bind=True)
+    _injector.binder.bind(Settings, to=unsafe_typed_settings)
+    return _injector
+
+
+"""
+Global injector for the application.
+
+Avoid using this reference, it will make your code harder to test.
+
+Instead, use the `request.state.injector` reference, which is bound to every request
+"""
+global_injector: Injector = create_application_injector()
--- a/private_gpt/launcher.py
+++ b/private_gpt/launcher.py
@@ -0,0 +1,128 @@
+"""FastAPI app creation, logger configuration and main API routes."""
+import logging
+from typing import Any
+
+from fastapi import Depends, FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.openapi.utils import get_openapi
+from injector import Injector
+
+from private_gpt.paths import docs_path
+from private_gpt.server.chat.chat_router import chat_router
+from private_gpt.server.chunks.chunks_router import chunks_router
+from private_gpt.server.completions.completions_router import completions_router
+from private_gpt.server.embeddings.embeddings_router import embeddings_router
+from private_gpt.server.health.health_router import health_router
+from private_gpt.server.ingest.ingest_router import ingest_router
+from private_gpt.settings.settings import Settings
+
+logger = logging.getLogger(__name__)
+
+
+def create_app(root_injector: Injector) -> FastAPI:
+
+    # Start the API
+    with open(docs_path / "description.md") as description_file:
+        description = description_file.read()
+
+        tags_metadata = [
+            {
+                "name": "Ingestion",
+                "description": "High-level APIs covering document ingestion -internally "
+                "managing document parsing, splitting,"
+                "metadata extraction, embedding generation and storage- and ingested "
+                "documents CRUD."
+                "Each ingested document is identified by an ID that can be used to filter the "
+                "context"
+                "used in *Contextual Completions* and *Context Chunks* APIs.",
+            },
+            {
+                "name": "Contextual Completions",
+                "description": "High-level APIs covering contextual Chat and Completions. They "
+                "follow OpenAI's format, extending it to "
+                "allow using the context coming from ingested documents to create the "
+                "response. Internally"
+                "manage context retrieval, prompt engineering and the response generation.",
+            },
+            {
+                "name": "Context Chunks",
+                "description": "Low-level API that given a query return relevant chunks of "
+                "text coming from the ingested"
+                "documents.",
+            },
+            {
+                "name": "Embeddings",
+                "description": "Low-level API to obtain the vector representation of a given "
+                "text, using an Embeddings model."
+                "Follows OpenAI's embeddings API format.",
+            },
+            {
+                "name": "Health",
+                "description": "Simple health API to make sure the server is up and running.",
+            },
+        ]
+
+        async def bind_injector_to_request(request: Request) -> None:
+            request.state.injector = root_injector
+
+        app = FastAPI(dependencies=[Depends(bind_injector_to_request)])
+
+        def custom_openapi() -> dict[str, Any]:
+            if app.openapi_schema:
+                return app.openapi_schema
+            openapi_schema = get_openapi(
+                title="PrivateGPT",
+                description=description,
+                version="0.1.0",
+                summary="PrivateGPT is a production-ready AI project that allows you to "
+                "ask questions to your documents using the power of Large Language "
+                "Models (LLMs), even in scenarios without Internet connection. "
+                "100% private, no data leaves your execution environment at any point.",
+                contact={
+                    "url": "https://github.com/imartinez/privateGPT",
+                },
+                license_info={
+                    "name": "Apache 2.0",
+                    "url": "https://www.apache.org/licenses/LICENSE-2.0.html",
+                },
+                routes=app.routes,
+                tags=tags_metadata,
+            )
+            openapi_schema["info"]["x-logo"] = {
+                "url": "https://lh3.googleusercontent.com/drive-viewer"
+                "/AK7aPaD_iNlMoTquOBsw4boh4tIYxyEuhz6EtEs8nzq3yNkNAK00xGj"
+                "E1KUCmPJSk3TYOjcs6tReG6w_cLu1S7L_gPgT9z52iw=s2560"
+            }
+
+            app.openapi_schema = openapi_schema
+            return app.openapi_schema
+
+        app.openapi = custom_openapi  # type: ignore[method-assign]
+
+        app.include_router(completions_router)
+        app.include_router(chat_router)
+        app.include_router(chunks_router)
+        app.include_router(ingest_router)
+        app.include_router(embeddings_router)
+        app.include_router(health_router)
+
+        settings = root_injector.get(Settings)
+        if settings.server.cors.enabled:
+            logger.debug("Setting up CORS middleware")
+            app.add_middleware(
+                CORSMiddleware,
+                allow_credentials=settings.server.cors.allow_credentials,
+                allow_origins=settings.server.cors.allow_origins,
+                allow_origin_regex=settings.server.cors.allow_origin_regex,
+                allow_methods=settings.server.cors.allow_methods,
+                allow_headers=settings.server.cors.allow_headers,
+            )
+
+        if settings.ui.enabled:
+            logger.debug("Importing the UI module")
+            from private_gpt.ui.ui import PrivateGptUi
+
+            ui = root_injector.get(PrivateGptUi)
+            ui.mount_in_app(app, settings.ui.path)
+
+        return app
--- a/private_gpt/main.py
+++ b/private_gpt/main.py
@@ -0,0 +1,11 @@
+"""FastAPI app creation, logger configuration and main API routes."""
+
+import llama_index
+
+from private_gpt.di import global_injector
+from private_gpt.launcher import create_app
+
+# Add LlamaIndex simple observability
+llama_index.set_global_handler("simple")
+
+app = create_app(global_injector)
--- a/private_gpt/open_ai/init.py
+++ b/private_gpt/open_ai/init.py
@@ -0,0 +1 @@
+"""OpenAI compatibility utilities."""
--- a/private_gpt/open_ai/extensions/init.py
+++ b/private_gpt/open_ai/extensions/init.py
@@ -0,0 +1 @@
+"""OpenAI API extensions."""
--- a/private_gpt/open_ai/extensions/context_filter.py
+++ b/private_gpt/open_ai/extensions/context_filter.py
@@ -0,0 +1,7 @@
+from pydantic import BaseModel, Field
+
+
+class ContextFilter(BaseModel):
+    docs_ids: list[str] | None = Field(
+        examples=[["c202d5e6-7b69-4869-81cc-dd574ee8ee11"]]
+    )
--- a/private_gpt/open_ai/openai_models.py
+++ b/private_gpt/open_ai/openai_models.py
@@ -0,0 +1,122 @@
+import time
+import uuid
+from collections.abc import Iterator
+from typing import Literal
+
+from llama_index.llms import ChatResponse, CompletionResponse
+from pydantic import BaseModel, Field
+
+from private_gpt.server.chunks.chunks_service import Chunk
+
+
+class OpenAIDelta(BaseModel):
+    """A piece of completion that needs to be concatenated to get the full message."""
+
+    content: str | None
+
+
+class OpenAIMessage(BaseModel):
+    """Inference result, with the source of the message.
+
+    Role could be the assistant or system
+    (providing a default response, not AI generated).
+    """
+
+    role: Literal["assistant", "system", "user"] = Field(default="user")
+    content: str | None
+
+
+class OpenAIChoice(BaseModel):
+    """Response from AI.
+
+    Either the delta or the message will be present, but never both.
+    Sources used will be returned in case context retrieval was enabled.
+    """
+
+    finish_reason: str | None = Field(examples=["stop"])
+    delta: OpenAIDelta | None = None
+    message: OpenAIMessage | None = None
+    sources: list[Chunk] | None = None
+    index: int = 0
+
+
+class OpenAICompletion(BaseModel):
+    """Clone of OpenAI Completion model.
+
+    For more information see: https://platform.openai.com/docs/api-reference/chat/object
+    """
+
+    id: str
+    object: Literal["completion", "completion.chunk"] = Field(default="completion")
+    created: int = Field(..., examples=[1623340000])
+    model: Literal["private-gpt"]
+    choices: list[OpenAIChoice]
+
+    @classmethod
+    def from_text(
+        cls,
+        text: str | None,
+        finish_reason: str | None = None,
+        sources: list[Chunk] | None = None,
+    ) -> "OpenAICompletion":
+        return OpenAICompletion(
+            id=str(uuid.uuid4()),
+            object="completion",
+            created=int(time.time()),
+            model="private-gpt",
+            choices=[
+                OpenAIChoice(
+                    message=OpenAIMessage(role="assistant", content=text),
+                    finish_reason=finish_reason,
+                    sources=sources,
+                )
+            ],
+        )
+
+    @classmethod
+    def json_from_delta(
+        cls,
+        *,
+        text: str | None,
+        finish_reason: str | None = None,
+        sources: list[Chunk] | None = None,
+    ) -> str:
+        chunk = OpenAICompletion(
+            id=str(uuid.uuid4()),
+            object="completion.chunk",
+            created=int(time.time()),
+            model="private-gpt",
+            choices=[
+                OpenAIChoice(
+                    delta=OpenAIDelta(content=text),
+                    finish_reason=finish_reason,
+                    sources=sources,
+                )
+            ],
+        )
+
+        return chunk.model_dump_json()
+
+
+def to_openai_response(
+    response: str | ChatResponse, sources: list[Chunk] | None = None
+) -> OpenAICompletion:
+    if isinstance(response, ChatResponse):
+        return OpenAICompletion.from_text(response.delta, finish_reason="stop")
+    else:
+        return OpenAICompletion.from_text(
+            response, finish_reason="stop", sources=sources
+        )
+
+
+def to_openai_sse_stream(
+    response_generator: Iterator[str | CompletionResponse | ChatResponse],
+    sources: list[Chunk] | None = None,
+) -> Iterator[str]:
+    for response in response_generator:
+        if isinstance(response, CompletionResponse | ChatResponse):
+            yield f"data: {OpenAICompletion.json_from_delta(text=response.delta)}\n\n"
+        else:
+            yield f"data: {OpenAICompletion.json_from_delta(text=response, sources=sources)}\n\n"
+    yield f"data: {OpenAICompletion.json_from_delta(text=None, finish_reason='stop')}\n\n"
+    yield "data: [DONE]\n\n"
--- a/private_gpt/paths.py
+++ b/private_gpt/paths.py
@@ -0,0 +1,18 @@
+from pathlib import Path
+
+from private_gpt.constants import PROJECT_ROOT_PATH
+from private_gpt.settings.settings import settings
+
+
+def _absolute_or_from_project_root(path: str) -> Path:
+    if path.startswith("/"):
+        return Path(path)
+    return PROJECT_ROOT_PATH / path
+
+
+models_path: Path = PROJECT_ROOT_PATH / "models"
+models_cache_path: Path = models_path / "cache"
+docs_path: Path = PROJECT_ROOT_PATH / "docs"
+local_data_path: Path = _absolute_or_from_project_root(
+    settings().data.local_data_folder
+)
--- a/private_gpt/server/init.py
+++ b/private_gpt/server/init.py
@@ -0,0 +1 @@
+"""private-gpt server."""
--- a/private_gpt/server/chat/init.py
+++ b/private_gpt/server/chat/init.py
--- a/private_gpt/server/chat/chat_router.py
+++ b/private_gpt/server/chat/chat_router.py
@@ -0,0 +1,108 @@
+from fastapi import APIRouter, Depends, Request
+from llama_index.llms import ChatMessage, MessageRole
+from pydantic import BaseModel
+from starlette.responses import StreamingResponse
+
+from private_gpt.open_ai.extensions.context_filter import ContextFilter
+from private_gpt.open_ai.openai_models import (
+    OpenAICompletion,
+    OpenAIMessage,
+    to_openai_response,
+    to_openai_sse_stream,
+)
+from private_gpt.server.chat.chat_service import ChatService
+from private_gpt.server.utils.auth import authenticated
+
+chat_router = APIRouter(prefix="/v1", dependencies=[Depends(authenticated)])
+
+
+class ChatBody(BaseModel):
+    messages: list[OpenAIMessage]
+    use_context: bool = False
+    context_filter: ContextFilter | None = None
+    include_sources: bool = True
+    stream: bool = False
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": "You are a rapper. Always answer with a rap.",
+                        },
+                        {
+                            "role": "user",
+                            "content": "How do you fry an egg?",
+                        },
+                    ],
+                    "stream": False,
+                    "use_context": True,
+                    "include_sources": True,
+                    "context_filter": {
+                        "docs_ids": ["c202d5e6-7b69-4869-81cc-dd574ee8ee11"]
+                    },
+                }
+            ]
+        }
+    }
+
+
+@chat_router.post(
+    "/chat/completions",
+    response_model=None,
+    responses={200: {"model": OpenAICompletion}},
+    tags=["Contextual Completions"],
+)
+def chat_completion(
+    request: Request, body: ChatBody
+) -> OpenAICompletion | StreamingResponse:
+    """Given a list of messages comprising a conversation, return a response.
+
+    Optionally include an initial `role: system` message to influence the way
+    the LLM answers.
+
+    If `use_context` is set to `true`, the model will use context coming
+    from the ingested documents to create the response. The documents being used can
+    be filtered using the `context_filter` and passing the document IDs to be used.
+    Ingested documents IDs can be found using `/ingest/list` endpoint. If you want
+    all ingested documents to be used, remove `context_filter` altogether.
+
+    When using `'include_sources': true`, the API will return the source Chunks used
+    to create the response, which come from the context provided.
+
+    When using `'stream': true`, the API will return data chunks following [OpenAI's
+    streaming model](https://platform.openai.com/docs/api-reference/chat/streaming):
+    ```
+    {"id":"12345","object":"completion.chunk","created":1694268190,
+    "model":"private-gpt","choices":[{"index":0,"delta":{"content":"Hello"},
+    "finish_reason":null}]}
+    ```
+    """
+    service = request.state.injector.get(ChatService)
+    all_messages = [
+        ChatMessage(content=m.content, role=MessageRole(m.role)) for m in body.messages
+    ]
+    if body.stream:
+        completion_gen = service.stream_chat(
+            messages=all_messages,
+            use_context=body.use_context,
+            context_filter=body.context_filter,
+        )
+        return StreamingResponse(
+            to_openai_sse_stream(
+                completion_gen.response,
+                completion_gen.sources if body.include_sources else None,
+            ),
+            media_type="text/event-stream",
+        )
+    else:
+        completion = service.chat(
+            messages=all_messages,
+            use_context=body.use_context,
+            context_filter=body.context_filter,
+        )
+        return to_openai_response(
+            completion.response, completion.sources if body.include_sources else None
+        )
--- a/private_gpt/server/chat/chat_service.py
+++ b/private_gpt/server/chat/chat_service.py
@@ -0,0 +1,187 @@
+from dataclasses import dataclass
+
+from injector import inject, singleton
+from llama_index import ServiceContext, StorageContext, VectorStoreIndex
+from llama_index.chat_engine import ContextChatEngine, SimpleChatEngine
+from llama_index.chat_engine.types import (
+    BaseChatEngine,
+)
+from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
+from llama_index.llms import ChatMessage, MessageRole
+from llama_index.types import TokenGen
+from pydantic import BaseModel
+
+from private_gpt.components.embedding.embedding_component import EmbeddingComponent
+from private_gpt.components.llm.llm_component import LLMComponent
+from private_gpt.components.node_store.node_store_component import NodeStoreComponent
+from private_gpt.components.vector_store.vector_store_component import (
+    VectorStoreComponent,
+)
+from private_gpt.open_ai.extensions.context_filter import ContextFilter
+from private_gpt.server.chunks.chunks_service import Chunk
+
+
+class Completion(BaseModel):
+    response: str
+    sources: list[Chunk] | None = None
+
+
+class CompletionGen(BaseModel):
+    response: TokenGen
+    sources: list[Chunk] | None = None
+
+
+@dataclass
+class ChatEngineInput:
+    system_message: ChatMessage | None = None
+    last_message: ChatMessage | None = None
+    chat_history: list[ChatMessage] | None = None
+
+    @classmethod
+    def from_messages(cls, messages: list[ChatMessage]) -> "ChatEngineInput":
+        # Detect if there is a system message, extract the last message and chat history
+        system_message = (
+            messages[0]
+            if len(messages) > 0 and messages[0].role == MessageRole.SYSTEM
+            else None
+        )
+        last_message = (
+            messages[-1]
+            if len(messages) > 0 and messages[-1].role == MessageRole.USER
+            else None
+        )
+        # Remove from messages list the system message and last message,
+        # if they exist. The rest is the chat history.
+        if system_message:
+            messages.pop(0)
+        if last_message:
+            messages.pop(-1)
+        chat_history = messages if len(messages) > 0 else None
+
+        return cls(
+            system_message=system_message,
+            last_message=last_message,
+            chat_history=chat_history,
+        )
+
+
+@singleton
+class ChatService:
+    @inject
+    def __init__(
+        self,
+        llm_component: LLMComponent,
+        vector_store_component: VectorStoreComponent,
+        embedding_component: EmbeddingComponent,
+        node_store_component: NodeStoreComponent,
+    ) -> None:
+        self.llm_service = llm_component
+        self.vector_store_component = vector_store_component
+        self.storage_context = StorageContext.from_defaults(
+            vector_store=vector_store_component.vector_store,
+            docstore=node_store_component.doc_store,
+            index_store=node_store_component.index_store,
+        )
+        self.service_context = ServiceContext.from_defaults(
+            llm=llm_component.llm, embed_model=embedding_component.embedding_model
+        )
+        self.index = VectorStoreIndex.from_vector_store(
+            vector_store_component.vector_store,
+            storage_context=self.storage_context,
+            service_context=self.service_context,
+            show_progress=True,
+        )
+
+    def _chat_engine(
+        self,
+        system_prompt: str | None = None,
+        use_context: bool = False,
+        context_filter: ContextFilter | None = None,
+    ) -> BaseChatEngine:
+        if use_context:
+            vector_index_retriever = self.vector_store_component.get_retriever(
+                index=self.index, context_filter=context_filter
+            )
+            return ContextChatEngine.from_defaults(
+                system_prompt=system_prompt,
+                retriever=vector_index_retriever,
+                service_context=self.service_context,
+                node_postprocessors=[
+                    MetadataReplacementPostProcessor(target_metadata_key="window"),
+                ],
+            )
+        else:
+            return SimpleChatEngine.from_defaults(
+                system_prompt=system_prompt,
+                service_context=self.service_context,
+            )
+
+    def stream_chat(
+        self,
+        messages: list[ChatMessage],
+        use_context: bool = False,
+        context_filter: ContextFilter | None = None,
+    ) -> CompletionGen:
+        chat_engine_input = ChatEngineInput.from_messages(messages)
+        last_message = (
+            chat_engine_input.last_message.content
+            if chat_engine_input.last_message
+            else None
+        )
+        system_prompt = (
+            chat_engine_input.system_message.content
+            if chat_engine_input.system_message
+            else None
+        )
+        chat_history = (
+            chat_engine_input.chat_history if chat_engine_input.chat_history else None
+        )
+
+        chat_engine = self._chat_engine(
+            system_prompt=system_prompt,
+            use_context=use_context,
+            context_filter=context_filter,
+        )
+        streaming_response = chat_engine.stream_chat(
+            message=last_message if last_message is not None else "",
+            chat_history=chat_history,
+        )
+        sources = [Chunk.from_node(node) for node in streaming_response.source_nodes]
+        completion_gen = CompletionGen(
+            response=streaming_response.response_gen, sources=sources
+        )
+        return completion_gen
+
+    def chat(
+        self,
+        messages: list[ChatMessage],
+        use_context: bool = False,
+        context_filter: ContextFilter | None = None,
+    ) -> Completion:
+        chat_engine_input = ChatEngineInput.from_messages(messages)
+        last_message = (
+            chat_engine_input.last_message.content
+            if chat_engine_input.last_message
+            else None
+        )
+        system_prompt = (
+            chat_engine_input.system_message.content
+            if chat_engine_input.system_message
+            else None
+        )
+        chat_history = (
+            chat_engine_input.chat_history if chat_engine_input.chat_history else None
+        )
+
+        chat_engine = self._chat_engine(
+            system_prompt=system_prompt,
+            use_context=use_context,
+            context_filter=context_filter,
+        )
+        wrapped_response = chat_engine.chat(
+            message=last_message if last_message is not None else "",
+            chat_history=chat_history,
+        )
+        sources = [Chunk.from_node(node) for node in wrapped_response.source_nodes]
+        completion = Completion(response=wrapped_response.response, sources=sources)
+        return completion
--- a/private_gpt/server/chunks/init.py
+++ b/private_gpt/server/chunks/init.py
--- a/private_gpt/server/chunks/chunks_router.py
+++ b/private_gpt/server/chunks/chunks_router.py
@@ -0,0 +1,55 @@
+from typing import Literal
+
+from fastapi import APIRouter, Depends, Request
+from pydantic import BaseModel, Field
+
+from private_gpt.open_ai.extensions.context_filter import ContextFilter
+from private_gpt.server.chunks.chunks_service import Chunk, ChunksService
+from private_gpt.server.utils.auth import authenticated
+
+chunks_router = APIRouter(prefix="/v1", dependencies=[Depends(authenticated)])
+
+
+class ChunksBody(BaseModel):
+    text: str = Field(examples=["Q3 2023 sales"])
+    context_filter: ContextFilter | None = None
+    limit: int = 10
+    prev_next_chunks: int = Field(default=0, examples=[2])
+
+
+class ChunksResponse(BaseModel):
+    object: Literal["list"]
+    model: Literal["private-gpt"]
+    data: list[Chunk]
+
+
+@chunks_router.post("/chunks", tags=["Context Chunks"])
+def chunks_retrieval(request: Request, body: ChunksBody) -> ChunksResponse:
+    """Given a `text`, returns the most relevant chunks from the ingested documents.
+
+    The returned information can be used to generate prompts that can be
+    passed to `/completions` or `/chat/completions` APIs. Note: it is usually a very
+    fast API, because only the Embeddings model is involved, not the LLM. The
+    returned information contains the relevant chunk `text` together with the source
+    `document` it is coming from. It also contains a score that can be used to
+    compare different results.
+
+    The max number of chunks to be returned is set using the `limit` param.
+
+    Previous and next chunks (pieces of text that appear right before or after in the
+    document) can be fetched by using the `prev_next_chunks` field.
+
+    The documents being used can be filtered using the `context_filter` and passing
+    the document IDs to be used. Ingested documents IDs can be found using
+    `/ingest/list` endpoint. If you want all ingested documents to be used,
+    remove `context_filter` altogether.
+    """
+    service = request.state.injector.get(ChunksService)
+    results = service.retrieve_relevant(
+        body.text, body.context_filter, body.limit, body.prev_next_chunks
+    )
+    return ChunksResponse(
+        object="list",
+        model="private-gpt",
+        data=results,
+    )
--- a/private_gpt/server/chunks/chunks_service.py
+++ b/private_gpt/server/chunks/chunks_service.py
@@ -0,0 +1,124 @@
+from typing import TYPE_CHECKING, Literal
+
+from injector import inject, singleton
+from llama_index import ServiceContext, StorageContext, VectorStoreIndex
+from llama_index.schema import NodeWithScore
+from pydantic import BaseModel, Field
+
+from private_gpt.components.embedding.embedding_component import EmbeddingComponent
+from private_gpt.components.llm.llm_component import LLMComponent
+from private_gpt.components.node_store.node_store_component import NodeStoreComponent
+from private_gpt.components.vector_store.vector_store_component import (
+    VectorStoreComponent,
+)
+from private_gpt.open_ai.extensions.context_filter import ContextFilter
+from private_gpt.server.ingest.model import IngestedDoc
+
+if TYPE_CHECKING:
+    from llama_index.schema import RelatedNodeInfo
+
+
+class Chunk(BaseModel):
+    object: Literal["context.chunk"]
+    score: float = Field(examples=[0.023])
+    document: IngestedDoc
+    text: str = Field(examples=["Outbound sales increased 20%, driven by new leads."])
+    previous_texts: list[str] | None = Field(
+        default=None,
+        examples=[["SALES REPORT 2023", "Inbound didn't show major changes."]],
+    )
+    next_texts: list[str] | None = Field(
+        default=None,
+        examples=[
+            [
+                "New leads came from Google Ads campaign.",
+                "The campaign was run by the Marketing Department",
+            ]
+        ],
+    )
+
+    @classmethod
+    def from_node(cls: type["Chunk"], node: NodeWithScore) -> "Chunk":
+        doc_id = node.node.ref_doc_id if node.node.ref_doc_id is not None else "-"
+        return cls(
+            object="context.chunk",
+            score=node.score or 0.0,
+            document=IngestedDoc(
+                object="ingest.document",
+                doc_id=doc_id,
+                doc_metadata=node.metadata,
+            ),
+            text=node.get_content(),
+        )
+
+
+@singleton
+class ChunksService:
+    @inject
+    def __init__(
+        self,
+        llm_component: LLMComponent,
+        vector_store_component: VectorStoreComponent,
+        embedding_component: EmbeddingComponent,
+        node_store_component: NodeStoreComponent,
+    ) -> None:
+        self.vector_store_component = vector_store_component
+        self.storage_context = StorageContext.from_defaults(
+            vector_store=vector_store_component.vector_store,
+            docstore=node_store_component.doc_store,
+            index_store=node_store_component.index_store,
+        )
+        self.query_service_context = ServiceContext.from_defaults(
+            llm=llm_component.llm, embed_model=embedding_component.embedding_model
+        )
+
+    def _get_sibling_nodes_text(
+        self, node_with_score: NodeWithScore, related_number: int, forward: bool = True
+    ) -> list[str]:
+        explored_nodes_texts = []
+        current_node = node_with_score.node
+        for _ in range(related_number):
+            explored_node_info: RelatedNodeInfo | None = (
+                current_node.next_node if forward else current_node.prev_node
+            )
+            if explored_node_info is None:
+                break
+
+            explored_node = self.storage_context.docstore.get_node(
+                explored_node_info.node_id
+            )
+
+            explored_nodes_texts.append(explored_node.get_content())
+            current_node = explored_node
+
+        return explored_nodes_texts
+
+    def retrieve_relevant(
+        self,
+        text: str,
+        context_filter: ContextFilter | None = None,
+        limit: int = 10,
+        prev_next_chunks: int = 0,
+    ) -> list[Chunk]:
+        index = VectorStoreIndex.from_vector_store(
+            self.vector_store_component.vector_store,
+            storage_context=self.storage_context,
+            service_context=self.query_service_context,
+            show_progress=True,
+        )
+        vector_index_retriever = self.vector_store_component.get_retriever(
+            index=index, context_filter=context_filter, similarity_top_k=limit
+        )
+        nodes = vector_index_retriever.retrieve(text)
+        nodes.sort(key=lambda n: n.score or 0.0, reverse=True)
+
+        retrieved_nodes = []
+        for node in nodes:
+            chunk = Chunk.from_node(node)
+            chunk.previous_texts = self._get_sibling_nodes_text(
+                node, prev_next_chunks, False
+            )
+            chunk.next_texts = self._get_sibling_nodes_text(node, prev_next_chunks)
+            retrieved_nodes.append(chunk)
+
+        return retrieved_nodes
--- a/private_gpt/server/completions/init.py
+++ b/private_gpt/server/completions/init.py
@@ -0,0 +1 @@
+"""Deprecated Openai compatibility endpoint."""
--- a/private_gpt/server/completions/completions_router.py
+++ b/private_gpt/server/completions/completions_router.py
@@ -0,0 +1,85 @@
+from fastapi import APIRouter, Depends, Request
+from pydantic import BaseModel
+from starlette.responses import StreamingResponse
+
+from private_gpt.open_ai.extensions.context_filter import ContextFilter
+from private_gpt.open_ai.openai_models import (
+    OpenAICompletion,
+    OpenAIMessage,
+)
+from private_gpt.server.chat.chat_router import ChatBody, chat_completion
+from private_gpt.server.utils.auth import authenticated
+
+completions_router = APIRouter(prefix="/v1", dependencies=[Depends(authenticated)])
+
+
+class CompletionsBody(BaseModel):
+    prompt: str
+    system_prompt: str | None = None
+    use_context: bool = False
+    context_filter: ContextFilter | None = None
+    include_sources: bool = True
+    stream: bool = False
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "prompt": "How do you fry an egg?",
+                    "system_prompt": "You are a rapper. Always answer with a rap.",
+                    "stream": False,
+                    "use_context": False,
+                    "include_sources": False,
+                }
+            ]
+        }
+    }
+
+
+@completions_router.post(
+    "/completions",
+    response_model=None,
+    summary="Completion",
+    responses={200: {"model": OpenAICompletion}},
+    tags=["Contextual Completions"],
+)
+def prompt_completion(
+    request: Request, body: CompletionsBody
+) -> OpenAICompletion | StreamingResponse:
+    """We recommend most users use our Chat completions API.
+
+    Given a prompt, the model will return one predicted completion.
+
+    Optionally include a `system_prompt` to influence the way the LLM answers.
+
+    If `use_context`
+    is set to `true`, the model will use context coming from the ingested documents
+    to create the response. The documents being used can be filtered using the
+    `context_filter` and passing the document IDs to be used. Ingested documents IDs
+    can be found using `/ingest/list` endpoint. If you want all ingested documents to
+    be used, remove `context_filter` altogether.
+
+    When using `'include_sources': true`, the API will return the source Chunks used
+    to create the response, which come from the context provided.
+
+    When using `'stream': true`, the API will return data chunks following [OpenAI's
+    streaming model](https://platform.openai.com/docs/api-reference/chat/streaming):
+    ```
+    {"id":"12345","object":"completion.chunk","created":1694268190,
+    "model":"private-gpt","choices":[{"index":0,"delta":{"content":"Hello"},
+    "finish_reason":null}]}
+    ```
+    """
+    messages = [OpenAIMessage(content=body.prompt, role="user")]
+    # If system prompt is passed, create a fake message with the system prompt.
+    if body.system_prompt:
+        messages.insert(0, OpenAIMessage(content=body.system_prompt, role="system"))
+
+    chat_body = ChatBody(
+        messages=messages,
+        use_context=body.use_context,
+        stream=body.stream,
+        include_sources=body.include_sources,
+        context_filter=body.context_filter,
+    )
+    return chat_completion(request, chat_body)
--- a/private_gpt/server/embeddings/init.py
+++ b/private_gpt/server/embeddings/init.py
--- a/private_gpt/server/embeddings/embeddings_router.py
+++ b/private_gpt/server/embeddings/embeddings_router.py
@@ -0,0 +1,35 @@
+from typing import Literal
+
+from fastapi import APIRouter, Depends, Request
+from pydantic import BaseModel
+
+from private_gpt.server.embeddings.embeddings_service import (
+    Embedding,
+    EmbeddingsService,
+)
+from private_gpt.server.utils.auth import authenticated
+
+embeddings_router = APIRouter(prefix="/v1", dependencies=[Depends(authenticated)])
+
+
+class EmbeddingsBody(BaseModel):
+    input: str | list[str]
+
+
+class EmbeddingsResponse(BaseModel):
+    object: Literal["list"]
+    model: Literal["private-gpt"]
+    data: list[Embedding]
+
+
+@embeddings_router.post("/embeddings", tags=["Embeddings"])
+def embeddings_generation(request: Request, body: EmbeddingsBody) -> EmbeddingsResponse:
+    """Get a vector representation of a given input.
+
+    That vector representation can be easily consumed
+    by machine learning models and algorithms.
+    """
+    service = request.state.injector.get(EmbeddingsService)
+    input_texts = body.input if isinstance(body.input, list) else [body.input]
+    embeddings = service.texts_embeddings(input_texts)
+    return EmbeddingsResponse(object="list", model="private-gpt", data=embeddings)
--- a/private_gpt/server/embeddings/embeddings_service.py
+++ b/private_gpt/server/embeddings/embeddings_service.py
@@ -0,0 +1,30 @@
+from typing import Literal
+
+from injector import inject, singleton
+from pydantic import BaseModel, Field
+
+from private_gpt.components.embedding.embedding_component import EmbeddingComponent
+
+
+class Embedding(BaseModel):
+    index: int
+    object: Literal["embedding"]
+    embedding: list[float] = Field(examples=[[0.0023064255, -0.009327292]])
+
+
+@singleton
+class EmbeddingsService:
+    @inject
+    def __init__(self, embedding_component: EmbeddingComponent) -> None:
+        self.embedding_model = embedding_component.embedding_model
+
+    def texts_embeddings(self, texts: list[str]) -> list[Embedding]:
+        texts_embeddings = self.embedding_model.get_text_embedding_batch(texts)
+        return [
+            Embedding(
+                index=texts_embeddings.index(embedding),
+                object="embedding",
+                embedding=embedding,
+            )
+            for embedding in texts_embeddings
+        ]
--- a/private_gpt/server/health/init.py
+++ b/private_gpt/server/health/init.py
--- a/private_gpt/server/health/health_router.py
+++ b/private_gpt/server/health/health_router.py
@@ -0,0 +1,17 @@
+from typing import Literal
+
+from fastapi import APIRouter
+from pydantic import BaseModel, Field
+
+# Not authentication or authorization required to get the health status.
+health_router = APIRouter()
+
+
+class HealthResponse(BaseModel):
+    status: Literal["ok"] = Field(default="ok")
+
+
+@health_router.get("/health", tags=["Health"])
+def health() -> HealthResponse:
+    """Return ok if the system is up."""
+    return HealthResponse(status="ok")
--- a/private_gpt/server/ingest/init.py
+++ b/private_gpt/server/ingest/init.py
--- a/private_gpt/server/ingest/ingest_router.py
+++ b/private_gpt/server/ingest/ingest_router.py
@@ -0,0 +1,63 @@
+from typing import Literal
+
+from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile
+from pydantic import BaseModel
+
+from private_gpt.server.ingest.ingest_service import IngestService
+from private_gpt.server.ingest.model import IngestedDoc
+from private_gpt.server.utils.auth import authenticated
+
+ingest_router = APIRouter(prefix="/v1", dependencies=[Depends(authenticated)])
+
+
+class IngestResponse(BaseModel):
+    object: Literal["list"]
+    model: Literal["private-gpt"]
+    data: list[IngestedDoc]
+
+
+@ingest_router.post("/ingest", tags=["Ingestion"])
+def ingest(request: Request, file: UploadFile) -> IngestResponse:
+    """Ingests and processes a file, storing its chunks to be used as context.
+
+    The context obtained from files is later used in
+    `/chat/completions`, `/completions`, and `/chunks` APIs.
+
+    Most common document
+    formats are supported, but you may be prompted to install an extra dependency to
+    manage a specific file type.
+
+    A file can generate different Documents (for example a PDF generates one Document
+    per page). All Documents IDs are returned in the response, together with the
+    extracted Metadata (which is later used to improve context retrieval). Those IDs
+    can be used to filter the context used to create responses in
+    `/chat/completions`, `/completions`, and `/chunks` APIs.
+    """
+    service = request.state.injector.get(IngestService)
+    if file.filename is None:
+        raise HTTPException(400, "No file name provided")
+    ingested_documents = service.ingest_bin_data(file.filename, file.file)
+    return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
+
+
+@ingest_router.get("/ingest/list", tags=["Ingestion"])
+def list_ingested(request: Request) -> IngestResponse:
+    """Lists already ingested Documents including their Document ID and metadata.
+
+    Those IDs can be used to filter the context used to create responses
+    in `/chat/completions`, `/completions`, and `/chunks` APIs.
+    """
+    service = request.state.injector.get(IngestService)
+    ingested_documents = service.list_ingested()
+    return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
+
+
+@ingest_router.delete("/ingest/{doc_id}", tags=["Ingestion"])
+def delete_ingested(request: Request, doc_id: str) -> None:
+    """Delete the specified ingested Document.
+
+    The `doc_id` can be obtained from the `GET /ingest/list` endpoint.
+    The document will be effectively deleted from your storage context.
+    """
+    service = request.state.injector.get(IngestService)
+    service.delete(doc_id)
--- a/private_gpt/server/ingest/ingest_service.py
+++ b/private_gpt/server/ingest/ingest_service.py
@@ -0,0 +1,123 @@
+import logging
+import tempfile
+from pathlib import Path
+from typing import BinaryIO
+
+from injector import inject, singleton
+from llama_index import (
+    ServiceContext,
+    StorageContext,
+)
+from llama_index.node_parser import SentenceWindowNodeParser
+
+from private_gpt.components.embedding.embedding_component import EmbeddingComponent
+from private_gpt.components.ingest.ingest_component import get_ingestion_component
+from private_gpt.components.llm.llm_component import LLMComponent
+from private_gpt.components.node_store.node_store_component import NodeStoreComponent
+from private_gpt.components.vector_store.vector_store_component import (
+    VectorStoreComponent,
+)
+from private_gpt.server.ingest.model import IngestedDoc
+from private_gpt.settings.settings import settings
+
+logger = logging.getLogger(__name__)
+
+
+@singleton
+class IngestService:
+    @inject
+    def __init__(
+        self,
+        llm_component: LLMComponent,
+        vector_store_component: VectorStoreComponent,
+        embedding_component: EmbeddingComponent,
+        node_store_component: NodeStoreComponent,
+    ) -> None:
+        self.llm_service = llm_component
+        self.storage_context = StorageContext.from_defaults(
+            vector_store=vector_store_component.vector_store,
+            docstore=node_store_component.doc_store,
+            index_store=node_store_component.index_store,
+        )
+        node_parser = SentenceWindowNodeParser.from_defaults()
+        self.ingest_service_context = ServiceContext.from_defaults(
+            llm=self.llm_service.llm,
+            embed_model=embedding_component.embedding_model,
+            node_parser=node_parser,
+            # Embeddings done early in the pipeline of node transformations, right
+            # after the node parsing
+            transformations=[node_parser, embedding_component.embedding_model],
+        )
+
+        self.ingest_component = get_ingestion_component(
+            self.storage_context, self.ingest_service_context, settings=settings()
+        )
+
+    def ingest(self, file_name: str, file_data: Path) -> list[IngestedDoc]:
+        logger.info("Ingesting file_name=%s", file_name)
+        documents = self.ingest_component.ingest(file_name, file_data)
+        return [IngestedDoc.from_document(document) for document in documents]
+
+    def ingest_bin_data(
+        self, file_name: str, raw_file_data: BinaryIO
+    ) -> list[IngestedDoc]:
+        logger.debug("Ingesting binary data with file_name=%s", file_name)
+        file_data = raw_file_data.read()
+        logger.debug("Got file data of size=%s to ingest", len(file_data))
+        # llama-index mainly supports reading from files, so
+        # we have to create a tmp file to read for it to work
+        # delete=False to avoid a Windows 11 permission error.
+        with tempfile.NamedTemporaryFile(delete=False) as tmp:
+            try:
+                path_to_tmp = Path(tmp.name)
+                if isinstance(file_data, bytes):
+                    path_to_tmp.write_bytes(file_data)
+                else:
+                    path_to_tmp.write_text(str(file_data))
+                return self.ingest(file_name, path_to_tmp)
+            finally:
+                tmp.close()
+                path_to_tmp.unlink()
+
+    def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[IngestedDoc]:
+        logger.info("Ingesting file_names=%s", [f[0] for f in files])
+        documents = self.ingest_component.bulk_ingest(files)
+        return [IngestedDoc.from_document(document) for document in documents]
+
+    def list_ingested(self) -> list[IngestedDoc]:
+        ingested_docs = []
+        try:
+            docstore = self.storage_context.docstore
+            ingested_docs_ids: set[str] = set()
+
+            for node in docstore.docs.values():
+                if node.ref_doc_id is not None:
+                    ingested_docs_ids.add(node.ref_doc_id)
+
+            for doc_id in ingested_docs_ids:
+                ref_doc_info = docstore.get_ref_doc_info(ref_doc_id=doc_id)
+                doc_metadata = None
+                if ref_doc_info is not None and ref_doc_info.metadata is not None:
+                    doc_metadata = IngestedDoc.curate_metadata(ref_doc_info.metadata)
+                ingested_docs.append(
+                    IngestedDoc(
+                        object="ingest.document",
+                        doc_id=doc_id,
+                        doc_metadata=doc_metadata,
+                    )
+                )
+        except ValueError:
+            logger.warning("Got an exception when getting list of docs", exc_info=True)
+            pass
+        logger.debug("Found count=%s ingested documents", len(ingested_docs))
+        return ingested_docs
+
+    def delete(self, doc_id: str) -> None:
+        """Delete an ingested document.
+
+        :raises ValueError: if the document does not exist
+        """
+        logger.info(
+            "Deleting the ingested document=%s in the doc and index store", doc_id
+        )
+        self.ingest_component.delete(doc_id)
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`"""Deprecated Openai compatibility endpoint."""`