From 7e2f93c5644e05a6ffd2f0961cd99c9a066ad217 Mon Sep 17 00:00:00 2001 From: Steven Linn Date: Wed, 1 May 2024 08:04:54 -0600 Subject: [PATCH] Make more updateS --- README.md | 14 ++++++++++- docker-compose.yaml | 23 ++++++++++--------- .../components/ingest/ingest_helper.py | 19 +++++++-------- scripts/question.txt | 2 +- settings.yaml | 2 +- 5 files changed, 37 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 8884dc7b..d5176def 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,17 @@ # Notes -poetry run pip install bs4 openpyxl flask transformers python-pptx Pillow cryptography +sudo apt-get install build-essential +sudo apt-get install git gcc make openssl libssl-dev libbz2-dev libreadline-dev +sudo apt-get install lzma +sudo apt-get install liblzma-dev + +curl -sSL https://install.python-poetry.org | python3 - + +cd privateGPT +poetry install --extras "ui llms-llama-cpp" +CMAKE_ARGS='-DLLAMA_CUBLAS=on' poetry run pip install --force-reinstall --no-cache-dir llama-cpp-python + +poetry run python scripts/setup +poetry run pip install bs4 openpyxl flask transformers python-pptx Pillow cryptography llama_index.vector_stores.postgres llama_index.embeddings.huggingface sentencepiece apt-get install antiword # 🔒 PrivateGPT 📑 diff --git a/docker-compose.yaml b/docker-compose.yaml index 0cabd53f..3ae4963d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,7 +12,7 @@ services: # PGPT_PROFILES: docker # PGPT_MODE: local postgres: - image: "postgres" + image: "ankane/pgvector:latest" environment: POSTGRES_USER: "postgres" POSTGRES_PASSWORD: "postgres" @@ -21,14 +21,15 @@ services: - "5432:5432" volumes: - ./postgres_data:/var/lib/postgresql/data + - ./init.sql:/docker-entrypoint-initdb.d/init.sql restart: always - phppgadmin: - image: "dockage/phppgadmin" - environment: - PHP_PG_ADMIN_SERVER_HOST: "postgres" - PHP_PG_ADMIN_SERVER_PORT: "5432" - PHP_PG_ADMIN_SERVER_DEFAULT_DB: "postgres" - PHP_PG_ADMIN_OWNED_ONLY: "false" - ports: - - "8080:80" - restart: always + # phppgadmin: + # image: "dockage/phppgadmin" + # environment: + # PHP_PG_ADMIN_SERVER_HOST: "postgres" + # PHP_PG_ADMIN_SERVER_PORT: "5432" + # PHP_PG_ADMIN_SERVER_DEFAULT_DB: "postgres" + # PHP_PG_ADMIN_OWNED_ONLY: "false" + # ports: + # - "8080:80" + # restart: always diff --git a/private_gpt/components/ingest/ingest_helper.py b/private_gpt/components/ingest/ingest_helper.py index 1ed3ec53..bf916453 100644 --- a/private_gpt/components/ingest/ingest_helper.py +++ b/private_gpt/components/ingest/ingest_helper.py @@ -18,7 +18,8 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]: PDFReader, ) from llama_index.readers.file.epub import EpubReader # type: ignore - from llama_index.readers.file.image import ImageReader # type: ignore + #from llama_index.readers.file.image import ImageReader # type: ignore + from llama_index.readers.file.image_vision_llm import ImageVisionLLMReader # type: ignore from llama_index.readers.file.ipynb import IPYNBReader # type: ignore from llama_index.readers.file.markdown import MarkdownReader # type: ignore from llama_index.readers.file.mbox import MboxReader # type: ignore @@ -41,9 +42,9 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]: ".pptx": PptxReader, ".ppt": PptxReader, ".pptm": PptxReader, - ".jpg": ImageReader, - ".png": ImageReader, - ".jpeg": ImageReader, + ".jpg": ImageVisionLLMReader, + ".png": ImageVisionLLMReader, + ".jpeg": ImageVisionLLMReader, ".mp3": VideoAudioReader, ".mp4": VideoAudioReader, ".csv": PandasCSVReader, @@ -57,9 +58,9 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]: ".htm": HTMLParser, ".xlsx": XLSXParser, ".xml": XMLReader, - ".eps": ImageReader, - ".tif": ImageReader, - ".gif": ImageReader, + ".eps": ImageVisionLLMReader, + ".tif": ImageVisionLLMReader, + ".gif": ImageVisionLLMReader, ".doc": DOCParser, } return default_file_reader_cls @@ -119,11 +120,11 @@ class IngestionHelper: ) return [] string_reader = StringIterableReader() - return string_reader.load_data([file_content]) + return string_reader.load_data([file_content.replace("\x00", "")]) logger.debug("Specific reader found for extension=%s", extension) try: - return reader_cls().load_data(file_data) + return reader_cls().load_data(str(file_data).replace("\x00", "")) except Exception as e: logger.debug( "Failed to read file_name=%s e=%s", diff --git a/scripts/question.txt b/scripts/question.txt index 8dba0046..eb91c250 100644 --- a/scripts/question.txt +++ b/scripts/question.txt @@ -1 +1 @@ -Which files are in the context provided? \ No newline at end of file +Which files in the context are responsible for reading Solidworks DWG or DWF formats? \ No newline at end of file diff --git a/settings.yaml b/settings.yaml index c0a18015..8208d96c 100644 --- a/settings.yaml +++ b/settings.yaml @@ -67,7 +67,7 @@ huggingface: embedding_hf_model_name: BAAI/bge-small-en-v1.5 vectorstore: - database: qdrant + database: postgres nodestore: database: postgres