Make updates

2025-08-16 22:46:58 +00:00 · 2024-03-23 00:10:58 -06:00 · 2024-03-23 00:10:58 -06:00 · b7f1aaa587
commit b7f1aaa587
parent 087cb0b7b7
14 changed files with 223 additions and 16 deletions
--- a/.gitignore
+++ b/.gitignore
@ -29,3 +29,7 @@ __pycache__/

 # macOS
 .DS_Store
+
+# other
+proj
+postgres_data*
--- a/README.md
+++ b/README.md
@ -1,3 +1,7 @@
+# Notes
+poetry run pip install bs4 openpyxl flask transformers python-pptx Pillow cryptography
+apt-get install antiword
+
 # 🔒 PrivateGPT 📑

 [![Tests](https://github.com/imartinez/privateGPT/actions/workflows/tests.yml/badge.svg)](https://github.com/imartinez/privateGPT/actions/workflows/tests.yml?query=branch%3Amain)
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -1,13 +1,34 @@
 services:
-  private-gpt:
-    build:
-      dockerfile: Dockerfile.local
-    volumes:
-      - ./local_data/:/home/worker/app/local_data
-      - ./models/:/home/worker/app/models
-    ports:
-      - 8001:8080
+  # private-gpt:
+  #   build:
+  #     dockerfile: Dockerfile.local
+  #   volumes:
+  #     - ./local_data/:/home/worker/app/local_data
+  #     - ./models/:/home/worker/app/models
+  #   ports:
+  #     - 8001:8080
+  #   environment:
+  #     PORT: 8080
+  #     PGPT_PROFILES: docker
+  #     PGPT_MODE: local
+  postgres:
+    image: "postgres"
    environment:
-      PORT: 8080
-      PGPT_PROFILES: docker
-      PGPT_MODE: llamacpp
+      POSTGRES_USER: "postgres"
+      POSTGRES_PASSWORD: "postgres"
+      POSTGRES_DB: "postgres"
+    ports:
+      - "5432:5432"
+    volumes:
+      - ./postgres_data:/var/lib/postgresql/data
+    restart: always
+  phppgadmin:
+    image: "dockage/phppgadmin"
+    environment:
+      PHP_PG_ADMIN_SERVER_HOST: "postgres"
+      PHP_PG_ADMIN_SERVER_PORT: "5432"
+      PHP_PG_ADMIN_SERVER_DEFAULT_DB: "postgres"
+      PHP_PG_ADMIN_OWNED_ONLY: "false"
+    ports:
+      - "8080:80"
+    restart: always
--- a/private_gpt/init.py
+++ b/private_gpt/init.py
@ -4,7 +4,7 @@ import logging
 import os

 # Set to 'DEBUG' to have extensive logging turned on, even for libraries
-ROOT_LOG_LEVEL = "INFO"
+ROOT_LOG_LEVEL = "DEBUG"

 PRETTY_LOG_FORMAT = (
    "%(asctime)s.%(msecs)03d [%(levelname)-8s] %(name)+25s - %(message)s"
--- a/private_gpt/components/ingest/ingest_helper.py
+++ b/private_gpt/components/ingest/ingest_helper.py
@ -27,6 +27,10 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
        from llama_index.readers.file.video_audio import (  # type: ignore
            VideoAudioReader,
        )
+        from llama_index.readers.file.xml import XMLReader  # type: ignore
+        from private_gpt.components.ingest.readers.xlsx_parser import XLSXParser  # type: ignore
+        from private_gpt.components.ingest.readers.html_parser import HTMLParser  # type: ignore
+        from private_gpt.components.ingest.readers.doc_parser import DOCParser  # type: ignore
    except ImportError as e:
        raise ImportError("`llama-index-readers-file` package not found") from e

@ -47,6 +51,16 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
        ".md": MarkdownReader,
        ".mbox": MboxReader,
        ".ipynb": IPYNBReader,
+        # // delete desktop.ini and *.eps files. 
+        ".markdown": MarkdownReader,
+        ".html": HTMLParser,
+        ".htm": HTMLParser,
+        ".xlsx": XLSXParser,
+        ".xml": XMLReader,
+        ".eps": ImageReader,
+        ".tif": ImageReader,
+        ".gif": ImageReader,
+        ".doc": DOCParser,
    }
    return default_file_reader_cls

@ -87,12 +101,36 @@ class IngestionHelper:
                "No reader found for extension=%s, using default string reader",
                extension,
            )
-            # Read as a plain text
+            # Attempt to read as a plain text with UTF-8 encoding, if fails read as ASCII
+            try:
+                file_content = file_data.read_text(encoding='utf-8')
+            except UnicodeDecodeError:
+                try:
+                    logger.debug(
+                        "Failed to read file_name=%s as UTF-8 encoded text, trying with ASCII",
+                        file_name,
+                    )
+                    # Fallback to ASCII decoding
+                    file_content = file_data.read_text(encoding='ascii')
+                except UnicodeDecodeError:
+                    logger.debug(
+                        "Failed to read file_name=%s as ASCII encoded text, returning empty document",
+                        file_name,
+                    )
+                    return []
            string_reader = StringIterableReader()
-            return string_reader.load_data([file_data.read_text()])
+            return string_reader.load_data([file_content])

        logger.debug("Specific reader found for extension=%s", extension)
-        return reader_cls().load_data(file_data)
+        try:
+            return reader_cls().load_data(file_data)
+        except Exception as e:
+            logger.debug(
+                "Failed to read file_name=%s e=%s",
+                file_name,
+                e
+            )
+            return []

    @staticmethod
    def _exclude_metadata(documents: list[Document]) -> None:
--- a/private_gpt/components/ingest/readers/doc_parser.py
+++ b/private_gpt/components/ingest/readers/doc_parser.py
@ -0,0 +1,31 @@
+import subprocess
+from pathlib import Path
+from typing import List, Optional, Dict
+
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+class DOCParser(BaseReader):
+    """DOC parser."""
+
+    def __init__(self) -> None:
+        """Init parser."""
+        super().__init__()
+
+    def load_data(self, input_file: Path, extra_info: Optional[Dict] = {}) -> List[Document]:
+        """Parse file."""
+        documents = []  # This will contain the list of Document objects
+
+        # Call antiword to convert the .doc file to plain text
+        try:
+            text = subprocess.run(['antiword', input_file], capture_output=True, text=True, check=True).stdout
+        except subprocess.CalledProcessError as e:
+            print(f"An error occurred while processing {input_file}: {e}")
+            text = ""
+
+        # Create a Document object with the extracted text and any extra metadata
+        if text:
+            document = Document(text=text, metadata=extra_info)
+            documents.append(document)
+
+        return documents
--- a/private_gpt/components/ingest/readers/html_parser.py
+++ b/private_gpt/components/ingest/readers/html_parser.py
@ -0,0 +1,28 @@
+from pathlib import Path
+from typing import List, Optional, Dict
+
+from bs4 import BeautifulSoup
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+class HTMLParser(BaseReader):
+    """HTML parser."""
+
+    def __init__(self) -> None:
+        """Init parser."""
+        super().__init__()
+
+    def load_data(self, input_file: Path, extra_info: Optional[Dict] = {}) -> List[Document]:
+        """Parse file."""
+        documents = []  # This will contain the list of Document objects
+
+        with open(input_file, "rb") as fp:
+            soup = BeautifulSoup(fp, 'html.parser')
+            text = soup.get_text()
+            text = text.strip() if text else ''
+
+            # Create a Document object with the extracted text and any extra metadata
+            document = Document(text=text, metadata=extra_info)
+            documents.append(document)
+
+        return documents
--- a/private_gpt/components/ingest/readers/xlsx_parser.py
+++ b/private_gpt/components/ingest/readers/xlsx_parser.py
@ -0,0 +1,42 @@
+import json
+from pathlib import Path
+from typing import List, Optional, Dict
+from openpyxl import load_workbook
+
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+class XLSXParser(BaseReader):
+    """XLSX parser."""
+
+    def __init__(self) -> None:
+        """Init parser."""
+        super().__init__()
+
+    def load_data(self, input_file: Path, extra_info: Optional[Dict] = {}) -> List[Document]:
+        """Parse file."""
+        documents = []  # This will contain the list of Document objects
+        wb = load_workbook(filename=str(input_file), read_only=True)
+        # loop over all sheets
+        for sheet in wb:
+            sheet_data = []  # This will contain the list of rows for the current sheet
+            keys = []
+            for row_index, row in enumerate(sheet.iter_rows(values_only=True)):
+                # Skip empty rows
+                if all(cell is None for cell in row):
+                    continue
+
+                # Initialize keys with the first row (header)
+                if row_index == 0:
+                    keys = [str(cell) if cell is not None else "" for cell in row]
+                    continue
+
+                # Ensure each row has the same number of columns as the header
+                row_data = [(keys[i], str(cell) if cell is not None else "") for i, cell in enumerate(row) if i < len(keys)]
+                sheet_data.append(dict(row_data))
+
+            # Create a Document object with the sheet data and any extra metadata
+            document = Document(text=json.dumps(sheet_data, ensure_ascii=False), metadata=extra_info)
+            documents.append(document)
+
+        return documents
--- a/scripts/longquestion.py
+++ b/scripts/longquestion.py
@ -0,0 +1,19 @@
+from gradio_client import Client
+
+with open('question.txt', 'r') as file:
+    # Read the content of the file
+    content = file.read()
+
+with open('system.txt', 'r') as file:
+    # Read the content of the file
+    systemcontent = file.read()
+
+client = Client("http://localhost:8001/")
+result = client.predict(
+		content,	# str  in 'Message' Textbox component
+		"Query Files",	# Literal['Query Files', 'Search Files', 'LLM Chat (no context from files)']  in 'Mode' Radio component
+		[],	# List[filepath]  in 'Upload File(s)' Uploadbutton component
+		systemcontent,	# str  in 'System Prompt' Textbox component
+		api_name="/chat"
+)
+print(result)
--- a/scripts/question.py
+++ b/scripts/question.py
@ -0,0 +1,12 @@
+import sys
+from gradio_client import Client
+
+client = Client("http://localhost:8001/")
+result = client.predict(
+		sys.argv[1],	# str  in 'Message' Textbox component
+		"Query Files",	# Literal['Query Files', 'Search Files', 'LLM Chat (no context from files)']  in 'Mode' Radio component
+		[],	# List[filepath]  in 'Upload File(s)' Uploadbutton component
+		"You can only answer questions about the provided context. If you know the answer but it is not based in the provided context, don't provide the answer, just state the answer is not in the context provided.",	# str  in 'System Prompt' Textbox component
+		api_name="/chat"
+)
+print(result)
--- a/scripts/question.txt
+++ b/scripts/question.txt
@ -0,0 +1 @@
+Which files are in the context provided?
--- a/scripts/system.txt
+++ b/scripts/system.txt
@ -0,0 +1 @@
+You can only answer questions about the provided context. If you know the answer but it is not based in the provided context, don't provide the answer, just state the answer is not in the context provided.
--- a/settings.yaml
+++ b/settings.yaml
@ -70,7 +70,7 @@ vectorstore:
  database: qdrant

 nodestore:
-  database: simple
+  database: postgres

 qdrant:
  path: local_data/private_gpt/qdrant
--- a/test.log
+++ b/test.log
				`@ -0,0 +1 @@`
				`You can only answer questions about the provided context. If you know the answer but it is not based in the provided context, don't provide the answer, just state the answer is not in the context provided.`