Make updates

This commit is contained in:
Steven Linn 2024-03-23 00:10:58 -06:00
parent 087cb0b7b7
commit b7f1aaa587
14 changed files with 223 additions and 16 deletions

4
.gitignore vendored
View File

@ -29,3 +29,7 @@ __pycache__/
# macOS
.DS_Store
# other
proj
postgres_data*

View File

@ -1,3 +1,7 @@
# Notes
poetry run pip install bs4 openpyxl flask transformers python-pptx Pillow cryptography
apt-get install antiword
# 🔒 PrivateGPT 📑
[![Tests](https://github.com/imartinez/privateGPT/actions/workflows/tests.yml/badge.svg)](https://github.com/imartinez/privateGPT/actions/workflows/tests.yml?query=branch%3Amain)

View File

@ -1,13 +1,34 @@
services:
private-gpt:
build:
dockerfile: Dockerfile.local
volumes:
- ./local_data/:/home/worker/app/local_data
- ./models/:/home/worker/app/models
ports:
- 8001:8080
# private-gpt:
# build:
# dockerfile: Dockerfile.local
# volumes:
# - ./local_data/:/home/worker/app/local_data
# - ./models/:/home/worker/app/models
# ports:
# - 8001:8080
# environment:
# PORT: 8080
# PGPT_PROFILES: docker
# PGPT_MODE: local
postgres:
image: "postgres"
environment:
PORT: 8080
PGPT_PROFILES: docker
PGPT_MODE: llamacpp
POSTGRES_USER: "postgres"
POSTGRES_PASSWORD: "postgres"
POSTGRES_DB: "postgres"
ports:
- "5432:5432"
volumes:
- ./postgres_data:/var/lib/postgresql/data
restart: always
phppgadmin:
image: "dockage/phppgadmin"
environment:
PHP_PG_ADMIN_SERVER_HOST: "postgres"
PHP_PG_ADMIN_SERVER_PORT: "5432"
PHP_PG_ADMIN_SERVER_DEFAULT_DB: "postgres"
PHP_PG_ADMIN_OWNED_ONLY: "false"
ports:
- "8080:80"
restart: always

View File

@ -4,7 +4,7 @@ import logging
import os
# Set to 'DEBUG' to have extensive logging turned on, even for libraries
ROOT_LOG_LEVEL = "INFO"
ROOT_LOG_LEVEL = "DEBUG"
PRETTY_LOG_FORMAT = (
"%(asctime)s.%(msecs)03d [%(levelname)-8s] %(name)+25s - %(message)s"

View File

@ -27,6 +27,10 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
from llama_index.readers.file.video_audio import ( # type: ignore
VideoAudioReader,
)
from llama_index.readers.file.xml import XMLReader # type: ignore
from private_gpt.components.ingest.readers.xlsx_parser import XLSXParser # type: ignore
from private_gpt.components.ingest.readers.html_parser import HTMLParser # type: ignore
from private_gpt.components.ingest.readers.doc_parser import DOCParser # type: ignore
except ImportError as e:
raise ImportError("`llama-index-readers-file` package not found") from e
@ -47,6 +51,16 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
".md": MarkdownReader,
".mbox": MboxReader,
".ipynb": IPYNBReader,
# // delete desktop.ini and *.eps files.
".markdown": MarkdownReader,
".html": HTMLParser,
".htm": HTMLParser,
".xlsx": XLSXParser,
".xml": XMLReader,
".eps": ImageReader,
".tif": ImageReader,
".gif": ImageReader,
".doc": DOCParser,
}
return default_file_reader_cls
@ -87,12 +101,36 @@ class IngestionHelper:
"No reader found for extension=%s, using default string reader",
extension,
)
# Read as a plain text
# Attempt to read as a plain text with UTF-8 encoding, if fails read as ASCII
try:
file_content = file_data.read_text(encoding='utf-8')
except UnicodeDecodeError:
try:
logger.debug(
"Failed to read file_name=%s as UTF-8 encoded text, trying with ASCII",
file_name,
)
# Fallback to ASCII decoding
file_content = file_data.read_text(encoding='ascii')
except UnicodeDecodeError:
logger.debug(
"Failed to read file_name=%s as ASCII encoded text, returning empty document",
file_name,
)
return []
string_reader = StringIterableReader()
return string_reader.load_data([file_data.read_text()])
return string_reader.load_data([file_content])
logger.debug("Specific reader found for extension=%s", extension)
return reader_cls().load_data(file_data)
try:
return reader_cls().load_data(file_data)
except Exception as e:
logger.debug(
"Failed to read file_name=%s e=%s",
file_name,
e
)
return []
@staticmethod
def _exclude_metadata(documents: list[Document]) -> None:

View File

@ -0,0 +1,31 @@
import subprocess
from pathlib import Path
from typing import List, Optional, Dict
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
class DOCParser(BaseReader):
"""DOC parser."""
def __init__(self) -> None:
"""Init parser."""
super().__init__()
def load_data(self, input_file: Path, extra_info: Optional[Dict] = {}) -> List[Document]:
"""Parse file."""
documents = [] # This will contain the list of Document objects
# Call antiword to convert the .doc file to plain text
try:
text = subprocess.run(['antiword', input_file], capture_output=True, text=True, check=True).stdout
except subprocess.CalledProcessError as e:
print(f"An error occurred while processing {input_file}: {e}")
text = ""
# Create a Document object with the extracted text and any extra metadata
if text:
document = Document(text=text, metadata=extra_info)
documents.append(document)
return documents

View File

@ -0,0 +1,28 @@
from pathlib import Path
from typing import List, Optional, Dict
from bs4 import BeautifulSoup
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
class HTMLParser(BaseReader):
"""HTML parser."""
def __init__(self) -> None:
"""Init parser."""
super().__init__()
def load_data(self, input_file: Path, extra_info: Optional[Dict] = {}) -> List[Document]:
"""Parse file."""
documents = [] # This will contain the list of Document objects
with open(input_file, "rb") as fp:
soup = BeautifulSoup(fp, 'html.parser')
text = soup.get_text()
text = text.strip() if text else ''
# Create a Document object with the extracted text and any extra metadata
document = Document(text=text, metadata=extra_info)
documents.append(document)
return documents

View File

@ -0,0 +1,42 @@
import json
from pathlib import Path
from typing import List, Optional, Dict
from openpyxl import load_workbook
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
class XLSXParser(BaseReader):
"""XLSX parser."""
def __init__(self) -> None:
"""Init parser."""
super().__init__()
def load_data(self, input_file: Path, extra_info: Optional[Dict] = {}) -> List[Document]:
"""Parse file."""
documents = [] # This will contain the list of Document objects
wb = load_workbook(filename=str(input_file), read_only=True)
# loop over all sheets
for sheet in wb:
sheet_data = [] # This will contain the list of rows for the current sheet
keys = []
for row_index, row in enumerate(sheet.iter_rows(values_only=True)):
# Skip empty rows
if all(cell is None for cell in row):
continue
# Initialize keys with the first row (header)
if row_index == 0:
keys = [str(cell) if cell is not None else "" for cell in row]
continue
# Ensure each row has the same number of columns as the header
row_data = [(keys[i], str(cell) if cell is not None else "") for i, cell in enumerate(row) if i < len(keys)]
sheet_data.append(dict(row_data))
# Create a Document object with the sheet data and any extra metadata
document = Document(text=json.dumps(sheet_data, ensure_ascii=False), metadata=extra_info)
documents.append(document)
return documents

19
scripts/longquestion.py Normal file
View File

@ -0,0 +1,19 @@
from gradio_client import Client
with open('question.txt', 'r') as file:
# Read the content of the file
content = file.read()
with open('system.txt', 'r') as file:
# Read the content of the file
systemcontent = file.read()
client = Client("http://localhost:8001/")
result = client.predict(
content, # str in 'Message' Textbox component
"Query Files", # Literal['Query Files', 'Search Files', 'LLM Chat (no context from files)'] in 'Mode' Radio component
[], # List[filepath] in 'Upload File(s)' Uploadbutton component
systemcontent, # str in 'System Prompt' Textbox component
api_name="/chat"
)
print(result)

12
scripts/question.py Normal file
View File

@ -0,0 +1,12 @@
import sys
from gradio_client import Client
client = Client("http://localhost:8001/")
result = client.predict(
sys.argv[1], # str in 'Message' Textbox component
"Query Files", # Literal['Query Files', 'Search Files', 'LLM Chat (no context from files)'] in 'Mode' Radio component
[], # List[filepath] in 'Upload File(s)' Uploadbutton component
"You can only answer questions about the provided context. If you know the answer but it is not based in the provided context, don't provide the answer, just state the answer is not in the context provided.", # str in 'System Prompt' Textbox component
api_name="/chat"
)
print(result)

1
scripts/question.txt Normal file
View File

@ -0,0 +1 @@
Which files are in the context provided?

1
scripts/system.txt Normal file
View File

@ -0,0 +1 @@
You can only answer questions about the provided context. If you know the answer but it is not based in the provided context, don't provide the answer, just state the answer is not in the context provided.

View File

@ -70,7 +70,7 @@ vectorstore:
database: qdrant
nodestore:
database: simple
database: postgres
qdrant:
path: local_data/private_gpt/qdrant

6
test.log Normal file

File diff suppressed because one or more lines are too long