mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-08-12 20:55:47 +00:00
Make updates
This commit is contained in:
parent
087cb0b7b7
commit
b7f1aaa587
4
.gitignore
vendored
4
.gitignore
vendored
@ -29,3 +29,7 @@ __pycache__/
|
||||
|
||||
# macOS
|
||||
.DS_Store
|
||||
|
||||
# other
|
||||
proj
|
||||
postgres_data*
|
@ -1,3 +1,7 @@
|
||||
# Notes
|
||||
poetry run pip install bs4 openpyxl flask transformers python-pptx Pillow cryptography
|
||||
apt-get install antiword
|
||||
|
||||
# 🔒 PrivateGPT 📑
|
||||
|
||||
[](https://github.com/imartinez/privateGPT/actions/workflows/tests.yml?query=branch%3Amain)
|
||||
|
@ -1,13 +1,34 @@
|
||||
services:
|
||||
private-gpt:
|
||||
build:
|
||||
dockerfile: Dockerfile.local
|
||||
volumes:
|
||||
- ./local_data/:/home/worker/app/local_data
|
||||
- ./models/:/home/worker/app/models
|
||||
ports:
|
||||
- 8001:8080
|
||||
# private-gpt:
|
||||
# build:
|
||||
# dockerfile: Dockerfile.local
|
||||
# volumes:
|
||||
# - ./local_data/:/home/worker/app/local_data
|
||||
# - ./models/:/home/worker/app/models
|
||||
# ports:
|
||||
# - 8001:8080
|
||||
# environment:
|
||||
# PORT: 8080
|
||||
# PGPT_PROFILES: docker
|
||||
# PGPT_MODE: local
|
||||
postgres:
|
||||
image: "postgres"
|
||||
environment:
|
||||
PORT: 8080
|
||||
PGPT_PROFILES: docker
|
||||
PGPT_MODE: llamacpp
|
||||
POSTGRES_USER: "postgres"
|
||||
POSTGRES_PASSWORD: "postgres"
|
||||
POSTGRES_DB: "postgres"
|
||||
ports:
|
||||
- "5432:5432"
|
||||
volumes:
|
||||
- ./postgres_data:/var/lib/postgresql/data
|
||||
restart: always
|
||||
phppgadmin:
|
||||
image: "dockage/phppgadmin"
|
||||
environment:
|
||||
PHP_PG_ADMIN_SERVER_HOST: "postgres"
|
||||
PHP_PG_ADMIN_SERVER_PORT: "5432"
|
||||
PHP_PG_ADMIN_SERVER_DEFAULT_DB: "postgres"
|
||||
PHP_PG_ADMIN_OWNED_ONLY: "false"
|
||||
ports:
|
||||
- "8080:80"
|
||||
restart: always
|
||||
|
@ -4,7 +4,7 @@ import logging
|
||||
import os
|
||||
|
||||
# Set to 'DEBUG' to have extensive logging turned on, even for libraries
|
||||
ROOT_LOG_LEVEL = "INFO"
|
||||
ROOT_LOG_LEVEL = "DEBUG"
|
||||
|
||||
PRETTY_LOG_FORMAT = (
|
||||
"%(asctime)s.%(msecs)03d [%(levelname)-8s] %(name)+25s - %(message)s"
|
||||
|
@ -27,6 +27,10 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
|
||||
from llama_index.readers.file.video_audio import ( # type: ignore
|
||||
VideoAudioReader,
|
||||
)
|
||||
from llama_index.readers.file.xml import XMLReader # type: ignore
|
||||
from private_gpt.components.ingest.readers.xlsx_parser import XLSXParser # type: ignore
|
||||
from private_gpt.components.ingest.readers.html_parser import HTMLParser # type: ignore
|
||||
from private_gpt.components.ingest.readers.doc_parser import DOCParser # type: ignore
|
||||
except ImportError as e:
|
||||
raise ImportError("`llama-index-readers-file` package not found") from e
|
||||
|
||||
@ -47,6 +51,16 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
|
||||
".md": MarkdownReader,
|
||||
".mbox": MboxReader,
|
||||
".ipynb": IPYNBReader,
|
||||
# // delete desktop.ini and *.eps files.
|
||||
".markdown": MarkdownReader,
|
||||
".html": HTMLParser,
|
||||
".htm": HTMLParser,
|
||||
".xlsx": XLSXParser,
|
||||
".xml": XMLReader,
|
||||
".eps": ImageReader,
|
||||
".tif": ImageReader,
|
||||
".gif": ImageReader,
|
||||
".doc": DOCParser,
|
||||
}
|
||||
return default_file_reader_cls
|
||||
|
||||
@ -87,12 +101,36 @@ class IngestionHelper:
|
||||
"No reader found for extension=%s, using default string reader",
|
||||
extension,
|
||||
)
|
||||
# Read as a plain text
|
||||
# Attempt to read as a plain text with UTF-8 encoding, if fails read as ASCII
|
||||
try:
|
||||
file_content = file_data.read_text(encoding='utf-8')
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
logger.debug(
|
||||
"Failed to read file_name=%s as UTF-8 encoded text, trying with ASCII",
|
||||
file_name,
|
||||
)
|
||||
# Fallback to ASCII decoding
|
||||
file_content = file_data.read_text(encoding='ascii')
|
||||
except UnicodeDecodeError:
|
||||
logger.debug(
|
||||
"Failed to read file_name=%s as ASCII encoded text, returning empty document",
|
||||
file_name,
|
||||
)
|
||||
return []
|
||||
string_reader = StringIterableReader()
|
||||
return string_reader.load_data([file_data.read_text()])
|
||||
return string_reader.load_data([file_content])
|
||||
|
||||
logger.debug("Specific reader found for extension=%s", extension)
|
||||
return reader_cls().load_data(file_data)
|
||||
try:
|
||||
return reader_cls().load_data(file_data)
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
"Failed to read file_name=%s e=%s",
|
||||
file_name,
|
||||
e
|
||||
)
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
def _exclude_metadata(documents: list[Document]) -> None:
|
||||
|
31
private_gpt/components/ingest/readers/doc_parser.py
Normal file
31
private_gpt/components/ingest/readers/doc_parser.py
Normal file
@ -0,0 +1,31 @@
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict
|
||||
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
from llama_index.core.schema import Document
|
||||
|
||||
class DOCParser(BaseReader):
|
||||
"""DOC parser."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Init parser."""
|
||||
super().__init__()
|
||||
|
||||
def load_data(self, input_file: Path, extra_info: Optional[Dict] = {}) -> List[Document]:
|
||||
"""Parse file."""
|
||||
documents = [] # This will contain the list of Document objects
|
||||
|
||||
# Call antiword to convert the .doc file to plain text
|
||||
try:
|
||||
text = subprocess.run(['antiword', input_file], capture_output=True, text=True, check=True).stdout
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"An error occurred while processing {input_file}: {e}")
|
||||
text = ""
|
||||
|
||||
# Create a Document object with the extracted text and any extra metadata
|
||||
if text:
|
||||
document = Document(text=text, metadata=extra_info)
|
||||
documents.append(document)
|
||||
|
||||
return documents
|
28
private_gpt/components/ingest/readers/html_parser.py
Normal file
28
private_gpt/components/ingest/readers/html_parser.py
Normal file
@ -0,0 +1,28 @@
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
from llama_index.core.schema import Document
|
||||
|
||||
class HTMLParser(BaseReader):
|
||||
"""HTML parser."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Init parser."""
|
||||
super().__init__()
|
||||
|
||||
def load_data(self, input_file: Path, extra_info: Optional[Dict] = {}) -> List[Document]:
|
||||
"""Parse file."""
|
||||
documents = [] # This will contain the list of Document objects
|
||||
|
||||
with open(input_file, "rb") as fp:
|
||||
soup = BeautifulSoup(fp, 'html.parser')
|
||||
text = soup.get_text()
|
||||
text = text.strip() if text else ''
|
||||
|
||||
# Create a Document object with the extracted text and any extra metadata
|
||||
document = Document(text=text, metadata=extra_info)
|
||||
documents.append(document)
|
||||
|
||||
return documents
|
42
private_gpt/components/ingest/readers/xlsx_parser.py
Normal file
42
private_gpt/components/ingest/readers/xlsx_parser.py
Normal file
@ -0,0 +1,42 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict
|
||||
from openpyxl import load_workbook
|
||||
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
from llama_index.core.schema import Document
|
||||
|
||||
class XLSXParser(BaseReader):
|
||||
"""XLSX parser."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Init parser."""
|
||||
super().__init__()
|
||||
|
||||
def load_data(self, input_file: Path, extra_info: Optional[Dict] = {}) -> List[Document]:
|
||||
"""Parse file."""
|
||||
documents = [] # This will contain the list of Document objects
|
||||
wb = load_workbook(filename=str(input_file), read_only=True)
|
||||
# loop over all sheets
|
||||
for sheet in wb:
|
||||
sheet_data = [] # This will contain the list of rows for the current sheet
|
||||
keys = []
|
||||
for row_index, row in enumerate(sheet.iter_rows(values_only=True)):
|
||||
# Skip empty rows
|
||||
if all(cell is None for cell in row):
|
||||
continue
|
||||
|
||||
# Initialize keys with the first row (header)
|
||||
if row_index == 0:
|
||||
keys = [str(cell) if cell is not None else "" for cell in row]
|
||||
continue
|
||||
|
||||
# Ensure each row has the same number of columns as the header
|
||||
row_data = [(keys[i], str(cell) if cell is not None else "") for i, cell in enumerate(row) if i < len(keys)]
|
||||
sheet_data.append(dict(row_data))
|
||||
|
||||
# Create a Document object with the sheet data and any extra metadata
|
||||
document = Document(text=json.dumps(sheet_data, ensure_ascii=False), metadata=extra_info)
|
||||
documents.append(document)
|
||||
|
||||
return documents
|
19
scripts/longquestion.py
Normal file
19
scripts/longquestion.py
Normal file
@ -0,0 +1,19 @@
|
||||
from gradio_client import Client
|
||||
|
||||
with open('question.txt', 'r') as file:
|
||||
# Read the content of the file
|
||||
content = file.read()
|
||||
|
||||
with open('system.txt', 'r') as file:
|
||||
# Read the content of the file
|
||||
systemcontent = file.read()
|
||||
|
||||
client = Client("http://localhost:8001/")
|
||||
result = client.predict(
|
||||
content, # str in 'Message' Textbox component
|
||||
"Query Files", # Literal['Query Files', 'Search Files', 'LLM Chat (no context from files)'] in 'Mode' Radio component
|
||||
[], # List[filepath] in 'Upload File(s)' Uploadbutton component
|
||||
systemcontent, # str in 'System Prompt' Textbox component
|
||||
api_name="/chat"
|
||||
)
|
||||
print(result)
|
12
scripts/question.py
Normal file
12
scripts/question.py
Normal file
@ -0,0 +1,12 @@
|
||||
import sys
|
||||
from gradio_client import Client
|
||||
|
||||
client = Client("http://localhost:8001/")
|
||||
result = client.predict(
|
||||
sys.argv[1], # str in 'Message' Textbox component
|
||||
"Query Files", # Literal['Query Files', 'Search Files', 'LLM Chat (no context from files)'] in 'Mode' Radio component
|
||||
[], # List[filepath] in 'Upload File(s)' Uploadbutton component
|
||||
"You can only answer questions about the provided context. If you know the answer but it is not based in the provided context, don't provide the answer, just state the answer is not in the context provided.", # str in 'System Prompt' Textbox component
|
||||
api_name="/chat"
|
||||
)
|
||||
print(result)
|
1
scripts/question.txt
Normal file
1
scripts/question.txt
Normal file
@ -0,0 +1 @@
|
||||
Which files are in the context provided?
|
1
scripts/system.txt
Normal file
1
scripts/system.txt
Normal file
@ -0,0 +1 @@
|
||||
You can only answer questions about the provided context. If you know the answer but it is not based in the provided context, don't provide the answer, just state the answer is not in the context provided.
|
@ -70,7 +70,7 @@ vectorstore:
|
||||
database: qdrant
|
||||
|
||||
nodestore:
|
||||
database: simple
|
||||
database: postgres
|
||||
|
||||
qdrant:
|
||||
path: local_data/private_gpt/qdrant
|
||||
|
Loading…
Reference in New Issue
Block a user