From 9fb7f07e3cd5782b6d58e6d13f042cac9a26dbcf Mon Sep 17 00:00:00 2001 From: abhiruka Date: Fri, 19 May 2023 23:18:31 +0800 Subject: [PATCH 1/5] "Refactored main function to take hide_source and mute_stream parameters for controlling output. Added argparse for command-line argument parsing. StreamingStdOutCallbackHandler and source document display are now optional based on user input. Introduced parse_arguments function to handle command-line arguments. Also, updated README.md to reflect these changes." --- README.md | 23 +++++++++++++++++++++++ privateGPT.py | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 51 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 46ee119d..eae10884 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,29 @@ Note: you could turn off your internet connection, and the script inference woul Type `exit` to finish the script. + +### Script Arguments +The script also supports optional command-line arguments to modify its behavior: + +- `--hide-source` or `-S`: Use this flag to disable printing of the source documents used for answers. By default, the source documents are printed. + +```shell +python privateGPT.py --hide-source +``` + +- `--mute-stream` or `-M`: Use this flag to disable LLM standard output streaming response, which by default prints progress to the console. + +```shell +python privateGPT.py --mute-stream +``` + +You can combine these options if needed: + +```shell +python privateGPT.py --hide-source --mute-callback +``` + + # How does it work? Selecting the right local models and the power of `LangChain` you can run the entire pipeline locally, without any data leaving your environment, and with reasonable performance. diff --git a/privateGPT.py b/privateGPT.py index ae08bb93..743caa9a 100644 --- a/privateGPT.py +++ b/privateGPT.py @@ -5,6 +5,7 @@ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.vectorstores import Chroma from langchain.llms import GPT4All, LlamaCpp import os +import argparse load_dotenv() @@ -17,12 +18,13 @@ model_n_ctx = os.environ.get('MODEL_N_CTX') from constants import CHROMA_SETTINGS -def main(): +def main(hide_source=False, mute_stream=False): embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) retriever = db.as_retriever() + # activate/deactivate the streaming StdOut callback for LLMs + callbacks = [] if mute_stream else [StreamingStdOutCallbackHandler()] # Prepare the LLM - callbacks = [StreamingStdOutCallbackHandler()] match model_type: case "LlamaCpp": llm = LlamaCpp(model_path=model_path, n_ctx=model_n_ctx, callbacks=callbacks, verbose=False) @@ -31,7 +33,7 @@ def main(): case _default: print(f"Model {model_type} not supported!") exit; - qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True) + qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not hide_source) # Interactive questions and answers while True: query = input("\nEnter a query: ") @@ -39,19 +41,34 @@ def main(): break # Get the answer from the chain - res = qa(query) - answer, docs = res['result'], res['source_documents'] + res = qa(query) + answer, docs = res['result'], None if hide_source else res['source_documents'] # Print the result print("\n\n> Question:") print(query) print("\n> Answer:") print(answer) - - # Print the relevant sources used for the answer - for document in docs: - print("\n> " + document.metadata["source"] + ":") - print(document.page_content) + + # Print the relevant sources used for the answer, if source is True + if not hide_source and docs: + for document in docs: + print("\n> " + document.metadata["source"] + ":") + print(document.page_content) + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--hide-source", "-S", action='store_true', + help='Use this flag to disable printing of source documents used for answers.') + + parser.add_argument("--mute-stream", "-M", + action='store_true', + help='Use this flag to disable the streaming StdOut callback for LLMs.') + + return parser.parse_args() + if __name__ == "__main__": - main() + # Parse the command line arguments + args = parse_arguments() + main(hide_source=args.hide_source, mute_stream=args.mute_stream) From f8805c80f8faa4dee546bf404332a8e7ab86dabd Mon Sep 17 00:00:00 2001 From: abhiruka Date: Sat, 20 May 2023 07:40:05 +0800 Subject: [PATCH 2/5] Update as per the feedback. - moved args parser inside main - assigned empty list to docs. - Updated README.md. --- README.md | 20 +------------------- privateGPT.py | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index eae10884..11cb2632 100644 --- a/README.md +++ b/README.md @@ -83,25 +83,7 @@ Type `exit` to finish the script. ### Script Arguments -The script also supports optional command-line arguments to modify its behavior: - -- `--hide-source` or `-S`: Use this flag to disable printing of the source documents used for answers. By default, the source documents are printed. - -```shell -python privateGPT.py --hide-source -``` - -- `--mute-stream` or `-M`: Use this flag to disable LLM standard output streaming response, which by default prints progress to the console. - -```shell -python privateGPT.py --mute-stream -``` - -You can combine these options if needed: - -```shell -python privateGPT.py --hide-source --mute-callback -``` +The script also supports optional command-line arguments to modify its behavior. You can see a full list of these arguments by running the command ```python privateGPT.py --help``` in your terminal # How does it work? diff --git a/privateGPT.py b/privateGPT.py index 743caa9a..fd6e276b 100644 --- a/privateGPT.py +++ b/privateGPT.py @@ -18,12 +18,14 @@ model_n_ctx = os.environ.get('MODEL_N_CTX') from constants import CHROMA_SETTINGS -def main(hide_source=False, mute_stream=False): +def main(): + # Parse the command line arguments + args = parse_arguments() embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) retriever = db.as_retriever() # activate/deactivate the streaming StdOut callback for LLMs - callbacks = [] if mute_stream else [StreamingStdOutCallbackHandler()] + callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()] # Prepare the LLM match model_type: case "LlamaCpp": @@ -33,16 +35,16 @@ def main(hide_source=False, mute_stream=False): case _default: print(f"Model {model_type} not supported!") exit; - qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not hide_source) + qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source) # Interactive questions and answers while True: query = input("\nEnter a query: ") if query == "exit": break - + # Get the answer from the chain res = qa(query) - answer, docs = res['result'], None if hide_source else res['source_documents'] + answer, docs = res['result'], [] if args.hide_source else res['source_documents'] # Print the result print("\n\n> Question:") @@ -50,14 +52,14 @@ def main(hide_source=False, mute_stream=False): print("\n> Answer:") print(answer) - # Print the relevant sources used for the answer, if source is True - if not hide_source and docs: - for document in docs: - print("\n> " + document.metadata["source"] + ":") - print(document.page_content) + # Print the relevant sources used for the answer + for document in docs: + print("\n> " + document.metadata["source"] + ":") + print(document.page_content) def parse_arguments(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(description='privateGPT: Ask questions to your documents without an internet connection, ' + 'using the power of LLMs.') parser.add_argument("--hide-source", "-S", action='store_true', help='Use this flag to disable printing of source documents used for answers.') @@ -69,6 +71,4 @@ def parse_arguments(): if __name__ == "__main__": - # Parse the command line arguments - args = parse_arguments() - main(hide_source=args.hide_source, mute_stream=args.mute_stream) + main() From 04f6706bbbc50117813f74aab4c72ad261db6d3f Mon Sep 17 00:00:00 2001 From: MDW Date: Thu, 18 May 2023 02:08:52 +0200 Subject: [PATCH 3/5] Make scripts executeable, add basic pre-commit setup --- .pre-commit-config.yaml | 44 +++++++++++++++++++++++++++++++++++++++++ ingest.py | 3 ++- privateGPT.py | 1 + 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 .pre-commit-config.yaml mode change 100644 => 100755 ingest.py mode change 100644 => 100755 privateGPT.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..8826ee1b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,44 @@ +--- +files: ^(.*\.(py|json|md|sh|yaml|cfg|txt))$ +exclude: ^(\.[^/]*cache/.*|.*/_user.py|source_documents/)$ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + #- id: no-commit-to-branch + # args: [--branch, main] + - id: check-yaml + args: [--unsafe] + # - id: debug-statements + - id: end-of-file-fixer + - id: trailing-whitespace + exclude-files: \.md$ + - id: check-json + - id: mixed-line-ending + # - id: check-builtin-literals + # - id: check-ast + - id: check-merge-conflict + - id: check-executables-have-shebangs + - id: check-shebang-scripts-are-executable + - id: check-docstring-first + - id: fix-byte-order-marker + - id: check-case-conflict + # - id: check-toml + - repo: https://github.com/adrienverge/yamllint.git + rev: v1.29.0 + hooks: + - id: yamllint + args: + - --no-warnings + - -d + - '{extends: relaxed, rules: {line-length: {max: 90}}}' + - repo: https://github.com/codespell-project/codespell + rev: v2.2.2 + hooks: + - id: codespell + args: + # - --builtin=clear,rare,informal,usage,code,names,en-GB_to_en-US + - --builtin=clear,rare,informal,usage,code,names + - --ignore-words-list=hass,master + - --skip="./.*" + - --quiet-level=2 diff --git a/ingest.py b/ingest.py old mode 100644 new mode 100755 index 12049abc..271e80d9 --- a/ingest.py +++ b/ingest.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import os import glob from typing import List @@ -107,7 +108,7 @@ def main(): # Create embeddings embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) - + # Create and store locally vectorstore db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS) db.persist() diff --git a/privateGPT.py b/privateGPT.py old mode 100644 new mode 100755 index fd6e276b..7adab52d --- a/privateGPT.py +++ b/privateGPT.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from dotenv import load_dotenv from langchain.chains import RetrievalQA from langchain.embeddings import HuggingFaceEmbeddings From e3b769d33a18242a5b01388435cbba0e0e47c873 Mon Sep 17 00:00:00 2001 From: jiangzhuo Date: Fri, 19 May 2023 02:35:20 +0900 Subject: [PATCH 4/5] Optimize load_documents function with multiprocessing --- ingest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ingest.py b/ingest.py index 271e80d9..ee97e7c0 100755 --- a/ingest.py +++ b/ingest.py @@ -3,6 +3,7 @@ import os import glob from typing import List from dotenv import load_dotenv +from multiprocessing import Pool from langchain.document_loaders import ( CSVLoader, @@ -87,7 +88,9 @@ def load_documents(source_dir: str) -> List[Document]: all_files.extend( glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) ) - return [load_single_document(file_path) for file_path in all_files] + with Pool(processes=os.cpu_count()) as pool: + documents = pool.map(load_single_document, all_files) + return documents def main(): From cb7c96b31d60a3dcc0e2430bdf452fb657affde1 Mon Sep 17 00:00:00 2001 From: jiangzhuo Date: Fri, 19 May 2023 03:18:41 +0900 Subject: [PATCH 5/5] Add progress bar to load_documents function Enhanced the load_documents() function by adding a progress bar using the tqdm library. This change improves user experience by providing real-time feedback on the progress of document loading. Now, users can easily track the progress of this operation, especially when loading a large number of documents. --- ingest.py | 12 +++++++++--- requirements.txt | 3 ++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/ingest.py b/ingest.py index ee97e7c0..f609c760 100755 --- a/ingest.py +++ b/ingest.py @@ -4,6 +4,7 @@ import glob from typing import List from dotenv import load_dotenv from multiprocessing import Pool +from tqdm import tqdm from langchain.document_loaders import ( CSVLoader, @@ -80,7 +81,6 @@ def load_single_document(file_path: str) -> Document: raise ValueError(f"Unsupported file extension '{ext}'") - def load_documents(source_dir: str) -> List[Document]: # Loads all documents from source documents directory all_files = [] @@ -88,9 +88,15 @@ def load_documents(source_dir: str) -> List[Document]: all_files.extend( glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) ) + with Pool(processes=os.cpu_count()) as pool: - documents = pool.map(load_single_document, all_files) - return documents + results = [] + with tqdm(total=len(all_files), desc='Loading documents', ncols=80) as pbar: + for i, doc in enumerate(pool.imap_unordered(load_single_document, all_files)): + results.append(doc) + pbar.update() + + return results def main(): diff --git a/requirements.txt b/requirements.txt index 21740bcd..204b77ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ unstructured==0.6.6 extract-msg==0.41.1 tabulate==0.9.0 pandoc==2.3 -pypandoc==1.11 \ No newline at end of file +pypandoc==1.11 +tqdm==4.65.0 \ No newline at end of file