From 9fb7f07e3cd5782b6d58e6d13f042cac9a26dbcf Mon Sep 17 00:00:00 2001
From: abhiruka <abhiruka@users.noreply.github.com>
Date: Fri, 19 May 2023 23:18:31 +0800
Subject: [PATCH 1/5] "Refactored main function to take hide_source and
 mute_stream parameters for controlling output. Added argparse for
 command-line argument parsing. StreamingStdOutCallbackHandler and source
 document display are now optional based on user input. Introduced
 parse_arguments function to handle command-line arguments. Also, updated
 README.md to reflect these changes."

---
 README.md     | 23 +++++++++++++++++++++++
 privateGPT.py | 39 ++++++++++++++++++++++++++++-----------
 2 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 46ee119d..eae10884 100644
--- a/README.md
+++ b/README.md
@@ -81,6 +81,29 @@ Note: you could turn off your internet connection, and the script inference woul
 
 Type `exit` to finish the script.
 
+
+### Script Arguments
+The script also supports optional command-line arguments to modify its behavior:
+
+- `--hide-source` or `-S`: Use this flag to disable printing of the source documents used for answers. By default, the source documents are printed.
+  
+```shell
+python privateGPT.py --hide-source
+```
+
+- `--mute-stream` or `-M`: Use this flag to disable LLM standard output streaming response, which by default prints progress to the console.
+
+```shell
+python privateGPT.py --mute-stream
+```
+
+You can combine these options if needed:
+
+```shell
+python privateGPT.py --hide-source --mute-callback
+```
+
+
 # How does it work?
 Selecting the right local models and the power of `LangChain` you can run the entire pipeline locally, without any data leaving your environment, and with reasonable performance.
 
diff --git a/privateGPT.py b/privateGPT.py
index ae08bb93..743caa9a 100644
--- a/privateGPT.py
+++ b/privateGPT.py
@@ -5,6 +5,7 @@ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import Chroma
 from langchain.llms import GPT4All, LlamaCpp
 import os
+import argparse
 
 load_dotenv()
 
@@ -17,12 +18,13 @@ model_n_ctx = os.environ.get('MODEL_N_CTX')
 
 from constants import CHROMA_SETTINGS
 
-def main():
+def main(hide_source=False, mute_stream=False):
     embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
     db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
     retriever = db.as_retriever()
+    # activate/deactivate the streaming StdOut callback for LLMs
+    callbacks = [] if mute_stream else [StreamingStdOutCallbackHandler()]
     # Prepare the LLM
-    callbacks = [StreamingStdOutCallbackHandler()]
     match model_type:
         case "LlamaCpp":
             llm = LlamaCpp(model_path=model_path, n_ctx=model_n_ctx, callbacks=callbacks, verbose=False)
@@ -31,7 +33,7 @@ def main():
         case _default:
             print(f"Model {model_type} not supported!")
             exit;
-    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
+    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not hide_source)
     # Interactive questions and answers
     while True:
         query = input("\nEnter a query: ")
@@ -39,19 +41,34 @@ def main():
             break
         
         # Get the answer from the chain
-        res = qa(query)    
-        answer, docs = res['result'], res['source_documents']
+        res = qa(query)
+        answer, docs = res['result'], None if hide_source else res['source_documents']
 
         # Print the result
         print("\n\n> Question:")
         print(query)
         print("\n> Answer:")
         print(answer)
-        
-        # Print the relevant sources used for the answer
-        for document in docs:
-            print("\n> " + document.metadata["source"] + ":")
-            print(document.page_content)
+
+        # Print the relevant sources used for the answer, if source is True
+        if not hide_source and docs:
+            for document in docs:
+                print("\n> " + document.metadata["source"] + ":")
+                print(document.page_content)
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hide-source", "-S", action='store_true',
+                        help='Use this flag to disable printing of source documents used for answers.')
+
+    parser.add_argument("--mute-stream", "-M",
+                        action='store_true',
+                        help='Use this flag to disable the streaming StdOut callback for LLMs.')
+
+    return parser.parse_args()
+
 
 if __name__ == "__main__":
-    main()
+    # Parse the command line arguments
+    args = parse_arguments()
+    main(hide_source=args.hide_source, mute_stream=args.mute_stream)

From f8805c80f8faa4dee546bf404332a8e7ab86dabd Mon Sep 17 00:00:00 2001
From: abhiruka <abhiruka@users.noreply.github.com>
Date: Sat, 20 May 2023 07:40:05 +0800
Subject: [PATCH 2/5] Update as per the feedback. - moved args parser inside
 main - assigned empty list to docs. - Updated README.md.

---
 README.md     | 20 +-------------------
 privateGPT.py | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index eae10884..11cb2632 100644
--- a/README.md
+++ b/README.md
@@ -83,25 +83,7 @@ Type `exit` to finish the script.
 
 
 ### Script Arguments
-The script also supports optional command-line arguments to modify its behavior:
-
-- `--hide-source` or `-S`: Use this flag to disable printing of the source documents used for answers. By default, the source documents are printed.
-  
-```shell
-python privateGPT.py --hide-source
-```
-
-- `--mute-stream` or `-M`: Use this flag to disable LLM standard output streaming response, which by default prints progress to the console.
-
-```shell
-python privateGPT.py --mute-stream
-```
-
-You can combine these options if needed:
-
-```shell
-python privateGPT.py --hide-source --mute-callback
-```
+The script also supports optional command-line arguments to modify its behavior. You can see a full list of these arguments by running the command ```python privateGPT.py --help``` in your terminal
 
 
 # How does it work?
diff --git a/privateGPT.py b/privateGPT.py
index 743caa9a..fd6e276b 100644
--- a/privateGPT.py
+++ b/privateGPT.py
@@ -18,12 +18,14 @@ model_n_ctx = os.environ.get('MODEL_N_CTX')
 
 from constants import CHROMA_SETTINGS
 
-def main(hide_source=False, mute_stream=False):
+def main():
+    # Parse the command line arguments
+    args = parse_arguments()
     embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
     db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
     retriever = db.as_retriever()
     # activate/deactivate the streaming StdOut callback for LLMs
-    callbacks = [] if mute_stream else [StreamingStdOutCallbackHandler()]
+    callbacks = [] if args.mute_stream else [StreamingStdOutCallbackHandler()]
     # Prepare the LLM
     match model_type:
         case "LlamaCpp":
@@ -33,16 +35,16 @@ def main(hide_source=False, mute_stream=False):
         case _default:
             print(f"Model {model_type} not supported!")
             exit;
-    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not hide_source)
+    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
     # Interactive questions and answers
     while True:
         query = input("\nEnter a query: ")
         if query == "exit":
             break
-        
+
         # Get the answer from the chain
         res = qa(query)
-        answer, docs = res['result'], None if hide_source else res['source_documents']
+        answer, docs = res['result'], [] if args.hide_source else res['source_documents']
 
         # Print the result
         print("\n\n> Question:")
@@ -50,14 +52,14 @@ def main(hide_source=False, mute_stream=False):
         print("\n> Answer:")
         print(answer)
 
-        # Print the relevant sources used for the answer, if source is True
-        if not hide_source and docs:
-            for document in docs:
-                print("\n> " + document.metadata["source"] + ":")
-                print(document.page_content)
+        # Print the relevant sources used for the answer
+        for document in docs:
+            print("\n> " + document.metadata["source"] + ":")
+            print(document.page_content)
 
 def parse_arguments():
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(description='privateGPT: Ask questions to your documents without an internet connection, '
+                                                 'using the power of LLMs.')
     parser.add_argument("--hide-source", "-S", action='store_true',
                         help='Use this flag to disable printing of source documents used for answers.')
 
@@ -69,6 +71,4 @@ def parse_arguments():
 
 
 if __name__ == "__main__":
-    # Parse the command line arguments
-    args = parse_arguments()
-    main(hide_source=args.hide_source, mute_stream=args.mute_stream)
+    main()

From 04f6706bbbc50117813f74aab4c72ad261db6d3f Mon Sep 17 00:00:00 2001
From: MDW <mdeweerd@users.noreply.github.com>
Date: Thu, 18 May 2023 02:08:52 +0200
Subject: [PATCH 3/5] Make scripts executeable, add basic pre-commit setup

---
 .pre-commit-config.yaml | 44 +++++++++++++++++++++++++++++++++++++++++
 ingest.py               |  3 ++-
 privateGPT.py           |  1 +
 3 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 .pre-commit-config.yaml
 mode change 100644 => 100755 ingest.py
 mode change 100644 => 100755 privateGPT.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..8826ee1b
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,44 @@
+---
+files: ^(.*\.(py|json|md|sh|yaml|cfg|txt))$
+exclude: ^(\.[^/]*cache/.*|.*/_user.py|source_documents/)$
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      #- id: no-commit-to-branch
+      #  args: [--branch, main]
+      - id: check-yaml
+        args: [--unsafe]
+      # - id: debug-statements
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+        exclude-files: \.md$
+      - id: check-json
+      - id: mixed-line-ending
+      # - id: check-builtin-literals
+      # - id: check-ast
+      - id: check-merge-conflict
+      - id: check-executables-have-shebangs
+      - id: check-shebang-scripts-are-executable
+      - id: check-docstring-first
+      - id: fix-byte-order-marker
+      - id: check-case-conflict
+      # - id: check-toml
+  - repo: https://github.com/adrienverge/yamllint.git
+    rev: v1.29.0
+    hooks:
+      - id: yamllint
+        args:
+          - --no-warnings
+          - -d
+          - '{extends: relaxed, rules: {line-length: {max: 90}}}'
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.2
+    hooks:
+      - id: codespell
+        args:
+          # - --builtin=clear,rare,informal,usage,code,names,en-GB_to_en-US
+          - --builtin=clear,rare,informal,usage,code,names
+          - --ignore-words-list=hass,master
+          - --skip="./.*"
+          - --quiet-level=2
diff --git a/ingest.py b/ingest.py
old mode 100644
new mode 100755
index 12049abc..271e80d9
--- a/ingest.py
+++ b/ingest.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import os
 import glob
 from typing import List
@@ -107,7 +108,7 @@ def main():
 
     # Create embeddings
     embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
-    
+
     # Create and store locally vectorstore
     db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
     db.persist()
diff --git a/privateGPT.py b/privateGPT.py
old mode 100644
new mode 100755
index fd6e276b..7adab52d
--- a/privateGPT.py
+++ b/privateGPT.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 from dotenv import load_dotenv
 from langchain.chains import RetrievalQA
 from langchain.embeddings import HuggingFaceEmbeddings

From e3b769d33a18242a5b01388435cbba0e0e47c873 Mon Sep 17 00:00:00 2001
From: jiangzhuo <jiangzhuo9357@gmail.com>
Date: Fri, 19 May 2023 02:35:20 +0900
Subject: [PATCH 4/5] Optimize load_documents function with multiprocessing

---
 ingest.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ingest.py b/ingest.py
index 271e80d9..ee97e7c0 100755
--- a/ingest.py
+++ b/ingest.py
@@ -3,6 +3,7 @@ import os
 import glob
 from typing import List
 from dotenv import load_dotenv
+from multiprocessing import Pool
 
 from langchain.document_loaders import (
     CSVLoader,
@@ -87,7 +88,9 @@ def load_documents(source_dir: str) -> List[Document]:
         all_files.extend(
             glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
         )
-    return [load_single_document(file_path) for file_path in all_files]
+    with Pool(processes=os.cpu_count()) as pool:
+        documents = pool.map(load_single_document, all_files)
+    return documents
 
 
 def main():

From cb7c96b31d60a3dcc0e2430bdf452fb657affde1 Mon Sep 17 00:00:00 2001
From: jiangzhuo <jiangzhuo9357@gmail.com>
Date: Fri, 19 May 2023 03:18:41 +0900
Subject: [PATCH 5/5] Add progress bar to load_documents function Enhanced the
 load_documents() function by adding a progress bar using the tqdm library.
 This change improves user experience by providing real-time feedback on the
 progress of document loading. Now, users can easily track the progress of
 this operation, especially when loading a large number of documents.

---
 ingest.py        | 12 +++++++++---
 requirements.txt |  3 ++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/ingest.py b/ingest.py
index ee97e7c0..f609c760 100755
--- a/ingest.py
+++ b/ingest.py
@@ -4,6 +4,7 @@ import glob
 from typing import List
 from dotenv import load_dotenv
 from multiprocessing import Pool
+from tqdm import tqdm
 
 from langchain.document_loaders import (
     CSVLoader,
@@ -80,7 +81,6 @@ def load_single_document(file_path: str) -> Document:
 
     raise ValueError(f"Unsupported file extension '{ext}'")
 
-
 def load_documents(source_dir: str) -> List[Document]:
     # Loads all documents from source documents directory
     all_files = []
@@ -88,9 +88,15 @@ def load_documents(source_dir: str) -> List[Document]:
         all_files.extend(
             glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
         )
+
     with Pool(processes=os.cpu_count()) as pool:
-        documents = pool.map(load_single_document, all_files)
-    return documents
+        results = []
+        with tqdm(total=len(all_files), desc='Loading documents', ncols=80) as pbar:
+            for i, doc in enumerate(pool.imap_unordered(load_single_document, all_files)):
+                results.append(doc)
+                pbar.update()
+
+    return results
 
 
 def main():
diff --git a/requirements.txt b/requirements.txt
index 21740bcd..204b77ce 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,5 @@ unstructured==0.6.6
 extract-msg==0.41.1
 tabulate==0.9.0
 pandoc==2.3
-pypandoc==1.11
\ No newline at end of file
+pypandoc==1.11
+tqdm==4.65.0
\ No newline at end of file