diff --git a/.gitignore b/.gitignore index 5fadbd31..be27e4e7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +# ignore knn index +gpt4all/index/**/ .DS_Store *.pkl ckpts* diff --git a/gpt4all/index/README.md b/gpt4all/index/README.md new file mode 100644 index 00000000..0ba876d9 --- /dev/null +++ b/gpt4all/index/README.md @@ -0,0 +1,27 @@ +# How to Tokenize and Embed + +Split text into chunks +``` +python tokenize_texts.py +``` + +Embbed Texts + +``` +torchrun --master_port=29085 --nproc-per-node 8 embed_texts.py --ds_path=tokenized --batch_size=2048 +``` + + +Combine Embeddings and Build Index +``` +python build_index.py --ds_path=wiki_sample_tokenized --embed_folder=wiki_sample_embedded +``` + +To use the Index + +``` +import hnswlib + +index = hnswlib.Index(space='l2', dim=384) +index.load_index() +``` diff --git a/gpt4all/index/__init__.py b/gpt4all/index/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/requirements.txt b/requirements.txt index b38ab36c..1b0400ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,7 @@ sentencepiece jsonlines nomic scikit-learn -matplotlib \ No newline at end of file +matplotlib +apache_beam +mwparserfromhell +hnswlib \ No newline at end of file