mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-07-16 16:31:30 +00:00
chore: reqs, ignore, readme
This commit is contained in:
parent
4eeab60306
commit
8cb3c8de35
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,3 +1,5 @@
|
||||
# ignore knn index
|
||||
gpt4all/index/**/
|
||||
.DS_Store
|
||||
*.pkl
|
||||
ckpts*
|
||||
|
27
gpt4all/index/README.md
Normal file
27
gpt4all/index/README.md
Normal file
@ -0,0 +1,27 @@
|
||||
# How to Tokenize and Embed
|
||||
|
||||
Split text into chunks
|
||||
```
|
||||
python tokenize_texts.py
|
||||
```
|
||||
|
||||
Embbed Texts
|
||||
|
||||
```
|
||||
torchrun --master_port=29085 --nproc-per-node 8 embed_texts.py --ds_path=tokenized --batch_size=2048
|
||||
```
|
||||
|
||||
|
||||
Combine Embeddings and Build Index
|
||||
```
|
||||
python build_index.py --ds_path=wiki_sample_tokenized --embed_folder=wiki_sample_embedded
|
||||
```
|
||||
|
||||
To use the Index
|
||||
|
||||
```
|
||||
import hnswlib
|
||||
|
||||
index = hnswlib.Index(space='l2', dim=384)
|
||||
index.load_index(<path to index>)
|
||||
```
|
0
gpt4all/index/__init__.py
Normal file
0
gpt4all/index/__init__.py
Normal file
@ -12,4 +12,7 @@ sentencepiece
|
||||
jsonlines
|
||||
nomic
|
||||
scikit-learn
|
||||
matplotlib
|
||||
matplotlib
|
||||
apache_beam
|
||||
mwparserfromhell
|
||||
hnswlib
|
Loading…
Reference in New Issue
Block a user