mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-07-18 09:21:29 +00:00
chore: reqs, ignore, readme
This commit is contained in:
parent
4eeab60306
commit
8cb3c8de35
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,3 +1,5 @@
|
|||||||
|
# ignore knn index
|
||||||
|
gpt4all/index/**/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
*.pkl
|
*.pkl
|
||||||
ckpts*
|
ckpts*
|
||||||
|
27
gpt4all/index/README.md
Normal file
27
gpt4all/index/README.md
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# How to Tokenize and Embed
|
||||||
|
|
||||||
|
Split text into chunks
|
||||||
|
```
|
||||||
|
python tokenize_texts.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Embbed Texts
|
||||||
|
|
||||||
|
```
|
||||||
|
torchrun --master_port=29085 --nproc-per-node 8 embed_texts.py --ds_path=tokenized --batch_size=2048
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Combine Embeddings and Build Index
|
||||||
|
```
|
||||||
|
python build_index.py --ds_path=wiki_sample_tokenized --embed_folder=wiki_sample_embedded
|
||||||
|
```
|
||||||
|
|
||||||
|
To use the Index
|
||||||
|
|
||||||
|
```
|
||||||
|
import hnswlib
|
||||||
|
|
||||||
|
index = hnswlib.Index(space='l2', dim=384)
|
||||||
|
index.load_index(<path to index>)
|
||||||
|
```
|
0
gpt4all/index/__init__.py
Normal file
0
gpt4all/index/__init__.py
Normal file
@ -13,3 +13,6 @@ jsonlines
|
|||||||
nomic
|
nomic
|
||||||
scikit-learn
|
scikit-learn
|
||||||
matplotlib
|
matplotlib
|
||||||
|
apache_beam
|
||||||
|
mwparserfromhell
|
||||||
|
hnswlib
|
Loading…
Reference in New Issue
Block a user