mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-07-06 20:09:58 +00:00
feat: add options to specify different datasets
This commit is contained in:
parent
8cb3c8de35
commit
9bc88fb33d
@ -5,7 +5,12 @@ from gpt4all.index.embed import Embedder
|
|||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
|
# fmt: off
|
||||||
parser.add_argument("--tokenized_save_path", type=str, default="tokenized")
|
parser.add_argument("--tokenized_save_path", type=str, default="tokenized")
|
||||||
|
parser.add_argument("--ds_name", type=str, default="wikipedia")
|
||||||
|
parser.add_argument("--ds_version", type=str, default="20220301.simple")
|
||||||
|
parser.add_argument("--sbert_model", type=str, default="sentence-transformers/all-MiniLM-L6-v2")
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
@ -59,4 +64,9 @@ def chunk_dataset(
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
chunked_dataset = chunk_dataset(save_path=args.tokenized_save_path)
|
chunked_dataset = chunk_dataset(
|
||||||
|
ds_name=args.ds_name,
|
||||||
|
version=args.ds_version,
|
||||||
|
sbert_model=args.sbert_model,
|
||||||
|
save_path=args.tokenized_save_path,
|
||||||
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user