mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-03 10:06:44 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -11,12 +11,11 @@ from colossalai.legacy.registry import DATASETS
|
||||
|
||||
@DATASETS.register_module
|
||||
class WebtextDataset(Dataset):
|
||||
|
||||
def __init__(self, path: Optional[str] = None, seq_len=1024) -> None:
|
||||
super().__init__()
|
||||
if path is not None:
|
||||
root = os.path.dirname(path)
|
||||
encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
|
||||
encoded_data_cache_path = os.path.join(root, f"gpt_webtext_{seq_len}.pt")
|
||||
if os.path.isfile(encoded_data_cache_path):
|
||||
seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
|
||||
if seq_len_ == seq_len:
|
||||
@@ -26,12 +25,12 @@ class WebtextDataset(Dataset):
|
||||
raw_data = []
|
||||
with open(path) as f:
|
||||
for line in f.readlines():
|
||||
raw_data.append(json.loads(line)['text'])
|
||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
raw_data.append(json.loads(line)["text"])
|
||||
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
||||
tokenizer.pad_token = tokenizer.unk_token
|
||||
encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
|
||||
self.data = encoded_data['input_ids']
|
||||
self.attention_mask = encoded_data['attention_mask']
|
||||
encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors="pt")
|
||||
self.data = encoded_data["input_ids"]
|
||||
self.attention_mask = encoded_data["attention_mask"]
|
||||
else:
|
||||
self.data = torch.randint(0, 50257, (10240, seq_len))
|
||||
self.attention_mask = torch.ones_like(self.data)
|
||||
@@ -40,4 +39,4 @@ class WebtextDataset(Dataset):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
return {'input_ids': self.data[index], 'attention_mask': self.attention_mask[index]}, self.data[index]
|
||||
return {"input_ids": self.data[index], "attention_mask": self.attention_mask[index]}, self.data[index]
|
||||
|
Reference in New Issue
Block a user