changed chat dataset.
This commit is contained in:
9
data.py
9
data.py
@@ -20,10 +20,11 @@ def dataWorker(q, dataset, encoding, batch_size, block_size,chat):
|
|||||||
for text in dataset:
|
for text in dataset:
|
||||||
tokens = None
|
tokens = None
|
||||||
if(chat):
|
if(chat):
|
||||||
txt = f"<|user|>{text['instruction']}"
|
txt=""
|
||||||
if(text["input"] != None):
|
for msg in text['messages']:
|
||||||
txt += f"\n{text['input']}"
|
role = msg['role']
|
||||||
txt = txt + f"<|end|>\n<|assistant|>{text['output']}<|end|>"
|
content = msg['content']
|
||||||
|
txt = txt + f"<|{role}|>{content}<|end|>"
|
||||||
tokens = [encoding.bos_token_id]+encoding.encode(txt)
|
tokens = [encoding.bos_token_id]+encoding.encode(txt)
|
||||||
else:
|
else:
|
||||||
tokens = [encoding.bos_token_id]+encoding.encode(text["text"])
|
tokens = [encoding.bos_token_id]+encoding.encode(text["text"])
|
||||||
|
|||||||
4
train.py
4
train.py
@@ -23,8 +23,8 @@ hypr = {
|
|||||||
"encoding": "gpt2",
|
"encoding": "gpt2",
|
||||||
"dataset": "HuggingFaceTB/smollm-corpus",
|
"dataset": "HuggingFaceTB/smollm-corpus",
|
||||||
"subset": "cosmopedia-v2",
|
"subset": "cosmopedia-v2",
|
||||||
"chat_dataset": "yahma/alpaca-cleaned",
|
"chat_dataset": "HuggingFaceTB/smoltalk",
|
||||||
"chat_subset": None,
|
"chat_subset": "all",
|
||||||
"half": False,
|
"half": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user