changed chat dataset.
This commit is contained in:
parent
c78a31362a
commit
0537a5df64
2 changed files with 7 additions and 6 deletions
9
data.py
9
data.py
|
|
@ -20,10 +20,11 @@ def dataWorker(q, dataset, encoding, batch_size, block_size,chat):
|
|||
for text in dataset:
|
||||
tokens = None
|
||||
if(chat):
|
||||
txt = f"<|user|>{text['instruction']}"
|
||||
if(text["input"] != None):
|
||||
txt += f"\n{text['input']}"
|
||||
txt = txt + f"<|end|>\n<|assistant|>{text['output']}<|end|>"
|
||||
txt=""
|
||||
for msg in text['messages']:
|
||||
role = msg['role']
|
||||
content = msg['content']
|
||||
txt = txt + f"<|{role}|>{content}<|end|>"
|
||||
tokens = [encoding.bos_token_id]+encoding.encode(txt)
|
||||
else:
|
||||
tokens = [encoding.bos_token_id]+encoding.encode(text["text"])
|
||||
|
|
|
|||
4
train.py
4
train.py
|
|
@ -23,8 +23,8 @@ hypr = {
|
|||
"encoding": "gpt2",
|
||||
"dataset": "HuggingFaceTB/smollm-corpus",
|
||||
"subset": "cosmopedia-v2",
|
||||
"chat_dataset": "yahma/alpaca-cleaned",
|
||||
"chat_subset": None,
|
||||
"chat_dataset": "HuggingFaceTB/smoltalk",
|
||||
"chat_subset": "all",
|
||||
"half": False,
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue