changed chat dataset.

This commit is contained in:
k
2026-01-09 17:30:34 -05:00
parent c78a31362a
commit 0537a5df64
2 changed files with 7 additions and 6 deletions

View File

@@ -20,10 +20,11 @@ def dataWorker(q, dataset, encoding, batch_size, block_size,chat):
for text in dataset:
tokens = None
if(chat):
txt = f"<|user|>{text['instruction']}"
if(text["input"] != None):
txt += f"\n{text['input']}"
txt = txt + f"<|end|>\n<|assistant|>{text['output']}<|end|>"
txt=""
for msg in text['messages']:
role = msg['role']
content = msg['content']
txt = txt + f"<|{role}|>{content}<|end|>"
tokens = [encoding.bos_token_id]+encoding.encode(txt)
else:
tokens = [encoding.bos_token_id]+encoding.encode(text["text"])

View File

@@ -23,8 +23,8 @@ hypr = {
"encoding": "gpt2",
"dataset": "HuggingFaceTB/smollm-corpus",
"subset": "cosmopedia-v2",
"chat_dataset": "yahma/alpaca-cleaned",
"chat_subset": None,
"chat_dataset": "HuggingFaceTB/smoltalk",
"chat_subset": "all",
"half": False,
}