changed chat dataset.

This commit is contained in:
k
2026-01-09 17:30:34 -05:00
parent c78a31362a
commit 0537a5df64
2 changed files with 7 additions and 6 deletions

View File

@@ -20,10 +20,11 @@ def dataWorker(q, dataset, encoding, batch_size, block_size,chat):
for text in dataset: for text in dataset:
tokens = None tokens = None
if(chat): if(chat):
txt = f"<|user|>{text['instruction']}" txt=""
if(text["input"] != None): for msg in text['messages']:
txt += f"\n{text['input']}" role = msg['role']
txt = txt + f"<|end|>\n<|assistant|>{text['output']}<|end|>" content = msg['content']
txt = txt + f"<|{role}|>{content}<|end|>"
tokens = [encoding.bos_token_id]+encoding.encode(txt) tokens = [encoding.bos_token_id]+encoding.encode(txt)
else: else:
tokens = [encoding.bos_token_id]+encoding.encode(text["text"]) tokens = [encoding.bos_token_id]+encoding.encode(text["text"])

View File

@@ -23,8 +23,8 @@ hypr = {
"encoding": "gpt2", "encoding": "gpt2",
"dataset": "HuggingFaceTB/smollm-corpus", "dataset": "HuggingFaceTB/smollm-corpus",
"subset": "cosmopedia-v2", "subset": "cosmopedia-v2",
"chat_dataset": "yahma/alpaca-cleaned", "chat_dataset": "HuggingFaceTB/smoltalk",
"chat_subset": None, "chat_subset": "all",
"half": False, "half": False,
} }