changed chat dataset.

2026-01-09 17:30:34 -05:00 · 2026-01-09 17:30:34 -05:00 · 0537a5df64
commit 0537a5df64
parent c78a31362a
2 changed files with 7 additions and 6 deletions
--- a/data.py
+++ b/data.py
@ -20,10 +20,11 @@ def dataWorker(q, dataset, encoding, batch_size, block_size,chat):
        for text in dataset:
            tokens = None
            if(chat):
-                txt = f"<|user|>{text['instruction']}"
+                txt=""
-                if(text["input"] != None):
+                for msg in text['messages']:
-                    txt += f"\n{text['input']}"
+                    role = msg['role']
-                txt = txt + f"<|end|>\n<|assistant|>{text['output']}<|end|>"
+                    content = msg['content']
                    txt = txt + f"<|{role}|>{content}<|end|>"
                tokens = [encoding.bos_token_id]+encoding.encode(txt)
            else:
                tokens = [encoding.bos_token_id]+encoding.encode(text["text"])
--- a/train.py
+++ b/train.py
@ -23,8 +23,8 @@ hypr = {
    "encoding":    "gpt2",
    "dataset":     "HuggingFaceTB/smollm-corpus",
    "subset":      "cosmopedia-v2",
-    "chat_dataset": "yahma/alpaca-cleaned",
+    "chat_dataset": "HuggingFaceTB/smoltalk",
-    "chat_subset":  None,
+    "chat_subset":  "all",
    "half":         False,
 }