fixed bos token being prepended twice

2026-02-27 09:10:34 -05:00 · 2026-02-27 09:10:34 -05:00 · dc231ae703
commit dc231ae703
parent a0cd98876c
1 changed files with 8 additions and 9 deletions
--- a/data.py
+++ b/data.py
@ -18,27 +18,26 @@ def dataWorker(q, dataset, encoding, batch_size, block_size,chat):
    batch_x, batch_y = [], []
    while True:
        for text in dataset:
-            tokens = None
+            tokens = []
            if(chat):
                txt=""
                for msg in text['messages']:
                    role = msg['role']
                    content = msg['content']
-                    txt = txt + f"<|{role}|>{content}<|end|>"
+                    txt = f"<|{role}|>{content}<|end|> "
-                tokens = [encoding.bos_token_id]+encoding.encode(txt)
+                    tokens += encoding.encode(txt) + [encoding.eos_token_id]
            else:
-                tokens = [encoding.bos_token_id]+encoding.encode(text["text"])
+                tokens = encoding.encode(text["text"])
            for i in range(0, len(tokens)-block_size+1,block_size):
                x = tokens[i:i+block_size]
                y = tokens[i+1:i+block_size+1]
                if len(x) < block_size:
-                    pad = len(x)-(block_size-1)
+                    pad = len(x)-(block_size)
-                    x = x + [encoding.eos_token_id] + [encoding.pad_token_id] * pad
+                    x = x + [encoding.eos_token_id] * pad
                if len(y) < block_size:
-                    pad = len(y)-(block_size-1)
+                    pad = len(y)-(block_size)
-                    y = y + [encoding.eos_token_id] + [encoding.pad_token_id] * pad
+                    y = y + [encoding.eos_token_id] * pad
                batch_x.append(x)
                batch_y.append(y)