From dc231ae7038e04536f7132200f1a5af9583ba3e0 Mon Sep 17 00:00:00 2001 From: k Date: Fri, 27 Feb 2026 09:10:34 -0500 Subject: [PATCH] fixed bos token being prepended twice --- data.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/data.py b/data.py index 8bef979..d3414a5 100644 --- a/data.py +++ b/data.py @@ -18,27 +18,26 @@ def dataWorker(q, dataset, encoding, batch_size, block_size,chat): batch_x, batch_y = [], [] while True: for text in dataset: - tokens = None + tokens = [] if(chat): - txt="" for msg in text['messages']: role = msg['role'] content = msg['content'] - txt = txt + f"<|{role}|>{content}<|end|>" - tokens = [encoding.bos_token_id]+encoding.encode(txt) + txt = f"<|{role}|>{content}<|end|> " + tokens += encoding.encode(txt) + [encoding.eos_token_id] else: - tokens = [encoding.bos_token_id]+encoding.encode(text["text"]) + tokens = encoding.encode(text["text"]) for i in range(0, len(tokens)-block_size+1,block_size): x = tokens[i:i+block_size] y = tokens[i+1:i+block_size+1] if len(x) < block_size: - pad = len(x)-(block_size-1) - x = x + [encoding.eos_token_id] + [encoding.pad_token_id] * pad + pad = len(x)-(block_size) + x = x + [encoding.eos_token_id] * pad if len(y) < block_size: - pad = len(y)-(block_size-1) - y = y + [encoding.eos_token_id] + [encoding.pad_token_id] * pad + pad = len(y)-(block_size) + y = y + [encoding.eos_token_id] * pad batch_x.append(x) batch_y.append(y)