fixed bos token being prepended twice

This commit is contained in:
k
2026-02-27 09:10:34 -05:00
parent a0cd98876c
commit dc231ae703

17
data.py
View File

@@ -18,27 +18,26 @@ def dataWorker(q, dataset, encoding, batch_size, block_size,chat):
batch_x, batch_y = [], [] batch_x, batch_y = [], []
while True: while True:
for text in dataset: for text in dataset:
tokens = None tokens = []
if(chat): if(chat):
txt=""
for msg in text['messages']: for msg in text['messages']:
role = msg['role'] role = msg['role']
content = msg['content'] content = msg['content']
txt = txt + f"<|{role}|>{content}<|end|>" txt = f"<|{role}|>{content}<|end|> "
tokens = [encoding.bos_token_id]+encoding.encode(txt) tokens += encoding.encode(txt) + [encoding.eos_token_id]
else: else:
tokens = [encoding.bos_token_id]+encoding.encode(text["text"]) tokens = encoding.encode(text["text"])
for i in range(0, len(tokens)-block_size+1,block_size): for i in range(0, len(tokens)-block_size+1,block_size):
x = tokens[i:i+block_size] x = tokens[i:i+block_size]
y = tokens[i+1:i+block_size+1] y = tokens[i+1:i+block_size+1]
if len(x) < block_size: if len(x) < block_size:
pad = len(x)-(block_size-1) pad = len(x)-(block_size)
x = x + [encoding.eos_token_id] + [encoding.pad_token_id] * pad x = x + [encoding.eos_token_id] * pad
if len(y) < block_size: if len(y) < block_size:
pad = len(y)-(block_size-1) pad = len(y)-(block_size)
y = y + [encoding.eos_token_id] + [encoding.pad_token_id] * pad y = y + [encoding.eos_token_id] * pad
batch_x.append(x) batch_x.append(x)
batch_y.append(y) batch_y.append(y)