fixed bos token being prepended twice
This commit is contained in:
17
data.py
17
data.py
@@ -18,27 +18,26 @@ def dataWorker(q, dataset, encoding, batch_size, block_size,chat):
|
|||||||
batch_x, batch_y = [], []
|
batch_x, batch_y = [], []
|
||||||
while True:
|
while True:
|
||||||
for text in dataset:
|
for text in dataset:
|
||||||
tokens = None
|
tokens = []
|
||||||
if(chat):
|
if(chat):
|
||||||
txt=""
|
|
||||||
for msg in text['messages']:
|
for msg in text['messages']:
|
||||||
role = msg['role']
|
role = msg['role']
|
||||||
content = msg['content']
|
content = msg['content']
|
||||||
txt = txt + f"<|{role}|>{content}<|end|>"
|
txt = f"<|{role}|>{content}<|end|> "
|
||||||
tokens = [encoding.bos_token_id]+encoding.encode(txt)
|
tokens += encoding.encode(txt) + [encoding.eos_token_id]
|
||||||
else:
|
else:
|
||||||
tokens = [encoding.bos_token_id]+encoding.encode(text["text"])
|
tokens = encoding.encode(text["text"])
|
||||||
for i in range(0, len(tokens)-block_size+1,block_size):
|
for i in range(0, len(tokens)-block_size+1,block_size):
|
||||||
x = tokens[i:i+block_size]
|
x = tokens[i:i+block_size]
|
||||||
y = tokens[i+1:i+block_size+1]
|
y = tokens[i+1:i+block_size+1]
|
||||||
|
|
||||||
if len(x) < block_size:
|
if len(x) < block_size:
|
||||||
pad = len(x)-(block_size-1)
|
pad = len(x)-(block_size)
|
||||||
x = x + [encoding.eos_token_id] + [encoding.pad_token_id] * pad
|
x = x + [encoding.eos_token_id] * pad
|
||||||
|
|
||||||
if len(y) < block_size:
|
if len(y) < block_size:
|
||||||
pad = len(y)-(block_size-1)
|
pad = len(y)-(block_size)
|
||||||
y = y + [encoding.eos_token_id] + [encoding.pad_token_id] * pad
|
y = y + [encoding.eos_token_id] * pad
|
||||||
|
|
||||||
batch_x.append(x)
|
batch_x.append(x)
|
||||||
batch_y.append(y)
|
batch_y.append(y)
|
||||||
|
|||||||
Reference in New Issue
Block a user