diff --git a/data.py b/data.py new file mode 100644 index 0000000..938ab87 --- /dev/null +++ b/data.py @@ -0,0 +1,39 @@ +import numpy as np +import threading +import queue + +def startDataWorker(dataset,encoding,batch_size,block_size): + data_q = queue.Queue(maxsize=100) + t = threading.Thread(target=data_worker, args=(data_q, dataset, encoding, batch_size, block_size), daemon=True) + t.start() + while (1): + try: + bx, by = data_q.get(timeout=30) + except queue.Empty: + continue + yield (bx,by) + +def dataWorker(q, dataset, encoding, batch_size, block_size): + batch_x, batch_y = [], [] + while(1): + for text in dataset["text"]: + tokens = encoding.encode(text) + for i in range(0, len(tokens)-block_size-1,block_size): + x = [encoding.bos_token_id] + tokens[i:i+block_size-1] + y = tokens[i:i+block_size] + + if len(x) < block_size: + pad = len(x)-(block_size-1) + x = x + [encoding.eos_token_id] + [encoding.pad_token_id] * pad + + if len(y) < block_size: + pad = len(y)-(block_size-1) + y = y + [encoding.eos_token_id] + [encoding.pad_token_id] * pad + + batch_x.append(x) + batch_y.append(y) + + if len(batch_x) == batch_size: + q.put((np.array(batch_x, dtype=np.int32), + np.array(batch_y, dtype=np.int32))) + batch_x, batch_y = [], [] diff --git a/log.py b/log.py new file mode 100644 index 0000000..cdd56b9 --- /dev/null +++ b/log.py @@ -0,0 +1,15 @@ +from tinygrad.nn.state import safe_save +import csv +import os + +def logLoss(step, loss): + path = "loss.csv" + exists = os.path.isfile(path) + with open(path, mode='a', newline='') as f: + writer = csv.writer(f) + if not exists: + writer.writerow(['step', 'loss']) + writer.writerow([step, float(loss)]) + +def logModel(step,stateDict): + safe_save(stateDict, f"gpt_{step}.safetensors") diff --git a/model.py b/model.py index 655d623..a70025c 100644 --- a/model.py +++ b/model.py @@ -58,13 +58,18 @@ class Block: return self class Transformer(): - def __init__(self,vocab_size,embed_size,n_heads,n_blocks): + def __init__(self,vocab_size,embed_size,n_heads,n_blocks,max_len): self.tok_embed = nn.Embedding(vocab_size,embed_size) + self.pos_embed = nn.Embedding(block_size,embed_size) + self.pos_idx = Tensor.arange(max_len, requires_grad=False) + self.blocks = [Block(embed_size,n_heads) for _ in range(n_blocks)] self.norm = nn.RMSNorm(embed_size) self.output = nn.Linear(embed_size,vocab_size,bias=False) def __call__(self,x): - x = self.tok_embed(x) + B,T = x.shape + pos_embeds = self.pos_embed(self.pos_idx[:T]) + x = self.tok_embed(x) + pos_embeds x = x.sequential(self.blocks) x = self.norm(x) return self.output(x) diff --git a/optm.py b/optm.py new file mode 100644 index 0000000..4a99ea0 --- /dev/null +++ b/optm.py @@ -0,0 +1,28 @@ +from tinygrad import Tensor +import math + +class CosineLR: + def __init__(self,optm,totalSteps,minlr): + self.optm = optm + self.maxlr = optm.lr + self.minlr = minlr + self.totalSteps = totalSteps + self.steps = 0 + + def step(self): + self.optm.lr = self.minlr + 0.5 * (self.maxlr - self.minlr) * (1 + math.cos((step / self.totalSteps) * math.pi)) + self.optm.step() + self.steps += 1 + + def zero_grad(self): + self.optm.zero_grad() + + +def llmOptimizer(params,steps,minlr): + muon_params = [p for p in params if len(p.shape) >= 2] + adamw_params = [p for p in params if len(p.shape) < 2] + + o1 = nn.optim.Muon(muon_params, lr=hypr["starting_lr"]) + o2 = nn.optim.AdamW(adamw_params, lr=hypr["starting_lr"]) + optimizer = nn.optim.OptimizerGroup([o1,o2]) + return CosineLR(optimizer,steps,minlr)