From 7f25dff1d1a1c196d64badc44c801bc4096edd73 Mon Sep 17 00:00:00 2001 From: k Date: Wed, 7 Jan 2026 02:13:08 -0500 Subject: [PATCH] Fix errors --- data.py | 13 +++++++------ model.py | 4 ++-- optm.py | 20 +++++++++++--------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/data.py b/data.py index 938ab87..56a893a 100644 --- a/data.py +++ b/data.py @@ -4,18 +4,19 @@ import queue def startDataWorker(dataset,encoding,batch_size,block_size): data_q = queue.Queue(maxsize=100) - t = threading.Thread(target=data_worker, args=(data_q, dataset, encoding, batch_size, block_size), daemon=True) + t = threading.Thread(target=dataWorker, args=(data_q, dataset, encoding, batch_size, block_size), daemon=True) t.start() while (1): try: bx, by = data_q.get(timeout=30) except queue.Empty: + print("queue empty ...") continue yield (bx,by) def dataWorker(q, dataset, encoding, batch_size, block_size): batch_x, batch_y = [], [] - while(1): + while True: for text in dataset["text"]: tokens = encoding.encode(text) for i in range(0, len(tokens)-block_size-1,block_size): @@ -33,7 +34,7 @@ def dataWorker(q, dataset, encoding, batch_size, block_size): batch_x.append(x) batch_y.append(y) - if len(batch_x) == batch_size: - q.put((np.array(batch_x, dtype=np.int32), - np.array(batch_y, dtype=np.int32))) - batch_x, batch_y = [], [] + if len(batch_x) == batch_size: + q.put((np.array(batch_x, dtype=np.int32), + np.array(batch_y, dtype=np.int32))) + batch_x, batch_y = [], [] diff --git a/model.py b/model.py index a70025c..8df1c46 100644 --- a/model.py +++ b/model.py @@ -58,10 +58,10 @@ class Block: return self class Transformer(): - def __init__(self,vocab_size,embed_size,n_heads,n_blocks,max_len): + def __init__(self,vocab_size,embed_size,n_heads,n_blocks,block_size): self.tok_embed = nn.Embedding(vocab_size,embed_size) self.pos_embed = nn.Embedding(block_size,embed_size) - self.pos_idx = Tensor.arange(max_len, requires_grad=False) + self.pos_idx = Tensor.arange(block_size, requires_grad=False) self.blocks = [Block(embed_size,n_heads) for _ in range(n_blocks)] self.norm = nn.RMSNorm(embed_size) diff --git a/optm.py b/optm.py index 4a99ea0..1397ab4 100644 --- a/optm.py +++ b/optm.py @@ -1,16 +1,18 @@ -from tinygrad import Tensor +from tinygrad import Tensor,nn import math class CosineLR: - def __init__(self,optm,totalSteps,minlr): + def __init__(self,optm,totalSteps,maxlr,minlr): self.optm = optm - self.maxlr = optm.lr + self.maxlr = maxlr self.minlr = minlr self.totalSteps = totalSteps self.steps = 0 def step(self): - self.optm.lr = self.minlr + 0.5 * (self.maxlr - self.minlr) * (1 + math.cos((step / self.totalSteps) * math.pi)) + lr = self.minlr + 0.5 * (self.maxlr - self.minlr) * (1 + math.cos((self.steps / self.totalSteps) * math.pi)) + for o in self.optm: + o.lr = lr self.optm.step() self.steps += 1 @@ -18,11 +20,11 @@ class CosineLR: self.optm.zero_grad() -def llmOptimizer(params,steps,minlr): +def llmOptimizer(params,steps,maxlr,minlr): muon_params = [p for p in params if len(p.shape) >= 2] adamw_params = [p for p in params if len(p.shape) < 2] - o1 = nn.optim.Muon(muon_params, lr=hypr["starting_lr"]) - o2 = nn.optim.AdamW(adamw_params, lr=hypr["starting_lr"]) - optimizer = nn.optim.OptimizerGroup([o1,o2]) - return CosineLR(optimizer,steps,minlr) + o1 = nn.optim.Muon(muon_params, lr=maxlr) + o2 = nn.optim.AdamW(adamw_params, lr=maxlr) + optimizer = nn.optim.OptimizerGroup(o1,o2) + return CosineLR(optimizer,steps,maxlr,minlr)