From 478010c8cccd11c441137a4dbeb8f946206a6633 Mon Sep 17 00:00:00 2001 From: k Date: Tue, 6 Jan 2026 21:38:12 -0500 Subject: [PATCH 1/4] added Positional encodeings --- model.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/model.py b/model.py index 655d623..a70025c 100644 --- a/model.py +++ b/model.py @@ -58,13 +58,18 @@ class Block: return self class Transformer(): - def __init__(self,vocab_size,embed_size,n_heads,n_blocks): + def __init__(self,vocab_size,embed_size,n_heads,n_blocks,max_len): self.tok_embed = nn.Embedding(vocab_size,embed_size) + self.pos_embed = nn.Embedding(block_size,embed_size) + self.pos_idx = Tensor.arange(max_len, requires_grad=False) + self.blocks = [Block(embed_size,n_heads) for _ in range(n_blocks)] self.norm = nn.RMSNorm(embed_size) self.output = nn.Linear(embed_size,vocab_size,bias=False) def __call__(self,x): - x = self.tok_embed(x) + B,T = x.shape + pos_embeds = self.pos_embed(self.pos_idx[:T]) + x = self.tok_embed(x) + pos_embeds x = x.sequential(self.blocks) x = self.norm(x) return self.output(x) From 229c564811dce0e064e0356e3697ea81e6d44a91 Mon Sep 17 00:00:00 2001 From: k Date: Wed, 7 Jan 2026 00:26:04 -0500 Subject: [PATCH 2/4] CosineAnnealing with optimizer Group --- optm.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 optm.py diff --git a/optm.py b/optm.py new file mode 100644 index 0000000..4a99ea0 --- /dev/null +++ b/optm.py @@ -0,0 +1,28 @@ +from tinygrad import Tensor +import math + +class CosineLR: + def __init__(self,optm,totalSteps,minlr): + self.optm = optm + self.maxlr = optm.lr + self.minlr = minlr + self.totalSteps = totalSteps + self.steps = 0 + + def step(self): + self.optm.lr = self.minlr + 0.5 * (self.maxlr - self.minlr) * (1 + math.cos((step / self.totalSteps) * math.pi)) + self.optm.step() + self.steps += 1 + + def zero_grad(self): + self.optm.zero_grad() + + +def llmOptimizer(params,steps,minlr): + muon_params = [p for p in params if len(p.shape) >= 2] + adamw_params = [p for p in params if len(p.shape) < 2] + + o1 = nn.optim.Muon(muon_params, lr=hypr["starting_lr"]) + o2 = nn.optim.AdamW(adamw_params, lr=hypr["starting_lr"]) + optimizer = nn.optim.OptimizerGroup([o1,o2]) + return CosineLR(optimizer,steps,minlr) From 6daa8ec46cf3f792abdd8047a1d0e73113cb8aba Mon Sep 17 00:00:00 2001 From: k Date: Wed, 7 Jan 2026 01:15:18 -0500 Subject: [PATCH 3/4] Added code to generate training batches --- data.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 data.py diff --git a/data.py b/data.py new file mode 100644 index 0000000..938ab87 --- /dev/null +++ b/data.py @@ -0,0 +1,39 @@ +import numpy as np +import threading +import queue + +def startDataWorker(dataset,encoding,batch_size,block_size): + data_q = queue.Queue(maxsize=100) + t = threading.Thread(target=data_worker, args=(data_q, dataset, encoding, batch_size, block_size), daemon=True) + t.start() + while (1): + try: + bx, by = data_q.get(timeout=30) + except queue.Empty: + continue + yield (bx,by) + +def dataWorker(q, dataset, encoding, batch_size, block_size): + batch_x, batch_y = [], [] + while(1): + for text in dataset["text"]: + tokens = encoding.encode(text) + for i in range(0, len(tokens)-block_size-1,block_size): + x = [encoding.bos_token_id] + tokens[i:i+block_size-1] + y = tokens[i:i+block_size] + + if len(x) < block_size: + pad = len(x)-(block_size-1) + x = x + [encoding.eos_token_id] + [encoding.pad_token_id] * pad + + if len(y) < block_size: + pad = len(y)-(block_size-1) + y = y + [encoding.eos_token_id] + [encoding.pad_token_id] * pad + + batch_x.append(x) + batch_y.append(y) + + if len(batch_x) == batch_size: + q.put((np.array(batch_x, dtype=np.int32), + np.array(batch_y, dtype=np.int32))) + batch_x, batch_y = [], [] From 007c96e91bef03b7f7e5edafb262c6d0cf1df6e8 Mon Sep 17 00:00:00 2001 From: k Date: Wed, 7 Jan 2026 01:25:47 -0500 Subject: [PATCH 4/4] Simple log functions --- log.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 log.py diff --git a/log.py b/log.py new file mode 100644 index 0000000..cdd56b9 --- /dev/null +++ b/log.py @@ -0,0 +1,15 @@ +from tinygrad.nn.state import safe_save +import csv +import os + +def logLoss(step, loss): + path = "loss.csv" + exists = os.path.isfile(path) + with open(path, mode='a', newline='') as f: + writer = csv.writer(f) + if not exists: + writer.writerow(['step', 'loss']) + writer.writerow([step, float(loss)]) + +def logModel(step,stateDict): + safe_save(stateDict, f"gpt_{step}.safetensors")