diff --git a/data.py b/data.py deleted file mode 100644 index 938ab87..0000000 --- a/data.py +++ /dev/null @@ -1,39 +0,0 @@ -import numpy as np -import threading -import queue - -def startDataWorker(dataset,encoding,batch_size,block_size): - data_q = queue.Queue(maxsize=100) - t = threading.Thread(target=data_worker, args=(data_q, dataset, encoding, batch_size, block_size), daemon=True) - t.start() - while (1): - try: - bx, by = data_q.get(timeout=30) - except queue.Empty: - continue - yield (bx,by) - -def dataWorker(q, dataset, encoding, batch_size, block_size): - batch_x, batch_y = [], [] - while(1): - for text in dataset["text"]: - tokens = encoding.encode(text) - for i in range(0, len(tokens)-block_size-1,block_size): - x = [encoding.bos_token_id] + tokens[i:i+block_size-1] - y = tokens[i:i+block_size] - - if len(x) < block_size: - pad = len(x)-(block_size-1) - x = x + [encoding.eos_token_id] + [encoding.pad_token_id] * pad - - if len(y) < block_size: - pad = len(y)-(block_size-1) - y = y + [encoding.eos_token_id] + [encoding.pad_token_id] * pad - - batch_x.append(x) - batch_y.append(y) - - if len(batch_x) == batch_size: - q.put((np.array(batch_x, dtype=np.int32), - np.array(batch_y, dtype=np.int32))) - batch_x, batch_y = [], [] diff --git a/log.py b/log.py deleted file mode 100644 index cdd56b9..0000000 --- a/log.py +++ /dev/null @@ -1,15 +0,0 @@ -from tinygrad.nn.state import safe_save -import csv -import os - -def logLoss(step, loss): - path = "loss.csv" - exists = os.path.isfile(path) - with open(path, mode='a', newline='') as f: - writer = csv.writer(f) - if not exists: - writer.writerow(['step', 'loss']) - writer.writerow([step, float(loss)]) - -def logModel(step,stateDict): - safe_save(stateDict, f"gpt_{step}.safetensors") diff --git a/model.py b/model.py index a70025c..655d623 100644 --- a/model.py +++ b/model.py @@ -58,18 +58,13 @@ class Block: return self class Transformer(): - def __init__(self,vocab_size,embed_size,n_heads,n_blocks,max_len): + def __init__(self,vocab_size,embed_size,n_heads,n_blocks): self.tok_embed = nn.Embedding(vocab_size,embed_size) - self.pos_embed = nn.Embedding(block_size,embed_size) - self.pos_idx = Tensor.arange(max_len, requires_grad=False) - self.blocks = [Block(embed_size,n_heads) for _ in range(n_blocks)] self.norm = nn.RMSNorm(embed_size) self.output = nn.Linear(embed_size,vocab_size,bias=False) def __call__(self,x): - B,T = x.shape - pos_embeds = self.pos_embed(self.pos_idx[:T]) - x = self.tok_embed(x) + pos_embeds + x = self.tok_embed(x) x = x.sequential(self.blocks) x = self.norm(x) return self.output(x) diff --git a/optm.py b/optm.py deleted file mode 100644 index 4a99ea0..0000000 --- a/optm.py +++ /dev/null @@ -1,28 +0,0 @@ -from tinygrad import Tensor -import math - -class CosineLR: - def __init__(self,optm,totalSteps,minlr): - self.optm = optm - self.maxlr = optm.lr - self.minlr = minlr - self.totalSteps = totalSteps - self.steps = 0 - - def step(self): - self.optm.lr = self.minlr + 0.5 * (self.maxlr - self.minlr) * (1 + math.cos((step / self.totalSteps) * math.pi)) - self.optm.step() - self.steps += 1 - - def zero_grad(self): - self.optm.zero_grad() - - -def llmOptimizer(params,steps,minlr): - muon_params = [p for p in params if len(p.shape) >= 2] - adamw_params = [p for p in params if len(p.shape) < 2] - - o1 = nn.optim.Muon(muon_params, lr=hypr["starting_lr"]) - o2 = nn.optim.AdamW(adamw_params, lr=hypr["starting_lr"]) - optimizer = nn.optim.OptimizerGroup([o1,o2]) - return CosineLR(optimizer,steps,minlr)