diff --git a/data.py b/data.py index 56a893a..938ab87 100644 --- a/data.py +++ b/data.py @@ -4,19 +4,18 @@ import queue def startDataWorker(dataset,encoding,batch_size,block_size): data_q = queue.Queue(maxsize=100) - t = threading.Thread(target=dataWorker, args=(data_q, dataset, encoding, batch_size, block_size), daemon=True) + t = threading.Thread(target=data_worker, args=(data_q, dataset, encoding, batch_size, block_size), daemon=True) t.start() while (1): try: bx, by = data_q.get(timeout=30) except queue.Empty: - print("queue empty ...") continue yield (bx,by) def dataWorker(q, dataset, encoding, batch_size, block_size): batch_x, batch_y = [], [] - while True: + while(1): for text in dataset["text"]: tokens = encoding.encode(text) for i in range(0, len(tokens)-block_size-1,block_size): @@ -34,7 +33,7 @@ def dataWorker(q, dataset, encoding, batch_size, block_size): batch_x.append(x) batch_y.append(y) - if len(batch_x) == batch_size: - q.put((np.array(batch_x, dtype=np.int32), - np.array(batch_y, dtype=np.int32))) - batch_x, batch_y = [], [] + if len(batch_x) == batch_size: + q.put((np.array(batch_x, dtype=np.int32), + np.array(batch_y, dtype=np.int32))) + batch_x, batch_y = [], [] diff --git a/model.py b/model.py index 8df1c46..a70025c 100644 --- a/model.py +++ b/model.py @@ -58,10 +58,10 @@ class Block: return self class Transformer(): - def __init__(self,vocab_size,embed_size,n_heads,n_blocks,block_size): + def __init__(self,vocab_size,embed_size,n_heads,n_blocks,max_len): self.tok_embed = nn.Embedding(vocab_size,embed_size) self.pos_embed = nn.Embedding(block_size,embed_size) - self.pos_idx = Tensor.arange(block_size, requires_grad=False) + self.pos_idx = Tensor.arange(max_len, requires_grad=False) self.blocks = [Block(embed_size,n_heads) for _ in range(n_blocks)] self.norm = nn.RMSNorm(embed_size) diff --git a/optm.py b/optm.py index 1397ab4..4a99ea0 100644 --- a/optm.py +++ b/optm.py @@ -1,18 +1,16 @@ -from tinygrad import Tensor,nn +from tinygrad import Tensor import math class CosineLR: - def __init__(self,optm,totalSteps,maxlr,minlr): + def __init__(self,optm,totalSteps,minlr): self.optm = optm - self.maxlr = maxlr + self.maxlr = optm.lr self.minlr = minlr self.totalSteps = totalSteps self.steps = 0 def step(self): - lr = self.minlr + 0.5 * (self.maxlr - self.minlr) * (1 + math.cos((self.steps / self.totalSteps) * math.pi)) - for o in self.optm: - o.lr = lr + self.optm.lr = self.minlr + 0.5 * (self.maxlr - self.minlr) * (1 + math.cos((step / self.totalSteps) * math.pi)) self.optm.step() self.steps += 1 @@ -20,11 +18,11 @@ class CosineLR: self.optm.zero_grad() -def llmOptimizer(params,steps,maxlr,minlr): +def llmOptimizer(params,steps,minlr): muon_params = [p for p in params if len(p.shape) >= 2] adamw_params = [p for p in params if len(p.shape) < 2] - o1 = nn.optim.Muon(muon_params, lr=maxlr) - o2 = nn.optim.AdamW(adamw_params, lr=maxlr) - optimizer = nn.optim.OptimizerGroup(o1,o2) - return CosineLR(optimizer,steps,maxlr,minlr) + o1 = nn.optim.Muon(muon_params, lr=hypr["starting_lr"]) + o2 = nn.optim.AdamW(adamw_params, lr=hypr["starting_lr"]) + optimizer = nn.optim.OptimizerGroup([o1,o2]) + return CosineLR(optimizer,steps,minlr) diff --git a/train.py b/train.py deleted file mode 100644 index f231764..0000000 --- a/train.py +++ /dev/null @@ -1,77 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor -from tinygrad import Tensor,TinyJit,Device,nn -from tinygrad.nn.state import get_state_dict -from model import Transformer -from transformers import AutoTokenizer -from datasets import load_dataset -from tqdm import tqdm -import optm -import data -import log - -hypr = { - "embed_size": 256, - "n_heads": 4, - "n_blocks": 4, - "block_size": 256, - "batch_size": 16, - "starting_lr": 3e-4, - "minimum_lr": 3e-5, - "warmup": 1_000, - "steps": 5_000, - "encoding": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "dataset": "HuggingFaceTB/smollm-corpus", - "subset": "cosmopedia-v2", -} - -print(Device.DEFAULT) - -#for loging -loger = ThreadPoolExecutor(max_workers=2) - -dataset = load_dataset(hypr["dataset"], - hypr["subset"], - split="train", - streaming=True) -encoding = AutoTokenizer.from_pretrained(hypr["encoding"]) -hypr["vocab_size"] = encoding.vocab_size -model = Transformer(hypr["vocab_size"],hypr["embed_size"],hypr["n_heads"],hypr["n_blocks"],hypr["block_size"]) -batch = data.startDataWorker(dataset,encoding,hypr["batch_size"],hypr["block_size"]) - -params = nn.state.get_parameters(model) -optimizer = optm.llmOptimizer(params,hypr["steps"],hypr["starting_lr"],hypr["minimum_lr"]) - -@TinyJit -def step(x,y): - optimizer.zero_grad() - - logits = model(x) - B,T,C = logits.shape - logits = logits.view(B*T,C) - y = y.view(B*T) - loss = logits.cross_entropy(y) - - loss.backward() - optimizer.step() - return loss - -Tensor.training=True -bar = tqdm(range(hypr["steps"])) - -for steps in bar: - nx, ny = next(batch) - x = Tensor(nx, device=Device.DEFAULT).realize() - y = Tensor(ny, device=Device.DEFAULT).realize() - loss = step(x, y) - if steps % 10 == 0: - l = loss.numpy() - loger.submit(log.logLoss, steps, l) - bar.set_postfix(loss= f"{l:.4f}") - if steps % 500 == 0: - loss.realize() - m = get_state_dict(model) - log.logModel(steps,m) - #TODO non sycronus safetensor loging - #loger.submit(log.logModel,steps,m) - -loger.shutdown(wait=True)