from tinygrad import Tensor,nn,TinyJit class MultiHeadAttention: def __init__(self): pass #TODO def __call__(self): pass #TODO def cast(self): pass #TODO class FeedForwardNetwork: def __init__(self,embeding_size,ratio=(8/3)): hidden_size = int(embeding_size*ratio) self.norm = nn.RMSNorm(embeding_size) self.gate = nn.Linear(embeding_size,hidden_size,bias=False) self.up = nn.Linear(embeding_size, hidden_size,bias=False) self.down = nn.Linear(hidden_size,embeding_size,bias=False) def __call__(self,x): x = self.norm(x) return self.down(self.gate(x).silu() * self.up(x)) def cast(self,dtype): self.gate.weight = gate.weight.cast(dtype) self.up.weight = up.weight.cast(dtype) self.down.weight = down.weight.cast(dtype) class Block: def __init__(self): pass #TODO def __call__(self): pass #TODO def cast(self): pass #TODO class Transformer(): def __init__(self): pass #TODO def __call__(self): pass #TODO def cast(self): pass #TODO