From 64e66260ecb933dfdeae969899e6ebc596f4a1ab Mon Sep 17 00:00:00 2001 From: k Date: Wed, 12 Nov 2025 12:10:52 -0500 Subject: [PATCH 1/6] simple vae style model --- data.py | 9 --------- dataInit.py | 8 ++++---- model.py | 28 ++++++++++++++-------------- train.py | 34 +++++++++++++++------------------- 4 files changed, 33 insertions(+), 46 deletions(-) diff --git a/data.py b/data.py index 137ecef..4c4f3c8 100644 --- a/data.py +++ b/data.py @@ -22,15 +22,9 @@ def process_file(file_path): end = start_pos + size if end <= sample_len: chunk = y[start_pos:end] - chunk = librosa.feature.melspectrogram(y=chunk, sr=SAMPLE_RATE) - chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+80)/80) - - #chunk = librosa.feature.melspectrogram(y=chunk,sr=SAMPLE_RATE) - #chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+40)/40) file_chunks.append(chunk) return file_chunks -#@mlflow.trace def load(): """ Load 10 second chunks of songs. @@ -44,9 +38,6 @@ def load(): audio.extend(l) return audio - - -##DEP def audio_split(audio): """ Split 10 seconds of audio to 2 5 second clips diff --git a/dataInit.py b/dataInit.py index 8eb3a6e..1776e4b 100644 --- a/dataInit.py +++ b/dataInit.py @@ -1,14 +1,14 @@ import data import numpy as np -x = data.load() +x,y = data.dataset(data.load()) size=len(x) print(size) x_np = np.stack(x) x_np = np.expand_dims(x_np, axis=1) -#y_np = np.stack(y) -#y_np = np.expand_dims(y_np, axis=1) +y_np = np.stack(y) +y_np = np.expand_dims(y_np, axis=1) -np.savez_compressed("data",x_np) +np.savez_compressed("data",x_np,y_np) diff --git a/model.py b/model.py index 93add80..dd18bd2 100644 --- a/model.py +++ b/model.py @@ -1,40 +1,39 @@ from tinygrad import Tensor, nn class gen: - def __init__(self, input_channels=1, height=128, width=431, latent_dim=64): + def __init__(self, input_channels=1, height=128, width=216, latent_dim=1024): self.height = height self.width = width self.latent_dim = latent_dim - self.w = width // 4 - self.h = height // 4 - self.h = 32 # Output height after 2 strides - self.w = 108 # Output width after 2 strides - self.flattened_size = 128 * self.h * self.w + self.w = width // 8 + self.h = height // 8 + self.flattened_size = 256 * self.h * self.w self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1) self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1) + self.e3 = nn.Conv2d(128,256, kernel_size=3,stride=2,padding=1) + self.el = nn.Linear(self.flattened_size, self.latent_dim) - self.q = nn.Linear(self.latent_dim,self.latent_dim) - self.k = nn.Linear(self.latent_dim,self.latent_dim) - self.v = nn.Linear(self.latent_dim,self.latent_dim) self.dl = nn.Linear(self.latent_dim, self.flattened_size) - self.d1 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1) - self.d2 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1) + self.d1 = nn.ConvTranspose2d(256,128,kernel_size=3,stride=2,padding=1,output_padding=1) + self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1) + self.d3 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1) def __call__(self, x: Tensor) -> Tensor: y, shape = self.encode(x) - z = self.atten(y) + z = y#self.atten(y) return self.decode(z, shape) def encode(self, x: Tensor): x = self.e1(x).leakyrelu() x = self.e2(x).leakyrelu() + x = self.e3(x).leakyrelu() b, c, h, w = x.shape flattened_size = c * h * w @@ -52,9 +51,10 @@ class gen: def decode(self, z: Tensor, shape): x = self.dl(z).leakyrelu() - x = x.reshape(shape=(-1, 128, self.h, self.w)) + x = x.reshape(shape=(-1, 256, self.h, self.w)) x = self.d1(x).leakyrelu() - x = self.d2(x).sigmoid() + x = self.d2(x).leakyrelu() + x = self.d3(x).sigmoid() # Crop or pad to match input size out_h, out_w = x.shape[2], x.shape[3] diff --git a/train.py b/train.py index 14f6de3..e28c5ef 100644 --- a/train.py +++ b/train.py @@ -8,27 +8,25 @@ from model import gen BATCH_SIZE = 16 EPOCHS = 100 -LEARNING_RATE = 1e-5 +LEARNING_RATE = 3e-4 print(Device.DEFAULT) mdl = gen() opt = nn.optim.AdamW(nn.state.get_parameters(mdl), lr=LEARNING_RATE) -volume = 0.1 def spec_loss(pred, target, eps=1e-6): # spectral convergence sc = ((target - pred).square().sum()) ** 0.5 / ((target.square().sum()) ** 0.5 + eps) # log magnitude difference log_mag = ((target.abs() + eps).log() - (pred.abs() + eps).log()).abs().mean() - return sc + log_mag + return 0.1*sc + 1.0*log_mag + 0.1*(pred - target).abs().mean() @TinyJit -def step_gen(x): +def step_gen(x,y): Tensor.training = True - noise = Tensor.rand_like(x).tanh() - y = x+(noise*volume) - y = y.clamp(0,1) - loss = spec_loss(mdl(y),x) + z = mdl(x) + loss = spec_loss(z,y) + #loss = (y - z).abs().mean() opt.zero_grad() loss.backward() opt.step() @@ -36,8 +34,8 @@ def step_gen(x): print("loading") x = np.load("data.npz")["arr_0"] -#x= x[0:64] -run_name = f"tinygrad_autoencoder_{int(time.time())}" +y = np.load("data.npz")["arr_1"] +run_name = f"vae_{int(time.time())}" mlflow.set_tracking_uri("http://127.0.0.1:5000") mlflow.start_run() mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "data size":len(x)}) @@ -52,20 +50,18 @@ for epoch in range(0,EPOCHS): loss=0 for i in range(0,len(x),BATCH_SIZE): tx=Tensor(x[i:i+BATCH_SIZE]) + ty=Tensor(y[i:i+BATCH_SIZE]) if(tx.shape != eshape): continue - loss += step_gen(tx) + loss += step_gen(tx,ty) loss /= (len(x)/BATCH_SIZE) if epoch%5==0: - noise = Tensor.rand_like(Tensor(x[0:1])).tanh() - y = Tensor(x[0:1]) + (noise*volume) - show.logSpec(mdl(y).numpy()[0][0],epoch) - if(pl - loss < 0.03 and epoch > 25): - show.logSpec(y.numpy()[0][0],f"volume_{volume}") - volume *= 2 - pl = loss + show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],epoch) + if epoch%15==0: + state_dict = get_state_dict(mdl) + safe_save(state_dict, f"model_{epoch}.safetensors") + show.logSpec(mdl(mdl(mdl(Tensor(y[0:1])))).numpy()[0][0],f"deep_{epoch}") - mlflow.log_metric("volume", volume, step=epoch) mlflow.log_metric("loss", loss, step=epoch) print(f"loss of {loss}") From 579b37cd70e89dba4861baec5234347e34b9ae48 Mon Sep 17 00:00:00 2001 From: k Date: Wed, 12 Nov 2025 12:11:57 -0500 Subject: [PATCH 2/6] add player script and fix bug --- data.py | 10 +++++++++- run.py | 43 +++++++++++++++++++++++++++++++++++++++++++ train.py | 8 ++++++-- 3 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 run.py diff --git a/data.py b/data.py index 4c4f3c8..6c83a51 100644 --- a/data.py +++ b/data.py @@ -6,7 +6,15 @@ import mlflow SAMPLE_RATE = 22050 -#@mlflow.trace +def spec_to_audio(spec): + """ + Convert a normalized mel-spectrogram back to audio. + """ + spec = (spec * 80) - 80 + spec = librosa.db_to_amplitude(spec)*80 + audio = librosa.feature.inverse.mel_to_audio(spec,sr=SAMPLE_RATE) + return audio + def process_file(file_path): """ Load 10 second chunks single song. diff --git a/run.py b/run.py new file mode 100644 index 0000000..eef1d5c --- /dev/null +++ b/run.py @@ -0,0 +1,43 @@ +import numpy as np +import random +import time +from tinygrad import Tensor, nn +from tinygrad.nn.state import safe_load, load_state_dict +import librosa +import sounddevice as sd +from model import gen +from data import spec_to_audio + +SAMPLE_RATE = 22050 + +def load_model(filepath="model.safetensors"): + """Loads the model structure and weights.""" + model = gen() + state_dict = safe_load(filepath) + load_state_dict(model, state_dict) + return model + +def load_data(filepath="data.npz"): + """Loads the pre-processed spectrogram data.""" + print(f"Loading data from {filepath}...") + data = np.load(filepath) + x = data["arr_0"] + return x + +def play_spec(spec,i): + """Converts a spectrogram numpy array to audio and plays it.""" + audio = spec_to_audio(spec) + sd.wait() + print(f"chunk:{i}") + sd.play(audio, samplerate=SAMPLE_RATE) + +def run_prediction_loop(model, data_x): + current_spect = data_x[0:1] + for i in range(10): + play_spec(current_spect[0][0],i) + current_spect = model(Tensor(current_spect)).numpy() + +if __name__ == "__main__": + model = load_model() + data_x = load_data() + run_prediction_loop(model, data_x) diff --git a/train.py b/train.py index e28c5ef..e912da5 100644 --- a/train.py +++ b/train.py @@ -1,6 +1,7 @@ import mlflow import numpy as np from tinygrad import Device,Tensor,nn,TinyJit +from tinygrad.nn.state import safe_save, get_state_dict import matplotlib.pyplot as plt import time import show @@ -43,8 +44,7 @@ mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RA show.logSpec(Tensor(x[0:1]).numpy()[0][0],"default") print("training") -pl = 0 -eshape = (BATCH_SIZE, 1, 128, 431) +eshape = (BATCH_SIZE, 1, 128, 216) for epoch in range(0,EPOCHS): print(f"\n--- Starting Epoch {epoch} ---\n") loss=0 @@ -65,3 +65,7 @@ for epoch in range(0,EPOCHS): mlflow.log_metric("loss", loss, step=epoch) print(f"loss of {loss}") + +show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],EPOCHS) +state_dict = get_state_dict(mdl) +safe_save(state_dict, "model.safetensors") From b076a0d12311911355cdabbb9e23b852db5016c8 Mon Sep 17 00:00:00 2001 From: k Date: Wed, 12 Nov 2025 12:12:26 -0500 Subject: [PATCH 3/6] add status bar for epoch progress --- train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index e912da5..43f91e7 100644 --- a/train.py +++ b/train.py @@ -6,6 +6,7 @@ import matplotlib.pyplot as plt import time import show from model import gen +from tqdm import tqdm BATCH_SIZE = 16 EPOCHS = 100 @@ -48,7 +49,7 @@ eshape = (BATCH_SIZE, 1, 128, 216) for epoch in range(0,EPOCHS): print(f"\n--- Starting Epoch {epoch} ---\n") loss=0 - for i in range(0,len(x),BATCH_SIZE): + for i in tqdm(range(0,len(x),BATCH_SIZE)): tx=Tensor(x[i:i+BATCH_SIZE]) ty=Tensor(y[i:i+BATCH_SIZE]) if(tx.shape != eshape): From 6e0b3882bcc452b3152f40f898410c26da3fd4dd Mon Sep 17 00:00:00 2001 From: k Date: Wed, 12 Nov 2025 12:13:03 -0500 Subject: [PATCH 4/6] added transformer block in latenent space --- model.py | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/model.py b/model.py index dd18bd2..623d9ec 100644 --- a/model.py +++ b/model.py @@ -10,6 +10,9 @@ class gen: self.h = height // 8 self.flattened_size = 256 * self.h * self.w + self.num_tokens = 16 + self.dim_per_token = self.latent_dim // self.num_tokens + self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1) self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1) @@ -18,6 +21,15 @@ class gen: self.el = nn.Linear(self.flattened_size, self.latent_dim) + self.q = nn.Linear(self.dim_per_token,self.dim_per_token) + self.k = nn.Linear(self.dim_per_token,self.dim_per_token) + self.v = nn.Linear(self.dim_per_token,self.dim_per_token) + self.norm1 = nn.LayerNorm(self.dim_per_token) + + ffn_dim = self.dim_per_token * 4 + self.ffn1 = nn.Linear(self.dim_per_token, ffn_dim) + self.ffn2 = nn.Linear(ffn_dim, self.dim_per_token) + self.norm2 = nn.LayerNorm(self.dim_per_token) self.dl = nn.Linear(self.latent_dim, self.flattened_size) @@ -27,7 +39,7 @@ class gen: def __call__(self, x: Tensor) -> Tensor: y, shape = self.encode(x) - z = y#self.atten(y) + z = self.atten(y) return self.decode(z, shape) def encode(self, x: Tensor): @@ -37,19 +49,28 @@ class gen: b, c, h, w = x.shape flattened_size = c * h * w - - x = x.reshape(shape=(b, flattened_size)) z = self.el(x) + + # reshape to multi-token: (batch, num_tokens, dim_per_token) + z = z.reshape(shape=(b, self.num_tokens, self.dim_per_token)) return z, (c, h, w) def atten(self, x: Tensor): - q = self.q(x).relu() - k = self.k(x).relu() - v = self.v(x).relu() - return q.scaled_dot_product_attention(k,v) + q = self.q(x) + k = self.k(x) + v = self.v(x) + attn = q.scaled_dot_product_attention(k, v) + x = self.norm1(x+attn) + + ffn = self.ffn1(x).relu() + ffn = self.ffn2(ffn) + x = self.norm2(x+ffn) + + return x def decode(self, z: Tensor, shape): + z = z.reshape(shape=(z.shape[0], -1)) x = self.dl(z).leakyrelu() x = x.reshape(shape=(-1, 256, self.h, self.w)) x = self.d1(x).leakyrelu() From 43b64e6ca393cf0f057696cc3057065c8602eb91 Mon Sep 17 00:00:00 2001 From: k Date: Wed, 12 Nov 2025 12:15:41 -0500 Subject: [PATCH 5/6] add shell.nix --- shell.nix | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 shell.nix diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..d74d11c --- /dev/null +++ b/shell.nix @@ -0,0 +1,8 @@ +{pkgs ? import {}}: +with pkgs; + mkShell rec { + packages = [python3 jupyter-all python3Packages.librosa python3Packages.tinygrad python3Packages.numpy python3Packages.mlflow python3Packages.tqdm python3Packages.sounddevice]; + nativeBuildInputs = []; + buildInputs = []; + LD_LIBRARY_PATH = lib.makeLibraryPath buildInputs; + } From bfdcc8311fd3f71d46f5e78639de0b64ffddfd35 Mon Sep 17 00:00:00 2001 From: k Date: Wed, 12 Nov 2025 12:15:59 -0500 Subject: [PATCH 6/6] ignore safetensors --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index afb89a2..3202c71 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ /data/ /music.safetensors /data.npz +*.safetensors