diff --git a/.gitignore b/.gitignore index 3202c71..afb89a2 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,3 @@ /data/ /music.safetensors /data.npz -*.safetensors diff --git a/data.py b/data.py index 6c83a51..137ecef 100644 --- a/data.py +++ b/data.py @@ -6,15 +6,7 @@ import mlflow SAMPLE_RATE = 22050 -def spec_to_audio(spec): - """ - Convert a normalized mel-spectrogram back to audio. - """ - spec = (spec * 80) - 80 - spec = librosa.db_to_amplitude(spec)*80 - audio = librosa.feature.inverse.mel_to_audio(spec,sr=SAMPLE_RATE) - return audio - +#@mlflow.trace def process_file(file_path): """ Load 10 second chunks single song. @@ -30,9 +22,15 @@ def process_file(file_path): end = start_pos + size if end <= sample_len: chunk = y[start_pos:end] + chunk = librosa.feature.melspectrogram(y=chunk, sr=SAMPLE_RATE) + chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+80)/80) + + #chunk = librosa.feature.melspectrogram(y=chunk,sr=SAMPLE_RATE) + #chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+40)/40) file_chunks.append(chunk) return file_chunks +#@mlflow.trace def load(): """ Load 10 second chunks of songs. @@ -46,6 +44,9 @@ def load(): audio.extend(l) return audio + + +##DEP def audio_split(audio): """ Split 10 seconds of audio to 2 5 second clips diff --git a/dataInit.py b/dataInit.py index 1776e4b..8eb3a6e 100644 --- a/dataInit.py +++ b/dataInit.py @@ -1,14 +1,14 @@ import data import numpy as np -x,y = data.dataset(data.load()) +x = data.load() size=len(x) print(size) x_np = np.stack(x) x_np = np.expand_dims(x_np, axis=1) -y_np = np.stack(y) -y_np = np.expand_dims(y_np, axis=1) +#y_np = np.stack(y) +#y_np = np.expand_dims(y_np, axis=1) -np.savez_compressed("data",x_np,y_np) +np.savez_compressed("data",x_np) diff --git a/model.py b/model.py index 623d9ec..93add80 100644 --- a/model.py +++ b/model.py @@ -1,41 +1,31 @@ from tinygrad import Tensor, nn class gen: - def __init__(self, input_channels=1, height=128, width=216, latent_dim=1024): + def __init__(self, input_channels=1, height=128, width=431, latent_dim=64): self.height = height self.width = width self.latent_dim = latent_dim - self.w = width // 8 - self.h = height // 8 - self.flattened_size = 256 * self.h * self.w - - self.num_tokens = 16 - self.dim_per_token = self.latent_dim // self.num_tokens + self.w = width // 4 + self.h = height // 4 + self.h = 32 # Output height after 2 strides + self.w = 108 # Output width after 2 strides + self.flattened_size = 128 * self.h * self.w self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1) self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1) - self.e3 = nn.Conv2d(128,256, kernel_size=3,stride=2,padding=1) - self.el = nn.Linear(self.flattened_size, self.latent_dim) - self.q = nn.Linear(self.dim_per_token,self.dim_per_token) - self.k = nn.Linear(self.dim_per_token,self.dim_per_token) - self.v = nn.Linear(self.dim_per_token,self.dim_per_token) - self.norm1 = nn.LayerNorm(self.dim_per_token) - - ffn_dim = self.dim_per_token * 4 - self.ffn1 = nn.Linear(self.dim_per_token, ffn_dim) - self.ffn2 = nn.Linear(ffn_dim, self.dim_per_token) - self.norm2 = nn.LayerNorm(self.dim_per_token) + self.q = nn.Linear(self.latent_dim,self.latent_dim) + self.k = nn.Linear(self.latent_dim,self.latent_dim) + self.v = nn.Linear(self.latent_dim,self.latent_dim) self.dl = nn.Linear(self.latent_dim, self.flattened_size) - self.d1 = nn.ConvTranspose2d(256,128,kernel_size=3,stride=2,padding=1,output_padding=1) - self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1) - self.d3 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1) + self.d1 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1) + self.d2 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1) def __call__(self, x: Tensor) -> Tensor: y, shape = self.encode(x) @@ -45,37 +35,26 @@ class gen: def encode(self, x: Tensor): x = self.e1(x).leakyrelu() x = self.e2(x).leakyrelu() - x = self.e3(x).leakyrelu() b, c, h, w = x.shape flattened_size = c * h * w + + x = x.reshape(shape=(b, flattened_size)) z = self.el(x) - - # reshape to multi-token: (batch, num_tokens, dim_per_token) - z = z.reshape(shape=(b, self.num_tokens, self.dim_per_token)) return z, (c, h, w) def atten(self, x: Tensor): - q = self.q(x) - k = self.k(x) - v = self.v(x) - attn = q.scaled_dot_product_attention(k, v) - x = self.norm1(x+attn) - - ffn = self.ffn1(x).relu() - ffn = self.ffn2(ffn) - x = self.norm2(x+ffn) - - return x + q = self.q(x).relu() + k = self.k(x).relu() + v = self.v(x).relu() + return q.scaled_dot_product_attention(k,v) def decode(self, z: Tensor, shape): - z = z.reshape(shape=(z.shape[0], -1)) x = self.dl(z).leakyrelu() - x = x.reshape(shape=(-1, 256, self.h, self.w)) + x = x.reshape(shape=(-1, 128, self.h, self.w)) x = self.d1(x).leakyrelu() - x = self.d2(x).leakyrelu() - x = self.d3(x).sigmoid() + x = self.d2(x).sigmoid() # Crop or pad to match input size out_h, out_w = x.shape[2], x.shape[3] diff --git a/run.py b/run.py deleted file mode 100644 index eef1d5c..0000000 --- a/run.py +++ /dev/null @@ -1,43 +0,0 @@ -import numpy as np -import random -import time -from tinygrad import Tensor, nn -from tinygrad.nn.state import safe_load, load_state_dict -import librosa -import sounddevice as sd -from model import gen -from data import spec_to_audio - -SAMPLE_RATE = 22050 - -def load_model(filepath="model.safetensors"): - """Loads the model structure and weights.""" - model = gen() - state_dict = safe_load(filepath) - load_state_dict(model, state_dict) - return model - -def load_data(filepath="data.npz"): - """Loads the pre-processed spectrogram data.""" - print(f"Loading data from {filepath}...") - data = np.load(filepath) - x = data["arr_0"] - return x - -def play_spec(spec,i): - """Converts a spectrogram numpy array to audio and plays it.""" - audio = spec_to_audio(spec) - sd.wait() - print(f"chunk:{i}") - sd.play(audio, samplerate=SAMPLE_RATE) - -def run_prediction_loop(model, data_x): - current_spect = data_x[0:1] - for i in range(10): - play_spec(current_spect[0][0],i) - current_spect = model(Tensor(current_spect)).numpy() - -if __name__ == "__main__": - model = load_model() - data_x = load_data() - run_prediction_loop(model, data_x) diff --git a/shell.nix b/shell.nix deleted file mode 100644 index d74d11c..0000000 --- a/shell.nix +++ /dev/null @@ -1,8 +0,0 @@ -{pkgs ? import {}}: -with pkgs; - mkShell rec { - packages = [python3 jupyter-all python3Packages.librosa python3Packages.tinygrad python3Packages.numpy python3Packages.mlflow python3Packages.tqdm python3Packages.sounddevice]; - nativeBuildInputs = []; - buildInputs = []; - LD_LIBRARY_PATH = lib.makeLibraryPath buildInputs; - } diff --git a/train.py b/train.py index 43f91e7..14f6de3 100644 --- a/train.py +++ b/train.py @@ -1,34 +1,34 @@ import mlflow import numpy as np from tinygrad import Device,Tensor,nn,TinyJit -from tinygrad.nn.state import safe_save, get_state_dict import matplotlib.pyplot as plt import time import show from model import gen -from tqdm import tqdm BATCH_SIZE = 16 EPOCHS = 100 -LEARNING_RATE = 3e-4 +LEARNING_RATE = 1e-5 print(Device.DEFAULT) mdl = gen() opt = nn.optim.AdamW(nn.state.get_parameters(mdl), lr=LEARNING_RATE) +volume = 0.1 def spec_loss(pred, target, eps=1e-6): # spectral convergence sc = ((target - pred).square().sum()) ** 0.5 / ((target.square().sum()) ** 0.5 + eps) # log magnitude difference log_mag = ((target.abs() + eps).log() - (pred.abs() + eps).log()).abs().mean() - return 0.1*sc + 1.0*log_mag + 0.1*(pred - target).abs().mean() + return sc + log_mag @TinyJit -def step_gen(x,y): +def step_gen(x): Tensor.training = True - z = mdl(x) - loss = spec_loss(z,y) - #loss = (y - z).abs().mean() + noise = Tensor.rand_like(x).tanh() + y = x+(noise*volume) + y = y.clamp(0,1) + loss = spec_loss(mdl(y),x) opt.zero_grad() loss.backward() opt.step() @@ -36,8 +36,8 @@ def step_gen(x,y): print("loading") x = np.load("data.npz")["arr_0"] -y = np.load("data.npz")["arr_1"] -run_name = f"vae_{int(time.time())}" +#x= x[0:64] +run_name = f"tinygrad_autoencoder_{int(time.time())}" mlflow.set_tracking_uri("http://127.0.0.1:5000") mlflow.start_run() mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "data size":len(x)}) @@ -45,28 +45,27 @@ mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RA show.logSpec(Tensor(x[0:1]).numpy()[0][0],"default") print("training") -eshape = (BATCH_SIZE, 1, 128, 216) +pl = 0 +eshape = (BATCH_SIZE, 1, 128, 431) for epoch in range(0,EPOCHS): print(f"\n--- Starting Epoch {epoch} ---\n") loss=0 - for i in tqdm(range(0,len(x),BATCH_SIZE)): + for i in range(0,len(x),BATCH_SIZE): tx=Tensor(x[i:i+BATCH_SIZE]) - ty=Tensor(y[i:i+BATCH_SIZE]) if(tx.shape != eshape): continue - loss += step_gen(tx,ty) + loss += step_gen(tx) loss /= (len(x)/BATCH_SIZE) if epoch%5==0: - show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],epoch) - if epoch%15==0: - state_dict = get_state_dict(mdl) - safe_save(state_dict, f"model_{epoch}.safetensors") - show.logSpec(mdl(mdl(mdl(Tensor(y[0:1])))).numpy()[0][0],f"deep_{epoch}") + noise = Tensor.rand_like(Tensor(x[0:1])).tanh() + y = Tensor(x[0:1]) + (noise*volume) + show.logSpec(mdl(y).numpy()[0][0],epoch) + if(pl - loss < 0.03 and epoch > 25): + show.logSpec(y.numpy()[0][0],f"volume_{volume}") + volume *= 2 + pl = loss + mlflow.log_metric("volume", volume, step=epoch) mlflow.log_metric("loss", loss, step=epoch) print(f"loss of {loss}") - -show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],EPOCHS) -state_dict = get_state_dict(mdl) -safe_save(state_dict, "model.safetensors")