diff --git a/data.py b/data.py index 137ecef..4c4f3c8 100644 --- a/data.py +++ b/data.py @@ -22,15 +22,9 @@ def process_file(file_path): end = start_pos + size if end <= sample_len: chunk = y[start_pos:end] - chunk = librosa.feature.melspectrogram(y=chunk, sr=SAMPLE_RATE) - chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+80)/80) - - #chunk = librosa.feature.melspectrogram(y=chunk,sr=SAMPLE_RATE) - #chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+40)/40) file_chunks.append(chunk) return file_chunks -#@mlflow.trace def load(): """ Load 10 second chunks of songs. @@ -44,9 +38,6 @@ def load(): audio.extend(l) return audio - - -##DEP def audio_split(audio): """ Split 10 seconds of audio to 2 5 second clips diff --git a/dataInit.py b/dataInit.py index 8eb3a6e..1776e4b 100644 --- a/dataInit.py +++ b/dataInit.py @@ -1,14 +1,14 @@ import data import numpy as np -x = data.load() +x,y = data.dataset(data.load()) size=len(x) print(size) x_np = np.stack(x) x_np = np.expand_dims(x_np, axis=1) -#y_np = np.stack(y) -#y_np = np.expand_dims(y_np, axis=1) +y_np = np.stack(y) +y_np = np.expand_dims(y_np, axis=1) -np.savez_compressed("data",x_np) +np.savez_compressed("data",x_np,y_np) diff --git a/model.py b/model.py index 93add80..dd18bd2 100644 --- a/model.py +++ b/model.py @@ -1,40 +1,39 @@ from tinygrad import Tensor, nn class gen: - def __init__(self, input_channels=1, height=128, width=431, latent_dim=64): + def __init__(self, input_channels=1, height=128, width=216, latent_dim=1024): self.height = height self.width = width self.latent_dim = latent_dim - self.w = width // 4 - self.h = height // 4 - self.h = 32 # Output height after 2 strides - self.w = 108 # Output width after 2 strides - self.flattened_size = 128 * self.h * self.w + self.w = width // 8 + self.h = height // 8 + self.flattened_size = 256 * self.h * self.w self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1) self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1) + self.e3 = nn.Conv2d(128,256, kernel_size=3,stride=2,padding=1) + self.el = nn.Linear(self.flattened_size, self.latent_dim) - self.q = nn.Linear(self.latent_dim,self.latent_dim) - self.k = nn.Linear(self.latent_dim,self.latent_dim) - self.v = nn.Linear(self.latent_dim,self.latent_dim) self.dl = nn.Linear(self.latent_dim, self.flattened_size) - self.d1 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1) - self.d2 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1) + self.d1 = nn.ConvTranspose2d(256,128,kernel_size=3,stride=2,padding=1,output_padding=1) + self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1) + self.d3 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1) def __call__(self, x: Tensor) -> Tensor: y, shape = self.encode(x) - z = self.atten(y) + z = y#self.atten(y) return self.decode(z, shape) def encode(self, x: Tensor): x = self.e1(x).leakyrelu() x = self.e2(x).leakyrelu() + x = self.e3(x).leakyrelu() b, c, h, w = x.shape flattened_size = c * h * w @@ -52,9 +51,10 @@ class gen: def decode(self, z: Tensor, shape): x = self.dl(z).leakyrelu() - x = x.reshape(shape=(-1, 128, self.h, self.w)) + x = x.reshape(shape=(-1, 256, self.h, self.w)) x = self.d1(x).leakyrelu() - x = self.d2(x).sigmoid() + x = self.d2(x).leakyrelu() + x = self.d3(x).sigmoid() # Crop or pad to match input size out_h, out_w = x.shape[2], x.shape[3] diff --git a/train.py b/train.py index 14f6de3..e28c5ef 100644 --- a/train.py +++ b/train.py @@ -8,27 +8,25 @@ from model import gen BATCH_SIZE = 16 EPOCHS = 100 -LEARNING_RATE = 1e-5 +LEARNING_RATE = 3e-4 print(Device.DEFAULT) mdl = gen() opt = nn.optim.AdamW(nn.state.get_parameters(mdl), lr=LEARNING_RATE) -volume = 0.1 def spec_loss(pred, target, eps=1e-6): # spectral convergence sc = ((target - pred).square().sum()) ** 0.5 / ((target.square().sum()) ** 0.5 + eps) # log magnitude difference log_mag = ((target.abs() + eps).log() - (pred.abs() + eps).log()).abs().mean() - return sc + log_mag + return 0.1*sc + 1.0*log_mag + 0.1*(pred - target).abs().mean() @TinyJit -def step_gen(x): +def step_gen(x,y): Tensor.training = True - noise = Tensor.rand_like(x).tanh() - y = x+(noise*volume) - y = y.clamp(0,1) - loss = spec_loss(mdl(y),x) + z = mdl(x) + loss = spec_loss(z,y) + #loss = (y - z).abs().mean() opt.zero_grad() loss.backward() opt.step() @@ -36,8 +34,8 @@ def step_gen(x): print("loading") x = np.load("data.npz")["arr_0"] -#x= x[0:64] -run_name = f"tinygrad_autoencoder_{int(time.time())}" +y = np.load("data.npz")["arr_1"] +run_name = f"vae_{int(time.time())}" mlflow.set_tracking_uri("http://127.0.0.1:5000") mlflow.start_run() mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "data size":len(x)}) @@ -52,20 +50,18 @@ for epoch in range(0,EPOCHS): loss=0 for i in range(0,len(x),BATCH_SIZE): tx=Tensor(x[i:i+BATCH_SIZE]) + ty=Tensor(y[i:i+BATCH_SIZE]) if(tx.shape != eshape): continue - loss += step_gen(tx) + loss += step_gen(tx,ty) loss /= (len(x)/BATCH_SIZE) if epoch%5==0: - noise = Tensor.rand_like(Tensor(x[0:1])).tanh() - y = Tensor(x[0:1]) + (noise*volume) - show.logSpec(mdl(y).numpy()[0][0],epoch) - if(pl - loss < 0.03 and epoch > 25): - show.logSpec(y.numpy()[0][0],f"volume_{volume}") - volume *= 2 - pl = loss + show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],epoch) + if epoch%15==0: + state_dict = get_state_dict(mdl) + safe_save(state_dict, f"model_{epoch}.safetensors") + show.logSpec(mdl(mdl(mdl(Tensor(y[0:1])))).numpy()[0][0],f"deep_{epoch}") - mlflow.log_metric("volume", volume, step=epoch) mlflow.log_metric("loss", loss, step=epoch) print(f"loss of {loss}")