From ccc3fa3ed451cd0e08a9cf2aeb3fc88c068e6556 Mon Sep 17 00:00:00 2001 From: k Date: Fri, 5 Sep 2025 14:38:45 -0400 Subject: [PATCH] switched to gan and train.py. --- data.py | 10 +++++- model.py | 102 ++++++++++++++--------------------------------------- show.py | 13 +++---- train.py | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 147 insertions(+), 83 deletions(-) create mode 100644 train.py diff --git a/data.py b/data.py index 94faec5..b2cee4a 100644 --- a/data.py +++ b/data.py @@ -2,15 +2,17 @@ import librosa import numpy as np from pathlib import Path from multiprocessing import Pool, cpu_count +import mlflow SAMPLE_RATE = 22050 +@mlflow.trace def process_file(file_path): """ Load 10 second chunks single song. """ y, sr = librosa.load(file_path, mono=True, sr=SAMPLE_RATE) - size = int(SAMPLE_RATE * 10) + size = int(SAMPLE_RATE * 5) sample_len = len(y) file_chunks = [] @@ -18,9 +20,12 @@ def process_file(file_path): end = start_pos + size if end <= sample_len: chunk = y[start_pos:end] + chunk = librosa.feature.melspectrogram(y=chunk,sr=SAMPLE_RATE) + chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+40)/40) file_chunks.append(chunk) return file_chunks +@mlflow.trace def load(): """ Load 10 second chunks of songs. @@ -33,6 +38,9 @@ def load(): audio.extend(l) return audio + + +##DEP def audio_split(audio): """ Split 10 seconds of audio to 2 5 second clips diff --git a/model.py b/model.py index 532fdec..1769c1e 100644 --- a/model.py +++ b/model.py @@ -1,84 +1,34 @@ from tinygrad import Tensor, nn -import numpy as np class Gen: - def __init__(self, input_channels=1, height=128, width=216, latent_dim=32): - self.w = width // 8 - self.h = height // 8 - self.flattened_size = 256 * self.h * self.w - - # Encoder - self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1) + def __init__(self, height=128, width=216, latent_dim=128): + self.w = width // 4 + self.h = height // 4 + self.flat = 128 * self.h * self.w + self.ld = latent_dim + self.d1 = nn.Linear(latent_dim, self.flat) + self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1) + self.d3 = nn.ConvTranspose2d(64, 1, kernel_size=3, stride=2, padding=1, output_padding=1) + + def __call__(self, noise: Tensor) -> Tensor: + x = self.d1(noise).relu() + x = x.reshape(noise.shape[0], 128, self.h, self.w) + x = self.d2(x).relu() + x = self.d3(x) + return x.tanh() + + +class Check: + def __init__(self, height=128, width=216): + self.w = width // 4 + self.h = height // 4 + self.flat = 128 * self.h * self.w + self.e1 = nn.Conv2d(1, 64, kernel_size=3, stride=2, padding=1) self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1) - self.e3 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1) + self.out = nn.Linear(self.flat, 2) - # VAE Latent Space - self.fc_mu = nn.Linear(self.flattened_size, latent_dim) - self.fc_logvar = nn.Linear(self.flattened_size, latent_dim) - - # Decoder - self.dl = nn.Linear(latent_dim, self.flattened_size) - self.d1 = nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1,output_padding=1) - self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1,output_padding=1) - self.d3 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1,output_padding=1) - def __call__(self, x: Tensor) -> Tensor: - mu, log_var = self.encode(x) - x = self.reparameterize(mu, log_var) - return self.decode(x) - - def __Lcall__(self, inp: Tensor, otp:Tensor, epoch) -> (Tensor, Tensor): - mu, log_var = self.encode(inp) - z = self.reparameterize(mu, log_var) - recon = self.decode(z) - - # Normalized MSE (per-pixel) - recon_loss = (recon - otp).abs().mean() - - # Stabilized KL - kl_div = -0.5 * (1 + log_var - mu.pow(2) - log_var.exp()).mean() - - - # Weighted loss - total_loss = recon_loss + min(0.1, 0.01 * epoch) * kl_div - return recon, total_loss - - def encode(self, x: Tensor) -> (Tensor, Tensor): x = self.e1(x).relu() x = self.e2(x).relu() - x = self.e3(x).relu() - x = x.reshape(shape=(-1, self.flattened_size)) - return self.fc_mu(x), self.fc_logvar(x) - - def reparameterize(self, mu: Tensor, log_var: Tensor) -> Tensor: - log_var = log_var.clip(-10, 10) - std = (log_var * 0.5).exp() - eps = Tensor.randn(mu.shape) - return mu + std * eps - - def decode(self, x: Tensor) -> Tensor: - x = self.dl(x).relu() - x = x.reshape(shape=(-1, 256, self.h, self.w)) - x = self.d1(x).relu() - x = self.d2(x).relu() - x = self.d3(x).sigmoid() - return x - -class Check(): - def __init__(self, input_channels=1, height=128, width=216): - self.w = width // 8 - self.h = height // 8 - self.flattened_size = 256 * self.h * self.w - - self.d1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1) - self.d2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1) - self.d3 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1) - self.fc = nn.Linear(self.flattened_size, 1) - - - def __call__(self, x: Tensor) -> Tensor: - x = self.d1(x).leakyrelu(0.2) - x = self.d2(x).leakyrelu(0.2) - x = self.d3(x).leakyrelu(0.2) - x = x.reshape(shape=(-1, self.flattened_size)) - return self.fc(x) \ No newline at end of file + x = x.reshape(x.shape[0], -1) + return self.out(x).sigmoid() diff --git a/show.py b/show.py index 086ffe4..ffab934 100644 --- a/show.py +++ b/show.py @@ -1,18 +1,21 @@ import matplotlib.pyplot as plt -import IPython.display as ipd import librosa +import mlflow SAMPLE_RATE = 22050 -def showSpec(spec): +def logSpec(spec,e): + #spec = ((spec*40)-40) + #spec = librosa.db_to_amplitude(spec) plt.figure(figsize=(10, 4)) librosa.display.specshow(spec, sr=SAMPLE_RATE, x_axis='time', y_axis='mel', cmap='viridis') plt.colorbar(format='%+2.0f dB') plt.title('Mel spectrogram') - plt.show() + mlflow.log_figure(plt.gcf(), f"output_{e}.png") + #plt.close() def playSpec(spec): S = librosa.feature.inverse.mel_to_stft(spec, sr=SAMPLE_RATE) @@ -21,6 +24,4 @@ def playSpec(spec): plt.figure(figsize=(12,4)) plt.plot(audio) plt.title('waveform') - plt.show() - - display(ipd.Audio(audio,rate=SAMPLE_RATE)) + plt.close() diff --git a/train.py b/train.py new file mode 100644 index 0000000..511c9a7 --- /dev/null +++ b/train.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# coding: utf-8 +import data +import model as model +import show +import mlflow +import numpy as np +from tinygrad import nn,TinyJit,Tensor + +mlflow.set_tracking_uri("http://127.0.0.1:5000") +mlflow.start_run(experiment_id=804883409598823668) +#hyper +BACH_SIZE=32 +glr=1e-3 +dlr=1e-3 +epochs=100 + + +#dataset +x = data.load() +size=len(x) +x_np = np.stack(x) +x_np = np.expand_dims(x_np, axis=1) +permutation = np.random.permutation(size) +x_np = x_np[permutation] + +train = x_np[30:] +test = x_np[0:30] + +print("Train:"+str(len(train))) +print("Test:"+str(len(test))) + + +#model +gen = model.Gen() +dif = model.Check() +genOpt = nn.optim.AdamW(nn.state.get_parameters(gen), lr=glr) +difOpt = nn.optim.AdamW(nn.state.get_parameters(dif), lr=dlr) + + +#train + +@TinyJit +def step_dis(x:Tensor): + Tensor.training = True + real = Tensor([1,0]) + fake = Tensor([0,1]) + noise = Tensor.randn(BACH_SIZE, gen.ld) + fake_data = gen(noise).detach() + fake_loss = dif(fake_data).log_softmax().nll_loss(fake) + real_loss = dif(x).log_softmax().nll_loss(real) + loss = (fake_loss + real_loss)/2 + loss.backward() + difOpt.step() + return loss.numpy() + +@TinyJit +def step_gen(): + Tensor.training = True + real = Tensor([1,0]) + noise = Tensor.randn(BACH_SIZE, gen.ld) + fake_data = gen(noise).detach() + loss = dif(fake_data).log_softmax().nll_loss(real) + loss.backward() + genOpt.step() + return loss.numpy() + + +eshape = (BACH_SIZE, 1, 128, 216) + +mlflow.log_param("generator_learning_rate", glr) +mlflow.log_param("discim_learning_rate", dlr) +mlflow.log_param("epochs", epochs) +mlflow.log_param("train size", len(train)) +mlflow.log_param("test size", len(test)) +for e in range(0,epochs): + print(f"\n--- Starting Epoch {e} ---\n") + dl=0 + gl=0 + + for i in range(0,size,BACH_SIZE): + tx=Tensor(train[i:i+BACH_SIZE]) + if(tx.shape != eshape): + continue + #steps + dl+=step_dis(tx) + gl+=step_gen() + + dl /= (size/BACH_SIZE) + gl /= (size/BACH_SIZE) + if e%4==0: + noise = Tensor.randn(BACH_SIZE, gen.ld) + show.logSpec(gen(noise).numpy()[0][0],e) + #todo test on test data + mlflow.log_metric("gen_loss", gl, step=e) + mlflow.log_metric("dis_loss", dl, step=e) + print(f"loss of gen:{gl} dis:{dl}") + + +#save +noise = Tensor.randn(BACH_SIZE, gen.ld) +show.logSpec(gen(noise).numpy()[0][0],epochs) +from tinygrad.nn.state import safe_save, get_state_dict +safe_save(get_state_dict(gen),"music.safetensors") +mlflow.log_artifact("music.safetensors")