switched to gan and train.py.

2025-09-05 14:38:45 -04:00
parent 5df1e5df7e
commit ccc3fa3ed4
4 changed files with 147 additions and 83 deletions
--- a/data.py
+++ b/data.py
@@ -2,15 +2,17 @@ import librosa
 import numpy as np
 from pathlib import Path
 from multiprocessing import Pool, cpu_count
+import mlflow

 SAMPLE_RATE = 22050

+@mlflow.trace
 def process_file(file_path):
    """
    Load 10 second chunks single song.
    """
    y, sr = librosa.load(file_path, mono=True, sr=SAMPLE_RATE)
-    size = int(SAMPLE_RATE * 10)
+    size = int(SAMPLE_RATE * 5)
    sample_len = len(y)

    file_chunks = []
@@ -18,9 +20,12 @@ def process_file(file_path):
        end = start_pos + size
        if end <= sample_len:
            chunk = y[start_pos:end]
+            chunk = librosa.feature.melspectrogram(y=chunk,sr=SAMPLE_RATE)
+            chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+40)/40)
            file_chunks.append(chunk)
    return file_chunks

+@mlflow.trace
 def load():
    """
    Load 10 second chunks of songs.
@@ -33,6 +38,9 @@ def load():
        audio.extend(l)
    return audio

+
+
+##DEP
 def audio_split(audio):
    """
    Split 10 seconds of audio to 2 5 second clips
--- a/model.py
+++ b/model.py
@@ -1,84 +1,34 @@
 from tinygrad import Tensor, nn
-import numpy as np

 class Gen:
-    def __init__(self, input_channels=1, height=128, width=216, latent_dim=32):
-        self.w = width // 8
-        self.h = height // 8
-        self.flattened_size = 256 * self.h * self.w
-        
-        # Encoder
-        self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1)
+    def __init__(self, height=128, width=216, latent_dim=128):
+        self.w = width // 4
+        self.h = height // 4
+        self.flat = 128 * self.h * self.w
+        self.ld = latent_dim
+        self.d1 = nn.Linear(latent_dim, self.flat)
+        self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)
+        self.d3 = nn.ConvTranspose2d(64, 1, kernel_size=3, stride=2, padding=1, output_padding=1)
+
+    def __call__(self, noise: Tensor) -> Tensor:
+        x = self.d1(noise).relu()
+        x = x.reshape(noise.shape[0], 128, self.h, self.w)
+        x = self.d2(x).relu()
+        x = self.d3(x)
+        return x.tanh()
+
+
+class Check:
+    def __init__(self, height=128, width=216):
+        self.w = width // 4
+        self.h = height // 4
+        self.flat = 128 * self.h * self.w
+        self.e1 = nn.Conv2d(1, 64, kernel_size=3, stride=2, padding=1)
        self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
-        self.e3 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
+        self.out = nn.Linear(self.flat, 2)

-        # VAE Latent Space
-        self.fc_mu = nn.Linear(self.flattened_size, latent_dim)
-        self.fc_logvar = nn.Linear(self.flattened_size, latent_dim)
-
-        # Decoder
-        self.dl = nn.Linear(latent_dim, self.flattened_size)
-        self.d1 = nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1,output_padding=1)
-        self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1,output_padding=1)
-        self.d3 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1,output_padding=1)
-        
    def __call__(self, x: Tensor) -> Tensor:
-        mu, log_var = self.encode(x)
-        x = self.reparameterize(mu, log_var)
-        return self.decode(x)
-
-    def __Lcall__(self, inp: Tensor, otp:Tensor, epoch) -> (Tensor, Tensor):
-        mu, log_var = self.encode(inp)
-        z = self.reparameterize(mu, log_var)
-        recon = self.decode(z)
-
-        # Normalized MSE (per-pixel)
-        recon_loss = (recon - otp).abs().mean()
-    
-        # Stabilized KL
-        kl_div = -0.5 * (1 + log_var - mu.pow(2) - log_var.exp()).mean()
-        
-    
-        # Weighted loss
-        total_loss = recon_loss + min(0.1, 0.01 * epoch) * kl_div
-        return recon, total_loss
-
-    def encode(self, x: Tensor) -> (Tensor, Tensor):
        x = self.e1(x).relu()
        x = self.e2(x).relu()
-        x = self.e3(x).relu()
-        x = x.reshape(shape=(-1, self.flattened_size))
-        return self.fc_mu(x), self.fc_logvar(x)
-
-    def reparameterize(self, mu: Tensor, log_var: Tensor) -> Tensor:
-        log_var = log_var.clip(-10, 10)
-        std = (log_var * 0.5).exp()
-        eps = Tensor.randn(mu.shape)
-        return mu + std * eps
-
-    def decode(self, x: Tensor) -> Tensor:
-        x = self.dl(x).relu()
-        x = x.reshape(shape=(-1, 256, self.h, self.w))
-        x = self.d1(x).relu()
-        x = self.d2(x).relu()
-        x = self.d3(x).sigmoid()
-        return x
-
-class Check():
-    def __init__(self, input_channels=1, height=128, width=216):
-        self.w = width // 8
-        self.h = height // 8
-        self.flattened_size = 256 * self.h * self.w
-        
-        self.d1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1)
-        self.d2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
-        self.d3 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
-        self.fc = nn.Linear(self.flattened_size, 1)
-
-        
-    def __call__(self, x: Tensor) -> Tensor:
-        x = self.d1(x).leakyrelu(0.2)
-        x = self.d2(x).leakyrelu(0.2)
-        x = self.d3(x).leakyrelu(0.2)
-        x = x.reshape(shape=(-1, self.flattened_size))
-        return self.fc(x)
+        x = x.reshape(x.shape[0], -1)
+        return self.out(x).sigmoid()
--- a/show.py
+++ b/show.py
@@ -1,18 +1,21 @@
 import matplotlib.pyplot as plt
-import IPython.display as ipd
 import librosa
+import mlflow


 SAMPLE_RATE = 22050

-def showSpec(spec):
+def logSpec(spec,e):
+    #spec = ((spec*40)-40)
+    #spec = librosa.db_to_amplitude(spec)
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(spec, sr=SAMPLE_RATE,
                             x_axis='time', y_axis='mel',
                             cmap='viridis')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel spectrogram')
-    plt.show()
+    mlflow.log_figure(plt.gcf(), f"output_{e}.png")
+    #plt.close()

 def playSpec(spec):
    S = librosa.feature.inverse.mel_to_stft(spec, sr=SAMPLE_RATE)
@@ -21,6 +24,4 @@ def playSpec(spec):
    plt.figure(figsize=(12,4))
    plt.plot(audio)
    plt.title('waveform')
-    plt.show()
-
-    display(ipd.Audio(audio,rate=SAMPLE_RATE))
+    plt.close()
--- a/train.py
+++ b/train.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+# coding: utf-8
+import data
+import model as model
+import show
+import mlflow
+import numpy as np
+from tinygrad import nn,TinyJit,Tensor
+
+mlflow.set_tracking_uri("http://127.0.0.1:5000")
+mlflow.start_run(experiment_id=804883409598823668)
+#hyper
+BACH_SIZE=32
+glr=1e-3
+dlr=1e-3
+epochs=100
+
+
+#dataset
+x = data.load()
+size=len(x)
+x_np = np.stack(x)
+x_np = np.expand_dims(x_np, axis=1)
+permutation = np.random.permutation(size)
+x_np = x_np[permutation]
+
+train = x_np[30:]
+test = x_np[0:30]
+
+print("Train:"+str(len(train)))
+print("Test:"+str(len(test)))
+
+
+#model
+gen = model.Gen()
+dif = model.Check()
+genOpt = nn.optim.AdamW(nn.state.get_parameters(gen), lr=glr)
+difOpt = nn.optim.AdamW(nn.state.get_parameters(dif), lr=dlr)
+
+
+#train
+
+@TinyJit
+def step_dis(x:Tensor):
+    Tensor.training = True
+    real = Tensor([1,0])
+    fake = Tensor([0,1])
+    noise = Tensor.randn(BACH_SIZE, gen.ld)
+    fake_data = gen(noise).detach()
+    fake_loss = dif(fake_data).log_softmax().nll_loss(fake)
+    real_loss = dif(x).log_softmax().nll_loss(real)
+    loss = (fake_loss + real_loss)/2
+    loss.backward()
+    difOpt.step()
+    return loss.numpy()
+
+@TinyJit
+def step_gen():
+    Tensor.training = True
+    real = Tensor([1,0])
+    noise = Tensor.randn(BACH_SIZE, gen.ld)
+    fake_data = gen(noise).detach()
+    loss = dif(fake_data).log_softmax().nll_loss(real)
+    loss.backward()
+    genOpt.step()
+    return loss.numpy()
+
+
+eshape = (BACH_SIZE, 1, 128, 216)
+
+mlflow.log_param("generator_learning_rate", glr)
+mlflow.log_param("discim_learning_rate", dlr)
+mlflow.log_param("epochs", epochs)
+mlflow.log_param("train size", len(train))
+mlflow.log_param("test size", len(test))
+for e in range(0,epochs):
+    print(f"\n--- Starting Epoch {e} ---\n")
+    dl=0
+    gl=0
+
+    for i in range(0,size,BACH_SIZE):
+        tx=Tensor(train[i:i+BACH_SIZE])
+        if(tx.shape != eshape):
+            continue
+        #steps
+        dl+=step_dis(tx)
+        gl+=step_gen()
+
+    dl /= (size/BACH_SIZE)
+    gl /= (size/BACH_SIZE)
+    if e%4==0:
+        noise = Tensor.randn(BACH_SIZE, gen.ld)
+        show.logSpec(gen(noise).numpy()[0][0],e)
+    #todo test on test data
+    mlflow.log_metric("gen_loss", gl, step=e)
+    mlflow.log_metric("dis_loss", dl, step=e)
+    print(f"loss of gen:{gl} dis:{dl}")
+
+
+#save
+noise = Tensor.randn(BACH_SIZE, gen.ld)
+show.logSpec(gen(noise).numpy()[0][0],epochs)
+from tinygrad.nn.state import safe_save, get_state_dict
+safe_save(get_state_dict(gen),"music.safetensors")
+mlflow.log_artifact("music.safetensors")