Compare commits

..

8 Commits

Author SHA1 Message Date
k
bfdcc8311f ignore safetensors 2025-11-12 12:15:59 -05:00
k
43b64e6ca3 add shell.nix 2025-11-12 12:15:41 -05:00
k
6e0b3882bc added transformer block in latenent space 2025-11-12 12:13:03 -05:00
k
b076a0d123 add status bar for epoch progress 2025-11-12 12:12:26 -05:00
k
579b37cd70 add player script and fix bug 2025-11-12 12:11:57 -05:00
k
64e66260ec simple vae style model 2025-11-12 12:10:52 -05:00
k
df4cdc8e25 playing with denoiseing 2025-11-10 22:34:17 -05:00
k
c84c100cb8 updated data 2025-11-08 00:10:50 -05:00
7 changed files with 209 additions and 128 deletions

1
.gitignore vendored
View File

@@ -3,3 +3,4 @@
/data/ /data/
/music.safetensors /music.safetensors
/data.npz /data.npz
*.safetensors

16
data.py
View File

@@ -6,7 +6,15 @@ import mlflow
SAMPLE_RATE = 22050 SAMPLE_RATE = 22050
#@mlflow.trace def spec_to_audio(spec):
"""
Convert a normalized mel-spectrogram back to audio.
"""
spec = (spec * 80) - 80
spec = librosa.db_to_amplitude(spec)*80
audio = librosa.feature.inverse.mel_to_audio(spec,sr=SAMPLE_RATE)
return audio
def process_file(file_path): def process_file(file_path):
""" """
Load 10 second chunks single song. Load 10 second chunks single song.
@@ -22,12 +30,9 @@ def process_file(file_path):
end = start_pos + size end = start_pos + size
if end <= sample_len: if end <= sample_len:
chunk = y[start_pos:end] chunk = y[start_pos:end]
#chunk = librosa.feature.melspectrogram(y=chunk,sr=SAMPLE_RATE)
#chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+40)/40)
file_chunks.append(chunk) file_chunks.append(chunk)
return file_chunks return file_chunks
#@mlflow.trace
def load(): def load():
""" """
Load 10 second chunks of songs. Load 10 second chunks of songs.
@@ -41,9 +46,6 @@ def load():
audio.extend(l) audio.extend(l)
return audio return audio
##DEP
def audio_split(audio): def audio_split(audio):
""" """
Split 10 seconds of audio to 2 5 second clips Split 10 seconds of audio to 2 5 second clips

View File

@@ -3,6 +3,7 @@ import numpy as np
x,y = data.dataset(data.load()) x,y = data.dataset(data.load())
size=len(x) size=len(x)
print(size)
x_np = np.stack(x) x_np = np.stack(x)
x_np = np.expand_dims(x_np, axis=1) x_np = np.expand_dims(x_np, axis=1)

112
model.py
View File

@@ -1,34 +1,94 @@
from tinygrad import Tensor, nn from tinygrad import Tensor, nn
class Gen: class gen:
def __init__(self, height=128, width=216, latent_dim=128): def __init__(self, input_channels=1, height=128, width=216, latent_dim=1024):
self.w = width // 4 self.height = height
self.h = height // 4 self.width = width
self.flat = 128 * self.h * self.w self.latent_dim = latent_dim
self.ld = latent_dim
self.d1 = nn.Linear(latent_dim, self.flat)
self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)
self.d3 = nn.ConvTranspose2d(64, 1, kernel_size=3, stride=2, padding=1, output_padding=1)
def __call__(self, noise: Tensor) -> Tensor: self.w = width // 8
x = self.d1(noise).relu() self.h = height // 8
x = x.reshape(noise.shape[0], 128, self.h, self.w) self.flattened_size = 256 * self.h * self.w
x = self.d2(x).relu()
x = self.d3(x) self.num_tokens = 16
return x.tanh() self.dim_per_token = self.latent_dim // self.num_tokens
class Check: self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1)
def __init__(self, height=128, width=216):
self.w = width // 4
self.h = height // 4
self.flat = 128 * self.h * self.w
self.e1 = nn.Conv2d(1, 64, kernel_size=3, stride=2, padding=1)
self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1) self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
self.out = nn.Linear(self.flat, 1) self.e3 = nn.Conv2d(128,256, kernel_size=3,stride=2,padding=1)
self.el = nn.Linear(self.flattened_size, self.latent_dim)
self.q = nn.Linear(self.dim_per_token,self.dim_per_token)
self.k = nn.Linear(self.dim_per_token,self.dim_per_token)
self.v = nn.Linear(self.dim_per_token,self.dim_per_token)
self.norm1 = nn.LayerNorm(self.dim_per_token)
ffn_dim = self.dim_per_token * 4
self.ffn1 = nn.Linear(self.dim_per_token, ffn_dim)
self.ffn2 = nn.Linear(ffn_dim, self.dim_per_token)
self.norm2 = nn.LayerNorm(self.dim_per_token)
self.dl = nn.Linear(self.latent_dim, self.flattened_size)
self.d1 = nn.ConvTranspose2d(256,128,kernel_size=3,stride=2,padding=1,output_padding=1)
self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)
self.d3 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1)
def __call__(self, x: Tensor) -> Tensor: def __call__(self, x: Tensor) -> Tensor:
x = self.e1(x).relu() y, shape = self.encode(x)
x = self.e2(x).relu() z = self.atten(y)
x = x.reshape(x.shape[0], -1) return self.decode(z, shape)
return self.out(x)#.sigmoid()
def encode(self, x: Tensor):
x = self.e1(x).leakyrelu()
x = self.e2(x).leakyrelu()
x = self.e3(x).leakyrelu()
b, c, h, w = x.shape
flattened_size = c * h * w
x = x.reshape(shape=(b, flattened_size))
z = self.el(x)
# reshape to multi-token: (batch, num_tokens, dim_per_token)
z = z.reshape(shape=(b, self.num_tokens, self.dim_per_token))
return z, (c, h, w)
def atten(self, x: Tensor):
q = self.q(x)
k = self.k(x)
v = self.v(x)
attn = q.scaled_dot_product_attention(k, v)
x = self.norm1(x+attn)
ffn = self.ffn1(x).relu()
ffn = self.ffn2(ffn)
x = self.norm2(x+ffn)
return x
def decode(self, z: Tensor, shape):
z = z.reshape(shape=(z.shape[0], -1))
x = self.dl(z).leakyrelu()
x = x.reshape(shape=(-1, 256, self.h, self.w))
x = self.d1(x).leakyrelu()
x = self.d2(x).leakyrelu()
x = self.d3(x).sigmoid()
# Crop or pad to match input size
out_h, out_w = x.shape[2], x.shape[3]
if out_h > self.height:
x = x[:, :, :self.height, :]
elif out_h < self.height:
pad_h = self.height - out_h
x = x.pad2d((0, 0, 0, pad_h))
if out_w > self.width:
x = x[:, :, :, :self.width]
elif out_w < self.width:
pad_w = self.width - out_w
x = x.pad2d((0, pad_w, 0, 0))
return x

43
run.py Normal file
View File

@@ -0,0 +1,43 @@
import numpy as np
import random
import time
from tinygrad import Tensor, nn
from tinygrad.nn.state import safe_load, load_state_dict
import librosa
import sounddevice as sd
from model import gen
from data import spec_to_audio
SAMPLE_RATE = 22050
def load_model(filepath="model.safetensors"):
"""Loads the model structure and weights."""
model = gen()
state_dict = safe_load(filepath)
load_state_dict(model, state_dict)
return model
def load_data(filepath="data.npz"):
"""Loads the pre-processed spectrogram data."""
print(f"Loading data from {filepath}...")
data = np.load(filepath)
x = data["arr_0"]
return x
def play_spec(spec,i):
"""Converts a spectrogram numpy array to audio and plays it."""
audio = spec_to_audio(spec)
sd.wait()
print(f"chunk:{i}")
sd.play(audio, samplerate=SAMPLE_RATE)
def run_prediction_loop(model, data_x):
current_spect = data_x[0:1]
for i in range(10):
play_spec(current_spect[0][0],i)
current_spect = model(Tensor(current_spect)).numpy()
if __name__ == "__main__":
model = load_model()
data_x = load_data()
run_prediction_loop(model, data_x)

8
shell.nix Normal file
View File

@@ -0,0 +1,8 @@
{pkgs ? import <nixpkgs> {}}:
with pkgs;
mkShell rec {
packages = [python3 jupyter-all python3Packages.librosa python3Packages.tinygrad python3Packages.numpy python3Packages.mlflow python3Packages.tqdm python3Packages.sounddevice];
nativeBuildInputs = [];
buildInputs = [];
LD_LIBRARY_PATH = lib.makeLibraryPath buildInputs;
}

156
train.py
View File

@@ -1,106 +1,72 @@
#!/usr/bin/env python
# coding: utf-8
import data
import model as model
import show
import mlflow import mlflow
import numpy as np import numpy as np
from tinygrad import nn,TinyJit,Tensor from tinygrad import Device,Tensor,nn,TinyJit
from tinygrad.nn.state import safe_save, get_state_dict
import matplotlib.pyplot as plt
import time
import show
from model import gen
from tqdm import tqdm
BATCH_SIZE = 16
EPOCHS = 100
LEARNING_RATE = 3e-4
print(Device.DEFAULT)
mdl = gen()
opt = nn.optim.AdamW(nn.state.get_parameters(mdl), lr=LEARNING_RATE)
def spec_loss(pred, target, eps=1e-6):
# spectral convergence
sc = ((target - pred).square().sum()) ** 0.5 / ((target.square().sum()) ** 0.5 + eps)
# log magnitude difference
log_mag = ((target.abs() + eps).log() - (pred.abs() + eps).log()).abs().mean()
return 0.1*sc + 1.0*log_mag + 0.1*(pred - target).abs().mean()
@TinyJit
def step_gen(x,y):
Tensor.training = True
z = mdl(x)
loss = spec_loss(z,y)
#loss = (y - z).abs().mean()
opt.zero_grad()
loss.backward()
opt.step()
return loss.numpy()
print("loading")
x = np.load("data.npz")["arr_0"]
y = np.load("data.npz")["arr_1"]
run_name = f"vae_{int(time.time())}"
mlflow.set_tracking_uri("http://127.0.0.1:5000") mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.start_run(experiment_id=804883409598823668) mlflow.start_run()
#hyper mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "data size":len(x)})
BACH_SIZE=32
BATCH_SIZE=BACH_SIZE
glr=2e-4
dlr=1e-5
epochs=100
show.logSpec(Tensor(x[0:1]).numpy()[0][0],"default")
#dataset print("training")
x = data.load() eshape = (BATCH_SIZE, 1, 128, 216)
size=len(x) for epoch in range(0,EPOCHS):
x_np = np.stack(x) print(f"\n--- Starting Epoch {epoch} ---\n")
x_np = np.expand_dims(x_np, axis=1) loss=0
permutation = np.random.permutation(size) for i in tqdm(range(0,len(x),BATCH_SIZE)):
x_np = x_np[permutation] tx=Tensor(x[i:i+BATCH_SIZE])
ty=Tensor(y[i:i+BATCH_SIZE])
train = x_np[30:]
test = x_np[0:30]
print("Train:"+str(len(train)))
print("Test:"+str(len(test)))
#model
gen = model.Gen()
dif = model.Check()
genOpt = nn.optim.AdamW(nn.state.get_parameters(gen), lr=glr)
difOpt = nn.optim.AdamW(nn.state.get_parameters(dif), lr=dlr)
#train
@TinyJit
def step_dis(x:Tensor):
Tensor.training = True
real = Tensor.ones((BATCH_SIZE,1))
fake = Tensor.zeros((BACH_SIZE,1))
noise = Tensor.randn(BACH_SIZE, gen.ld)
fake_data = gen(noise).detach()
fake_loss = dif(fake_data).binary_crossentropy_logits(fake)
real_loss = dif(x).binary_crossentropy_logits(real)
loss = (fake_loss + real_loss)/2
loss.backward()
difOpt.step()
return loss.numpy()
@TinyJit
def step_gen():
Tensor.training = True
real = Tensor.ones((BATCH_SIZE,1))
noise = Tensor.randn(BACH_SIZE, gen.ld)
fake_data = gen(noise).detach()
loss = dif(fake_data).binary_crossentropy_logits(real)
loss.backward()
genOpt.step()
return loss.numpy()
eshape = (BACH_SIZE, 1, 128, 216)
mlflow.log_param("generator_learning_rate", glr)
mlflow.log_param("discim_learning_rate", dlr)
mlflow.log_param("epochs", epochs)
mlflow.log_param("train size", len(train))
mlflow.log_param("test size", len(test))
for e in range(0,epochs):
print(f"\n--- Starting Epoch {e} ---\n")
dl=0
gl=0
for i in range(0,size,BACH_SIZE):
tx=Tensor(train[i:i+BACH_SIZE])
if(tx.shape != eshape): if(tx.shape != eshape):
continue continue
#steps loss += step_gen(tx,ty)
dl+=step_dis(tx)
gl+=step_gen()
dl /= (size/BACH_SIZE) loss /= (len(x)/BATCH_SIZE)
gl /= (size/BACH_SIZE) if epoch%5==0:
if e%5==0: show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],epoch)
noise = Tensor.randn(BACH_SIZE, gen.ld) if epoch%15==0:
show.logSpec(gen(noise).numpy()[0][0],e) state_dict = get_state_dict(mdl)
#todo test on test data safe_save(state_dict, f"model_{epoch}.safetensors")
mlflow.log_metric("gen_loss", gl, step=e) show.logSpec(mdl(mdl(mdl(Tensor(y[0:1])))).numpy()[0][0],f"deep_{epoch}")
mlflow.log_metric("dis_loss", dl, step=e)
print(f"loss of gen:{gl} dis:{dl}")
mlflow.log_metric("loss", loss, step=epoch)
print(f"loss of {loss}")
#save show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],EPOCHS)
noise = Tensor.randn(BACH_SIZE, gen.ld) state_dict = get_state_dict(mdl)
show.logSpec(gen(noise).numpy()[0][0],epochs) safe_save(state_dict, "model.safetensors")
from tinygrad.nn.state import safe_save, get_state_dict
safe_save(get_state_dict(gen),"music.safetensors")
mlflow.log_artifact("music.safetensors")