Compare commits

...

6 Commits

Author SHA1 Message Date
k
bfdcc8311f ignore safetensors 2025-11-12 12:15:59 -05:00
k
43b64e6ca3 add shell.nix 2025-11-12 12:15:41 -05:00
k
6e0b3882bc added transformer block in latenent space 2025-11-12 12:13:03 -05:00
k
b076a0d123 add status bar for epoch progress 2025-11-12 12:12:26 -05:00
k
579b37cd70 add player script and fix bug 2025-11-12 12:11:57 -05:00
k
64e66260ec simple vae style model 2025-11-12 12:10:52 -05:00
7 changed files with 128 additions and 55 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@
/data/ /data/
/music.safetensors /music.safetensors
/data.npz /data.npz
*.safetensors

19
data.py
View File

@ -6,7 +6,15 @@ import mlflow
SAMPLE_RATE = 22050 SAMPLE_RATE = 22050
#@mlflow.trace def spec_to_audio(spec):
"""
Convert a normalized mel-spectrogram back to audio.
"""
spec = (spec * 80) - 80
spec = librosa.db_to_amplitude(spec)*80
audio = librosa.feature.inverse.mel_to_audio(spec,sr=SAMPLE_RATE)
return audio
def process_file(file_path): def process_file(file_path):
""" """
Load 10 second chunks single song. Load 10 second chunks single song.
@ -22,15 +30,9 @@ def process_file(file_path):
end = start_pos + size end = start_pos + size
if end <= sample_len: if end <= sample_len:
chunk = y[start_pos:end] chunk = y[start_pos:end]
chunk = librosa.feature.melspectrogram(y=chunk, sr=SAMPLE_RATE)
chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+80)/80)
#chunk = librosa.feature.melspectrogram(y=chunk,sr=SAMPLE_RATE)
#chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+40)/40)
file_chunks.append(chunk) file_chunks.append(chunk)
return file_chunks return file_chunks
#@mlflow.trace
def load(): def load():
""" """
Load 10 second chunks of songs. Load 10 second chunks of songs.
@ -44,9 +46,6 @@ def load():
audio.extend(l) audio.extend(l)
return audio return audio
##DEP
def audio_split(audio): def audio_split(audio):
""" """
Split 10 seconds of audio to 2 5 second clips Split 10 seconds of audio to 2 5 second clips

View File

@ -1,14 +1,14 @@
import data import data
import numpy as np import numpy as np
x = data.load() x,y = data.dataset(data.load())
size=len(x) size=len(x)
print(size) print(size)
x_np = np.stack(x) x_np = np.stack(x)
x_np = np.expand_dims(x_np, axis=1) x_np = np.expand_dims(x_np, axis=1)
#y_np = np.stack(y) y_np = np.stack(y)
#y_np = np.expand_dims(y_np, axis=1) y_np = np.expand_dims(y_np, axis=1)
np.savez_compressed("data",x_np) np.savez_compressed("data",x_np,y_np)

View File

@ -1,31 +1,41 @@
from tinygrad import Tensor, nn from tinygrad import Tensor, nn
class gen: class gen:
def __init__(self, input_channels=1, height=128, width=431, latent_dim=64): def __init__(self, input_channels=1, height=128, width=216, latent_dim=1024):
self.height = height self.height = height
self.width = width self.width = width
self.latent_dim = latent_dim self.latent_dim = latent_dim
self.w = width // 4 self.w = width // 8
self.h = height // 4 self.h = height // 8
self.h = 32 # Output height after 2 strides self.flattened_size = 256 * self.h * self.w
self.w = 108 # Output width after 2 strides
self.flattened_size = 128 * self.h * self.w self.num_tokens = 16
self.dim_per_token = self.latent_dim // self.num_tokens
self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1) self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1)
self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1) self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
self.e3 = nn.Conv2d(128,256, kernel_size=3,stride=2,padding=1)
self.el = nn.Linear(self.flattened_size, self.latent_dim) self.el = nn.Linear(self.flattened_size, self.latent_dim)
self.q = nn.Linear(self.latent_dim,self.latent_dim) self.q = nn.Linear(self.dim_per_token,self.dim_per_token)
self.k = nn.Linear(self.latent_dim,self.latent_dim) self.k = nn.Linear(self.dim_per_token,self.dim_per_token)
self.v = nn.Linear(self.latent_dim,self.latent_dim) self.v = nn.Linear(self.dim_per_token,self.dim_per_token)
self.norm1 = nn.LayerNorm(self.dim_per_token)
ffn_dim = self.dim_per_token * 4
self.ffn1 = nn.Linear(self.dim_per_token, ffn_dim)
self.ffn2 = nn.Linear(ffn_dim, self.dim_per_token)
self.norm2 = nn.LayerNorm(self.dim_per_token)
self.dl = nn.Linear(self.latent_dim, self.flattened_size) self.dl = nn.Linear(self.latent_dim, self.flattened_size)
self.d1 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1) self.d1 = nn.ConvTranspose2d(256,128,kernel_size=3,stride=2,padding=1,output_padding=1)
self.d2 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1) self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)
self.d3 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1)
def __call__(self, x: Tensor) -> Tensor: def __call__(self, x: Tensor) -> Tensor:
y, shape = self.encode(x) y, shape = self.encode(x)
@ -35,26 +45,37 @@ class gen:
def encode(self, x: Tensor): def encode(self, x: Tensor):
x = self.e1(x).leakyrelu() x = self.e1(x).leakyrelu()
x = self.e2(x).leakyrelu() x = self.e2(x).leakyrelu()
x = self.e3(x).leakyrelu()
b, c, h, w = x.shape b, c, h, w = x.shape
flattened_size = c * h * w flattened_size = c * h * w
x = x.reshape(shape=(b, flattened_size)) x = x.reshape(shape=(b, flattened_size))
z = self.el(x) z = self.el(x)
# reshape to multi-token: (batch, num_tokens, dim_per_token)
z = z.reshape(shape=(b, self.num_tokens, self.dim_per_token))
return z, (c, h, w) return z, (c, h, w)
def atten(self, x: Tensor): def atten(self, x: Tensor):
q = self.q(x).relu() q = self.q(x)
k = self.k(x).relu() k = self.k(x)
v = self.v(x).relu() v = self.v(x)
return q.scaled_dot_product_attention(k,v) attn = q.scaled_dot_product_attention(k, v)
x = self.norm1(x+attn)
ffn = self.ffn1(x).relu()
ffn = self.ffn2(ffn)
x = self.norm2(x+ffn)
return x
def decode(self, z: Tensor, shape): def decode(self, z: Tensor, shape):
z = z.reshape(shape=(z.shape[0], -1))
x = self.dl(z).leakyrelu() x = self.dl(z).leakyrelu()
x = x.reshape(shape=(-1, 128, self.h, self.w)) x = x.reshape(shape=(-1, 256, self.h, self.w))
x = self.d1(x).leakyrelu() x = self.d1(x).leakyrelu()
x = self.d2(x).sigmoid() x = self.d2(x).leakyrelu()
x = self.d3(x).sigmoid()
# Crop or pad to match input size # Crop or pad to match input size
out_h, out_w = x.shape[2], x.shape[3] out_h, out_w = x.shape[2], x.shape[3]

43
run.py Normal file
View File

@ -0,0 +1,43 @@
import numpy as np
import random
import time
from tinygrad import Tensor, nn
from tinygrad.nn.state import safe_load, load_state_dict
import librosa
import sounddevice as sd
from model import gen
from data import spec_to_audio
SAMPLE_RATE = 22050
def load_model(filepath="model.safetensors"):
"""Loads the model structure and weights."""
model = gen()
state_dict = safe_load(filepath)
load_state_dict(model, state_dict)
return model
def load_data(filepath="data.npz"):
"""Loads the pre-processed spectrogram data."""
print(f"Loading data from {filepath}...")
data = np.load(filepath)
x = data["arr_0"]
return x
def play_spec(spec,i):
"""Converts a spectrogram numpy array to audio and plays it."""
audio = spec_to_audio(spec)
sd.wait()
print(f"chunk:{i}")
sd.play(audio, samplerate=SAMPLE_RATE)
def run_prediction_loop(model, data_x):
current_spect = data_x[0:1]
for i in range(10):
play_spec(current_spect[0][0],i)
current_spect = model(Tensor(current_spect)).numpy()
if __name__ == "__main__":
model = load_model()
data_x = load_data()
run_prediction_loop(model, data_x)

8
shell.nix Normal file
View File

@ -0,0 +1,8 @@
{pkgs ? import <nixpkgs> {}}:
with pkgs;
mkShell rec {
packages = [python3 jupyter-all python3Packages.librosa python3Packages.tinygrad python3Packages.numpy python3Packages.mlflow python3Packages.tqdm python3Packages.sounddevice];
nativeBuildInputs = [];
buildInputs = [];
LD_LIBRARY_PATH = lib.makeLibraryPath buildInputs;
}

View File

@ -1,34 +1,34 @@
import mlflow import mlflow
import numpy as np import numpy as np
from tinygrad import Device,Tensor,nn,TinyJit from tinygrad import Device,Tensor,nn,TinyJit
from tinygrad.nn.state import safe_save, get_state_dict
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import time import time
import show import show
from model import gen from model import gen
from tqdm import tqdm
BATCH_SIZE = 16 BATCH_SIZE = 16
EPOCHS = 100 EPOCHS = 100
LEARNING_RATE = 1e-5 LEARNING_RATE = 3e-4
print(Device.DEFAULT) print(Device.DEFAULT)
mdl = gen() mdl = gen()
opt = nn.optim.AdamW(nn.state.get_parameters(mdl), lr=LEARNING_RATE) opt = nn.optim.AdamW(nn.state.get_parameters(mdl), lr=LEARNING_RATE)
volume = 0.1
def spec_loss(pred, target, eps=1e-6): def spec_loss(pred, target, eps=1e-6):
# spectral convergence # spectral convergence
sc = ((target - pred).square().sum()) ** 0.5 / ((target.square().sum()) ** 0.5 + eps) sc = ((target - pred).square().sum()) ** 0.5 / ((target.square().sum()) ** 0.5 + eps)
# log magnitude difference # log magnitude difference
log_mag = ((target.abs() + eps).log() - (pred.abs() + eps).log()).abs().mean() log_mag = ((target.abs() + eps).log() - (pred.abs() + eps).log()).abs().mean()
return sc + log_mag return 0.1*sc + 1.0*log_mag + 0.1*(pred - target).abs().mean()
@TinyJit @TinyJit
def step_gen(x): def step_gen(x,y):
Tensor.training = True Tensor.training = True
noise = Tensor.rand_like(x).tanh() z = mdl(x)
y = x+(noise*volume) loss = spec_loss(z,y)
y = y.clamp(0,1) #loss = (y - z).abs().mean()
loss = spec_loss(mdl(y),x)
opt.zero_grad() opt.zero_grad()
loss.backward() loss.backward()
opt.step() opt.step()
@ -36,8 +36,8 @@ def step_gen(x):
print("loading") print("loading")
x = np.load("data.npz")["arr_0"] x = np.load("data.npz")["arr_0"]
#x= x[0:64] y = np.load("data.npz")["arr_1"]
run_name = f"tinygrad_autoencoder_{int(time.time())}" run_name = f"vae_{int(time.time())}"
mlflow.set_tracking_uri("http://127.0.0.1:5000") mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.start_run() mlflow.start_run()
mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "data size":len(x)}) mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "data size":len(x)})
@ -45,27 +45,28 @@ mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RA
show.logSpec(Tensor(x[0:1]).numpy()[0][0],"default") show.logSpec(Tensor(x[0:1]).numpy()[0][0],"default")
print("training") print("training")
pl = 0 eshape = (BATCH_SIZE, 1, 128, 216)
eshape = (BATCH_SIZE, 1, 128, 431)
for epoch in range(0,EPOCHS): for epoch in range(0,EPOCHS):
print(f"\n--- Starting Epoch {epoch} ---\n") print(f"\n--- Starting Epoch {epoch} ---\n")
loss=0 loss=0
for i in range(0,len(x),BATCH_SIZE): for i in tqdm(range(0,len(x),BATCH_SIZE)):
tx=Tensor(x[i:i+BATCH_SIZE]) tx=Tensor(x[i:i+BATCH_SIZE])
ty=Tensor(y[i:i+BATCH_SIZE])
if(tx.shape != eshape): if(tx.shape != eshape):
continue continue
loss += step_gen(tx) loss += step_gen(tx,ty)
loss /= (len(x)/BATCH_SIZE) loss /= (len(x)/BATCH_SIZE)
if epoch%5==0: if epoch%5==0:
noise = Tensor.rand_like(Tensor(x[0:1])).tanh() show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],epoch)
y = Tensor(x[0:1]) + (noise*volume) if epoch%15==0:
show.logSpec(mdl(y).numpy()[0][0],epoch) state_dict = get_state_dict(mdl)
if(pl - loss < 0.03 and epoch > 25): safe_save(state_dict, f"model_{epoch}.safetensors")
show.logSpec(y.numpy()[0][0],f"volume_{volume}") show.logSpec(mdl(mdl(mdl(Tensor(y[0:1])))).numpy()[0][0],f"deep_{epoch}")
volume *= 2
pl = loss
mlflow.log_metric("volume", volume, step=epoch)
mlflow.log_metric("loss", loss, step=epoch) mlflow.log_metric("loss", loss, step=epoch)
print(f"loss of {loss}") print(f"loss of {loss}")
show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],EPOCHS)
state_dict = get_state_dict(mdl)
safe_save(state_dict, "model.safetensors")