Compare commits
8 Commits
1a328d313f
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| bfdcc8311f | |||
| 43b64e6ca3 | |||
| 6e0b3882bc | |||
| b076a0d123 | |||
| 579b37cd70 | |||
| 64e66260ec | |||
| df4cdc8e25 | |||
| c84c100cb8 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -3,3 +3,4 @@
|
|||||||
/data/
|
/data/
|
||||||
/music.safetensors
|
/music.safetensors
|
||||||
/data.npz
|
/data.npz
|
||||||
|
*.safetensors
|
||||||
|
|||||||
16
data.py
16
data.py
@@ -6,7 +6,15 @@ import mlflow
|
|||||||
|
|
||||||
SAMPLE_RATE = 22050
|
SAMPLE_RATE = 22050
|
||||||
|
|
||||||
#@mlflow.trace
|
def spec_to_audio(spec):
|
||||||
|
"""
|
||||||
|
Convert a normalized mel-spectrogram back to audio.
|
||||||
|
"""
|
||||||
|
spec = (spec * 80) - 80
|
||||||
|
spec = librosa.db_to_amplitude(spec)*80
|
||||||
|
audio = librosa.feature.inverse.mel_to_audio(spec,sr=SAMPLE_RATE)
|
||||||
|
return audio
|
||||||
|
|
||||||
def process_file(file_path):
|
def process_file(file_path):
|
||||||
"""
|
"""
|
||||||
Load 10 second chunks single song.
|
Load 10 second chunks single song.
|
||||||
@@ -22,12 +30,9 @@ def process_file(file_path):
|
|||||||
end = start_pos + size
|
end = start_pos + size
|
||||||
if end <= sample_len:
|
if end <= sample_len:
|
||||||
chunk = y[start_pos:end]
|
chunk = y[start_pos:end]
|
||||||
#chunk = librosa.feature.melspectrogram(y=chunk,sr=SAMPLE_RATE)
|
|
||||||
#chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+40)/40)
|
|
||||||
file_chunks.append(chunk)
|
file_chunks.append(chunk)
|
||||||
return file_chunks
|
return file_chunks
|
||||||
|
|
||||||
#@mlflow.trace
|
|
||||||
def load():
|
def load():
|
||||||
"""
|
"""
|
||||||
Load 10 second chunks of songs.
|
Load 10 second chunks of songs.
|
||||||
@@ -41,9 +46,6 @@ def load():
|
|||||||
audio.extend(l)
|
audio.extend(l)
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##DEP
|
|
||||||
def audio_split(audio):
|
def audio_split(audio):
|
||||||
"""
|
"""
|
||||||
Split 10 seconds of audio to 2 5 second clips
|
Split 10 seconds of audio to 2 5 second clips
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import numpy as np
|
|||||||
|
|
||||||
x,y = data.dataset(data.load())
|
x,y = data.dataset(data.load())
|
||||||
size=len(x)
|
size=len(x)
|
||||||
|
print(size)
|
||||||
|
|
||||||
x_np = np.stack(x)
|
x_np = np.stack(x)
|
||||||
x_np = np.expand_dims(x_np, axis=1)
|
x_np = np.expand_dims(x_np, axis=1)
|
||||||
|
|||||||
112
model.py
112
model.py
@@ -1,34 +1,94 @@
|
|||||||
from tinygrad import Tensor, nn
|
from tinygrad import Tensor, nn
|
||||||
|
|
||||||
class Gen:
|
class gen:
|
||||||
def __init__(self, height=128, width=216, latent_dim=128):
|
def __init__(self, input_channels=1, height=128, width=216, latent_dim=1024):
|
||||||
self.w = width // 4
|
self.height = height
|
||||||
self.h = height // 4
|
self.width = width
|
||||||
self.flat = 128 * self.h * self.w
|
self.latent_dim = latent_dim
|
||||||
self.ld = latent_dim
|
|
||||||
self.d1 = nn.Linear(latent_dim, self.flat)
|
|
||||||
self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)
|
|
||||||
self.d3 = nn.ConvTranspose2d(64, 1, kernel_size=3, stride=2, padding=1, output_padding=1)
|
|
||||||
|
|
||||||
def __call__(self, noise: Tensor) -> Tensor:
|
self.w = width // 8
|
||||||
x = self.d1(noise).relu()
|
self.h = height // 8
|
||||||
x = x.reshape(noise.shape[0], 128, self.h, self.w)
|
self.flattened_size = 256 * self.h * self.w
|
||||||
x = self.d2(x).relu()
|
|
||||||
x = self.d3(x)
|
self.num_tokens = 16
|
||||||
return x.tanh()
|
self.dim_per_token = self.latent_dim // self.num_tokens
|
||||||
|
|
||||||
|
|
||||||
class Check:
|
self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1)
|
||||||
def __init__(self, height=128, width=216):
|
|
||||||
self.w = width // 4
|
|
||||||
self.h = height // 4
|
|
||||||
self.flat = 128 * self.h * self.w
|
|
||||||
self.e1 = nn.Conv2d(1, 64, kernel_size=3, stride=2, padding=1)
|
|
||||||
self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
|
self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
|
||||||
self.out = nn.Linear(self.flat, 1)
|
self.e3 = nn.Conv2d(128,256, kernel_size=3,stride=2,padding=1)
|
||||||
|
|
||||||
|
|
||||||
|
self.el = nn.Linear(self.flattened_size, self.latent_dim)
|
||||||
|
|
||||||
|
self.q = nn.Linear(self.dim_per_token,self.dim_per_token)
|
||||||
|
self.k = nn.Linear(self.dim_per_token,self.dim_per_token)
|
||||||
|
self.v = nn.Linear(self.dim_per_token,self.dim_per_token)
|
||||||
|
self.norm1 = nn.LayerNorm(self.dim_per_token)
|
||||||
|
|
||||||
|
ffn_dim = self.dim_per_token * 4
|
||||||
|
self.ffn1 = nn.Linear(self.dim_per_token, ffn_dim)
|
||||||
|
self.ffn2 = nn.Linear(ffn_dim, self.dim_per_token)
|
||||||
|
self.norm2 = nn.LayerNorm(self.dim_per_token)
|
||||||
|
|
||||||
|
self.dl = nn.Linear(self.latent_dim, self.flattened_size)
|
||||||
|
|
||||||
|
self.d1 = nn.ConvTranspose2d(256,128,kernel_size=3,stride=2,padding=1,output_padding=1)
|
||||||
|
self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)
|
||||||
|
self.d3 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1)
|
||||||
|
|
||||||
def __call__(self, x: Tensor) -> Tensor:
|
def __call__(self, x: Tensor) -> Tensor:
|
||||||
x = self.e1(x).relu()
|
y, shape = self.encode(x)
|
||||||
x = self.e2(x).relu()
|
z = self.atten(y)
|
||||||
x = x.reshape(x.shape[0], -1)
|
return self.decode(z, shape)
|
||||||
return self.out(x)#.sigmoid()
|
|
||||||
|
def encode(self, x: Tensor):
|
||||||
|
x = self.e1(x).leakyrelu()
|
||||||
|
x = self.e2(x).leakyrelu()
|
||||||
|
x = self.e3(x).leakyrelu()
|
||||||
|
b, c, h, w = x.shape
|
||||||
|
|
||||||
|
flattened_size = c * h * w
|
||||||
|
x = x.reshape(shape=(b, flattened_size))
|
||||||
|
z = self.el(x)
|
||||||
|
|
||||||
|
# reshape to multi-token: (batch, num_tokens, dim_per_token)
|
||||||
|
z = z.reshape(shape=(b, self.num_tokens, self.dim_per_token))
|
||||||
|
return z, (c, h, w)
|
||||||
|
|
||||||
|
def atten(self, x: Tensor):
|
||||||
|
q = self.q(x)
|
||||||
|
k = self.k(x)
|
||||||
|
v = self.v(x)
|
||||||
|
attn = q.scaled_dot_product_attention(k, v)
|
||||||
|
x = self.norm1(x+attn)
|
||||||
|
|
||||||
|
ffn = self.ffn1(x).relu()
|
||||||
|
ffn = self.ffn2(ffn)
|
||||||
|
x = self.norm2(x+ffn)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
def decode(self, z: Tensor, shape):
|
||||||
|
z = z.reshape(shape=(z.shape[0], -1))
|
||||||
|
x = self.dl(z).leakyrelu()
|
||||||
|
x = x.reshape(shape=(-1, 256, self.h, self.w))
|
||||||
|
x = self.d1(x).leakyrelu()
|
||||||
|
x = self.d2(x).leakyrelu()
|
||||||
|
x = self.d3(x).sigmoid()
|
||||||
|
|
||||||
|
# Crop or pad to match input size
|
||||||
|
out_h, out_w = x.shape[2], x.shape[3]
|
||||||
|
if out_h > self.height:
|
||||||
|
x = x[:, :, :self.height, :]
|
||||||
|
elif out_h < self.height:
|
||||||
|
pad_h = self.height - out_h
|
||||||
|
x = x.pad2d((0, 0, 0, pad_h))
|
||||||
|
|
||||||
|
if out_w > self.width:
|
||||||
|
x = x[:, :, :, :self.width]
|
||||||
|
elif out_w < self.width:
|
||||||
|
pad_w = self.width - out_w
|
||||||
|
x = x.pad2d((0, pad_w, 0, 0))
|
||||||
|
|
||||||
|
return x
|
||||||
|
|||||||
43
run.py
Normal file
43
run.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import numpy as np
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from tinygrad import Tensor, nn
|
||||||
|
from tinygrad.nn.state import safe_load, load_state_dict
|
||||||
|
import librosa
|
||||||
|
import sounddevice as sd
|
||||||
|
from model import gen
|
||||||
|
from data import spec_to_audio
|
||||||
|
|
||||||
|
SAMPLE_RATE = 22050
|
||||||
|
|
||||||
|
def load_model(filepath="model.safetensors"):
|
||||||
|
"""Loads the model structure and weights."""
|
||||||
|
model = gen()
|
||||||
|
state_dict = safe_load(filepath)
|
||||||
|
load_state_dict(model, state_dict)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def load_data(filepath="data.npz"):
|
||||||
|
"""Loads the pre-processed spectrogram data."""
|
||||||
|
print(f"Loading data from {filepath}...")
|
||||||
|
data = np.load(filepath)
|
||||||
|
x = data["arr_0"]
|
||||||
|
return x
|
||||||
|
|
||||||
|
def play_spec(spec,i):
|
||||||
|
"""Converts a spectrogram numpy array to audio and plays it."""
|
||||||
|
audio = spec_to_audio(spec)
|
||||||
|
sd.wait()
|
||||||
|
print(f"chunk:{i}")
|
||||||
|
sd.play(audio, samplerate=SAMPLE_RATE)
|
||||||
|
|
||||||
|
def run_prediction_loop(model, data_x):
|
||||||
|
current_spect = data_x[0:1]
|
||||||
|
for i in range(10):
|
||||||
|
play_spec(current_spect[0][0],i)
|
||||||
|
current_spect = model(Tensor(current_spect)).numpy()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
model = load_model()
|
||||||
|
data_x = load_data()
|
||||||
|
run_prediction_loop(model, data_x)
|
||||||
8
shell.nix
Normal file
8
shell.nix
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
{pkgs ? import <nixpkgs> {}}:
|
||||||
|
with pkgs;
|
||||||
|
mkShell rec {
|
||||||
|
packages = [python3 jupyter-all python3Packages.librosa python3Packages.tinygrad python3Packages.numpy python3Packages.mlflow python3Packages.tqdm python3Packages.sounddevice];
|
||||||
|
nativeBuildInputs = [];
|
||||||
|
buildInputs = [];
|
||||||
|
LD_LIBRARY_PATH = lib.makeLibraryPath buildInputs;
|
||||||
|
}
|
||||||
156
train.py
156
train.py
@@ -1,106 +1,72 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
import data
|
|
||||||
import model as model
|
|
||||||
import show
|
|
||||||
import mlflow
|
import mlflow
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tinygrad import nn,TinyJit,Tensor
|
from tinygrad import Device,Tensor,nn,TinyJit
|
||||||
|
from tinygrad.nn.state import safe_save, get_state_dict
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import time
|
||||||
|
import show
|
||||||
|
from model import gen
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
BATCH_SIZE = 16
|
||||||
|
EPOCHS = 100
|
||||||
|
LEARNING_RATE = 3e-4
|
||||||
|
print(Device.DEFAULT)
|
||||||
|
mdl = gen()
|
||||||
|
opt = nn.optim.AdamW(nn.state.get_parameters(mdl), lr=LEARNING_RATE)
|
||||||
|
|
||||||
|
def spec_loss(pred, target, eps=1e-6):
|
||||||
|
# spectral convergence
|
||||||
|
sc = ((target - pred).square().sum()) ** 0.5 / ((target.square().sum()) ** 0.5 + eps)
|
||||||
|
# log magnitude difference
|
||||||
|
log_mag = ((target.abs() + eps).log() - (pred.abs() + eps).log()).abs().mean()
|
||||||
|
return 0.1*sc + 1.0*log_mag + 0.1*(pred - target).abs().mean()
|
||||||
|
|
||||||
|
|
||||||
|
@TinyJit
|
||||||
|
def step_gen(x,y):
|
||||||
|
Tensor.training = True
|
||||||
|
z = mdl(x)
|
||||||
|
loss = spec_loss(z,y)
|
||||||
|
#loss = (y - z).abs().mean()
|
||||||
|
opt.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
opt.step()
|
||||||
|
return loss.numpy()
|
||||||
|
|
||||||
|
print("loading")
|
||||||
|
x = np.load("data.npz")["arr_0"]
|
||||||
|
y = np.load("data.npz")["arr_1"]
|
||||||
|
run_name = f"vae_{int(time.time())}"
|
||||||
mlflow.set_tracking_uri("http://127.0.0.1:5000")
|
mlflow.set_tracking_uri("http://127.0.0.1:5000")
|
||||||
mlflow.start_run(experiment_id=804883409598823668)
|
mlflow.start_run()
|
||||||
#hyper
|
mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "data size":len(x)})
|
||||||
BACH_SIZE=32
|
|
||||||
BATCH_SIZE=BACH_SIZE
|
|
||||||
glr=2e-4
|
|
||||||
dlr=1e-5
|
|
||||||
epochs=100
|
|
||||||
|
|
||||||
|
show.logSpec(Tensor(x[0:1]).numpy()[0][0],"default")
|
||||||
|
|
||||||
#dataset
|
print("training")
|
||||||
x = data.load()
|
eshape = (BATCH_SIZE, 1, 128, 216)
|
||||||
size=len(x)
|
for epoch in range(0,EPOCHS):
|
||||||
x_np = np.stack(x)
|
print(f"\n--- Starting Epoch {epoch} ---\n")
|
||||||
x_np = np.expand_dims(x_np, axis=1)
|
loss=0
|
||||||
permutation = np.random.permutation(size)
|
for i in tqdm(range(0,len(x),BATCH_SIZE)):
|
||||||
x_np = x_np[permutation]
|
tx=Tensor(x[i:i+BATCH_SIZE])
|
||||||
|
ty=Tensor(y[i:i+BATCH_SIZE])
|
||||||
train = x_np[30:]
|
|
||||||
test = x_np[0:30]
|
|
||||||
|
|
||||||
print("Train:"+str(len(train)))
|
|
||||||
print("Test:"+str(len(test)))
|
|
||||||
|
|
||||||
|
|
||||||
#model
|
|
||||||
gen = model.Gen()
|
|
||||||
dif = model.Check()
|
|
||||||
genOpt = nn.optim.AdamW(nn.state.get_parameters(gen), lr=glr)
|
|
||||||
difOpt = nn.optim.AdamW(nn.state.get_parameters(dif), lr=dlr)
|
|
||||||
|
|
||||||
|
|
||||||
#train
|
|
||||||
|
|
||||||
@TinyJit
|
|
||||||
def step_dis(x:Tensor):
|
|
||||||
Tensor.training = True
|
|
||||||
real = Tensor.ones((BATCH_SIZE,1))
|
|
||||||
fake = Tensor.zeros((BACH_SIZE,1))
|
|
||||||
noise = Tensor.randn(BACH_SIZE, gen.ld)
|
|
||||||
fake_data = gen(noise).detach()
|
|
||||||
fake_loss = dif(fake_data).binary_crossentropy_logits(fake)
|
|
||||||
real_loss = dif(x).binary_crossentropy_logits(real)
|
|
||||||
loss = (fake_loss + real_loss)/2
|
|
||||||
loss.backward()
|
|
||||||
difOpt.step()
|
|
||||||
return loss.numpy()
|
|
||||||
|
|
||||||
@TinyJit
|
|
||||||
def step_gen():
|
|
||||||
Tensor.training = True
|
|
||||||
real = Tensor.ones((BATCH_SIZE,1))
|
|
||||||
noise = Tensor.randn(BACH_SIZE, gen.ld)
|
|
||||||
fake_data = gen(noise).detach()
|
|
||||||
loss = dif(fake_data).binary_crossentropy_logits(real)
|
|
||||||
loss.backward()
|
|
||||||
genOpt.step()
|
|
||||||
return loss.numpy()
|
|
||||||
|
|
||||||
|
|
||||||
eshape = (BACH_SIZE, 1, 128, 216)
|
|
||||||
|
|
||||||
mlflow.log_param("generator_learning_rate", glr)
|
|
||||||
mlflow.log_param("discim_learning_rate", dlr)
|
|
||||||
mlflow.log_param("epochs", epochs)
|
|
||||||
mlflow.log_param("train size", len(train))
|
|
||||||
mlflow.log_param("test size", len(test))
|
|
||||||
for e in range(0,epochs):
|
|
||||||
print(f"\n--- Starting Epoch {e} ---\n")
|
|
||||||
dl=0
|
|
||||||
gl=0
|
|
||||||
|
|
||||||
for i in range(0,size,BACH_SIZE):
|
|
||||||
tx=Tensor(train[i:i+BACH_SIZE])
|
|
||||||
if(tx.shape != eshape):
|
if(tx.shape != eshape):
|
||||||
continue
|
continue
|
||||||
#steps
|
loss += step_gen(tx,ty)
|
||||||
dl+=step_dis(tx)
|
|
||||||
gl+=step_gen()
|
|
||||||
|
|
||||||
dl /= (size/BACH_SIZE)
|
loss /= (len(x)/BATCH_SIZE)
|
||||||
gl /= (size/BACH_SIZE)
|
if epoch%5==0:
|
||||||
if e%5==0:
|
show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],epoch)
|
||||||
noise = Tensor.randn(BACH_SIZE, gen.ld)
|
if epoch%15==0:
|
||||||
show.logSpec(gen(noise).numpy()[0][0],e)
|
state_dict = get_state_dict(mdl)
|
||||||
#todo test on test data
|
safe_save(state_dict, f"model_{epoch}.safetensors")
|
||||||
mlflow.log_metric("gen_loss", gl, step=e)
|
show.logSpec(mdl(mdl(mdl(Tensor(y[0:1])))).numpy()[0][0],f"deep_{epoch}")
|
||||||
mlflow.log_metric("dis_loss", dl, step=e)
|
|
||||||
print(f"loss of gen:{gl} dis:{dl}")
|
|
||||||
|
|
||||||
|
mlflow.log_metric("loss", loss, step=epoch)
|
||||||
|
print(f"loss of {loss}")
|
||||||
|
|
||||||
#save
|
show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],EPOCHS)
|
||||||
noise = Tensor.randn(BACH_SIZE, gen.ld)
|
state_dict = get_state_dict(mdl)
|
||||||
show.logSpec(gen(noise).numpy()[0][0],epochs)
|
safe_save(state_dict, "model.safetensors")
|
||||||
from tinygrad.nn.state import safe_save, get_state_dict
|
|
||||||
safe_save(get_state_dict(gen),"music.safetensors")
|
|
||||||
mlflow.log_artifact("music.safetensors")
|
|
||||||
|
|||||||
Reference in New Issue
Block a user