simple vae style model
This commit is contained in:
parent
df4cdc8e25
commit
64e66260ec
9
data.py
9
data.py
@ -22,15 +22,9 @@ def process_file(file_path):
|
|||||||
end = start_pos + size
|
end = start_pos + size
|
||||||
if end <= sample_len:
|
if end <= sample_len:
|
||||||
chunk = y[start_pos:end]
|
chunk = y[start_pos:end]
|
||||||
chunk = librosa.feature.melspectrogram(y=chunk, sr=SAMPLE_RATE)
|
|
||||||
chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+80)/80)
|
|
||||||
|
|
||||||
#chunk = librosa.feature.melspectrogram(y=chunk,sr=SAMPLE_RATE)
|
|
||||||
#chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+40)/40)
|
|
||||||
file_chunks.append(chunk)
|
file_chunks.append(chunk)
|
||||||
return file_chunks
|
return file_chunks
|
||||||
|
|
||||||
#@mlflow.trace
|
|
||||||
def load():
|
def load():
|
||||||
"""
|
"""
|
||||||
Load 10 second chunks of songs.
|
Load 10 second chunks of songs.
|
||||||
@ -44,9 +38,6 @@ def load():
|
|||||||
audio.extend(l)
|
audio.extend(l)
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##DEP
|
|
||||||
def audio_split(audio):
|
def audio_split(audio):
|
||||||
"""
|
"""
|
||||||
Split 10 seconds of audio to 2 5 second clips
|
Split 10 seconds of audio to 2 5 second clips
|
||||||
|
|||||||
@ -1,14 +1,14 @@
|
|||||||
import data
|
import data
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
x = data.load()
|
x,y = data.dataset(data.load())
|
||||||
size=len(x)
|
size=len(x)
|
||||||
print(size)
|
print(size)
|
||||||
|
|
||||||
x_np = np.stack(x)
|
x_np = np.stack(x)
|
||||||
x_np = np.expand_dims(x_np, axis=1)
|
x_np = np.expand_dims(x_np, axis=1)
|
||||||
|
|
||||||
#y_np = np.stack(y)
|
y_np = np.stack(y)
|
||||||
#y_np = np.expand_dims(y_np, axis=1)
|
y_np = np.expand_dims(y_np, axis=1)
|
||||||
|
|
||||||
np.savez_compressed("data",x_np)
|
np.savez_compressed("data",x_np,y_np)
|
||||||
|
|||||||
28
model.py
28
model.py
@ -1,40 +1,39 @@
|
|||||||
from tinygrad import Tensor, nn
|
from tinygrad import Tensor, nn
|
||||||
|
|
||||||
class gen:
|
class gen:
|
||||||
def __init__(self, input_channels=1, height=128, width=431, latent_dim=64):
|
def __init__(self, input_channels=1, height=128, width=216, latent_dim=1024):
|
||||||
self.height = height
|
self.height = height
|
||||||
self.width = width
|
self.width = width
|
||||||
self.latent_dim = latent_dim
|
self.latent_dim = latent_dim
|
||||||
|
|
||||||
self.w = width // 4
|
self.w = width // 8
|
||||||
self.h = height // 4
|
self.h = height // 8
|
||||||
self.h = 32 # Output height after 2 strides
|
self.flattened_size = 256 * self.h * self.w
|
||||||
self.w = 108 # Output width after 2 strides
|
|
||||||
self.flattened_size = 128 * self.h * self.w
|
|
||||||
|
|
||||||
|
|
||||||
self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1)
|
self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1)
|
||||||
self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
|
self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
|
||||||
|
self.e3 = nn.Conv2d(128,256, kernel_size=3,stride=2,padding=1)
|
||||||
|
|
||||||
|
|
||||||
self.el = nn.Linear(self.flattened_size, self.latent_dim)
|
self.el = nn.Linear(self.flattened_size, self.latent_dim)
|
||||||
|
|
||||||
self.q = nn.Linear(self.latent_dim,self.latent_dim)
|
|
||||||
self.k = nn.Linear(self.latent_dim,self.latent_dim)
|
|
||||||
self.v = nn.Linear(self.latent_dim,self.latent_dim)
|
|
||||||
|
|
||||||
self.dl = nn.Linear(self.latent_dim, self.flattened_size)
|
self.dl = nn.Linear(self.latent_dim, self.flattened_size)
|
||||||
|
|
||||||
self.d1 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)
|
self.d1 = nn.ConvTranspose2d(256,128,kernel_size=3,stride=2,padding=1,output_padding=1)
|
||||||
self.d2 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1)
|
self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)
|
||||||
|
self.d3 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1, output_padding=1)
|
||||||
|
|
||||||
def __call__(self, x: Tensor) -> Tensor:
|
def __call__(self, x: Tensor) -> Tensor:
|
||||||
y, shape = self.encode(x)
|
y, shape = self.encode(x)
|
||||||
z = self.atten(y)
|
z = y#self.atten(y)
|
||||||
return self.decode(z, shape)
|
return self.decode(z, shape)
|
||||||
|
|
||||||
def encode(self, x: Tensor):
|
def encode(self, x: Tensor):
|
||||||
x = self.e1(x).leakyrelu()
|
x = self.e1(x).leakyrelu()
|
||||||
x = self.e2(x).leakyrelu()
|
x = self.e2(x).leakyrelu()
|
||||||
|
x = self.e3(x).leakyrelu()
|
||||||
b, c, h, w = x.shape
|
b, c, h, w = x.shape
|
||||||
|
|
||||||
flattened_size = c * h * w
|
flattened_size = c * h * w
|
||||||
@ -52,9 +51,10 @@ class gen:
|
|||||||
|
|
||||||
def decode(self, z: Tensor, shape):
|
def decode(self, z: Tensor, shape):
|
||||||
x = self.dl(z).leakyrelu()
|
x = self.dl(z).leakyrelu()
|
||||||
x = x.reshape(shape=(-1, 128, self.h, self.w))
|
x = x.reshape(shape=(-1, 256, self.h, self.w))
|
||||||
x = self.d1(x).leakyrelu()
|
x = self.d1(x).leakyrelu()
|
||||||
x = self.d2(x).sigmoid()
|
x = self.d2(x).leakyrelu()
|
||||||
|
x = self.d3(x).sigmoid()
|
||||||
|
|
||||||
# Crop or pad to match input size
|
# Crop or pad to match input size
|
||||||
out_h, out_w = x.shape[2], x.shape[3]
|
out_h, out_w = x.shape[2], x.shape[3]
|
||||||
|
|||||||
34
train.py
34
train.py
@ -8,27 +8,25 @@ from model import gen
|
|||||||
|
|
||||||
BATCH_SIZE = 16
|
BATCH_SIZE = 16
|
||||||
EPOCHS = 100
|
EPOCHS = 100
|
||||||
LEARNING_RATE = 1e-5
|
LEARNING_RATE = 3e-4
|
||||||
print(Device.DEFAULT)
|
print(Device.DEFAULT)
|
||||||
mdl = gen()
|
mdl = gen()
|
||||||
opt = nn.optim.AdamW(nn.state.get_parameters(mdl), lr=LEARNING_RATE)
|
opt = nn.optim.AdamW(nn.state.get_parameters(mdl), lr=LEARNING_RATE)
|
||||||
volume = 0.1
|
|
||||||
|
|
||||||
def spec_loss(pred, target, eps=1e-6):
|
def spec_loss(pred, target, eps=1e-6):
|
||||||
# spectral convergence
|
# spectral convergence
|
||||||
sc = ((target - pred).square().sum()) ** 0.5 / ((target.square().sum()) ** 0.5 + eps)
|
sc = ((target - pred).square().sum()) ** 0.5 / ((target.square().sum()) ** 0.5 + eps)
|
||||||
# log magnitude difference
|
# log magnitude difference
|
||||||
log_mag = ((target.abs() + eps).log() - (pred.abs() + eps).log()).abs().mean()
|
log_mag = ((target.abs() + eps).log() - (pred.abs() + eps).log()).abs().mean()
|
||||||
return sc + log_mag
|
return 0.1*sc + 1.0*log_mag + 0.1*(pred - target).abs().mean()
|
||||||
|
|
||||||
|
|
||||||
@TinyJit
|
@TinyJit
|
||||||
def step_gen(x):
|
def step_gen(x,y):
|
||||||
Tensor.training = True
|
Tensor.training = True
|
||||||
noise = Tensor.rand_like(x).tanh()
|
z = mdl(x)
|
||||||
y = x+(noise*volume)
|
loss = spec_loss(z,y)
|
||||||
y = y.clamp(0,1)
|
#loss = (y - z).abs().mean()
|
||||||
loss = spec_loss(mdl(y),x)
|
|
||||||
opt.zero_grad()
|
opt.zero_grad()
|
||||||
loss.backward()
|
loss.backward()
|
||||||
opt.step()
|
opt.step()
|
||||||
@ -36,8 +34,8 @@ def step_gen(x):
|
|||||||
|
|
||||||
print("loading")
|
print("loading")
|
||||||
x = np.load("data.npz")["arr_0"]
|
x = np.load("data.npz")["arr_0"]
|
||||||
#x= x[0:64]
|
y = np.load("data.npz")["arr_1"]
|
||||||
run_name = f"tinygrad_autoencoder_{int(time.time())}"
|
run_name = f"vae_{int(time.time())}"
|
||||||
mlflow.set_tracking_uri("http://127.0.0.1:5000")
|
mlflow.set_tracking_uri("http://127.0.0.1:5000")
|
||||||
mlflow.start_run()
|
mlflow.start_run()
|
||||||
mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "data size":len(x)})
|
mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "data size":len(x)})
|
||||||
@ -52,20 +50,18 @@ for epoch in range(0,EPOCHS):
|
|||||||
loss=0
|
loss=0
|
||||||
for i in range(0,len(x),BATCH_SIZE):
|
for i in range(0,len(x),BATCH_SIZE):
|
||||||
tx=Tensor(x[i:i+BATCH_SIZE])
|
tx=Tensor(x[i:i+BATCH_SIZE])
|
||||||
|
ty=Tensor(y[i:i+BATCH_SIZE])
|
||||||
if(tx.shape != eshape):
|
if(tx.shape != eshape):
|
||||||
continue
|
continue
|
||||||
loss += step_gen(tx)
|
loss += step_gen(tx,ty)
|
||||||
|
|
||||||
loss /= (len(x)/BATCH_SIZE)
|
loss /= (len(x)/BATCH_SIZE)
|
||||||
if epoch%5==0:
|
if epoch%5==0:
|
||||||
noise = Tensor.rand_like(Tensor(x[0:1])).tanh()
|
show.logSpec(mdl(Tensor(x[0:1])).numpy()[0][0],epoch)
|
||||||
y = Tensor(x[0:1]) + (noise*volume)
|
if epoch%15==0:
|
||||||
show.logSpec(mdl(y).numpy()[0][0],epoch)
|
state_dict = get_state_dict(mdl)
|
||||||
if(pl - loss < 0.03 and epoch > 25):
|
safe_save(state_dict, f"model_{epoch}.safetensors")
|
||||||
show.logSpec(y.numpy()[0][0],f"volume_{volume}")
|
show.logSpec(mdl(mdl(mdl(Tensor(y[0:1])))).numpy()[0][0],f"deep_{epoch}")
|
||||||
volume *= 2
|
|
||||||
pl = loss
|
|
||||||
|
|
||||||
mlflow.log_metric("volume", volume, step=epoch)
|
|
||||||
mlflow.log_metric("loss", loss, step=epoch)
|
mlflow.log_metric("loss", loss, step=epoch)
|
||||||
print(f"loss of {loss}")
|
print(f"loss of {loss}")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user