switched to gan and train.py.
This commit is contained in:
parent
5df1e5df7e
commit
ccc3fa3ed4
10
data.py
10
data.py
@ -2,15 +2,17 @@ import librosa
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from multiprocessing import Pool, cpu_count
|
||||
import mlflow
|
||||
|
||||
SAMPLE_RATE = 22050
|
||||
|
||||
@mlflow.trace
|
||||
def process_file(file_path):
|
||||
"""
|
||||
Load 10 second chunks single song.
|
||||
"""
|
||||
y, sr = librosa.load(file_path, mono=True, sr=SAMPLE_RATE)
|
||||
size = int(SAMPLE_RATE * 10)
|
||||
size = int(SAMPLE_RATE * 5)
|
||||
sample_len = len(y)
|
||||
|
||||
file_chunks = []
|
||||
@ -18,9 +20,12 @@ def process_file(file_path):
|
||||
end = start_pos + size
|
||||
if end <= sample_len:
|
||||
chunk = y[start_pos:end]
|
||||
chunk = librosa.feature.melspectrogram(y=chunk,sr=SAMPLE_RATE)
|
||||
chunk = ((librosa.amplitude_to_db(chunk,ref=np.max)+40)/40)
|
||||
file_chunks.append(chunk)
|
||||
return file_chunks
|
||||
|
||||
@mlflow.trace
|
||||
def load():
|
||||
"""
|
||||
Load 10 second chunks of songs.
|
||||
@ -33,6 +38,9 @@ def load():
|
||||
audio.extend(l)
|
||||
return audio
|
||||
|
||||
|
||||
|
||||
##DEP
|
||||
def audio_split(audio):
|
||||
"""
|
||||
Split 10 seconds of audio to 2 5 second clips
|
||||
|
||||
102
model.py
102
model.py
@ -1,84 +1,34 @@
|
||||
from tinygrad import Tensor, nn
|
||||
import numpy as np
|
||||
|
||||
class Gen:
|
||||
def __init__(self, input_channels=1, height=128, width=216, latent_dim=32):
|
||||
self.w = width // 8
|
||||
self.h = height // 8
|
||||
self.flattened_size = 256 * self.h * self.w
|
||||
|
||||
# Encoder
|
||||
self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1)
|
||||
def __init__(self, height=128, width=216, latent_dim=128):
|
||||
self.w = width // 4
|
||||
self.h = height // 4
|
||||
self.flat = 128 * self.h * self.w
|
||||
self.ld = latent_dim
|
||||
self.d1 = nn.Linear(latent_dim, self.flat)
|
||||
self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)
|
||||
self.d3 = nn.ConvTranspose2d(64, 1, kernel_size=3, stride=2, padding=1, output_padding=1)
|
||||
|
||||
def __call__(self, noise: Tensor) -> Tensor:
|
||||
x = self.d1(noise).relu()
|
||||
x = x.reshape(noise.shape[0], 128, self.h, self.w)
|
||||
x = self.d2(x).relu()
|
||||
x = self.d3(x)
|
||||
return x.tanh()
|
||||
|
||||
|
||||
class Check:
|
||||
def __init__(self, height=128, width=216):
|
||||
self.w = width // 4
|
||||
self.h = height // 4
|
||||
self.flat = 128 * self.h * self.w
|
||||
self.e1 = nn.Conv2d(1, 64, kernel_size=3, stride=2, padding=1)
|
||||
self.e2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
|
||||
self.e3 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
|
||||
self.out = nn.Linear(self.flat, 2)
|
||||
|
||||
# VAE Latent Space
|
||||
self.fc_mu = nn.Linear(self.flattened_size, latent_dim)
|
||||
self.fc_logvar = nn.Linear(self.flattened_size, latent_dim)
|
||||
|
||||
# Decoder
|
||||
self.dl = nn.Linear(latent_dim, self.flattened_size)
|
||||
self.d1 = nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1,output_padding=1)
|
||||
self.d2 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1,output_padding=1)
|
||||
self.d3 = nn.ConvTranspose2d(64, input_channels, kernel_size=3, stride=2, padding=1,output_padding=1)
|
||||
|
||||
def __call__(self, x: Tensor) -> Tensor:
|
||||
mu, log_var = self.encode(x)
|
||||
x = self.reparameterize(mu, log_var)
|
||||
return self.decode(x)
|
||||
|
||||
def __Lcall__(self, inp: Tensor, otp:Tensor, epoch) -> (Tensor, Tensor):
|
||||
mu, log_var = self.encode(inp)
|
||||
z = self.reparameterize(mu, log_var)
|
||||
recon = self.decode(z)
|
||||
|
||||
# Normalized MSE (per-pixel)
|
||||
recon_loss = (recon - otp).abs().mean()
|
||||
|
||||
# Stabilized KL
|
||||
kl_div = -0.5 * (1 + log_var - mu.pow(2) - log_var.exp()).mean()
|
||||
|
||||
|
||||
# Weighted loss
|
||||
total_loss = recon_loss + min(0.1, 0.01 * epoch) * kl_div
|
||||
return recon, total_loss
|
||||
|
||||
def encode(self, x: Tensor) -> (Tensor, Tensor):
|
||||
x = self.e1(x).relu()
|
||||
x = self.e2(x).relu()
|
||||
x = self.e3(x).relu()
|
||||
x = x.reshape(shape=(-1, self.flattened_size))
|
||||
return self.fc_mu(x), self.fc_logvar(x)
|
||||
|
||||
def reparameterize(self, mu: Tensor, log_var: Tensor) -> Tensor:
|
||||
log_var = log_var.clip(-10, 10)
|
||||
std = (log_var * 0.5).exp()
|
||||
eps = Tensor.randn(mu.shape)
|
||||
return mu + std * eps
|
||||
|
||||
def decode(self, x: Tensor) -> Tensor:
|
||||
x = self.dl(x).relu()
|
||||
x = x.reshape(shape=(-1, 256, self.h, self.w))
|
||||
x = self.d1(x).relu()
|
||||
x = self.d2(x).relu()
|
||||
x = self.d3(x).sigmoid()
|
||||
return x
|
||||
|
||||
class Check():
|
||||
def __init__(self, input_channels=1, height=128, width=216):
|
||||
self.w = width // 8
|
||||
self.h = height // 8
|
||||
self.flattened_size = 256 * self.h * self.w
|
||||
|
||||
self.d1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1)
|
||||
self.d2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
|
||||
self.d3 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
|
||||
self.fc = nn.Linear(self.flattened_size, 1)
|
||||
|
||||
|
||||
def __call__(self, x: Tensor) -> Tensor:
|
||||
x = self.d1(x).leakyrelu(0.2)
|
||||
x = self.d2(x).leakyrelu(0.2)
|
||||
x = self.d3(x).leakyrelu(0.2)
|
||||
x = x.reshape(shape=(-1, self.flattened_size))
|
||||
return self.fc(x)
|
||||
x = x.reshape(x.shape[0], -1)
|
||||
return self.out(x).sigmoid()
|
||||
|
||||
13
show.py
13
show.py
@ -1,18 +1,21 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import IPython.display as ipd
|
||||
import librosa
|
||||
import mlflow
|
||||
|
||||
|
||||
SAMPLE_RATE = 22050
|
||||
|
||||
def showSpec(spec):
|
||||
def logSpec(spec,e):
|
||||
#spec = ((spec*40)-40)
|
||||
#spec = librosa.db_to_amplitude(spec)
|
||||
plt.figure(figsize=(10, 4))
|
||||
librosa.display.specshow(spec, sr=SAMPLE_RATE,
|
||||
x_axis='time', y_axis='mel',
|
||||
cmap='viridis')
|
||||
plt.colorbar(format='%+2.0f dB')
|
||||
plt.title('Mel spectrogram')
|
||||
plt.show()
|
||||
mlflow.log_figure(plt.gcf(), f"output_{e}.png")
|
||||
#plt.close()
|
||||
|
||||
def playSpec(spec):
|
||||
S = librosa.feature.inverse.mel_to_stft(spec, sr=SAMPLE_RATE)
|
||||
@ -21,6 +24,4 @@ def playSpec(spec):
|
||||
plt.figure(figsize=(12,4))
|
||||
plt.plot(audio)
|
||||
plt.title('waveform')
|
||||
plt.show()
|
||||
|
||||
display(ipd.Audio(audio,rate=SAMPLE_RATE))
|
||||
plt.close()
|
||||
|
||||
105
train.py
Normal file
105
train.py
Normal file
@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
import data
|
||||
import model as model
|
||||
import show
|
||||
import mlflow
|
||||
import numpy as np
|
||||
from tinygrad import nn,TinyJit,Tensor
|
||||
|
||||
mlflow.set_tracking_uri("http://127.0.0.1:5000")
|
||||
mlflow.start_run(experiment_id=804883409598823668)
|
||||
#hyper
|
||||
BACH_SIZE=32
|
||||
glr=1e-3
|
||||
dlr=1e-3
|
||||
epochs=100
|
||||
|
||||
|
||||
#dataset
|
||||
x = data.load()
|
||||
size=len(x)
|
||||
x_np = np.stack(x)
|
||||
x_np = np.expand_dims(x_np, axis=1)
|
||||
permutation = np.random.permutation(size)
|
||||
x_np = x_np[permutation]
|
||||
|
||||
train = x_np[30:]
|
||||
test = x_np[0:30]
|
||||
|
||||
print("Train:"+str(len(train)))
|
||||
print("Test:"+str(len(test)))
|
||||
|
||||
|
||||
#model
|
||||
gen = model.Gen()
|
||||
dif = model.Check()
|
||||
genOpt = nn.optim.AdamW(nn.state.get_parameters(gen), lr=glr)
|
||||
difOpt = nn.optim.AdamW(nn.state.get_parameters(dif), lr=dlr)
|
||||
|
||||
|
||||
#train
|
||||
|
||||
@TinyJit
|
||||
def step_dis(x:Tensor):
|
||||
Tensor.training = True
|
||||
real = Tensor([1,0])
|
||||
fake = Tensor([0,1])
|
||||
noise = Tensor.randn(BACH_SIZE, gen.ld)
|
||||
fake_data = gen(noise).detach()
|
||||
fake_loss = dif(fake_data).log_softmax().nll_loss(fake)
|
||||
real_loss = dif(x).log_softmax().nll_loss(real)
|
||||
loss = (fake_loss + real_loss)/2
|
||||
loss.backward()
|
||||
difOpt.step()
|
||||
return loss.numpy()
|
||||
|
||||
@TinyJit
|
||||
def step_gen():
|
||||
Tensor.training = True
|
||||
real = Tensor([1,0])
|
||||
noise = Tensor.randn(BACH_SIZE, gen.ld)
|
||||
fake_data = gen(noise).detach()
|
||||
loss = dif(fake_data).log_softmax().nll_loss(real)
|
||||
loss.backward()
|
||||
genOpt.step()
|
||||
return loss.numpy()
|
||||
|
||||
|
||||
eshape = (BACH_SIZE, 1, 128, 216)
|
||||
|
||||
mlflow.log_param("generator_learning_rate", glr)
|
||||
mlflow.log_param("discim_learning_rate", dlr)
|
||||
mlflow.log_param("epochs", epochs)
|
||||
mlflow.log_param("train size", len(train))
|
||||
mlflow.log_param("test size", len(test))
|
||||
for e in range(0,epochs):
|
||||
print(f"\n--- Starting Epoch {e} ---\n")
|
||||
dl=0
|
||||
gl=0
|
||||
|
||||
for i in range(0,size,BACH_SIZE):
|
||||
tx=Tensor(train[i:i+BACH_SIZE])
|
||||
if(tx.shape != eshape):
|
||||
continue
|
||||
#steps
|
||||
dl+=step_dis(tx)
|
||||
gl+=step_gen()
|
||||
|
||||
dl /= (size/BACH_SIZE)
|
||||
gl /= (size/BACH_SIZE)
|
||||
if e%4==0:
|
||||
noise = Tensor.randn(BACH_SIZE, gen.ld)
|
||||
show.logSpec(gen(noise).numpy()[0][0],e)
|
||||
#todo test on test data
|
||||
mlflow.log_metric("gen_loss", gl, step=e)
|
||||
mlflow.log_metric("dis_loss", dl, step=e)
|
||||
print(f"loss of gen:{gl} dis:{dl}")
|
||||
|
||||
|
||||
#save
|
||||
noise = Tensor.randn(BACH_SIZE, gen.ld)
|
||||
show.logSpec(gen(noise).numpy()[0][0],epochs)
|
||||
from tinygrad.nn.state import safe_save, get_state_dict
|
||||
safe_save(get_state_dict(gen),"music.safetensors")
|
||||
mlflow.log_artifact("music.safetensors")
|
||||
Loading…
x
Reference in New Issue
Block a user