cleanup/rewrite

2025-07-31 14:23:34 -04:00 · 2025-07-31 14:23:34 -04:00 · 689e4df4aa
commit 689e4df4aa
parent f1abc67462
5 changed files with 277 additions and 482 deletions
--- a/data.py
+++ b/data.py
@ -0,0 +1,65 @@
 import librosa
 import numpy as np
 from pathlib import Path
 from multiprocessing import Pool, cpu_count
 SAMPLE_RATE = 22050
 def process_file(file_path):
    """
    Load 10 second chunks single song.
    """
    y, sr = librosa.load(file_path, mono=True, sr=SAMPLE_RATE)
    size = int(SAMPLE_RATE * 10)
    sample_len = len(y)
    file_chunks = []
    for start_pos in range(0, sample_len, size):
        end = start_pos + size
        if end <= sample_len:
            chunk = y[start_pos:end]
            file_chunks.append(chunk)
    return file_chunks
 def load():
    """
    Load 10 second chunks of songs.
    """
    audio = []
    files = list(Path("./data/").glob("*.mp3"))
    with Pool(cpu_count()) as pool:
        chunk_list = pool.map(process_file, files)
    for l in chunk_list:
        audio.extend(l)
    return audio
 def audio_split(audio):
    """
    Split 10 seconds of audio to 2 5 second clips
    """
    size = int(SAMPLE_RATE*5)
    x = audio[:size]
    y = audio[size:size*2]
    x = librosa.feature.melspectrogram(y=x, sr=SAMPLE_RATE)
    y = librosa.feature.melspectrogram(y=y, sr=SAMPLE_RATE)
    ma,mi = x.max(), x.min()
    x = (x - mi) / (ma - mi)
    ma,mi = y.max(), y.min()
    y = (y - mi) / (ma - mi)
    return x,y
 def detaset(chunks):
    """
    convert 10 second chunks to dataset
    """
    x,y=[],[]
    with Pool(cpu_count()) as pool:
        audio_list = pool.map(audio_split,chunks)
    for (ax,ay) in audio_list:
        x.append(ax)
        y.append(ay)
    return x,y
--- a/model.py
+++ b/model.py
@ -1,11 +1,11 @@
 from tinygrad import Tensor, nn
 import numpy as np
-class Model:
+class Gen:
    def __init__(self, input_channels=1, height=128, width=216, latent_dim=32):
        self.w = width // 8
        self.h = height // 8
-        self.flattened_size = 128 * self.h * self.w
+        self.flattened_size = 256 * self.h * self.w
        # Encoder
        self.e1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1)
@ -61,5 +61,24 @@ class Model:
        x = x.reshape(shape=(-1, 256, self.h, self.w))
        x = self.d1(x).relu()
        x = self.d2(x).relu()
-        x = self.d3(x)
+        x = self.d3(x).sigmoid()
        return x
 class Check():
    def __init__(self, input_channels=1, height=128, width=216):
        self.w = width // 8
        self.h = height // 8
        self.flattened_size = 256 * self.h * self.w
        self.d1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=2, padding=1)
        self.d2 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
        self.d3 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
        self.fc = nn.Linear(self.flattened_size, 1)
    def __call__(self, x: Tensor) -> Tensor:
        x = self.d1(x).leakyrelu(0.2)
        x = self.d2(x).leakyrelu(0.2)
        x = self.d3(x).leakyrelu(0.2)
        x = x.reshape(shape=(-1, self.flattened_size))
        return self.fc(x)
--- a/music.ipynb
+++ b/music.ipynb
--- a/show.py
+++ b/show.py
@ -0,0 +1,26 @@
 import matplotlib.pyplot as plt
 import IPython.display as ipd
 import librosa
 SAMPLE_RATE = 22050
 def showSpec(spec):
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(spec, sr=SAMPLE_RATE,
                             x_axis='time', y_axis='mel',
                             cmap='viridis')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel spectrogram')
    plt.show()
 def playSpec(spec):
    S = librosa.feature.inverse.mel_to_stft(spec, sr=SAMPLE_RATE)
    audio = librosa.griffinlim(S,n_iter=25,momentum=0.99)
    plt.figure(figsize=(12,4))
    plt.plot(audio)
    plt.title('waveform')
    plt.show()
    display(ipd.Audio(audio,rate=SAMPLE_RATE))
--- a/train.ipynb
+++ b/train.ipynb
@ -0,0 +1,163 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0",
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import data\n",
    "import show\n",
    "import model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "x,y = data.detaset(data.load())\n",
    "len(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "sample=x[420]\n",
    "show.showSpec(sample)\n",
    "show.playSpec(sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tinygrad import nn\n",
    "gen = model.Gen()\n",
    "optimizer = nn.optim.AdamW(nn.state.get_parameters(gen), lr=1e-4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {},
   "outputs": [],
   "source": [
    "@TinyJit\n",
    "def jit_step(X: Tensor, Y: Tensor,epoch) -> Tensor:\n",
    "    Tensor.training = True\n",
    "    optimizer.zero_grad()\n",
    "    _, loss = gen.__Lcall__(X,Y,epoch)\n",
    "    loss.backward()\n",
    "    optimizer.step()\n",
    "    return loss.realize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "x_np, y_np = np.stack(x), np.stack(y)\n",
    "x_np = np.expand_dims(x_np, axis=1)\n",
    "y_np = np.expand_dims(y_np, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "size=len(x)\n",
    "BACH_SIZE=32\n",
    "eshape = (BACH_SIZE, 1, 128, 216)\n",
    "\n",
    "for e in range(0,12):\n",
    "    print(f\"\\n--- Starting Epoch {e} ---\\n\")\n",
    "    l=0\n",
    "    \n",
    "    permutation = np.random.permutation(size)\n",
    "    x_np = x_np[permutation]\n",
    "    y_np = y_np[permutation]\n",
    "    \n",
    "    for i in range(0,size,BACH_SIZE):\n",
    "        tx,ty=Tensor(x_np[i:i+BACH_SIZE]),Tensor(y_np[i:i+BACH_SIZE])\n",
    "        if(tx.shape != eshape or ty.shape != eshape):\n",
    "            continue\n",
    "        l+=jit_step(tx,ty,e).numpy()\n",
    "        \n",
    "    l /= (size/BACH_SIZE)\n",
    "    print(f\"loss of {l}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from tinygrad.nn.state import safe_save, get_state_dict\n",
    "safe_save(get_state_dict(gen),\"music.safetensors\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }