2. Experiment Tracking

This matters because model training is not a single run, it is a sequence of decisions. If you do not record configs, metrics, and artifacts in a structured way, you cannot tell whether a change actually helped or just got lucky.

This notebook shows a minimal tracking pattern you can implement without adding a heavier experiment platform first.

2.1. Setup

[ ]:
from pathlib import Path
import json
import time
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, random_split

torch.manual_seed(13)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
run_dir = Path('output/experiments')
run_dir.mkdir(parents=True, exist_ok=True)

2.2. Data

[ ]:
x = torch.randn(1000, 6)
score = x[:, 0] - 0.7 * x[:, 1] + 0.3 * x[:, 2] * x[:, 3] + 0.4 * torch.randn(1000)
y = (score > 0).float().unsqueeze(1)
dataset = TensorDataset(x, y)
train_ds, val_ds = random_split(dataset, [800, 200], generator=torch.Generator().manual_seed(13))
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=128)

2.3. Tracking Helpers

[ ]:
def evaluate(model, loader, loss_fn):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            loss = loss_fn(logits, yb)
            total_loss += loss.item() * len(xb)
            preds = (torch.sigmoid(logits) > 0.5).float()
            correct += (preds == yb).sum().item()
            total += yb.numel()
    return {'loss': total_loss / total, 'accuracy': correct / total}

def run_experiment(config):
    model = nn.Sequential(nn.Linear(6, config['hidden']), nn.ReLU(), nn.Linear(config['hidden'], 1)).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    loss_fn = nn.BCEWithLogitsLoss()
    history = []

    for epoch in range(config['epochs']):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            loss = loss_fn(model(xb), yb)
            loss.backward()
            optimizer.step()
        metrics = evaluate(model, val_loader, loss_fn)
        metrics['epoch'] = epoch + 1
        history.append(metrics)

    run_name = f"run-{int(time.time() * 1000)}-{config['hidden']}h"
    path = run_dir / f'{run_name}.json'
    payload = {'config': config, 'history': history, 'best_val_loss': min(row['loss'] for row in history)}
    path.write_text(json.dumps(payload, indent=2))
    return path, payload

2.4. Compare Two Runs

[ ]:
runs = []
for config in [
    {'hidden': 8, 'lr': 1e-2, 'epochs': 10},
    {'hidden': 32, 'lr': 1e-2, 'epochs': 10},
]:
    path, payload = run_experiment(config)
    runs.append((path, payload))
    print(path.name, payload['config'], 'best_val_loss=', round(payload['best_val_loss'], 4))

2.5. Read Back the Saved Metadata

[ ]:
saved = []
for path, _ in runs:
    saved.append(json.loads(path.read_text()))

sorted(
    ({'config': item['config'], 'best_val_loss': round(item['best_val_loss'], 4)} for item in saved),
    key=lambda row: row['best_val_loss']
)