5. Parameter-Efficient Finetuning

This matters because full-model finetuning is often wasteful or impossible at scale. Focus on how to freeze the expensive backbone and confine learning to a small adapter or head.

[ ]:
import os

import torch
from torch import nn

torch.manual_seed(53)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
check_mode = os.getenv('PYTORCH_INTRO_CHECK_MODE') == '1'

5.1. Frozen backbone with a trainable adapter

[ ]:
class AdapterClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = nn.Sequential(nn.Linear(12, 32), nn.ReLU(), nn.Linear(32, 32), nn.ReLU())
        self.adapter = nn.Sequential(nn.Linear(32, 8), nn.ReLU(), nn.Linear(8, 32))
        self.head = nn.Linear(32, 2)

    def forward(self, x):
        features = self.backbone(x)
        adapted = features + self.adapter(features)
        return self.head(adapted)

model = AdapterClassifier().to(device)
for parameter in model.backbone.parameters():
    parameter.requires_grad = False

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen = sum(p.numel() for p in model.parameters() if not p.requires_grad)
print('trainable:', trainable, 'frozen:', frozen)
assert trainable < frozen

5.2. Train only the adapter and head

[ ]:
x = torch.randn(48, 12, device=device)
y = (x[:, 0] - x[:, 1] > 0).long()
optimizer = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=0.03)
criterion = nn.CrossEntropyLoss()
steps = 4 if check_mode else 40
for step in range(steps):
    optimizer.zero_grad(set_to_none=True)
    logits = model(x)
    loss = criterion(logits, y)
    loss.backward()
    optimizer.step()
    if step in {0, steps - 1}:
        print(step, round(loss.item(), 4))
assert logits.shape == (48, 2)