5. Parameter-Efficient Finetuning
This matters because full-model finetuning is often wasteful or impossible at scale. Focus on how to freeze the expensive backbone and confine learning to a small adapter or head.
[ ]:
import os
import torch
from torch import nn
torch.manual_seed(53)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
check_mode = os.getenv('PYTORCH_INTRO_CHECK_MODE') == '1'
5.1. Frozen backbone with a trainable adapter
[ ]:
class AdapterClassifier(nn.Module):
def __init__(self):
super().__init__()
self.backbone = nn.Sequential(nn.Linear(12, 32), nn.ReLU(), nn.Linear(32, 32), nn.ReLU())
self.adapter = nn.Sequential(nn.Linear(32, 8), nn.ReLU(), nn.Linear(8, 32))
self.head = nn.Linear(32, 2)
def forward(self, x):
features = self.backbone(x)
adapted = features + self.adapter(features)
return self.head(adapted)
model = AdapterClassifier().to(device)
for parameter in model.backbone.parameters():
parameter.requires_grad = False
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen = sum(p.numel() for p in model.parameters() if not p.requires_grad)
print('trainable:', trainable, 'frozen:', frozen)
assert trainable < frozen
5.2. Train only the adapter and head
[ ]:
x = torch.randn(48, 12, device=device)
y = (x[:, 0] - x[:, 1] > 0).long()
optimizer = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=0.03)
criterion = nn.CrossEntropyLoss()
steps = 4 if check_mode else 40
for step in range(steps):
optimizer.zero_grad(set_to_none=True)
logits = model(x)
loss = criterion(logits, y)
loss.backward()
optimizer.step()
if step in {0, steps - 1}:
print(step, round(loss.item(), 4))
assert logits.shape == (48, 2)