PyTorch Cheatsheet

Tensors, autograd, models, training loops & common operations

AI / ML
Contents
📦

Tensors

import torch

# Create tensors
x = torch.tensor([1, 2, 3])
x = torch.zeros(3, 4)
x = torch.ones(2, 3)
x = torch.randn(3, 4)           # normal dist
x = torch.rand(3, 4)            # uniform [0,1)
x = torch.arange(0, 10)
x = torch.linspace(0, 1, 100)
x = torch.eye(4)
x = torch.empty(3, 3)

# From numpy
x = torch.from_numpy(np_array)
np_array = x.numpy()

# Properties
x.shape   x.dtype   x.device   x.requires_grad

Operations

# Element-wise
x + y    x - y    x * y    x / y    x ** 2
torch.sqrt(x)   torch.exp(x)   torch.log(x)
torch.abs(x)    torch.clamp(x, 0, 1)

# Matrix
x @ y             # matmul
torch.mm(x, y)    # matmul 2D
torch.bmm(x, y)   # batch matmul
x.T               # transpose

# Reduction
x.sum()   x.mean()   x.max()   x.min()
x.sum(dim=0)   x.argmax(dim=1)

# Reshape
x.view(2, -1)    x.reshape(2, -1)
x.unsqueeze(0)   x.squeeze()
x.flatten()      x.permute(2,0,1)
torch.cat([x,y], dim=0)   torch.stack([x,y])
🔄

Autograd

# Enable gradient tracking
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2 + 3 * x + 1
y.backward()           # compute gradients
x.grad                  # dy/dx = 2x + 3 = 7

# Detach from graph
z = x.detach()
with torch.no_grad():
    # inference only
    pred = model(x)

# Zero gradients
optimizer.zero_grad()
🏗️

Building Models

import torch.nn as nn

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

# Sequential
model = nn.Sequential(
    nn.Linear(784, 128),
    nn.ReLU(),
    nn.Linear(128, 10)
)

# Common layers
nn.Linear   nn.Conv2d   nn.LSTM   nn.GRU
nn.BatchNorm1d   nn.LayerNorm   nn.Embedding
🏋️

Training Loop

model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        for val_x, val_y in val_loader:
            pred = model(val_x)

# Save / Load
torch.save(model.state_dict(), "model.pth")
model.load_state_dict(torch.load("model.pth"))
📊

Data Loading

from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)
🖥️

GPU & Device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
x = x.to(device)

# Apple Silicon
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Multi-GPU
model = nn.DataParallel(model)

# Memory
torch.cuda.empty_cache()
torch.cuda.memory_summary()