AI & ML Development Cheatsheet

PyTorch, TensorFlow, Hugging Face, model training, CNNs, RNNs, Transformers, MLOps & deployment

AI / ML

Contents

ML Fundamentals PyTorch TensorFlow / Keras CNNs (Computer Vision) RNNs & Sequence Models Transformers Hugging Face Training Best Practices NLP Essentials MLOps & Deployment GPU & Performance Math Cheat Sheet

ML Fundamentals

Types of Learning

Type	Data	Goal	Examples
Supervised	Labeled (X, y)	Predict y from X	Classification, Regression
Unsupervised	Unlabeled (X only)	Find patterns	Clustering, PCA, Autoencoders
Self-supervised	Unlabeled (create labels)	Learn representations	BERT (mask prediction), GPT (next token)
Reinforcement	Environment + rewards	Maximize reward	Game AI, robotics, RLHF

Bias-Variance Tradeoff

High Bias, Low Variance Low Bias, High Variance (Underfitting) (Overfitting) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Simple model (linear) ◄──────────► Complex model (deep net) Poor on train & test Sweet spot Great on train, bad on test Fix underfitting: Fix overfitting: • More features • More data • More complex model • Regularization (L1/L2, dropout) • Fewer regularization • Early stopping • Data augmentation

PyTorch

Tensors & Basics

import torch
import torch.nn as nn

# Tensor creation
x = torch.tensor([1, 2, 3], dtype=torch.float32)
z = torch.zeros(3, 4)
r = torch.randn(3, 4)           # standard normal
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = x.to(device)

# Operations
y = x @ W.T                      # matrix multiply
y = x.view(-1, 784)              # reshape
y = x.unsqueeze(0)               # add batch dim
y = torch.cat([a, b], dim=0)     # concatenate

# Autograd
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2 + 3 * x
y.backward()
print(x.grad)  # dy/dx = 2x + 3 = 7.0

Model Definition

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.view(-1, 784)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        return self.fc3(x)

Training Loop

model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    model.train()
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        correct = sum(
            (model(x.to(device)).argmax(1) == y.to(device)).sum().item()
            for x, y in val_loader
        )

DataLoader

from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

train_loader = DataLoader(
    MyDataset(X_train, y_train),
    batch_size=32, shuffle=True, num_workers=4
)

TensorFlow / Keras

import tensorflow as tf
from tensorflow import keras
from keras import layers

# Sequential model
model = keras.Sequential([
    layers.Flatten(input_shape=(28, 28)),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(128, activation="relu"),
    layers.Dense(10, activation="softmax"),
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

model.fit(X_train, y_train, epochs=10, batch_size=32,
         validation_split=0.2,
         callbacks=[keras.callbacks.EarlyStopping(patience=3)])

model.evaluate(X_test, y_test)
model.save("my_model.keras")

CNNs (Computer Vision)

# PyTorch CNN
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):        # x: (B, 1, 28, 28)
        x = self.pool(torch.relu(self.conv1(x)))  # → (B, 32, 14, 14)
        x = self.pool(torch.relu(self.conv2(x)))  # → (B, 64, 7, 7)
        x = x.view(x.size(0), -1)                  # flatten
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

Popular Architectures

Model	Year	Key Innovation	Use Case
ResNet	2015	Skip connections (residual)	Image classification baseline
EfficientNet	2019	Compound scaling	Efficient image classification
YOLO	2016+	Single-shot detection	Real-time object detection
U-Net	2015	Encoder-decoder + skip	Image segmentation
ViT	2020	Transformer for images	State-of-art classification

Transfer Learning (PyTorch)

from torchvision import models

# Load pretrained ResNet, replace final layer
model = models.resnet50(weights="IMAGENET1K_V2")
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Freeze early layers (optional)
for param in model.parameters():
    param.requires_grad = False
for param in model.fc.parameters():
    param.requires_grad = True

RNNs & Sequence Models

Model	Strengths	Weaknesses
Vanilla RNN	Simple sequence processing	Vanishing gradient, no long-range deps
LSTM	Long-range memory (forget/input/output gates)	Slower, more params
GRU	Simpler than LSTM, similar performance	Less control than LSTM
Transformer	Parallelizable, self-attention, dominant now	O(n²) memory for attention

# PyTorch LSTM
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)          # (B, seq_len, embed_dim)
        out, (h, c) = self.lstm(x)     # out: (B, seq_len, hidden*2)
        return self.fc(out[:, -1, :])  # last timestep

Transformers

Self-Attention Mechanism

Input: "The cat sat on the mat" ↓ Token Embeddings + Positional Encoding ↓ ┌──────────────────────────────┐ │ Multi-Head Self-Attention │ │ Q = XW_q, K = XW_k, V = XW_v │ Attention(Q,K,V) = softmax(QK^T / √d_k) × V └──────────────────────────────┘ ↓ Feed-Forward Network ↓ Layer Norm + Residual Connection ↓ Repeat N layers → Output

Architecture Families

Type	Architecture	Training	Models
Encoder-only	Bidirectional attention	Masked language model	BERT, RoBERTa, DeBERTa
Decoder-only	Causal (left-to-right)	Next token prediction	GPT, LLaMA, Claude, Gemini
Encoder-Decoder	Cross-attention	Seq-to-seq	T5, BART, mBART

Hugging Face Transformers

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Quick inference with pipeline
classifier = pipeline("sentiment-analysis")
result = classifier("This movie was amazing!")
# [{'label': 'POSITIVE', 'score': 0.9998}]

# Text generation
gen = pipeline("text-generation", model="gpt2")
gen("The future of AI is", max_length=50)

# Custom model loading
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

inputs = tokenizer("Hello world", return_tensors="pt", padding=True, truncation=True)
outputs = model(**inputs)

Fine-Tuning with Trainer

from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
)
trainer.train()

Training Best Practices

Technique	Purpose	Code / Details
Learning Rate Schedule	Decay LR over time	Cosine annealing, OneCycleLR, warmup
Gradient Clipping	Prevent exploding gradients	`torch.nn.utils.clip_grad_norm_(params, max_norm=1.0)`
Weight Decay	L2 regularization	`AdamW(params, lr=1e-3, weight_decay=0.01)`
Dropout	Prevent overfitting	Randomly zero neurons during training
Batch Normalization	Stabilize training	`nn.BatchNorm2d(num_features)`
Data Augmentation	More training variety	Flip, rotate, crop, color jitter
Early Stopping	Stop when val loss plateaus	Monitor val loss, patience=5
Mixed Precision	Faster training, less memory	`torch.cuda.amp.autocast()`

Loss Functions

Loss	Task	PyTorch
Cross Entropy	Multi-class classification	`nn.CrossEntropyLoss()`
Binary Cross Entropy	Binary / multi-label	`nn.BCEWithLogitsLoss()`
MSE	Regression	`nn.MSELoss()`
L1 / MAE	Regression (robust)	`nn.L1Loss()`
Huber	Regression (outlier-robust)	`nn.SmoothL1Loss()`
Contrastive	Similarity learning	Custom / `torch.nn.CosineEmbeddingLoss`

NLP Essentials

Concept	Description
Tokenization	Split text into tokens (words, subwords, chars). BPE, WordPiece, SentencePiece.
Embeddings	Dense vector representations. Word2Vec, GloVe, BERT embeddings.
Attention	Learn which tokens are relevant to each other. Self-attention = Transformer core.
Fine-tuning	Train pretrained model on your dataset. Usually last layers + low LR.
LoRA	Low-Rank Adaptation — train small rank matrices instead of full model. Memory efficient.
RAG	Retrieval-Augmented Generation — retrieve docs, feed to LLM for grounded answers.
RLHF	Reinforcement Learning from Human Feedback — align model to human preferences.

MLOps & Deployment

MLOps Stack

Data Versioning Experiment Tracking Model Registry Serving ┌───────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ DVC │ │ MLflow │ │ MLflow │ │ TorchServe │ │ LakeFS │ │ W&B │ │ W&B │ │ BentoML │ │ Delta Lake│ │ Neptune │ │ Hugging Face│ │ vLLM │ └───────────┘ └──────────────┘ └──────────────┘ │ FastAPI │ └──────────────┘

MLflow Tracking

import mlflow

mlflow.set_experiment("my_experiment")

with mlflow.start_run():
    mlflow.log_param("lr", 1e-3)
    mlflow.log_param("epochs", 10)

    # ... training ...

    mlflow.log_metric("accuracy", 0.95)
    mlflow.log_metric("f1", 0.93)
    mlflow.pytorch.log_model(model, "model")

Model Serving (FastAPI)

from fastapi import FastAPI
import torch

app = FastAPI()
model = torch.load("model.pt")
model.eval()

@app.post("/predict")
async def predict(data: dict):
    tensor = torch.tensor(data["features"])
    with torch.no_grad():
        pred = model(tensor)
    return {"prediction": pred.tolist()}

GPU & Performance

# Check GPU
torch.cuda.is_available()
torch.cuda.device_count()
torch.cuda.get_device_name(0)

# Mixed precision training (2x speed, less memory)
scaler = torch.cuda.amp.GradScaler()
for batch in loader:
    with torch.cuda.amp.autocast():
        output = model(batch)
        loss = criterion(output, target)
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

# Multi-GPU (DataParallel — simple)
model = nn.DataParallel(model)

# Memory tips
torch.cuda.empty_cache()
# Use gradient accumulation for large effective batch size
# Use gradient checkpointing to trade compute for memory

Math Cheat Sheet

Concept	Formula / Description	Used In
Sigmoid	σ(x) = 1 / (1 + e⁻ˣ) → maps to (0,1)	Binary classification output
Softmax	softmax(xᵢ) = eˣⁱ / Σeˣʲ → probability distribution	Multi-class output
ReLU	f(x) = max(0, x)	Default hidden activation
Cross Entropy	L = -Σ yᵢ·log(ŷᵢ)	Classification loss
MSE	L = (1/n)·Σ(yᵢ - ŷᵢ)²	Regression loss
Gradient Descent	θ = θ - α·∇L(θ)	All optimization
Cosine Similarity	cos(A,B) = A·B / (‖A‖·‖B‖)	Embeddings, RAG
L1 Regularization	λ·Σ\|wᵢ\| (sparse weights)	Feature selection
L2 Regularization	λ·Σwᵢ² (small weights)	Prevent overfitting