AI & ML Development Cheatsheet

PyTorch, TensorFlow, Hugging Face, model training, CNNs, RNNs, Transformers, MLOps & deployment

AI / ML
Contents
#

ML Fundamentals

Types of Learning

TypeDataGoalExamples
SupervisedLabeled (X, y)Predict y from XClassification, Regression
UnsupervisedUnlabeled (X only)Find patternsClustering, PCA, Autoencoders
Self-supervisedUnlabeled (create labels)Learn representationsBERT (mask prediction), GPT (next token)
ReinforcementEnvironment + rewardsMaximize rewardGame AI, robotics, RLHF

Bias-Variance Tradeoff

High Bias, Low Variance Low Bias, High Variance (Underfitting) (Overfitting) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Simple model (linear) ◄──────────► Complex model (deep net) Poor on train & test Sweet spot Great on train, bad on test Fix underfitting: Fix overfitting: • More features • More data • More complex model • Regularization (L1/L2, dropout) • Fewer regularization • Early stopping • Data augmentation
#

PyTorch

Tensors & Basics

import torch
import torch.nn as nn

# Tensor creation
x = torch.tensor([1, 2, 3], dtype=torch.float32)
z = torch.zeros(3, 4)
r = torch.randn(3, 4)           # standard normal
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = x.to(device)

# Operations
y = x @ W.T                      # matrix multiply
y = x.view(-1, 784)              # reshape
y = x.unsqueeze(0)               # add batch dim
y = torch.cat([a, b], dim=0)     # concatenate

# Autograd
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2 + 3 * x
y.backward()
print(x.grad)  # dy/dx = 2x + 3 = 7.0

Model Definition

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.view(-1, 784)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        return self.fc3(x)

Training Loop

model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    model.train()
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        correct = sum(
            (model(x.to(device)).argmax(1) == y.to(device)).sum().item()
            for x, y in val_loader
        )

DataLoader

from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

train_loader = DataLoader(
    MyDataset(X_train, y_train),
    batch_size=32, shuffle=True, num_workers=4
)
#

TensorFlow / Keras

import tensorflow as tf
from tensorflow import keras
from keras import layers

# Sequential model
model = keras.Sequential([
    layers.Flatten(input_shape=(28, 28)),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(128, activation="relu"),
    layers.Dense(10, activation="softmax"),
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

model.fit(X_train, y_train, epochs=10, batch_size=32,
         validation_split=0.2,
         callbacks=[keras.callbacks.EarlyStopping(patience=3)])

model.evaluate(X_test, y_test)
model.save("my_model.keras")
#

CNNs (Computer Vision)

# PyTorch CNN
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):        # x: (B, 1, 28, 28)
        x = self.pool(torch.relu(self.conv1(x)))  # → (B, 32, 14, 14)
        x = self.pool(torch.relu(self.conv2(x)))  # → (B, 64, 7, 7)
        x = x.view(x.size(0), -1)                  # flatten
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

Popular Architectures

ModelYearKey InnovationUse Case
ResNet2015Skip connections (residual)Image classification baseline
EfficientNet2019Compound scalingEfficient image classification
YOLO2016+Single-shot detectionReal-time object detection
U-Net2015Encoder-decoder + skipImage segmentation
ViT2020Transformer for imagesState-of-art classification

Transfer Learning (PyTorch)

from torchvision import models

# Load pretrained ResNet, replace final layer
model = models.resnet50(weights="IMAGENET1K_V2")
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Freeze early layers (optional)
for param in model.parameters():
    param.requires_grad = False
for param in model.fc.parameters():
    param.requires_grad = True
#

RNNs & Sequence Models

ModelStrengthsWeaknesses
Vanilla RNNSimple sequence processingVanishing gradient, no long-range deps
LSTMLong-range memory (forget/input/output gates)Slower, more params
GRUSimpler than LSTM, similar performanceLess control than LSTM
TransformerParallelizable, self-attention, dominant nowO(n²) memory for attention
# PyTorch LSTM
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)          # (B, seq_len, embed_dim)
        out, (h, c) = self.lstm(x)     # out: (B, seq_len, hidden*2)
        return self.fc(out[:, -1, :])  # last timestep
#

Transformers

Self-Attention Mechanism

Input: "The cat sat on the mat" ↓ Token Embeddings + Positional Encoding ↓ ┌──────────────────────────────┐ │ Multi-Head Self-Attention │ │ Q = XW_q, K = XW_k, V = XW_v │ Attention(Q,K,V) = softmax(QK^T / √d_k) × V └──────────────────────────────┘ ↓ Feed-Forward Network ↓ Layer Norm + Residual Connection ↓ Repeat N layers → Output

Architecture Families

TypeArchitectureTrainingModels
Encoder-onlyBidirectional attentionMasked language modelBERT, RoBERTa, DeBERTa
Decoder-onlyCausal (left-to-right)Next token predictionGPT, LLaMA, Claude, Gemini
Encoder-DecoderCross-attentionSeq-to-seqT5, BART, mBART
#

Hugging Face Transformers

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Quick inference with pipeline
classifier = pipeline("sentiment-analysis")
result = classifier("This movie was amazing!")
# [{'label': 'POSITIVE', 'score': 0.9998}]

# Text generation
gen = pipeline("text-generation", model="gpt2")
gen("The future of AI is", max_length=50)

# Custom model loading
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

inputs = tokenizer("Hello world", return_tensors="pt", padding=True, truncation=True)
outputs = model(**inputs)

Fine-Tuning with Trainer

from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
)
trainer.train()
#

Training Best Practices

TechniquePurposeCode / Details
Learning Rate ScheduleDecay LR over timeCosine annealing, OneCycleLR, warmup
Gradient ClippingPrevent exploding gradientstorch.nn.utils.clip_grad_norm_(params, max_norm=1.0)
Weight DecayL2 regularizationAdamW(params, lr=1e-3, weight_decay=0.01)
DropoutPrevent overfittingRandomly zero neurons during training
Batch NormalizationStabilize trainingnn.BatchNorm2d(num_features)
Data AugmentationMore training varietyFlip, rotate, crop, color jitter
Early StoppingStop when val loss plateausMonitor val loss, patience=5
Mixed PrecisionFaster training, less memorytorch.cuda.amp.autocast()

Loss Functions

LossTaskPyTorch
Cross EntropyMulti-class classificationnn.CrossEntropyLoss()
Binary Cross EntropyBinary / multi-labelnn.BCEWithLogitsLoss()
MSERegressionnn.MSELoss()
L1 / MAERegression (robust)nn.L1Loss()
HuberRegression (outlier-robust)nn.SmoothL1Loss()
ContrastiveSimilarity learningCustom / torch.nn.CosineEmbeddingLoss
#

NLP Essentials

ConceptDescription
TokenizationSplit text into tokens (words, subwords, chars). BPE, WordPiece, SentencePiece.
EmbeddingsDense vector representations. Word2Vec, GloVe, BERT embeddings.
AttentionLearn which tokens are relevant to each other. Self-attention = Transformer core.
Fine-tuningTrain pretrained model on your dataset. Usually last layers + low LR.
LoRALow-Rank Adaptation — train small rank matrices instead of full model. Memory efficient.
RAGRetrieval-Augmented Generation — retrieve docs, feed to LLM for grounded answers.
RLHFReinforcement Learning from Human Feedback — align model to human preferences.
#

MLOps & Deployment

MLOps Stack

Data Versioning Experiment Tracking Model Registry Serving ┌───────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ DVC │ │ MLflow │ │ MLflow │ │ TorchServe │ │ LakeFS │ │ W&B │ │ W&B │ │ BentoML │ │ Delta Lake│ │ Neptune │ │ Hugging Face│ │ vLLM │ └───────────┘ └──────────────┘ └──────────────┘ │ FastAPI │ └──────────────┘

MLflow Tracking

import mlflow

mlflow.set_experiment("my_experiment")

with mlflow.start_run():
    mlflow.log_param("lr", 1e-3)
    mlflow.log_param("epochs", 10)

    # ... training ...

    mlflow.log_metric("accuracy", 0.95)
    mlflow.log_metric("f1", 0.93)
    mlflow.pytorch.log_model(model, "model")

Model Serving (FastAPI)

from fastapi import FastAPI
import torch

app = FastAPI()
model = torch.load("model.pt")
model.eval()

@app.post("/predict")
async def predict(data: dict):
    tensor = torch.tensor(data["features"])
    with torch.no_grad():
        pred = model(tensor)
    return {"prediction": pred.tolist()}
#

GPU & Performance

# Check GPU
torch.cuda.is_available()
torch.cuda.device_count()
torch.cuda.get_device_name(0)

# Mixed precision training (2x speed, less memory)
scaler = torch.cuda.amp.GradScaler()
for batch in loader:
    with torch.cuda.amp.autocast():
        output = model(batch)
        loss = criterion(output, target)
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

# Multi-GPU (DataParallel — simple)
model = nn.DataParallel(model)

# Memory tips
torch.cuda.empty_cache()
# Use gradient accumulation for large effective batch size
# Use gradient checkpointing to trade compute for memory
#

Math Cheat Sheet

ConceptFormula / DescriptionUsed In
Sigmoidσ(x) = 1 / (1 + e⁻ˣ) → maps to (0,1)Binary classification output
Softmaxsoftmax(xᵢ) = eˣⁱ / Σeˣʲ → probability distributionMulti-class output
ReLUf(x) = max(0, x)Default hidden activation
Cross EntropyL = -Σ yᵢ·log(ŷᵢ)Classification loss
MSEL = (1/n)·Σ(yᵢ - ŷᵢ)²Regression loss
Gradient Descentθ = θ - α·∇L(θ)All optimization
Cosine Similaritycos(A,B) = A·B / (‖A‖·‖B‖)Embeddings, RAG
L1 Regularizationλ·Σ|wᵢ| (sparse weights)Feature selection
L2 Regularizationλ·Σwᵢ² (small weights)Prevent overfitting
Back to top