NumPy, Pandas, Matplotlib, Seaborn, scikit-learn, statistics, EDA & feature engineering
Data / Scienceimport numpy as np
# Array creation
a = np.array([1, 2, 3])
z = np.zeros((3, 4)) # 3x4 of 0s
o = np.ones((2, 3)) # 2x3 of 1s
r = np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
l = np.linspace(0, 1, 5) # 5 evenly spaced: [0, .25, .5, .75, 1]
e = np.eye(3) # 3x3 identity
rnd = np.random.randn(3, 4) # 3x4 standard normal
# Shape operations
a.shape # (3,)
a.reshape(1, 3) # 1 row, 3 cols
a.T # transpose
np.expand_dims(a, axis=0) # add dimension
np.concatenate([a, b], axis=0) # stack arrays
# Math
np.dot(A, B) # matrix multiply (or A @ B)
np.sum(a, axis=0) # sum along axis
np.mean(a), np.std(a), np.median(a)
np.argmax(a) # index of max value
np.where(a > 5, a, 0) # conditional: keep if >5 else 0
# Broadcasting: (3,4) + (1,4) → (3,4) — smaller dims stretched
# Slicing: a[1:3], a[:, 0], a[a > 5] (boolean indexing)import pandas as pd
# Quick EDA
df.shape # (rows, cols)
df.info() # types, nulls, memory
df.describe() # stats for numeric cols
df.describe(include='object') # stats for categorical
df.nunique() # unique count per col
df.isnull().sum() # count nulls
df.duplicated().sum() # count duplicates
df.corr() # correlation matrix
# Handle missing data
df.dropna(subset=["col"]) # drop rows with nulls in col
df["col"].fillna(df["col"].median())
df["col"].interpolate() # linear interpolation
# Categorical encoding
pd.get_dummies(df, columns=["city"]) # one-hot
df["size"].map({"S": 0, "M": 1, "L": 2}) # ordinal
# Binning
df["age_group"] = pd.cut(df["age"], bins=[0,18,35,60,100], labels=["child","young","mid","senior"])
# Apply / lambda
df["col"].apply(lambda x: x ** 2)
df.applymap(lambda x: round(x, 2)) # element-wise
# Value counts + cross tab
df["city"].value_counts(normalize=True)
pd.crosstab(df["gender"], df["bought"])import matplotlib.pyplot as plt
# Basic plot types
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(x, y, label="Line") # line
ax.scatter(x, y, alpha=0.5) # scatter
ax.bar(categories, values) # bar
ax.hist(data, bins=30, edgecolor="black") # histogram
ax.boxplot([data1, data2]) # box plot
# Formatting
ax.set_title("Title", fontsize=14)
ax.set_xlabel("X Axis")
ax.set_ylabel("Y Axis")
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
# Subplots grid
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes[0][0].plot(x, y)
axes[0][1].hist(data)
plt.savefig("plot.png", dpi=300, bbox_inches="tight")
plt.show()import seaborn as sns
sns.set_theme(style="darkgrid")
# Distribution
sns.histplot(df["age"], kde=True)
sns.kdeplot(df["salary"], fill=True)
# Categorical
sns.boxplot(x="city", y="salary", data=df)
sns.violinplot(x="dept", y="score", data=df)
sns.countplot(x="category", data=df)
sns.barplot(x="day", y="sales", data=df, estimator=np.mean)
# Relationships
sns.scatterplot(x="height", y="weight", hue="gender", data=df)
sns.regplot(x="x", y="y", data=df) # scatter + regression line
sns.pairplot(df, hue="species") # pairwise relationships
# Heatmaps
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
# FacetGrid — multi-panel plots
g = sns.FacetGrid(df, col="city", row="gender")
g.map(sns.histplot, "age")| Measure | Formula / Description | Python |
|---|---|---|
| Mean | Sum / N (sensitive to outliers) | np.mean(x) |
| Median | Middle value (robust to outliers) | np.median(x) |
| Mode | Most frequent value | stats.mode(x) |
| Std Dev | Spread around mean | np.std(x, ddof=1) |
| Variance | Std Dev squared | np.var(x, ddof=1) |
| IQR | Q3 - Q1 (interquartile range) | np.percentile(x, 75) - np.percentile(x, 25) |
| Skewness | Asymmetry of distribution | stats.skew(x) |
from scipy import stats
# t-test (compare means of 2 groups)
t_stat, p_val = stats.ttest_ind(group_a, group_b)
# Chi-squared (categorical independence)
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
# Pearson correlation
r, p_val = stats.pearsonr(x, y)
# Normal distribution test
stat, p = stats.shapiro(data) # Shapiro-Wilk
# If p_value < 0.05 → reject null hypothesis# 1. Load & inspect
df = pd.read_csv("data.csv")
df.shape, df.dtypes, df.head()
# 2. Check missing & duplicates
df.isnull().sum().sort_values(ascending=False)
df.duplicated().sum()
# 3. Univariate analysis
for col in df.select_dtypes("number").columns:
fig, ax = plt.subplots(1, 2, figsize=(10, 3))
sns.histplot(df[col], ax=ax[0], kde=True)
sns.boxplot(x=df[col], ax=ax[1])
plt.suptitle(col)
plt.show()
# 4. Bivariate analysis
sns.heatmap(df.corr(), annot=True, cmap="RdBu_r")
sns.pairplot(df, hue="target")
# 5. Outlier detection
Q1, Q3 = df["col"].quantile([.25, .75])
IQR = Q3 - Q1
outliers = df[(df["col"] < Q1 - 1.5*IQR) | (df["col"] > Q3 + 1.5*IQR)]# Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler() # z-score: (x - mean) / std
X_scaled = scaler.fit_transform(X)
# Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
df["city_enc"] = le.fit_transform(df["city"])
# Log transform (right-skewed data)
df["log_salary"] = np.log1p(df["salary"])
# Polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
# Date features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.dayofweek
df["is_weekend"] = df["weekday"].isin([5, 6]).astype(int)
# Text features
df["word_count"] = df["text"].str.split().str.len()
df["has_url"] = df["text"].str.contains(r"https?://").astype(int)from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Pipeline
pipe = Pipeline([
("scaler", StandardScaler()),
("model", RandomForestClassifier(n_estimators=100, random_state=42)),
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))| Task | Model | Import |
|---|---|---|
| Classification | Logistic Regression | sklearn.linear_model.LogisticRegression |
| Classification | Random Forest | sklearn.ensemble.RandomForestClassifier |
| Classification | XGBoost | xgboost.XGBClassifier |
| Regression | Linear Regression | sklearn.linear_model.LinearRegression |
| Regression | Random Forest | sklearn.ensemble.RandomForestRegressor |
| Clustering | K-Means | sklearn.cluster.KMeans |
| Clustering | DBSCAN | sklearn.cluster.DBSCAN |
| Dim Reduction | PCA | sklearn.decomposition.PCA |
from sklearn.model_selection import cross_val_score, GridSearchCV
# Cross validation
scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
print(f"Mean: {scores.mean():.3f} ± {scores.std():.3f}")
# Grid search
params = {"n_estimators": [50, 100, 200], "max_depth": [5, 10, None]}
grid = GridSearchCV(RandomForestClassifier(), params, cv=5, scoring="f1")
grid.fit(X_train, y_train)
print(grid.best_params_, grid.best_score_)| Metric | Formula | When to Use |
|---|---|---|
| Accuracy | (TP+TN) / Total | Balanced classes |
| Precision | TP / (TP+FP) | Cost of false positives high (spam) |
| Recall | TP / (TP+FN) | Cost of false negatives high (cancer) |
| F1 Score | 2 × (P×R)/(P+R) | Imbalanced data, balance P & R |
| AUC-ROC | Area under ROC curve | Probability ranking quality |
| Metric | Formula | Note |
|---|---|---|
| MAE | mean(|y - ŷ|) | Average absolute error |
| MSE | mean((y - ŷ)²) | Penalizes large errors more |
| RMSE | √MSE | Same unit as target |
| R² Score | 1 - SS_res/SS_tot | % variance explained (0-1) |
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_features = ["age", "salary", "experience"]
categorical_features = ["city", "department"]
preprocessor = ColumnTransformer([
("num", Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
]), numeric_features),
("cat", Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OneHotEncoder(handle_unknown="ignore")),
]), categorical_features),
])
full_pipe = Pipeline([
("preprocess", preprocessor),
("model", RandomForestClassifier()),
])
full_pipe.fit(X_train, y_train)| Data Type | Best Chart | Seaborn Function |
|---|---|---|
| Distribution (1 var) | Histogram / KDE | histplot, kdeplot |
| Distribution + outliers | Box / Violin | boxplot, violinplot |
| Two numeric vars | Scatter | scatterplot, regplot |
| Categorical counts | Bar / Count | countplot, barplot |
| Category vs. numeric | Box / Bar | boxplot, barplot |
| Correlation matrix | Heatmap | heatmap |
| Many variables | Pair plot | pairplot |
| Time series | Line plot | lineplot |