NumPy — The Foundation of
Python Data Science
NumPy (Numerical Python) provides the high-performance ndarray object and the tools to work with it efficiently. It underpins almost every data science library — Pandas, Scikit-learn, TensorFlow — making it essential knowledge.
Creating NumPy Arrays
The ndarray (N-dimensional array) is NumPy's core data structure. Unlike Python lists, all elements share the same data type, enabling vectorized operations that are orders of magnitude faster.
import numpy as np
# ── 1D Array from a list ──────────────────────────────────
arr1d = np.array([10, 20, 30, 40, 50])
print("1D Array:", arr1d)
print("Shape:", arr1d.shape) # (5,)
print("DType:", arr1d.dtype) # int64
# ── 2D Array (Matrix) ────────────────────────────────────
arr2d = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
print("\n2D Array:\n", arr2d)
print("Shape:", arr2d.shape) # (3, 3)
# ── 3D Array ─────────────────────────────────────────────
arr3d = np.array([[[1, 2], [3, 4]],
[[5, 6], [7, 8]]])
print("\n3D Shape:", arr3d.shape) # (2, 2, 2)
# ── Factory Functions ─────────────────────────────────────
zeros = np.zeros((3, 4)) # All zeros, shape (3,4)
ones = np.ones((2, 3), dtype=float) # All ones
eye = np.eye(4) # 4×4 Identity matrix
full = np.full((3, 3), 7) # All 7s
# ── Range & Spacing ──────────────────────────────────────
arange = np.arange(0, 20, 2) # [0, 2, 4, ..., 18]
linsp = np.linspace(0, 1, 6) # 6 evenly spaced in [0,1]
logsp = np.logspace(0, 3, 4) # 10^0 to 10^3
# ── Random Arrays ────────────────────────────────────────
rng = np.random.default_rng(seed=42) # reproducible
rand_uniform = rng.random((3, 3)) # uniform [0, 1)
rand_normal = rng.standard_normal((3, 3)) # N(0,1)
rand_int = rng.integers(1, 100, size=(3, 3))
print("\nRange array:", arange)
print("Linspace: ", linsp)
Array Indexing, Slicing, and Reshaping
NumPy's indexing system is far more powerful than Python lists. You can select individual elements, slices, entire rows/columns, or arbitrary subsets using boolean masks and fancy indexing.
import numpy as np
arr = np.array([10, 20, 30, 40, 50, 60, 70, 80])
# Basic indexing (0-based, negative from end)
print(arr[0]) # 10 — first element
print(arr[-1]) # 80 — last element
print(arr[-3]) # 60 — third from end
# Slicing: arr[start:stop:step]
print(arr[1:5]) # [20 30 40 50] — index 1 to 4
print(arr[:4]) # [10 20 30 40] — from start
print(arr[4:]) # [50 60 70 80] — to end
print(arr[::2]) # [10 30 50 70] — every 2nd
print(arr[::-1]) # [80 70 ... 10] — reversed
# View vs Copy
view = arr[2:5] # shares memory
copy = arr[2:5].copy() # independent
view[0] = 999 # changes arr too!
print(arr[2]) # 999 (modified)
matrix = np.array([[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12]])
# Single element: [row, col]
print(matrix[1, 2]) # 7
# Full row / column
print(matrix[0, :]) # [1 2 3 4] — row 0
print(matrix[:, 2]) # [3 7 11] — column 2
# Submatrix
print(matrix[0:2, 1:3]) # [[2,3],[6,7]]
# Boolean (mask) indexing
print(matrix[matrix > 6]) # [7 8 9 10 11 12]
# Fancy indexing — select specific rows
print(matrix[[0, 2], :]) # rows 0 and 2
# ── Reshape ──────────────────────────────────────────────
arr = np.arange(24)
reshaped = arr.reshape(4, 6) # 4 rows, 6 cols
cubic = arr.reshape(2, 3, 4) # 3D: 2 blocks, 3 rows, 4 cols
flat = reshaped.flatten() # always a copy, 1D
ravel = reshaped.ravel() # view if possible, 1D
print(reshaped.shape) # (4, 6)
print(cubic.shape) # (2, 3, 4)
# -1 means "infer this dimension"
auto = arr.reshape(-1, 8) # shape becomes (3, 8)
print(auto.shape)
Mathematical Operations with NumPy Arrays
NumPy implements universal functions (ufuncs) — vectorized C-level operations applied element-wise to entire arrays without Python loops. This is the core of NumPy's performance advantage.
np.sqrt, np.log, np.sin are implemented in C and operate on entire arrays at once. Calling np.sqrt(arr) on a million-element array is dramatically faster than a Python loop.
import numpy as np
a = np.array([1, 2, 3, 4, 5])
b = np.array([10, 20, 30, 40, 50])
# ── Arithmetic (element-wise) ─────────────────────────────
print(a + b) # [11 22 33 44 55]
print(b - a) # [ 9 18 27 36 45]
print(a * b) # [ 10 40 90 160 250]
print(b / a) # [10. 10. 10. 10. 10.]
print(a ** 2) # [ 1 4 9 16 25]
print(b % 3) # [1 2 0 1 2]
# ── Universal Functions ───────────────────────────────────
print(np.sqrt(a)) # [1. 1.41 1.73 2. 2.24]
print(np.log(a)) # [0. 0.69 1.1 1.39 1.61]
print(np.log2(a)) # [0. 1. 1.58 2. 2.32]
print(np.log10(b)) # [1. 1.3 1.48 1.6 1.7 ]
print(np.exp([0,1,2])) # [1. 2.72 7.39]
print(np.abs([-3,1,-5,4])) # [3 1 5 4]
# ── Trigonometry ─────────────────────────────────────────
angles = np.linspace(0, np.pi, 5)
print(np.sin(angles).round(2)) # [0. 0.71 1. 0.71 0.]
print(np.cos(angles).round(2)) # [1. 0.71 0. -0.71 -1.]
# ── Aggregations ─────────────────────────────────────────
arr = np.array([[1,2,3],[4,5,6],[7,8,9]])
print(arr.sum()) # 45 — total
print(arr.sum(axis=0)) # [12 15 18] — col sums
print(arr.sum(axis=1)) # [ 6 15 24] — row sums
print(arr.min(), arr.max()) # 1 9
print(arr.argmin(), arr.argmax()) # 0 8 (flat indices)
print(arr.cumsum()) # [1 3 6 10 15 21 28 36 45]
# ── Matrix Multiplication ─────────────────────────────────
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
print(np.dot(A, B)) # Matrix product (same as A @ B)
print(A @ B) # [[19 22] [43 50]]
print(A * B) # [[5 12] [21 32]] ← element-wise!
Broadcasting and Vectorized Operations
Broadcasting is NumPy's mechanism for operating on arrays of different shapes without copying data. Understanding broadcasting rules eliminates the need for explicit loops and is key to writing idiomatic, efficient NumPy code.
1. Prepend 1s to the smaller array's shape until lengths match.
2. Arrays with size 1 along a dimension are stretched to match the other.
3. Shapes must be equal or one of them must be 1 — otherwise
ValueError.
import numpy as np
# ── Scalar broadcasting ───────────────────────────────────
arr = np.array([1, 2, 3, 4])
print(arr + 10) # [11 12 13 14] — 10 broadcast to all
print(arr * 2.5) # [2.5 5. 7.5 10.]
# ── 2D + 1D broadcasting ─────────────────────────────────
matrix = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
row = np.array([10, 20, 30]) # shape (3,)
# row is broadcast to each row of matrix
result = matrix + row
# [[11 22 33]
# [14 25 36]
# [17 28 39]]
print(result)
# ── Column broadcasting ───────────────────────────────────
col = np.array([[100],
[200],
[300]]) # shape (3, 1)
result2 = matrix + col
# [[101 102 103]
# [204 205 206]
# [307 308 309]]
print(result2)
# ── Outer product via broadcasting ───────────────────────
a = np.array([1, 2, 3, 4]).reshape(4, 1) # (4,1)
b = np.array([1, 2, 3, 4]) # (4,)
outer = a * b # shape (4, 4)
print(outer)
# [[ 1 2 3 4]
# [ 2 4 6 8]
# [ 3 6 9 12]
# [ 4 8 12 16]]
# ── Normalizing rows (practical example) ─────────────────
data = np.random.rand(5, 4) * 100
row_means = data.mean(axis=1, keepdims=True) # shape (5,1)
normalized = data - row_means # broadcast
print("Row means after normalize:", normalized.mean(axis=1).round(4))
Working with Random Numbers and Simulations
NumPy's random module provides fast generation of random numbers from dozens of probability distributions. The newer np.random.default_rng() API is recommended over legacy np.random.rand() for reproducibility and statistical correctness.
Generator object with a seed for reproducible results: rng = np.random.default_rng(seed=42). This is the modern NumPy approach (since v1.17).
import numpy as np
rng = np.random.default_rng(seed=42)
# ── Distributions ─────────────────────────────────────────
uniform = rng.uniform(0, 10, size=1000) # U(0,10)
normal = rng.normal(loc=0, scale=1, size=1000) # N(0,1)
binomial = rng.binomial(n=10, p=0.3, size=1000) # B(10,0.3)
poisson = rng.poisson(lam=3, size=1000) # Pois(λ=3)
exp_dist = rng.exponential(scale=2, size=1000) # Exp(λ=0.5)
# Summary stats
for name, data in [("Uniform",uniform),("Normal",normal),
("Binomial",binomial),("Poisson",poisson)]:
print(f"{name:10s}: mean={data.mean():.2f}, std={data.std():.2f}")
# ── Shuffling & Choice ────────────────────────────────────
arr = np.arange(10)
rng.shuffle(arr)
print("Shuffled:", arr)
sample = rng.choice(arr, size=5, replace=False)
print("Sample:", sample)
# ── Monte Carlo: Estimate π ───────────────────────────────
n = 1_000_000
x = rng.uniform(-1, 1, n)
y = rng.uniform(-1, 1, n)
inside = np.sum(x**2 + y**2 <= 1.0)
pi_est = 4 * inside / n
print(f"\nMonte Carlo π estimate: {pi_est:.5f}")
print(f"Actual π: {np.pi:.5f}")
print(f"Error: {abs(pi_est - np.pi):.5f}")
Advanced Array Manipulation and Linear Algebra
The numpy.linalg module provides industry-grade linear algebra routines. These operations are the mathematical foundation of machine learning — PCA, regression, neural networks, and more all rely on these functions.
import numpy as np
# ── Stack & Concatenate ───────────────────────────────────
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
hstacked = np.hstack([a, b]) # [1 2 3 4 5 6]
vstacked = np.vstack([a, b]) # [[1 2 3],[4 5 6]]
stacked = np.stack([a, b], axis=0) # same as vstack
concat = np.concatenate([a, b]) # [1 2 3 4 5 6]
# ── Split ─────────────────────────────────────────────────
arr = np.arange(12).reshape(4, 3)
parts = np.vsplit(arr, 2) # split into 2 along rows
print([p.shape for p in parts]) # [(2,3),(2,3)]
# ── Transpose & Axes ─────────────────────────────────────
M = np.arange(12).reshape(3, 4)
print(M.T.shape) # (4, 3)
print(np.transpose(M).shape) # (4, 3)
# For 3D arrays: permute axes
arr3 = np.random.rand(2, 3, 4)
perm = np.transpose(arr3, (1, 0, 2)) # shape (3,2,4)
print(perm.shape)
# ── Where & Clip ─────────────────────────────────────────
data = np.array([-3, -1, 0, 2, 5, 8, -7])
clipped = np.clip(data, -2, 4) # [-2 -1 0 2 4 4 -2]
replaced = np.where(data < 0, 0, data) # [0 0 0 2 5 8 0]
print(clipped)
print(replaced)
import numpy as np
A = np.array([[4.0, 1.0],
[2.0, 3.0]])
# ── Basic Properties ─────────────────────────────────────
print("Determinant:", np.linalg.det(A)) # 10.0
print("Trace: ", np.trace(A)) # 7.0
print("Rank: ", np.linalg.matrix_rank(A)) # 2
print("Norm (Fro): ", np.linalg.norm(A)) # 5.477
# ── Inverse ───────────────────────────────────────────────
A_inv = np.linalg.inv(A)
print("Inverse:\n", A_inv)
# Verify: A @ A_inv ≈ I
print("A @ A_inv ≈ I:", np.allclose(A @ A_inv, np.eye(2)))
# ── Solve Linear System Ax = b ────────────────────────────
# 4x + y = 11
# 2x + 3y = 13
b = np.array([11.0, 13.0])
x = np.linalg.solve(A, b)
print("Solution x:", x) # [2. 3.]
print("Verify Ax=b:", A @ x) # [11. 13.]
# ── Eigenvalues & Eigenvectors ────────────────────────────
eigenvalues, eigenvectors = np.linalg.eig(A)
print("\nEigenvalues:", eigenvalues) # [5. 2.]
print("Eigenvectors:\n", eigenvectors)
# ── Singular Value Decomposition (SVD) ────────────────────
# A = U @ diag(S) @ Vt
U, S, Vt = np.linalg.svd(A)
print("\nSingular values:", S) # [5.21 1.92]
# Reconstruct A
A_reconstructed = U @ np.diag(S) @ Vt
print("Reconstruction error:", np.linalg.norm(A - A_reconstructed))
NumPy for Statistical Computations
NumPy provides a complete set of vectorized statistical functions. These run in C, handling millions of values efficiently without writing a single loop.
import numpy as np
# Sample dataset: exam scores
scores = np.array([72, 85, 91, 67, 78, 95, 83, 61, 88, 74,
90, 55, 79, 88, 71, 94, 82, 68, 76, 84])
# ── Central Tendency ─────────────────────────────────────
print(f"Mean: {np.mean(scores):.2f}")
print(f"Median: {np.median(scores):.2f}")
# Mode approximation via bincount (for integers)
unique, counts = np.unique(scores, return_counts=True)
mode = unique[np.argmax(counts)]
print(f"Mode: {mode}")
# ── Spread ────────────────────────────────────────────────
print(f"\nStd Dev: {np.std(scores):.2f}")
print(f"Variance: {np.var(scores):.2f}")
print(f"Range: {np.ptp(scores)}") # peak-to-peak
# ── Percentiles & Quartiles ───────────────────────────────
q1, q2, q3 = np.percentile(scores, [25, 50, 75])
print(f"\nQ1: {q1} Q2 (Median): {q2} Q3: {q3}")
print(f"IQR: {q3 - q1}")
# ── Correlation (for 2 arrays) ────────────────────────────
study_hours = np.array([2,4,3,5,1,6,3,2,5,4,
6,1,4,5,3,7,4,2,3,5])
corr = np.corrcoef(study_hours, scores)
print(f"\nCorrelation coefficient: {corr[0,1]:.4f}")
# ── Multi-dimensional stats ───────────────────────────────
matrix = scores.reshape(4, 5)
print("\nRow means:", np.mean(matrix, axis=1).round(1))
print("Col std: ", np.std(matrix, axis=0).round(2))
print("Z-scores: ", ((scores - np.mean(scores)) / np.std(scores)).round(2))
Handling Missing Data with NumPy
NumPy uses np.nan (Not a Number) to represent missing floating-point values. NumPy provides a full suite of NaN-aware functions and the masked_array API for more advanced missing data handling.
np.nan is a float, so integer arrays cannot store NaN directly. Use dtype=float when missing data is possible. For more complete missing data handling, Pandas (Chapter 2) extends these concepts.
import numpy as np
data = np.array([1.0, 2.0, np.nan, 4.0, np.nan, 6.0, 7.0, np.nan])
# ── Detection ─────────────────────────────────────────────
nan_mask = np.isnan(data)
print("NaN mask: ", nan_mask)
print("NaN count: ", np.sum(nan_mask))
print("Valid vals:", np.sum(~nan_mask))
# ── NaN-aware aggregations ────────────────────────────────
print("\nWith NaN (breaks):", np.mean(data)) # nan
print("nanmean (fixed): ", np.nanmean(data)) # 4.0
print("nansum: ", np.nansum(data)) # 20.0
print("nanstd: ", np.nanstd(data)) # 2.16
print("nanmin/nanmax: ", np.nanmin(data), np.nanmax(data))
# ── Filling Strategies ────────────────────────────────────
# Strategy 1: Replace with mean
mean_val = np.nanmean(data)
filled_mean = np.where(np.isnan(data), mean_val, data)
print("\nFilled with mean:", filled_mean)
# Strategy 2: Replace with 0
filled_zero = np.nan_to_num(data, nan=0.0)
print("Filled with 0: ", filled_zero)
# Strategy 3: Replace with custom value
filled_custom = np.where(np.isnan(data), -1, data)
# ── Masked Arrays ─────────────────────────────────────────
masked = np.ma.array(data, mask=np.isnan(data))
print("\nMasked array mean:", masked.mean()) # same as nanmean
print("Masked array std: ", masked.std())
# 2D example with NaN interpolation
matrix = np.array([[1., 2., np.nan],
[4., np.nan, 6.],
[7., 8., 9.]])
col_means = np.nanmean(matrix, axis=0)
# Fill each column's NaN with that column's mean
for j in range(matrix.shape[1]):
matrix[np.isnan(matrix[:, j]), j] = col_means[j]
print("\nImputed matrix:\n", matrix)
Performance Optimization with NumPy
Understanding why NumPy is fast — and how to keep it fast — makes the difference between code that runs in seconds vs hours on large datasets.
import numpy as np
import time
import sys
SIZE = 1_000_000
# ── Time Comparison ───────────────────────────────────────
py_list = list(range(SIZE))
np_arr = np.arange(SIZE, dtype=np.float64)
# Python list: sum of squares
t0 = time.perf_counter()
result_list = sum(x**2 for x in py_list)
time_list = time.perf_counter() - t0
# NumPy: sum of squares
t0 = time.perf_counter()
result_np = np.sum(np_arr**2)
time_np = time.perf_counter() - t0
print(f"Python list: {time_list:.4f}s")
print(f"NumPy array: {time_np:.4f}s")
print(f"Speedup: {time_list / time_np:.0f}× faster")
# ── Memory Comparison ─────────────────────────────────────
list_mem = sys.getsizeof(py_list) + SIZE * sys.getsizeof(1)
arr_mem = np_arr.nbytes
print(f"\nList memory: {list_mem:,} bytes ({list_mem/1e6:.1f} MB)")
print(f"Array memory: {arr_mem:,} bytes ({arr_mem/1e6:.1f} MB)")
print(f"Memory ratio: {list_mem / arr_mem:.1f}× less")
# ── dtype optimization ────────────────────────────────────
arr_f64 = np.random.rand(SIZE) # float64: 8 bytes/elem
arr_f32 = arr_f64.astype(np.float32) # float32: 4 bytes/elem
arr_f16 = arr_f64.astype(np.float16) # float16: 2 bytes/elem
print(f"\nfloat64: {arr_f64.nbytes:,} bytes")
print(f"float32: {arr_f32.nbytes:,} bytes (50% savings)")
print(f"float16: {arr_f16.nbytes:,} bytes (75% savings)")
# ── Views vs Copies ───────────────────────────────────────
arr = np.arange(1_000_000)
t0 = time.perf_counter(); v = arr[::2]; t_view = time.perf_counter()-t0
t0 = time.perf_counter(); c = arr[::2].copy(); t_copy = time.perf_counter()-t0
print(f"\nView: {t_view*1e6:.1f} µs")
print(f"Copy: {t_copy*1e6:.1f} µs (more expensive)")
# ── Avoid Python loops — use vectorization ────────────────
# SLOW: Python loop
data = np.random.rand(100_000)
t0 = time.perf_counter()
result_slow = [0] * len(data)
for i, x in enumerate(data):
result_slow[i] = x * 2 + 1
time_slow = time.perf_counter() - t0
# FAST: vectorized
t0 = time.perf_counter()
result_fast = data * 2 + 1
time_fast = time.perf_counter() - t0
print(f"\nLoop: {time_slow:.5f}s")
print(f"Vectorized: {time_fast:.5f}s ({time_slow/time_fast:.0f}× faster)")
1. Always prefer vectorized operations over Python loops.
2. Use the smallest
dtype that fits your data (float32 instead of float64 when possible).3. Prefer views over copies (avoid unnecessary
.copy()).4. Use
np.dot() or @ for matrix multiplication — never nested loops.5. Pre-allocate output arrays with
np.empty() or np.zeros() for in-place ops.