Scientific Computing &
Statistical Analysis with SciPy
SciPy builds on NumPy to provide algorithms for integration, optimization, linear algebra, statistical testing, and signal processing. It is the backbone of scientific Python β bridging pure math and real data science workflows.
Integration, Optimization, and Equation Solving
SciPy's integrate, optimize, and interpolate submodules solve the most common mathematical problems in science and engineering β computing areas under curves, finding minima, and fitting smooth functions through data points.
from scipy.integrate import quad rather than import scipy. This keeps imports fast and explicit.
from scipy import integrate
import numpy as np
# βββ 1. quad: single integral ββββββββββββββββββββ
# Integrate f(x) = x^2 from 0 to 3
result, error = integrate.quad(lambda x: x**2, 0, 3)
print(f"β«βΒ³ xΒ² dx = {result:.4f} (exact: 9.0)") # β 9.0000
print(f"Estimated error: {error:.2e}") # β ~1e-13
# βββ 2. dblquad: double integral βββββββββββββββββ
# β«β« (x + y) dx dy over [0,1] Γ [0,1]
result2, _ = integrate.dblquad(
lambda y, x: x + y,
0, 1, # x limits
lambda x: 0, # y lower limit (can depend on x)
lambda x: 1 # y upper limit
)
print(f"β¬ (x+y) dA = {result2:.4f}") # β 1.0000
# βββ 3. trapezoid: integrate from data points ββββ
x = np.linspace(0, np.pi, 100)
y = np.sin(x)
area = integrate.trapezoid(y, x)
print(f"β«β^Ο sin(x) dx β {area:.4f}") # β 2.0000 (exact: 2)
# βββ 4. cumulative_trapezoid: running integral βββ
x = np.array([0, 1, 2, 3, 4])
y = np.array([0, 1, 4, 9, 16]) # y = xΒ²
cumulative = integrate.cumulative_trapezoid(y, x, initial=0)
print(f"Cumulative integral: {cumulative}")
# β [ 0. 0.5 3. 9.5 21.5] (approx of xΒ³/3)
from scipy import optimize
import numpy as np
# βββ 1. minimize_scalar: single-variable minimum β
result = optimize.minimize_scalar(lambda x: (x - 3)**2 + 2)
print(f"Minimum at x = {result.x:.4f}, f(x) = {result.fun:.4f}")
# β x = 3.0000, f(x) = 2.0000
# βββ 2. minimize: multi-variable minimum βββββββββ
def rosenbrock(xy):
x, y = xy
return (1 - x)**2 + 100*(y - x**2)**2
result = optimize.minimize(rosenbrock, x0=[0, 0], method="BFGS")
print(f"Rosenbrock minimum: {result.x}") # β [1. 1.]
print(f"Iterations: {result.nit}")
# βββ 3. curve_fit: fit parameters to data ββββββββ
def model(x, a, b, c):
return a * np.exp(-b * x) + c
xdata = np.linspace(0, 4, 50)
ydata = model(xdata, 2.5, 1.3, 0.5) + 0.2*np.random.randn(50)
popt, pcov = optimize.curve_fit(model, xdata, ydata, p0=[2, 1, 0])
print(f"Fitted params: a={popt[0]:.2f}, b={popt[1]:.2f}, c={popt[2]:.2f}")
# β aβ2.50, bβ1.30, cβ0.50
# βββ 4. brentq: find root of f(x) = 0 βββββββββββ
root = optimize.brentq(lambda x: np.cos(x) - x, 0, 1)
print(f"cos(x) = x at x = {root:.6f}") # β 0.739085
Linear Algebra with SciPy
scipy.linalg extends NumPy's linear algebra with more algorithms and better performance. It handles solving systems of equations, matrix decompositions, and the core computations behind machine learning (PCA, SVD, least-squares).
scipy.linalg β it always uses optimized LAPACK/BLAS routines and provides more functions. numpy.linalg may fall back to slower implementations on some platforms.
from scipy import linalg
import numpy as np
# βββ 1. Solve linear system Ax = b βββββββββββββββ
A = np.array([[3, 1], [1, 2]], dtype=float)
b = np.array([9, 8], dtype=float)
x = linalg.solve(A, b)
print(f"Solution: x = {x}") # β [2. 3.]
print(f"Verify Ax = {A @ x}") # β [9. 8.] β
# βββ 2. Determinant and inverse ββββββββββββββββββ
det = linalg.det(A)
inv = linalg.inv(A)
print(f"det(A) = {det:.2f}") # β 5.00
print(f"Aβ»ΒΉ =\n{inv}")
# βββ 3. Eigenvalues and eigenvectors βββββββββββββ
vals, vecs = linalg.eig(A)
print(f"Eigenvalues: {vals.real}") # β [3.56 1.44]
print(f"Eigenvectors:\n{vecs}")
# βββ 4. SVD: Singular Value Decomposition ββββββββ
M = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]], dtype=float)
U, s, Vt = linalg.svd(M)
print(f"Singular values: {s.round(2)}") # β [16.12 1.07 0.00]
print(f"Rank β {np.sum(s > 1e-10)}") # β 2 (rank-deficient!)
# βββ 5. LU Decomposition βββββββββββββββββββββββββ
P, L, U = linalg.lu(A)
print(f"L:\n{L.round(3)}")
print(f"U:\n{U.round(3)}")
# A = P @ L @ U
from scipy import linalg
import numpy as np
# βββ Least-squares: overdetermined system ββββββββ
# Fit y = ax + b to noisy data
np.random.seed(42)
x = np.linspace(0, 10, 20)
y_true = 2.5 * x + 1.0
y = y_true + np.random.randn(20) * 2
# Build design matrix [x, 1] for linear regression
A = np.column_stack([x, np.ones(len(x))])
result = linalg.lstsq(A, y) # (solution, residuals, rank, sv)
coeffs = result[0]
print(f"Fitted: a={coeffs[0]:.3f}, b={coeffs[1]:.3f}")
# β aβ2.50, bβ1.00 (recovers true params)
# βββ QR Decomposition ββββββββββββββββββββββββββββ
Q, R = linalg.qr(A)
print(f"Q shape: {Q.shape}") # β (20, 20)
print(f"R shape: {R.shape}") # β (20, 2)
print(f"Q orthogonal: {np.allclose(Q @ Q.T, np.eye(20))}") # True
# βββ Cholesky (symmetric positive definite) ββββββ
S = np.array([[4, 2], [2, 3]], dtype=float)
L = linalg.cholesky(S, lower=True)
print(f"Cholesky L:\n{L}")
print(f"Verify L @ L.T β S: {np.allclose(L @ L.T, S)}") # True
Hypothesis Testing and Statistical Tests
scipy.stats is a comprehensive statistics library covering probability distributions, descriptive statistics, and a full suite of hypothesis tests. Understanding p-values and test choice is essential for any data analysis role.
from scipy import stats
import numpy as np
np.random.seed(42)
# βββ 1. One-sample t-test βββββββββββββββββββββββββ
# Is this sample mean significantly different from 5.0?
sample = np.random.normal(loc=5.3, scale=1.2, size=30)
t_stat, p_value = stats.ttest_1samp(sample, popmean=5.0)
print(f"One-sample t-test: t={t_stat:.3f}, p={p_value:.4f}")
print(f"β {'Reject Hβ' if p_value < 0.05 else 'Fail to reject Hβ'}")
# βββ 2. Two-sample t-test (independent) ββββββββββ
group_a = np.random.normal(5.0, 1.0, 40)
group_b = np.random.normal(5.8, 1.0, 40)
t2, p2 = stats.ttest_ind(group_a, group_b, equal_var=True)
print(f"\nTwo-sample t-test: t={t2:.3f}, p={p2:.4f}")
# βββ 3. Paired t-test ββββββββββββββββββββββββββββ
before = np.array([82, 91, 75, 88, 79, 95, 83, 77, 84, 90])
after = np.array([85, 94, 78, 90, 82, 96, 87, 80, 88, 93])
t3, p3 = stats.ttest_rel(before, after)
print(f"\nPaired t-test: t={t3:.3f}, p={p3:.4f}")
# βββ 4. One-way ANOVA ββββββββββββββββββββββββββββ
g1 = np.random.normal(5.0, 1.0, 20)
g2 = np.random.normal(6.0, 1.0, 20)
g3 = np.random.normal(5.5, 1.0, 20)
f_stat, p_anova = stats.f_oneway(g1, g2, g3)
print(f"\nANOVA: F={f_stat:.3f}, p={p_anova:.4f}")
# βββ 5. Chi-square test of independence ββββββββββ
observed = np.array([[30, 10], [20, 40]]) # contingency table
chi2, p_chi, dof, expected = stats.chi2_contingency(observed)
print(f"\nChi-square: ΟΒ²={chi2:.3f}, df={dof}, p={p_chi:.4f}")
# βββ 6. Normality test (Shapiro-Wilk) ββββββββββββ
normal_data = np.random.normal(0, 1, 50)
skewed_data = np.random.exponential(1, 50)
_, p_norm = stats.shapiro(normal_data)
_, p_skew = stats.shapiro(skewed_data)
print(f"\nShapiro normal data p={p_norm:.3f} β {'Normal' if p_norm>0.05 else 'Not normal'}")
print(f"Shapiro skewed data p={p_skew:.4f} β {'Normal' if p_skew>0.05 else 'Not normal'}")
from scipy import stats
import numpy as np
# βββ Continuous distributions ββββββββββββββββββββ
# PDF, CDF, PPF (percent-point = inverse CDF), RVS (random samples)
norm = stats.norm(loc=0, scale=1) # standard normal N(0,1)
print(f"PDF at 0: {norm.pdf(0):.4f}") # β 0.3989
print(f"CDF at 1.96:{norm.cdf(1.96):.4f}")# β 0.9750
print(f"PPF 0.975: {norm.ppf(0.975):.4f}")# β 1.9600 (z* for 95% CI)
# t-distribution (for small samples)
t_dist = stats.t(df=10)
print(f"\nt(10) critical value: {t_dist.ppf(0.975):.4f}") # β 2.2281
# βββ Descriptive stats βββββββββββββββββββββββββββ
data = np.random.normal(50, 10, 200)
desc = stats.describe(data)
print(f"\nn={desc.nobs}, mean={desc.mean:.2f}")
print(f"variance={desc.variance:.2f}, skew={desc.skewness:.3f}")
print(f"kurtosis={desc.kurtosis:.3f}")
# βββ Confidence interval βββββββββββββββββββββββββ
ci = stats.t.interval(0.95, df=len(data)-1,
loc=np.mean(data),
scale=stats.sem(data))
print(f"\n95% CI: ({ci[0]:.2f}, {ci[1]:.2f})")
# βββ Pearson and Spearman correlation βββββββββββββ
x = np.random.randn(50)
y = 2*x + np.random.randn(50)
r_p, p_p = stats.pearsonr(x, y)
r_s, p_s = stats.spearmanr(x, y)
print(f"\nPearson r={r_p:.3f} (p={p_p:.4f})")
print(f"Spearman Ο={r_s:.3f} (p={p_s:.4f})")
Signal Processing with SciPy
scipy.signal and scipy.fft enable frequency-domain analysis and filtering. These tools are essential for time series data, audio processing, sensor data, and any domain where you need to remove noise or identify periodic patterns.
butter, sosfilt) to remove specific frequency bands from a signal.
from scipy.fft import fft, fftfreq
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
# βββ Create composite signal βββββββββββββββββββββ
fs = 1000 # sampling frequency (Hz)
t = np.linspace(0, 1, fs, endpoint=False)
# Signal = 50 Hz sine + 120 Hz sine + noise
clean = (np.sin(2*np.pi*50*t) +
0.5*np.sin(2*np.pi*120*t))
noisy = clean + 0.3*np.random.randn(len(t))
# βββ FFT: frequency analysis βββββββββββββββββββββ
N = len(t)
yf = fft(noisy)
xf = fftfreq(N, 1/fs)[:N//2] # positive frequencies only
magnitude = 2/N * np.abs(yf[:N//2]) # one-sided spectrum
# Find dominant frequencies
peaks, _ = signal.find_peaks(magnitude, height=0.1)
print("Dominant frequencies (Hz):", xf[peaks].round(1))
# β [ 50. 120.] β (correctly identified both components)
# βββ Butterworth low-pass filter βββββββββββββββββ
cutoff = 80 # Hz β keep below 80 Hz, remove 120 Hz component
order = 5
sos = signal.butter(order, cutoff, fs=fs, btype='low', output='sos')
filtered = signal.sosfilt(sos, noisy)
# βββ Plot original vs filtered βββββββββββββββββββ
fig, axes = plt.subplots(3, 1, figsize=(10, 8), sharex=False)
axes[0].plot(t[:200], noisy[:200], alpha=.7, color='gray')
axes[0].set_title("Noisy Signal (first 200ms)")
axes[1].plot(xf, magnitude, color='steelblue')
axes[1].set_title("FFT Magnitude Spectrum")
axes[1].set_xlabel("Frequency (Hz)")
axes[2].plot(t[:200], filtered[:200], color='teal')
axes[2].set_title("After Low-pass Filter (cutoff=80Hz)")
axes[2].set_xlabel("Time (s)")
plt.tight_layout()
plt.show()
from scipy import signal
import numpy as np
# βββ Convolution: smoothing with a kernel ββββββββ
data = np.array([1, 2, 3, 4, 3, 2, 1, 5, 1, 2, 3])
kernel = np.ones(3) / 3 # moving average kernel
smoothed = signal.convolve(data, kernel, mode='same')
print("Original:", data)
print("Smoothed:", smoothed.round(2))
# βββ Cross-correlation: find time lag βββββββββββββ
# Detect delay between two versions of a signal
x = np.array([0, 0, 1, 2, 1, 0, 0, 0, 0, 0])
y = np.array([0, 0, 0, 0, 1, 2, 1, 0, 0, 0]) # x delayed by 2
correlation = signal.correlate(y, x, mode='full')
lags = signal.correlation_lags(len(y), len(x), mode='full')
lag = lags[np.argmax(correlation)]
print(f"\nDetected lag: {lag} samples") # β 2
# βββ Peak detection ββββββββββββββββββββββββββββββ
ecg = np.loadtxt("ecg.csv") if False else ( # demo data
np.sin(np.linspace(0, 20*np.pi, 500)) + 0.1*np.random.randn(500)
)
peaks, properties = signal.find_peaks(ecg, height=0.5, distance=20)
print(f"\nFound {len(peaks)} peaks at indices: {peaks[:5]}...")
scipy.integrate β numerical integration (quad, trapezoid, odeint)scipy.optimize β minimize, curve_fit, root finding, least squaresscipy.linalg β solve, SVD, eigenvalues, LU, QR, Choleskyscipy.stats β distributions, t-test, ANOVA, chi-square, normalityscipy.signal β filtering, convolution, peak detection, spectrogramscipy.fft β FFT, inverse FFT, frequency analysisscipy.interpolate β splines, interp1d, griddata