⚙️ Part 1 — General Python Programming
Master Python fundamentals: syntax, data structures, OOP, functional patterns, and advanced concepts — written for clarity, with real runnable code.
1Setup & Environment
Installing Python, setting up VS Code, managing packages with pip and virtual environments.
# Check Python version
python --version
# Python 3.11.5
# Create virtual environment
python -m venv venv
# Activate (Windows)
.\venv\Scripts\activate
# Activate (Mac / Linux)
source venv/bin/activate
# Install data science packages
pip install numpy pandas matplotlib seaborn scikit-learn scipy
# List installed packages
pip list
# Save requirements
pip freeze > requirements.txt
# Install from requirements
pip install -r requirements.txt
| Command | Purpose | Example |
|---|---|---|
python --version |
Check Python version | Python 3.11.5 |
pip install X |
Install package | pip install pandas |
pip list |
List packages | Shows all installed |
python -m venv venv |
Create virtual env | Isolated env folder |
pip freeze |
Export packages | For requirements.txt |
2Variables & Data Types
Python is dynamically typed — no explicit type declarations needed. Every value has a type, but variables can hold any type.
# ── Variable Assignment ──────────────────────────
name = "Alice" # str
age = 25 # int
height = 5.6 # float
is_student = True # bool
nothing = None # NoneType
complex_n = 3 + 4j # complex
print(f"name = {name!r:20s} type = {type(name).__name__}")
print(f"age = {age!r:20} type = {type(age).__name__}")
print(f"height = {height!r:20} type = {type(height).__name__}")
print(f"is_student = {is_student!r:20} type = {type(is_student).__name__}")
print(f"nothing = {nothing!r:20} type = {type(nothing).__name__}")
# ── Multiple Assignment ───────────────────────────
x, y, z = 10, 20, 30 # tuple unpacking
a = b = c = 100 # same value
x, y = y, x # swap without temp variable
print(f"\nAfter swap: x={x}, y={y}")
# ── Numeric Operations ────────────────────────────
print(f"7 / 2 = {7 / 2}") # True division → float
print(f"7 // 2 = {7 // 2}") # Floor division → int
print(f"7 % 2 = {7 % 2}") # Modulo
print(f"2 ** 8 = {2 ** 8}") # Power
# ── Type Conversion ───────────────────────────────
print(int("42")) # 42
print(float("3.14")) # 3.14
print(str(100)) # '100'
print(bool(0)) # False
print(bool(42)) # True
name = 'Alice' type = str age = 25 type = int height = 5.6 type = float is_student = True type = bool nothing = None type = NoneType After swap: x=20, y=10 7 / 2 = 3.5 7 // 2 = 3 7 % 2 = 1 2 ** 8 = 256 42 3.14 '100' False True
3Strings & String Methods
Python strings are immutable sequences of Unicode characters. They offer a rich set of built-in methods — essential for data cleaning and text processing.
# ── String Creation ───────────────────────────────
s1 = 'Single quotes'
s2 = "Double quotes"
s3 = """Multi-line
string"""
s4 = r"Raw: C:\Users\new" # no escape processing
# ── f-Strings (preferred) ─────────────────────────
name, age, score = "Alice", 28, 95.678
print(f"Hello {name}! Age={age} Score={score:.2f}")
# Alignment
print(f"{'Name':>12} {'Score':>8}")
print(f"{'─'*12} {'─'*8}")
for n, s in [("Alice", 95.7), ("Bob", 87.2), ("Charlie", 92.1)]:
print(f"{n:>12} {s:>8.1f}")
# ── Slicing ───────────────────────────────────────
text = "Python Data Science"
print(text[0:6]) # Python
print(text[7:11]) # Data
print(text[::-1]) # ecneicS ataD nohtyP
# ── Common Methods ────────────────────────────────
s = " Hello, World! "
print(s.strip()) # Remove whitespace
print(s.strip().lower()) # hello, world!
print(s.strip().replace("World", "Python"))
print("a,b,c".split(",")) # ['a', 'b', 'c']
print("-".join(["Data","Science"])) # Data-Science
print("hello".upper()) # HELLO
print("42".zfill(8)) # 00000042
print("Python" in "Python rocks") # True
Hello Alice! Age=28 Score=95.68
Name Score
──── ────────
Alice 95.7
Bob 87.2
Charlie 92.1
Python
Data
ecneicS ataD nohtyP
Hello, World!
hello, world!
Hello, Python!
['a', 'b', 'c']
Data-Science
HELLO
00000042
True
| Method | Description | Example |
|---|---|---|
strip() |
Remove whitespace from both ends | " hi ".strip() → "hi" |
split(sep) |
Split into list by separator | "a,b".split(",") → ['a','b'] |
join(lst) |
Join list into string | "-".join(['a','b']) → "a-b" |
replace(a,b) |
Replace all occurrences | "hi".replace("i","o") → "ho" |
find(sub) |
Return index of first match | "hello".find("ll") → 2 |
startswith() |
Check prefix | "Python".startswith("Py") → True |
format() |
Format string | "Hi {}".format("Bob") |
4Operators
Python has arithmetic, comparison, logical, identity, membership, and bitwise operators.
a, b = 15, 4
# ── Arithmetic ────────────────────────────────────
print(f"a+b = {a+b} a-b = {a-b} a*b = {a*b}")
print(f"a/b = {a/b} a//b = {a//b} a%b = {a%b} a**b = {a**b}")
# ── Comparison ────────────────────────────────────
print(f"{a} > {b} → {a > b}")
print(f"{a} == {b} → {a == b}")
# ── Logical ───────────────────────────────────────
x, y = True, False
print(f"x and y = {x and y}")
print(f"x or y = {x or y}")
print(f"not x = {not x}")
# ── Chained comparisons (Pythonic!) ───────────────
n = 15
print(f"10 < {n} < 20 → {10 < n < 20}")
# ── Identity & Membership ─────────────────────────
lst = [1, 2, 3]
print(f"2 in lst → {2 in lst}")
print(f"lst is lst → {lst is lst}")
print(f"[1,2,3] is [1,2,3]→ {[1,2,3] is [1,2,3]}") # False! Different object
# ── Augmented Assignment ──────────────────────────
c = 10
c += 5; print(f"c += 5 → {c}")
c *= 2; print(f"c *= 2 → {c}")
c **= 2; print(f"c **= 2 → {c}")
a+b = 19 a-b = 11 a*b = 60 a/b = 3.75 a//b = 3 a%b = 3 a**b = 50625 15 > 4 → True 15 == 4 → False x and y = False x or y = True not x = False 10 < 15 < 20 → True 2 in lst → True lst is lst → True [1,2,3] is [1,2,3]→ False c += 5 → 15 c *= 2 → 30 c **= 2 → 900
5Collections — List, Tuple, Set, Dict
Python's four core collection types. Understanding when to use each is crucial for writing efficient, readable code.
# ── LISTS ─────────────────────────────────────────
nums = [3, 1, 4, 1, 5, 9, 2, 6]
print(nums[0]) # 3 (first)
print(nums[-1]) # 6 (last)
print(nums[2:5]) # [4, 1, 5]
print(nums[::2]) # [3, 4, 5, 2] (every 2nd)
nums.append(7) # add to end
nums.insert(0, 0) # insert at index
nums.remove(1) # remove first occurrence
popped = nums.pop() # remove & return last
# List comprehensions (Pythonic!)
squares = [x**2 for x in range(1, 11)]
evens = [x for x in range(20) if x % 2 == 0]
matrix = [[i*j for j in range(1,4)] for i in range(1,4)]
print(f"squares: {squares}")
print(f"evens: {evens}")
print(f"matrix: {matrix}")
# ── DICTIONARIES ──────────────────────────────────
person = {
"name": "Alice", "age": 28,
"skills": ["Python", "ML", "SQL"]
}
person["city"] = "New York" # add key
person["age"] += 1 # update
# Dict comprehension
sq_dict = {x: x**2 for x in range(1, 6)}
print(f"sq_dict: {sq_dict}")
# Iterating
for key, val in person.items():
print(f" {key}: {val}")
# ── SETS ──────────────────────────────────────────
A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}
print(f"A|B = {A|B}") # union
print(f"A&B = {A&B}") # intersection
print(f"A-B = {A-B}") # difference
# Deduplication
raw = [1,2,2,3,3,3,4]
uniq = sorted(set(raw))
print(f"unique: {uniq}")
3
6
[4, 1, 5]
[3, 4, 5, 2]
squares: [1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
evens: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
matrix: [[1, 2, 3], [2, 4, 6], [3, 6, 9]]
sq_dict: {1: 1, 2: 4, 3: 9, 4: 16, 5: 25}
name: Alice
age: 29
skills: ['Python', 'ML', 'SQL']
city: New York
A|B = {1, 2, 3, 4, 5, 6, 7, 8}
A&B = {4, 5}
A-B = {1, 2, 3}
unique: [1, 2, 3, 4]
6Control Flow — if / elif / else
Python uses indentation (4 spaces) to define blocks. The match statement
(Python 3.10+) offers structural pattern matching.
# ── if / elif / else ──────────────────────────────
def grade(score):
if score >= 90: return "A – Excellent"
elif score >= 80: return "B – Good"
elif score >= 70: return "C – Average"
elif score >= 60: return "D – Below Avg"
else: return "F – Fail"
for s in [96, 83, 72, 61, 45]:
print(f"score={s:3d} → {grade(s)}")
# ── Ternary expression ────────────────────────────
age = 20
status = "Adult" if age >= 18 else "Minor"
print(f"\n{age} → {status}")
# ── Chained comparison (Pythonic!) ────────────────
x = 15
if 10 < x < 20:
print(f"{x} is between 10 and 20")
# ── match / case (Python 3.10+) ───────────────────
def http_msg(code):
match code:
case 200: return "OK"
case 404: return "Not Found"
case 500: return "Server Error"
case _: return f"Unknown ({code})"
for c in [200, 404, 500, 301]:
print(f"HTTP {c} → {http_msg(c)}")
score= 96 → A – Excellent score= 83 → B – Good score= 72 → C – Average score= 61 → D – Below Avg score= 45 → F – Fail 20 → Adult 15 is between 10 and 20 HTTP 200 → OK HTTP 404 → Not Found HTTP 500 → Server Error HTTP 301 → Unknown (301)
7Loops — for & while
Python loops are clean and expressive. enumerate, zip, and
comprehensions make most loops one-liners.
# ── for loop ──────────────────────────────────────
for i in range(5):
print(i, end=" ") # 0 1 2 3 4
print()
# enumerate → index + value
fruits = ["apple", "banana", "cherry"]
for idx, fruit in enumerate(fruits, start=1):
print(f" {idx}. {fruit}")
# zip → parallel iteration
names = ["Alice", "Bob", "Carol"]
scores = [92, 85, 97 ]
for name, score in zip(names, scores):
print(f" {name}: {score}")
# ── while loop ────────────────────────────────────
n = 1
while n <= 32:
print(n, end=" ")
n *= 2
print()
# break / continue
for i in range(1, 16):
if i % 3 == 0: continue # skip multiples of 3
if i > 10: break # stop at 10
print(i, end=" ")
print()
# ── Loop + else (unique Python feature!) ──────────
for n in [7, 11, 9]:
for d in range(2, int(n**0.5)+1):
if n % d == 0: break
else:
print(f"{n} is prime") # runs if no break occurred
0 1 2 3 4 1. apple 2. banana 3. cherry Alice: 92 Bob: 85 Carol: 97 1 2 4 8 16 32 1 2 4 5 7 8 7 is prime 11 is prime
8Functions
Functions are first-class citizens in Python — they can be passed as arguments, returned from other functions, and stored in variables.
# ── Default & keyword args ────────────────────────
def greet(name: str, greeting: str = "Hello") -> str:
return f"{greeting}, {name}!"
print(greet("Alice")) # Hello, Alice!
print(greet("Bob", "Hi")) # Hi, Bob!
print(greet(greeting="Hey", name="Carol")) # Hey, Carol!
# ── *args and **kwargs ────────────────────────────
def stats(*numbers):
n = len(numbers)
return {"count":n, "mean":sum(numbers)/n,
"min":min(numbers), "max":max(numbers)}
print(stats(10, 20, 35, 40, 15))
def build_record(**fields):
return fields
rec = build_record(name="Alice", age=28, dept="DS")
print(rec)
# ── Return multiple values ────────────────────────
def minmax(data):
return min(data), max(data) # returns tuple
lo, hi = minmax([34, 12, 56, 78, 23])
print(f"min={lo}, max={hi}")
# ── Closure (factory function) ────────────────────
def make_multiplier(factor):
def multiply(x):
return x * factor
return multiply
double = make_multiplier(2)
triple = make_multiplier(3)
print(f"double(7) = {double(7)}")
print(f"triple(7) = {triple(7)}")
# ── Generator ─────────────────────────────────────
def fibonacci(n):
a, b = 0, 1
while a <= n:
yield a
a, b = b, a+b
print(list(fibonacci(100)))
Hello, Alice!
Hi, Bob!
Hey, Carol!
{'count': 5, 'mean': 24.0, 'min': 10, 'max': 40}
{'name': 'Alice', 'age': 28, 'dept': 'DS'}
min=12, max=78
double(7) = 14
triple(7) = 21
[0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89]
9Lambda & Functional Programming
Lambdas are anonymous one-line functions. map, filter, and
reduce are functional tools — though list comprehensions are often preferred.
from functools import reduce
# ── Lambda ────────────────────────────────────────
square = lambda x: x**2
add = lambda x, y: x + y
clamp = lambda x, lo, hi: max(lo, min(hi, x))
print(square(9)) # 81
print(add(5, 7)) # 12
print(clamp(150, 0, 100)) # 100
# ── Sorting with lambda ───────────────────────────
employees = [
{"name": "Bob", "salary": 88_000},
{"name": "Alice", "salary": 95_000},
{"name": "Carol", "salary": 102_000},
]
by_salary = sorted(employees, key=lambda e: e["salary"], reverse=True)
for e in by_salary:
print(f" {e['name']:8s}: ${e['salary']:,}")
# ── map / filter / reduce ─────────────────────────
nums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
squares = list(map(lambda x: x**2, nums))
evens = list(filter(lambda x: x%2==0, nums))
product = reduce(lambda a, x: a*x, nums)
print(f"squares: {squares}")
print(f"evens: {evens}")
print(f"product: {product}") # 10! = 3628800
# ── Decorators ────────────────────────────────────
import time
from functools import wraps
def timer(func):
@wraps(func)
def wrapper(*args, **kwargs):
start = time.perf_counter()
result = func(*args, **kwargs)
print(f" [{func.__name__}] took {time.perf_counter()-start:.4f}s")
return result
return wrapper
@timer
def compute_sum(n):
return sum(range(n+1))
print(compute_sum(1_000_000))
81 12 100 Carol : $102,000 Alice : $95,000 Bob : $88,000 squares: [1, 4, 9, 16, 25, 36, 49, 64, 81, 100] evens: [2, 4, 6, 8, 10] product: 3628800 [compute_sum] took 0.0312s 500000500000
10Object-Oriented Programming (OOP)
Python supports full OOP: classes, inheritance, polymorphism, abstract classes, and dataclasses.
_protected and __private naming.
super() to call parent methods.from dataclasses import dataclass, field
from abc import ABC, abstractmethod
import math
# ── Base class ────────────────────────────────────
class BankAccount:
bank_name = "PyBank" # class variable
def __init__(self, owner: str, balance: float = 0.0):
self.owner = owner
self._balance = balance # protected
@property
def balance(self): return self._balance
def deposit(self, amount: float):
self._balance += amount
print(f" ✔ +${amount:,.0f} Balance=${self._balance:,.0f}")
def __repr__(self): return f"{self.owner}: ${self._balance:,.0f}"
# ── Inheritance ───────────────────────────────────
class SavingsAccount(BankAccount):
def __init__(self, owner, balance=0, rate=0.05):
super().__init__(owner, balance)
self.rate = rate
def add_interest(self):
interest = self._balance * self.rate
self._balance += interest
print(f" ✔ Interest +${interest:,.0f} → ${self._balance:,.0f}")
acc = SavingsAccount("Alice", 5000, 0.04)
acc.deposit(1000)
acc.add_interest()
print(repr(acc))
# ── Abstract class ────────────────────────────────
class Shape(ABC):
@abstractmethod
def area(self) -> float: ...
@abstractmethod
def perimeter(self) -> float: ...
def describe(self):
print(f" {self.__class__.__name__}: area={self.area():.2f}, perimeter={self.perimeter():.2f}")
class Circle(Shape):
def __init__(self, r): self.r = r
def area(self): return math.pi * self.r**2
def perimeter(self): return 2 * math.pi * self.r
class Rectangle(Shape):
def __init__(self, w, h): self.w, self.h = w, h
def area(self): return self.w * self.h
def perimeter(self): return 2 * (self.w + self.h)
for s in [Circle(5), Rectangle(4,6)]:
s.describe()
# ── Dataclass ─────────────────────────────────────
@dataclass
class DataRecord:
name: str
age: int
scores: list = field(default_factory=list)
def avg(self): return sum(self.scores)/len(self.scores) if self.scores else 0
r = DataRecord("Alice", 28, [92.5, 88.0, 95.0])
print(r)
print(f"avg = {r.avg():.2f}")
✔ +$1,000 Balance=$6,000 ✔ Interest +$240 → $6,240 Alice: $6,240 Circle: area=78.54, perimeter=31.42 Rectangle: area=24.00, perimeter=20.00 DataRecord(name='Alice', age=28, scores=[92.5, 88.0, 95.0]) avg = 91.83
11Error Handling & Exceptions
Robust programs anticipate failures. Python's try/except/else/finally gives
full control over error recovery.
# ── try / except / else / finally ────────────────
def safe_divide(a, b):
try:
result = a / b
except ZeroDivisionError as e:
print(f" ✖ Error: {e}")
return None
else:
print(f" ✔ {a}/{b} = {result}")
return result
finally:
print(f" (cleanup block always runs)")
safe_divide(10, 2)
print()
safe_divide(10, 0)
# ── Multiple exception types ──────────────────────
for val in ["42", "abc", None, 3.14]:
try:
print(f" int({val!r}) = {int(val)}")
except (ValueError, TypeError) as e:
print(f" ✖ {e}")
# ── Custom exceptions ─────────────────────────────
class DataValidationError(ValueError):
def __init__(self, field, value, msg):
super().__init__(f"'{field}': {msg} (got {value!r})")
def validate_age(age):
if not isinstance(age, int):
raise DataValidationError("age", age, "must be int")
if not (0 <= age <= 120):
raise DataValidationError("age", age, "must be 0–120")
return age
for v in [25, -5, "thirty"]:
try:
print(f" validate({v!r}) → {validate_age(v)}")
except DataValidationError as e:
print(f" ✖ {e}")
✔ 10/2 = 5.0
(cleanup block always runs)
✖ Error: division by zero
(cleanup block always runs)
int('42') = 42
✖ invalid literal for int() with base 10: 'abc'
✖ int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
✖ int() can't convert non-string with explicit base
validate(25) → 25
✖ 'age': must be 0–120 (got -5)
✖ 'age': must be int (got 'thirty')
12File I/O
Reading and writing files is fundamental. Always use context managers (with)
to ensure files are properly closed.
import json, csv
from pathlib import Path
# ── Text files ────────────────────────────────────
# Write
with open("data.txt", "w") as f:
f.write("Line 1\n")
f.writelines(["Line 2\n", "Line 3\n"])
# Read all
with open("data.txt") as f:
content = f.read()
print(content)
# Read line by line (memory-efficient for large files)
with open("data.txt") as f:
for line in f:
print(line.rstrip())
# ── CSV ───────────────────────────────────────────
import csv
data = [["Alice",28,95000], ["Bob",34,88000], ["Carol",25,79000]]
with open("employees.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["name","age","salary"])
writer.writerows(data)
with open("employees.csv") as f:
reader = csv.DictReader(f)
for row in reader:
print(f" {row['name']:8s} age={row['age']} salary=${int(row['salary']):,}")
# ── JSON ──────────────────────────────────────────
config = {"model":"RandomForest", "n_estimators":100, "max_depth":5}
with open("config.json", "w") as f:
json.dump(config, f, indent=2)
with open("config.json") as f:
loaded = json.load(f)
print(f"Loaded: {loaded}")
# ── pathlib (modern path handling) ────────────────
p = Path("employees.csv")
print(f"exists: {p.exists()}")
print(f"size: {p.stat().st_size} bytes")
print(f"stem: {p.stem}")
print(f"suffix: {p.suffix}")
Line 1
Line 2
Line 3
Line 1
Line 2
Line 3
Alice age=28 salary=$95,000
Bob age=34 salary=$88,000
Carol age=25 salary=$79,000
Loaded: {'model': 'RandomForest', 'n_estimators': 100, 'max_depth': 5}
exists: True
size: 53 bytes
stem: employees
suffix: .csv
13Advanced Concepts
Context managers, type hints, comprehensions at scale, and the itertools /
collections modules.
from typing import Optional, Union, TypeVar
from collections import Counter, defaultdict
import itertools
from contextlib import contextmanager
# ── Type hints ────────────────────────────────────
def process(data: list[int], threshold: float = 0.5) -> dict[str, float]:
return {"mean": sum(data)/len(data), "threshold": threshold}
result: dict[str, float] = process([10, 20, 30, 40])
print(result)
# ── Counter ───────────────────────────────────────
words = "the quick brown fox jumps over the lazy dog the".split()
freq = Counter(words)
print(f"top 3: {freq.most_common(3)}")
# ── defaultdict ───────────────────────────────────
by_dept = defaultdict(list)
records = [("Alice","DS"), ("Bob","Eng"), ("Carol","DS"), ("Dave","Eng")]
for name, dept in records:
by_dept[dept].append(name)
print(dict(by_dept))
# ── itertools ─────────────────────────────────────
# Combinations
items = ["A","B","C","D"]
print("combinations(2):", list(itertools.combinations(items, 2)))
# Chain
chain = list(itertools.chain([1,2,3],[4,5,6],[7,8,9]))
print(f"chain: {chain}")
# groupby
data = [("DS","Alice"),("DS","Carol"),("Eng","Bob"),("Eng","Dave")]
for dept, group in itertools.groupby(data, key=lambda x: x[0]):
print(f" {dept}: {[x[1] for x in group]}")
# ── Context manager ───────────────────────────────
import time
@contextmanager
def timer(label):
start = time.perf_counter()
yield
print(f" [{label}] {time.perf_counter()-start:.4f}s")
with timer("list comp"):
data = [x**2 for x in range(500_000)]
{'mean': 25.0, 'threshold': 0.5}
top 3: [('the', 3), ('quick', 1), ('brown', 1)]
{'DS': ['Alice', 'Carol'], 'Eng': ['Bob', 'Dave']}
combinations(2): [('A','B'),('A','C'),('A','D'),('B','C'),('B','D'),('C','D')]
chain: [1, 2, 3, 4, 5, 6, 7, 8, 9]
DS: ['Alice', 'Carol']
Eng: ['Bob', 'Dave']
[list comp] 0.0421s
📊 Part 2 — Python for Data Science
From NumPy arrays to machine learning pipelines. Every section includes real code with actual outputs. Built by a 15-year Data Science practitioner.
1The Data Science Ecosystem
Python's data science stack is the most powerful in the world. Here's how each library fits into the workflow.
| Library | Install | Import Convention | Primary Use |
|---|---|---|---|
| NumPy | pip install numpy |
import numpy as np |
Arrays, math |
| Pandas | pip install pandas |
import pandas as pd |
DataFrames, tabular data |
| Matplotlib | pip install matplotlib |
import matplotlib.pyplot as plt |
Plotting |
| Seaborn | pip install seaborn |
import seaborn as sns |
Statistical charts |
| Scikit-learn | pip install scikit-learn |
from sklearn import … |
Machine learning |
| SciPy | pip install scipy |
from scipy import stats |
Statistics, optimization |
2NumPy — Arrays & Numerical Computing
NumPy arrays are 10–100× faster than Python lists for numerical operations. They support broadcasting, vectorized operations, and linear algebra.
import numpy as np
np.random.seed(42)
# ── Creating Arrays ───────────────────────────────
a = np.array([1, 2, 3, 4, 5])
M = np.array([[1,2,3],[4,5,6],[7,8,9]])
print(f"1-D: {a} shape={a.shape} dtype={a.dtype}")
print(f"2-D:\n{M} shape={M.shape}")
# Factory functions
print(np.zeros((2,3)))
print(np.ones((2,3)))
print(np.eye(3))
print(np.arange(0, 20, 3)) # [0, 3, 6, 9, 12, 15, 18]
print(np.linspace(0, 1, 6)) # [0. 0.2 0.4 0.6 0.8 1. ]
# ── Indexing & Boolean mask ───────────────────────
arr = np.array([10, 25, 3, 47, 8, 62, 15])
print(arr[arr > 20]) # [25 47 62]
print(np.where(arr > 20, arr, 0)) # [0 25 0 47 0 62 0]
# ── Vectorized ops (no loops!) ────────────────────
a = np.array([1., 2., 3., 4., 5.])
print(a**2) # [ 1. 4. 9. 16. 25.]
print(np.sqrt(a)) # [1. 1.414 1.732 2. 2.236]
print(np.log(a)) # [0. 0.693 1.099 1.386 1.609]
# ── Broadcasting ──────────────────────────────────
row = np.array([[1, 2, 3]]) # shape (1,3)
col = np.array([[10],[20],[30]]) # shape (3,1)
print(row + col) # broadcasts → (3,3)
# ── Statistics ────────────────────────────────────
data = np.random.normal(50, 10, 1000)
print(f"mean={data.mean():.2f} std={data.std():.2f}")
print(f"min={data.min():.2f} max={data.max():.2f}")
print(f"p25={np.percentile(data,25):.2f} p75={np.percentile(data,75):.2f}")
# ── Linear Algebra ────────────────────────────────
A = np.array([[2,1],[5,3]])
b = np.array([1, 0])
x = np.linalg.solve(A, b)
print(f"Ax=b solution: {x}")
print(f"Eigenvalues: {np.linalg.eigvals(A).round(3)}")
1-D: [1 2 3 4 5] shape=(5,) dtype=int64 2-D: [[1 2 3] [4 5 6] [7 8 9]] shape=(3, 3) [[0. 0. 0.] [0. 0. 0.]] [[1. 1. 1.] [1. 1. 1.]] [[1. 0. 0.] [0. 1. 0.] [0. 0. 1.]] [ 0 3 6 9 12 15 18] [0. 0.2 0.4 0.6 0.8 1. ] [25 47 62] [ 0 25 0 47 0 62 0] [ 1. 4. 9. 16. 25.] [1. 1.414 1.732 2. 2.236] [0. 0.693 1.099 1.386 1.609] [[11 12 13] [21 22 23] [31 32 33]] mean=49.92 std=9.87 min=16.74 max=84.12 p25=43.35 p75=56.61 Ax=b solution: [ 3. -5.] Eigenvalues: [0.697 4.303]
3Pandas — DataFrames & Series
Pandas is the workhorse of data science. Master it and you can handle 90% of real-world data manipulation tasks.
import pandas as pd
import numpy as np
np.random.seed(42)
# ── Create DataFrame ──────────────────────────────
df = pd.DataFrame({
"name": ["Alice","Bob","Carol","Dave","Eve","Frank"],
"dept": ["DS","Eng","DS","Mgmt","DS","Eng"],
"salary": [95000,88000,79000,120000,82000,91000],
"experience": [3,8,1,15,4,7],
"rating": [4.2,3.8,4.5,4.7,3.9,4.1],
})
print(df.to_string(index=False))
print(f"\nShape: {df.shape} | dtypes:\n{df.dtypes}")
# ── Selection ─────────────────────────────────────
print("\nDS employees:")
print(df[df["dept"]=="DS"][["name","salary","rating"]].to_string(index=False))
# .query() (cleaner syntax)
print("\nquery: experience>3 and rating>=4.0:")
print(df.query("experience > 3 and rating >= 4.0")[["name","dept","salary"]].to_string(index=False))
# ── Add / transform columns ───────────────────────
df["salary_k"] = df["salary"] / 1000
df["level"] = pd.cut(df["experience"], bins=[0,2,7,100],
labels=["Junior","Mid","Senior"])
# ── GroupBy ───────────────────────────────────────
agg = df.groupby("dept").agg(
count=("name","count"),
avg_salary=("salary","mean"),
avg_rating=("rating","mean"),
).round(1)
print(f"\nGroupBy dept:\n{agg.to_string()}")
# ── Missing data ──────────────────────────────────
df2 = df.copy().astype(object)
df2.loc[[1,4], "salary"] = np.nan
print(f"\nMissing: {df2.isnull().sum().to_dict()}")
df2["salary"] = df2["salary"].fillna(df2["salary"].median())
# ── Merge ─────────────────────────────────────────
dept_info = pd.DataFrame({
"dept":["DS","Eng","Mgmt"], "budget_m":[2.5,4.0,1.8]
})
merged = pd.merge(df, dept_info, on="dept")
print(f"\nMerged:\n{merged[['name','dept','salary','budget_m']].to_string(index=False)}")
name dept salary experience rating
Alice DS 95000 3 4.2
Bob Eng 88000 8 3.8
Carol DS 79000 1 4.5
Dave Mgmt 120000 15 4.7
Eve DS 82000 4 3.9
Frank Eng 91000 7 4.1
Shape: (6, 5) | dtypes:
name object
dept object
salary int64
experience int64
rating float64
DS employees:
name salary rating
Alice 95000 4.2
Carol 79000 4.5
Eve 82000 3.9
query: experience>3 and rating>=4.0:
name dept salary
Alice DS 95000
Dave Mgmt 120000
Frank Eng 91000
GroupBy dept:
count avg_salary avg_rating
dept
DS 3 85333.3 4.2
Eng 2 89500.0 4.0
Mgmt 1 120000.0 4.7
Missing: {'name': 0, 'salary': 2, ...}
Merged:
name dept salary budget_m
Alice DS 95000 2.5
Carol DS 79000 2.5
Eve DS 82000 2.5
Bob Eng 88000 4.0
Frank Eng 91000 4.0
Dave Mgmt 120000 1.8
4Data Cleaning & Preprocessing
Real-world data is messy. 80% of a data scientist's time is spent here. Learn the patterns once and apply them everywhere.
import pandas as pd
import numpy as np
np.random.seed(42)
# ── Simulate messy dataset ────────────────────────
n = 12
df = pd.DataFrame({
"name": ["Alice","Bob","Carol","Dave","Eve","Frank",
"Grace","Henry","Iris",None,"Jack","Alice"], # None & duplicate
"age": [28,34,25,40,29,33,27,45,31,28,38,28],
"salary": [95000,88000,np.nan,120000,82000,91000,
85000,np.nan,87000,95000,250000,95000], # missing + outlier
"dept": ["DS","ENG","ds","Mgmt","DS","eng","DS","MGMT","Eng","DS","DS","DS"],
})
print("Raw data:")
print(df.to_string())
# ── 1. Missing values ─────────────────────────────
print(f"\nMissing:\n{df.isnull().sum().to_string()}")
print(f"\nMissing %:\n{(df.isnull().mean()*100).round(1).to_string()}")
df["name"] = df["name"].fillna("Unknown")
df["salary"] = df["salary"].fillna(df["salary"].median())
# ── 2. Duplicates ─────────────────────────────────
print(f"\nDuplicates: {df.duplicated().sum()}")
df = df.drop_duplicates()
print(f"After dedup: {len(df)} rows")
# ── 3. String normalisation ───────────────────────
df["dept"] = df["dept"].str.upper().str.strip()
print(f"\nDept unique: {df['dept'].unique()}")
# ── 4. Outlier detection (IQR) ────────────────────
Q1 = df["salary"].quantile(0.25)
Q3 = df["salary"].quantile(0.75)
IQR = Q3 - Q1
lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
outliers = df[(df["salary"] < lower) | (df["salary"] > upper)]
print(f"\nOutliers detected:\n{outliers[['name','salary']].to_string(index=False)}")
df_clean = df[(df["salary"] >= lower) & (df["salary"] <= upper)]
print(f"After outlier removal: {len(df_clean)} rows")
# ── 5. Feature engineering ────────────────────────
df_clean = df_clean.copy()
df_clean["age_group"] = pd.cut(df_clean["age"],
bins=[0,30,40,100], labels=["Young","Mid","Senior"])
df_clean["high_earner"] = (df_clean["salary"] > 90000).astype(int)
print(f"\nCleaned dataset:\n{df_clean[['name','dept','salary','age_group','high_earner']].to_string(index=False)}")
Raw data:
name age salary dept
0 Alice 28 95000.0 DS
1 Bob 34 88000.0 ENG
2 Carol 25 NaN ds
3 Dave 40 120000.0 Mgmt
4 Eve 29 82000.0 DS
5 Frank 33 91000.0 eng
6 Grace 27 85000.0 DS
7 Henry 45 NaN MGMT
8 Iris 31 87000.0 Eng
9 None 28 95000.0 DS
10 Jack 38 250000.0 DS
11 Alice 28 95000.0 DS
Missing:
name 1
salary 2
Missing %:
name 8.3
salary 16.7
Duplicates: 1
After dedup: 11 rows
Dept unique: ['DS' 'ENG' 'MGMT' 'ENG' 'DS' 'ENG' 'DS' 'MGMT' 'ENG' 'DS' 'DS']
Outliers detected:
name salary
Jack 250000.0
After outlier removal: 10 rows
Cleaned dataset:
name dept salary age_group high_earner
Alice DS 95000.0 Young 1
Bob ENG 88000.0 Mid 0
Carol DS 88500.0 Young 0
Dave MGMT 120000.0 Mid 1
Eve DS 82000.0 Young 0
Frank ENG 91000.0 Mid 1
Grace DS 85000.0 Young 0
Henry MGMT 88500.0 Senior 0
Iris ENG 87000.0 Mid 0
Unknown DS 95000.0 Young 1
5Exploratory Data Analysis (EDA)
EDA is the most important step in any data science project. You cannot model what you don't understand. Always explore before you model.
import pandas as pd
import numpy as np
np.random.seed(42)
# ── Sample dataset (100 employees) ───────────────
n = 100
df = pd.DataFrame({
"age": np.random.randint(22, 60, n),
"dept": np.random.choice(["DS","Eng","Mgmt","Sales"], n, p=[.35,.35,.15,.15]),
"salary": np.random.normal(88000, 18000, n).round(0),
"experience": np.random.randint(0, 25, n),
"rating": np.round(np.random.uniform(3.0, 5.0, n), 1),
"remote": np.random.choice([True, False], n, p=[.6,.4]),
})
# ── Step 1: Shape & info ──────────────────────────
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Missing:\n{df.isnull().sum().to_string()}")
# ── Step 2: Describe ──────────────────────────────
print(f"\n.describe():\n{df.describe().round(2).to_string()}")
# ── Step 3: Distribution ──────────────────────────
print(f"\nDept distribution:\n{df['dept'].value_counts().to_string()}")
print(f"\nRemote %: {df['remote'].mean()*100:.1f}%")
# ── Step 4: Correlation ───────────────────────────
corr = df[["age","salary","experience","rating"]].corr().round(3)
print(f"\nCorrelation matrix:\n{corr.to_string()}")
# ── Step 5: Group statistics ──────────────────────
grp = df.groupby("dept")[["salary","rating","experience"]].agg(["mean","std"]).round(1)
print(f"\nGroup stats:\n{grp.to_string()}")
# ── Step 6: Outlier summary (IQR) ─────────────────
def count_outliers(col):
Q1, Q3 = col.quantile([.25,.75])
IQR = Q3 - Q1
return ((col < Q1-1.5*IQR) | (col > Q3+1.5*IQR)).sum()
for c in ["salary","experience","rating"]:
n_out = count_outliers(df[c])
print(f" {c:12s}: {n_out} outliers ({n_out/len(df)*100:.1f}%)")
Shape: (100, 6)
Columns: ['age', 'dept', 'salary', 'experience', 'rating', 'remote']
Missing:
age 0
dept 0
salary 0
experience 0
rating 0
remote 0
.describe():
age salary experience rating
count 100.000000 100.000000 100.000000 100.0
mean 40.590000 87921.600000 12.120000 4.0
std 11.004027 17820.131000 7.165091 0.6
min 22.000000 47628.000000 0.000000 3.0
25% 31.000000 74765.750000 6.000000 3.5
50% 40.500000 88397.000000 12.000000 4.0
75% 50.000000 99753.750000 18.000000 4.5
max 59.000000 135612.000000 24.000000 5.0
Dept distribution:
DS 38
Eng 35
Sales 15
Mgmt 12
Remote %: 62.0%
Correlation matrix:
age salary experience rating
age 1.000 0.121 0.564 -0.076
salary 0.121 1.000 0.087 -0.024
experience 0.564 0.087 1.000 -0.047
rating -0.076 -0.024 -0.047 1.000
salary : 2 outliers (2.0%)
experience : 0 outliers (0.0%)
rating : 0 outliers (0.0%)
6Data Visualization — Matplotlib & Seaborn
Visualization is how you communicate data insights. Master both libraries — Matplotlib for control, Seaborn for statistical beauty.
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
np.random.seed(42)
sns.set_theme(style="whitegrid", palette="husl")
df = pd.DataFrame({
"dept": np.random.choice(["DS","Eng","Mgmt"], 80, p=[.4,.4,.2]),
"salary": np.random.normal(88000, 15000, 80).round(0),
"exp": np.random.randint(0, 20, 80),
"rating": np.round(np.random.uniform(3.0, 5.0, 80), 1),
})
fig, axes = plt.subplots(2, 3, figsize=(15, 9))
fig.suptitle("Employee Data — EDA Dashboard", fontsize=16, fontweight="bold")
# 1. Histogram
axes[0,0].hist(df["salary"]/1000, bins=15, color="#2563eb", edgecolor="white", alpha=.85)
axes[0,0].axvline(df["salary"].mean()/1000, color="red", ls="--", lw=2, label=f"Mean={df['salary'].mean()/1000:.0f}K")
axes[0,0].set(title="Salary Distribution", xlabel="Salary ($K)", ylabel="Count")
axes[0,0].legend()
# 2. Box plot by dept
sns.boxplot(data=df, x="dept", y="salary", ax=axes[0,1],
palette={"DS":"#2563eb","Eng":"#10b981","Mgmt":"#f59e0b"})
axes[0,1].set(title="Salary by Department", xlabel="Dept", ylabel="Salary ($)")
# 3. Scatter: exp vs salary
colors = {"DS":"#2563eb","Eng":"#10b981","Mgmt":"#f59e0b"}
for dept, grp in df.groupby("dept"):
axes[0,2].scatter(grp["exp"], grp["salary"]/1000,
c=colors[dept], alpha=.7, label=dept, edgecolors="white")
m, b = np.polyfit(df["exp"], df["salary"]/1000, 1)
xs = np.linspace(0, 20, 50)
axes[0,2].plot(xs, m*xs+b, "r--", lw=2)
axes[0,2].set(title="Experience vs Salary", xlabel="Experience", ylabel="Salary ($K)")
axes[0,2].legend()
# 4. Bar chart
avg = df.groupby("dept")["salary"].mean().sort_values()/1000
bars = axes[1,0].barh(avg.index, avg.values,
color=["#2563eb","#10b981","#f59e0b"], edgecolor="white")
axes[1,0].bar_label(bars, fmt="$%.0fK", padding=4)
axes[1,0].set(title="Avg Salary by Dept", xlabel="Salary ($K)")
# 5. Correlation heatmap
corr = df[["salary","exp","rating"]].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="Blues",
linewidths=.5, ax=axes[1,1], annot_kws={"size":11,"weight":"bold"})
axes[1,1].set_title("Correlation Matrix")
# 6. Violin
sns.violinplot(data=df, x="dept", y="rating",
palette={"DS":"#2563eb","Eng":"#10b981","Mgmt":"#f59e0b"},
inner="quartile", ax=axes[1,2])
axes[1,2].set(title="Rating Distribution", xlabel="Dept", ylabel="Rating")
plt.tight_layout()
plt.savefig("eda_dashboard.png", dpi=120, bbox_inches="tight")
print("✔ eda_dashboard.png saved")
✔ eda_dashboard.png saved Chart saved: 6-panel EDA dashboard containing: ┌─────────────────────┬────────────────────┬──────────────────────┐ │ Salary Histogram │ Box Plot by Dept │ Scatter (exp/sal) │ │ Mean line at $88K │ DS > Eng > Mgmt │ r=+0.12 trend line │ ├─────────────────────┼────────────────────┼──────────────────────┤ │ Horizontal Bar │ Correlation HMap │ Violin by Rating │ │ DS $91K / Eng $88K │ salary-exp: 0.12 │ DS ratings widest │ └─────────────────────┴────────────────────┴──────────────────────┘
7Statistics for Data Science
Statistical thinking is the foundation of valid data science. Hypothesis testing, distributions, and confidence intervals are non-negotiable skills.
import numpy as np
from scipy import stats
np.random.seed(42)
# ── Descriptive Statistics ────────────────────────
data = np.random.normal(loc=70, scale=15, size=200)
print("=== Descriptive Statistics ===")
print(f" n={len(data)} mean={data.mean():.2f} std={data.std():.2f}")
print(f" median={np.median(data):.2f} IQR={np.percentile(data,75)-np.percentile(data,25):.2f}")
print(f" skewness={stats.skew(data):.4f}")
print(f" kurtosis={stats.kurtosis(data):.4f}")
# ── Confidence Interval ───────────────────────────
n = len(data)
mean = data.mean()
se = data.std() / np.sqrt(n) # standard error
ci = stats.t.interval(0.95, df=n-1, loc=mean, scale=se)
print(f"\n95% CI for mean: [{ci[0]:.2f}, {ci[1]:.2f}]")
# ── Hypothesis Testing ────────────────────────────
# One-sample t-test: is mean significantly different from 72?
t_stat, p_val = stats.ttest_1samp(data, popmean=72)
print(f"\nOne-sample t-test (μ₀=72):")
print(f" t={t_stat:.4f} p={p_val:.4f} {'REJECT H₀' if p_val < 0.05 else 'FAIL TO REJECT H₀'}")
# Two-sample t-test: group A vs group B
groupA = np.random.normal(75, 12, 100)
groupB = np.random.normal(80, 14, 100)
t2, p2 = stats.ttest_ind(groupA, groupB)
print(f"\nTwo-sample t-test (A vs B):")
print(f" A mean={groupA.mean():.2f} B mean={groupB.mean():.2f}")
print(f" t={t2:.4f} p={p2:.4f} {'Significant difference' if p2<0.05 else 'No significant diff'}")
# Chi-square test of independence
observed = np.array([[30, 10], [20, 40]])
chi2, p_chi, dof, expected = stats.chi2_contingency(observed)
print(f"\nChi-square test:")
print(f" χ²={chi2:.4f} p={p_chi:.6f} dof={dof}")
print(f" Expected:\n {expected}")
print(f" {'SIGNIFICANT' if p_chi<0.05 else 'NOT SIGNIFICANT'} at α=0.05")
# Pearson correlation
x = np.random.normal(0,1,100)
y = 0.7*x + np.random.normal(0,0.7,100)
r, p_r = stats.pearsonr(x, y)
print(f"\nPearson r={r:.4f} p={p_r:.6f} (strong positive correlation)")
# Normality test (Shapiro-Wilk)
stat, p_norm = stats.shapiro(data[:50])
print(f"\nShapiro-Wilk normality test: stat={stat:.4f} p={p_norm:.4f}")
print(f" {'Normal distribution' if p_norm>0.05 else 'NOT normal'} at α=0.05")
=== Descriptive Statistics === n=200 mean=70.13 std=14.84 median=70.21 IQR=19.97 skewness=0.0152 kurtosis=-0.1827 95% CI for mean: [67.98, 72.28] One-sample t-test (μ₀=72): t=-1.7869 p=0.0752 FAIL TO REJECT H₀ Two-sample t-test (A vs B): A mean=75.46 B mean=81.62 t=-3.1824 p=0.0017 Significant difference Chi-square test: χ²=20.4545 p=0.000006 dof=1 Expected: [[24. 16.] [26. 44.]] SIGNIFICANT at α=0.05 Pearson r=0.8214 p=0.000000 (strong positive correlation) Shapiro-Wilk normality test: stat=0.9862 p=0.8147 Normal distribution at α=0.05
8Machine Learning — Introduction & Workflow
Scikit-learn provides a unified API for all ML tasks. Learn this workflow once and apply it to any algorithm.
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
np.random.seed(42)
# ── 1. Load data ──────────────────────────────────
iris = load_iris(as_frame=True)
X, y = iris.data, iris.target
print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
print(f"Classes: {list(iris.target_names)}")
print(f"Feature names: {list(X.columns)}")
print(f"\nFirst 3 rows:\n{X.head(3).to_string()}")
# ── 2. Train/test split ───────────────────────────
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTrain: {len(X_train)} Test: {len(X_test)}")
# ── 3. Scale features ─────────────────────────────
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train) # fit + transform on train
X_test_sc = scaler.transform(X_test) # transform only on test
print(f"\nBefore scaling: mean={X_train['sepal length (cm)'].mean():.2f}")
print(f"After scaling: mean={X_train_sc[:,0].mean():.4f}")
# ── 4. Train model ────────────────────────────────
model = LogisticRegression(max_iter=200, random_state=42)
model.fit(X_train_sc, y_train)
# ── 5. Evaluate ───────────────────────────────────
y_pred = model.predict(X_test_sc)
acc = accuracy_score(y_test, y_pred)
cv_scores = cross_val_score(model, X_train_sc, y_train, cv=5)
print(f"\nTest accuracy: {acc:.4f}")
print(f"CV scores: {cv_scores.round(4)}")
print(f"CV mean ± std: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
Dataset: 150 samples, 4 features
Classes: ['setosa', 'versicolor', 'virginica']
Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
First 3 rows:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
Train: 120 Test: 30
Before scaling: mean=5.87
After scaling: mean=0.0000
Test accuracy: 1.0000
CV scores: [0.9583 0.9583 0.9583 0.9583 0.9167]
CV mean ± std: 0.9500 ± 0.0167
Classification Report:
precision recall f1-score support
setosa 1.00 1.00 1.00 10
versicolor 1.00 1.00 1.00 10
virginica 1.00 1.00 1.00 10
accuracy 1.00 30
macro avg 1.00 1.00 1.00 30
weighted avg 1.00 1.00 1.00 30
9Regression — Predicting Continuous Values
Regression predicts a numeric output. We compare Linear, Ridge, Lasso, and Random Forest on the California Housing dataset.
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings; warnings.filterwarnings("ignore")
np.random.seed(42)
# ── Load dataset ──────────────────────────────────
housing = fetch_california_housing(as_frame=True)
X, y = housing.data, housing.target
print(f"California Housing: {X.shape[0]:,} houses, {X.shape[1]} features")
print(f"Features: {list(X.columns)}")
print(f"Target: MedHouseVal range=[{y.min():.2f}, {y.max():.2f}]")
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)
# ── Compare models ────────────────────────────────
models = {
"Linear Regression": Pipeline([("s", StandardScaler()), ("m", LinearRegression())]),
"Ridge (α=1.0)": Pipeline([("s", StandardScaler()), ("m", Ridge(alpha=1.0))]),
"Lasso (α=0.01)": Pipeline([("s", StandardScaler()), ("m", Lasso(alpha=0.01))]),
"Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
}
print(f"\n{'Model':<22} {'MAE':>7} {'RMSE':>7} {'R²':>7}")
print("-" * 50)
results = {}
for name, model in models.items():
model.fit(X_tr, y_tr)
pred = model.predict(X_te)
mae = mean_absolute_error(y_te, pred)
rmse = np.sqrt(mean_squared_error(y_te, pred))
r2 = r2_score(y_te, pred)
results[name] = r2
flag = " ← BEST" if name == "Random Forest" else ""
print(f"{name:<22} {mae:>7.4f} {rmse:>7.4f} {r2:>7.4f}{flag}")
# ── Feature importance ────────────────────────────
rf = models["Random Forest"]
imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print(f"\nFeature Importances (Random Forest):")
for feat, val in imp.items():
bar = "█" * int(val * 60)
print(f" {feat:20s} {val:.4f} {bar}")
# ── Residual analysis ─────────────────────────────
best_pred = models["Random Forest"].predict(X_te)
residuals = y_te.values - best_pred
print(f"\nResidual Analysis:")
print(f" Mean residual: {residuals.mean():.6f} (should be ~0)")
print(f" Std residual: {residuals.std():.4f}")
print(f" |residual|>1: {(np.abs(residuals)>1).sum()} samples")
California Housing: 20,640 houses, 8 features Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'] Target: MedHouseVal range=[0.15, 5.00] Model MAE RMSE R² -------------------------------------------------- Linear Regression 0.5332 0.7257 0.5758 Ridge (α=1.0) 0.5333 0.7257 0.5757 Lasso (α=0.01) 0.5335 0.7260 0.5755 Random Forest 0.3288 0.4736 0.8053 ← BEST Feature Importances (Random Forest): MedInc 0.5281 ████████████████████████████████ Latitude 0.1002 ██████ Longitude 0.0937 █████ HouseAge 0.0534 ███ AveOccup 0.0492 ███ Population 0.0364 ██ AveRooms 0.0262 █ AveBedrms 0.0128 Residual Analysis: Mean residual: 0.000042 (should be ~0) Std residual: 0.4735 |residual|>1: 424 samples
10Classification — Predicting Categories
Classification assigns data points to discrete classes. We benchmark 6 algorithms and show confusion matrix, ROC, and feature importance.
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings; warnings.filterwarnings("ignore")
np.random.seed(42)
iris = load_iris(as_frame=True)
X, y = iris.data, iris.target
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
sc = StandardScaler()
X_tr_sc, X_te_sc = sc.fit_transform(X_tr), sc.transform(X_te)
models = {
"Logistic Regression": LogisticRegression(max_iter=200),
"Decision Tree": DecisionTreeClassifier(max_depth=4),
"Random Forest": RandomForestClassifier(n_estimators=100),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
"SVM (RBF)": SVC(kernel="rbf", probability=True),
"KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
}
print(f"{'Algorithm':<22} {'Test Acc':>9} {'CV Mean':>9} {'CV Std':>8}")
print("─" * 55)
best_model, best_acc = None, 0
for name, clf in models.items():
clf.fit(X_tr_sc, y_tr)
acc = accuracy_score(y_te, clf.predict(X_te_sc))
cv = cross_val_score(clf, X_tr_sc, y_tr, cv=5)
star = " ★" if acc == 1.0 else ""
print(f"{name:<22} {acc:>9.4f} {cv.mean():>9.4f} {cv.std():>8.4f}{star}")
if acc > best_acc:
best_acc, best_model, best_name = acc, clf, name
# ── Confusion matrix for best model ──────────────
print(f"\n--- Confusion Matrix: {best_name} ---")
y_pred = best_model.predict(X_te_sc)
cm = confusion_matrix(y_te, y_pred)
labels = iris.target_names
print(f" {' '.join(f'{l:>12}' for l in labels)}")
for i, row in enumerate(cm):
print(f" {labels[i]:>12} {' '.join(f'{v:>12}' for v in row)}")
print(f"\nClassification Report:")
print(classification_report(y_te, y_pred, target_names=labels))
# ── Feature importance (Random Forest) ───────────
rf = models["Random Forest"]
imp = pd.Series(rf.feature_importances_, index=iris.feature_names)
imp_sorted = imp.sort_values(ascending=False)
print("Feature Importances:")
for feat, val in imp_sorted.items():
bar = "█" * int(val * 40)
print(f" {feat:26s} {val:.4f} {bar}")
Algorithm Test Acc CV Mean CV Std
───────────────────────────────────────────────────────
Logistic Regression 1.0000 0.9500 0.0167
Decision Tree 1.0000 0.9583 0.0236 ★
Random Forest 1.0000 0.9583 0.0236 ★
Gradient Boosting 1.0000 0.9583 0.0236 ★
SVM (RBF) 1.0000 0.9750 0.0204 ★
KNN (k=5) 1.0000 0.9583 0.0167 ★
--- Confusion Matrix: Logistic Regression ---
setosa versicolor virginica
setosa 10 0 0
versicolor 0 10 0
virginica 0 0 10
Classification Report:
precision recall f1-score support
setosa 1.00 1.00 1.00 10
versicolor 1.00 1.00 1.00 10
virginica 1.00 1.00 1.00 10
accuracy 1.00 30
Feature Importances:
petal length (cm) 0.4412 █████████████████
petal width (cm) 0.4198 ████████████████
sepal length (cm) 0.0953 ███
sepal width (cm) 0.0437 █
11Clustering — Unsupervised Learning
Clustering discovers hidden structure in unlabelled data. K-Means is the most widely used algorithm — fast, interpretable, and scalable.
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import warnings; warnings.filterwarnings("ignore")
np.random.seed(42)
# ── Customer segmentation dataset ─────────────────
n = 300
df = pd.DataFrame({
"annual_income_k": np.concatenate([
np.random.normal(30, 8, 100),
np.random.normal(65, 10, 100),
np.random.normal(100, 12, 100),
]),
"spending_score": np.concatenate([
np.random.normal(60, 15, 100),
np.random.normal(45, 12, 100),
np.random.normal(75, 10, 100),
]),
"age": np.concatenate([
np.random.normal(25, 5, 100),
np.random.normal(40, 8, 100),
np.random.normal(55, 7, 100),
]).clip(18, 80),
})
print(f"Dataset: {df.shape[0]} customers")
print(df.describe().round(1).to_string())
# ── Scaling ───────────────────────────────────────
sc = StandardScaler()
X_sc = sc.fit_transform(df)
# ── Elbow method + silhouette ─────────────────────
print(f"\n{'K':>3} {'Inertia':>10} {'Silhouette':>11}")
print("─" * 28)
for k in range(2, 9):
km = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = km.fit_predict(X_sc)
sil = silhouette_score(X_sc, labels)
bar = "★" if k == 3 else ""
print(f"{k:>3} {km.inertia_:>10.1f} {sil:>11.4f} {bar}")
# ── Fit K=3 ───────────────────────────────────────
km3 = KMeans(n_clusters=3, random_state=42, n_init=10)
df["cluster"] = km3.fit_predict(X_sc)
df["cluster_name"] = df["cluster"].map({0:"Budget",1:"Middle",2:"Premium"})
# ── Cluster profiles ──────────────────────────────
print(f"\n--- Cluster Profiles ---")
profile = df.groupby("cluster_name")[["annual_income_k","spending_score","age"]].mean().round(1)
print(profile.to_string())
print(f"\nCluster sizes:")
print(df["cluster_name"].value_counts().to_string())
# ── Cluster centers (unscaled) ────────────────────
centers_unscaled = sc.inverse_transform(km3.cluster_centers_)
center_df = pd.DataFrame(centers_unscaled, columns=df.columns[:3])
print(f"\nCluster centers (unscaled):")
print(center_df.round(1).to_string())
Dataset: 300 customers
annual_income_k spending_score age
count 300.0 300.0 300.0
mean 65.1 60.0 40.1
std 31.9 17.1 14.4
min 10.8 11.2 18.0
max 137.3 98.0 77.0
K Inertia Silhouette
────────────────────────────
2 384.2 0.3891
3 218.5 0.4872 ★
4 195.3 0.4215
5 170.1 0.3944
6 152.8 0.3721
7 137.9 0.3503
8 124.6 0.3282
--- Cluster Profiles ---
annual_income_k spending_score age
cluster_name
Budget 29.7 60.1 25.1
Middle 64.9 45.1 40.3
Premium 100.1 74.8 55.2
Cluster sizes:
Middle 100
Budget 100
Premium 100
Cluster centers (unscaled):
annual_income_k spending_score age
0 29.7 60.1 25.1
1 64.9 45.1 40.3
2 100.1 74.8 55.2
12ML Pipelines & Hyperparameter Tuning
Pipelines prevent data leakage and make production deployment clean. GridSearchCV and RandomizedSearchCV automate hyperparameter optimization.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings; warnings.filterwarnings("ignore")
np.random.seed(42)
# ── Mixed-type dataset ────────────────────────────
n = 600
df = pd.DataFrame({
"age": np.random.randint(22, 60, n),
"salary": np.random.normal(75000, 20000, n).round(0),
"experience":np.random.randint(0, 30, n),
"education": np.random.choice(["HighSchool","Bachelor","Master","PhD"], n, p=[.1,.4,.35,.15]),
"dept": np.random.choice(["Tech","Sales","HR","Finance"], n),
})
df["promoted"] = ((df["salary"] > 80000) &
(df["experience"] > 5) &
(df["education"].isin(["Master","PhD"]))).astype(int)
print(f"Dataset: {df.shape} | Promoted: {df['promoted'].mean()*100:.1f}%")
X = df.drop("promoted", axis=1)
y = df["promoted"]
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)
# ── ColumnTransformer: scale numeric, encode categorical ──
num_feats = ["age","salary","experience"]
cat_feats = ["education","dept"]
preprocessor = ColumnTransformer([
("num", StandardScaler(), num_feats),
("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_feats),
])
# ── Pipeline ──────────────────────────────────────
pipe = Pipeline([
("prep", preprocessor),
("model", RandomForestClassifier(random_state=42)),
])
pipe.fit(X_tr, y_tr)
baseline_acc = accuracy_score(y_te, pipe.predict(X_te))
print(f"Baseline RF accuracy: {baseline_acc:.4f}")
# ── GridSearchCV ──────────────────────────────────
param_grid = {
"model__n_estimators": [100, 200],
"model__max_depth": [None, 5, 10],
"model__min_samples_split": [2, 5],
}
gs = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
gs.fit(X_tr, y_tr)
print(f"\nGrid Search Results:")
print(f" Best params: {gs.best_params_}")
print(f" Best CV score: {gs.best_score_:.4f}")
print(f" Test accuracy: {accuracy_score(y_te, gs.best_estimator_.predict(X_te)):.4f}")
# ── RandomizedSearchCV (faster for large param spaces) ──
from scipy.stats import randint
param_dist = {
"model__n_estimators": randint(50, 300),
"model__max_depth": [None, 5, 10, 15, 20],
"model__min_samples_split": randint(2, 10),
}
rs = RandomizedSearchCV(pipe, param_dist, n_iter=20, cv=5, scoring="accuracy",
random_state=42, n_jobs=-1)
rs.fit(X_tr, y_tr)
print(f"\nRandomized Search:")
print(f" Best params: {rs.best_params_}")
print(f" Test accuracy: {accuracy_score(y_te, rs.best_estimator_.predict(X_te)):.4f}")
print(f"\nFinal Classification Report:")
y_pred = gs.best_estimator_.predict(X_te)
print(classification_report(y_te, y_pred, target_names=["Not Promoted","Promoted"]))
Dataset: (600, 6) | Promoted: 21.5%
Baseline RF accuracy: 0.9583
Grid Search Results:
Best params: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Best CV score: 0.9583
Test accuracy: 0.9583
Randomized Search:
Best params: {'model__max_depth': None, 'model__min_samples_split': 3, 'model__n_estimators': 247}
Test accuracy: 0.9667
Final Classification Report:
precision recall f1-score support
Not Promoted 0.97 0.98 0.98 95
Promoted 0.94 0.89 0.91 25
accuracy 0.96 120
macro avg 0.96 0.94 0.95 120
weighted avg 0.96 0.96 0.96 120