Pandas — The Swiss Army Knife
of Data Manipulation
Pandas provides two core data structures — Series (1D) and DataFrame (2D) — that make loading, cleaning, transforming, and analysing tabular data intuitive and fast, built on top of NumPy.
Loading and Saving Data (CSV, Excel, SQL, JSON)
Pandas can read and write virtually every tabular data format. pd.read_csv() alone has over 50 parameters to handle delimiter variations, encoding issues, large files, and more.
pd.read_FORMAT() and df.to_FORMAT(). Common formats: CSV, Excel (.xlsx), JSON, SQL, Parquet, HDF5, Feather, and Clipboard.
import pandas as pd
import numpy as np
# ── CSV ───────────────────────────────────────────────────
df = pd.read_csv('data.csv')
# Common parameters
df = pd.read_csv(
'data.csv',
sep=',', # delimiter (use '\t' for TSV)
header=0, # row to use as column names (None = no header)
index_col='ID', # column to use as row index
usecols=['Name','Age','Salary'], # load only these columns
dtype={'Age': int, 'Salary': float}, # enforce dtypes
na_values=['N/A', '-', '?'], # treat as NaN
parse_dates=['JoinDate'], # parse date columns
nrows=1000, # read only first 1000 rows (large files)
skiprows=[1, 2], # skip specific rows
encoding='utf-8' # handle special characters
)
# Reading in chunks (for huge files > 1GB)
chunk_iter = pd.read_csv('large_file.csv', chunksize=10_000)
frames = []
for chunk in chunk_iter:
# process each chunk
frames.append(chunk[chunk['Salary'] > 50000])
df_filtered = pd.concat(frames, ignore_index=True)
# ── Excel ─────────────────────────────────────────────────
df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# Read all sheets at once (returns dict of DataFrames)
all_sheets = pd.read_excel('data.xlsx', sheet_name=None)
print(all_sheets.keys()) # dict_keys(['Sheet1','Sheet2',...])
# ── JSON ─────────────────────────────────────────────────
df_json = pd.read_json('data.json')
df_json2 = pd.read_json('nested.json', orient='records')
# ── SQL (with SQLite as example) ─────────────────────────
import sqlite3
conn = sqlite3.connect('database.db')
df_sql = pd.read_sql('SELECT * FROM employees WHERE age > 30', conn)
conn.close()
# ── From dict / list (very common in code) ────────────────
data = {
'Name': ['Alice', 'Bob', 'Carol', 'Dave'],
'Age': [28, 34, 29, 41],
'Salary': [72000, 85000, 67000, 92000],
'Dept': ['Eng', 'HR', 'Eng', 'Finance']
}
df = pd.DataFrame(data)
print(df)
print(df.dtypes)
print(df.info())
print(df.describe())
import pandas as pd
# (assuming df exists from above)
# ── Save to CSV ───────────────────────────────────────────
df.to_csv('output.csv', index=False) # index=False avoids extra column
df.to_csv('output.csv', sep='\t') # tab-separated
# ── Save to Excel ─────────────────────────────────────────
df.to_excel('output.xlsx', sheet_name='Data', index=False)
# Multiple sheets in one file
with pd.ExcelWriter('multi.xlsx', engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='Employees', index=False)
df.describe().to_excel(writer, sheet_name='Stats')
# ── Save to JSON ──────────────────────────────────────────
df.to_json('output.json', orient='records', indent=2)
# ── Parquet (fast, compressed, best for large data) ───────
df.to_parquet('output.parquet', index=False) # requires pyarrow
df_back = pd.read_parquet('output.parquet')
# ── Quick inspection after loading ────────────────────────
print(df.shape) # (4, 4) — rows, columns
print(df.head(3)) # first 3 rows
print(df.tail(2)) # last 2 rows
print(df.columns.tolist()) # column names
print(df.dtypes) # data types per column
print(df.isnull().sum()) # NaN count per column
print(df.nunique()) # unique values per column
Indexing, Selecting, and Filtering Data in DataFrames
Pandas offers multiple ways to select data. .loc[] (label-based) and .iloc[] (position-based) are the two primary indexers. Understanding when to use each prevents subtle bugs.
•
df.loc[rows, cols] — use labels (index values, column names). Slices are inclusive on both ends.•
df.iloc[rows, cols] — use integer positions. Slices are exclusive on the right end (like Python).
import pandas as pd
df = pd.DataFrame({
'Name': ['Alice','Bob','Carol','Dave','Eve'],
'Age': [28, 34, 29, 41, 25],
'Salary': [72000, 85000, 67000, 92000, 58000],
'Dept': ['Eng','HR','Eng','Finance','Eng']
}, index=['E001','E002','E003','E004','E005'])
# ── Column Selection ──────────────────────────────────────
print(df['Name']) # Series — single column
print(df[['Name','Salary']]) # DataFrame — multiple columns
# ── .loc — label based ───────────────────────────────────
print(df.loc['E001']) # single row by index label
print(df.loc['E001':'E003']) # row slice (inclusive!)
print(df.loc['E001':'E003', 'Name':'Age']) # rows & cols by label
print(df.loc['E002', 'Salary']) # single value → scalar
# ── .iloc — position based ───────────────────────────────
print(df.iloc[0]) # first row
print(df.iloc[-1]) # last row
print(df.iloc[0:3]) # rows 0,1,2 (not 3)
print(df.iloc[0:3, 0:2]) # rows 0-2, cols 0-1
print(df.iloc[[0, 2, 4]]) # specific rows by position
print(df.iloc[1, 2]) # scalar: row 1, col 2
# ── .at / .iat — fast single-value access ─────────────────
print(df.at['E003', 'Salary']) # label-based scalar
print(df.iat[2, 2]) # position-based scalar
# ── Boolean Indexing ──────────────────────────────────────
mask = df['Age'] > 30
print(df[mask]) # rows where Age > 30
# Compound conditions
print(df[(df['Dept'] == 'Eng') & (df['Salary'] > 60000)])
print(df[(df['Age'] < 30) | (df['Salary'] > 90000)])
# .isin()
print(df[df['Dept'].isin(['Eng', 'Finance'])])
# .between()
print(df[df['Age'].between(25, 30)])
# ── .query() — SQL-like, readable syntax ─────────────────
print(df.query('Age > 30 and Dept == "HR"'))
print(df.query('Salary > 70000'))
# Use Python variable in query
threshold = 30000
print(df.query('Salary > @threshold'))
| Index | Name | Age | Salary | Dept |
|---|
Data Cleaning: Missing Data, Duplicates, and Outliers
Real-world data is messy. Studies show data scientists spend 60–80% of their time cleaning data. Pandas provides a comprehensive toolkit to detect and handle missing values, duplicates, and outliers efficiently.
import pandas as pd
import numpy as np
df = pd.DataFrame({
'Name': ['Alice', 'Bob', np.nan, 'Dave', 'Eve', 'Frank'],
'Age': [28, np.nan, 29, 41, 25, np.nan],
'Salary': [72000, 85000, 67000, np.nan, 58000, 92000],
'Score': [85.5, 90.2, np.nan, 78.1, np.nan, 88.4]
})
# ── Detection ─────────────────────────────────────────────
print(df.isnull()) # boolean DataFrame
print(df.isnull().sum()) # NaN count per column
print(df.isnull().sum() / len(df) * 100) # % missing
print(df.notnull().sum()) # non-null counts
print(df.info()) # concise overview
# ── Dropping ─────────────────────────────────────────────
df.dropna() # drop rows with ANY NaN
df.dropna(axis=1) # drop columns with ANY NaN
df.dropna(how='all') # only drop if ALL values NaN
df.dropna(subset=['Name','Age']) # only check these columns
df.dropna(thresh=3) # keep rows with >= 3 non-NaN
# ── Filling ───────────────────────────────────────────────
df['Age'].fillna(df['Age'].mean()) # fill with mean
df['Salary'].fillna(df['Salary'].median()) # fill with median
df['Name'].fillna('Unknown') # fill with literal
df.fillna(0) # fill ALL NaN with 0
df.fillna(method='ffill') # forward fill (carry last known value)
df.fillna(method='bfill') # backward fill
# Interpolation (great for time series)
df['Score'].interpolate(method='linear')
df['Score'].interpolate(method='polynomial', order=2)
print("\nAfter fillna(mean):")
df_clean = df.copy()
df_clean['Age'].fillna(df['Age'].mean(), inplace=True)
df_clean['Salary'].fillna(df['Salary'].median(), inplace=True)
print(df_clean)
import pandas as pd
import numpy as np
# ── Duplicates ────────────────────────────────────────────
df = pd.DataFrame({
'Name': ['Alice','Bob','Alice','Carol','Bob'],
'Dept': ['Eng','HR','Eng','Finance','HR'],
'Salary': [72000,85000,72000,67000,85000]
})
print(df.duplicated()) # True where duplicate
print(df.duplicated().sum()) # count: 2
df_nodup = df.drop_duplicates() # drop all duplicate rows
df_nodup2 = df.drop_duplicates(subset=['Name']) # by specific cols
df.drop_duplicates(keep='last') # keep last occurrence
# ── Outlier Detection: IQR Method ────────────────────────
salaries = pd.Series([45000,52000,61000,70000,75000,
78000,82000,90000,250000,300000])
Q1 = salaries.quantile(0.25)
Q3 = salaries.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = salaries[(salaries < lower) | (salaries > upper)]
print(f"\nQ1={Q1}, Q3={Q3}, IQR={IQR}")
print(f"Bounds: [{lower:.0f}, {upper:.0f}]")
print("Outliers:", outliers.values) # [250000 300000]
# ── Outlier Treatment ────────────────────────────────────
# Option 1: Remove outliers
clean = salaries[(salaries >= lower) & (salaries <= upper)]
# Option 2: Cap/clip to bounds (Winsorizing)
capped = salaries.clip(lower=lower, upper=upper)
print("Capped:", capped.values)
# Option 3: Replace with NaN and impute
salaries_nan = salaries.where(
(salaries >= lower) & (salaries <= upper), np.nan
)
salaries_filled = salaries_nan.fillna(salaries_nan.median())
# ── Z-Score Method ─────────────────────────────────────
from scipy import stats # or compute manually
z_scores = (salaries - salaries.mean()) / salaries.std()
z_outliers = salaries[z_scores.abs() > 2] # |z| > 2 = outlier
print("Z-score outliers:", z_outliers.values)
Data Transformation with Pandas (Merging, Joining, Concatenating)
Combining DataFrames is one of the most common operations in data engineering pipelines. Pandas mirrors SQL JOIN semantics with pd.merge(), plus Pandas-specific .join() and pd.concat().
•
pd.merge() — SQL-style joins on key column(s). Most flexible.•
df.join() — shortcut to merge on the index.•
pd.concat() — stack DataFrames vertically (rows) or horizontally (cols).
import pandas as pd
employees = pd.DataFrame({
'EmpID': [1, 2, 3, 4, 5],
'Name': ['Alice','Bob','Carol','Dave','Eve'],
'DeptID':[10, 20, 10, 30, 40]
})
departments = pd.DataFrame({
'DeptID': [10, 20, 30],
'DeptName': ['Engineering','HR','Finance']
})
# ── INNER JOIN — only matching rows ──────────────────────
inner = pd.merge(employees, departments, on='DeptID', how='inner')
print("INNER (matched only):\n", inner)
# Eve (DeptID=40) and Dept 40 are dropped
# ── LEFT JOIN — all left, matching right ─────────────────
left = pd.merge(employees, departments, on='DeptID', how='left')
print("\nLEFT (all employees):\n", left)
# Eve gets NaN for DeptName
# ── RIGHT JOIN ────────────────────────────────────────────
right = pd.merge(employees, departments, on='DeptID', how='right')
print("\nRIGHT (all departments):\n", right)
# ── OUTER JOIN — all rows from both ──────────────────────
outer = pd.merge(employees, departments, on='DeptID', how='outer')
print("\nOUTER (all rows):\n", outer)
# ── Merge on multiple keys ────────────────────────────────
sales = pd.DataFrame({
'Region': ['North','North','South','South'],
'Product':['A','B','A','B'],
'Sales': [100, 200, 150, 300]
})
targets = pd.DataFrame({
'Region': ['North','South'],
'Product':['A','A'],
'Target': [120, 140]
})
merged = pd.merge(sales, targets, on=['Region','Product'], how='left')
# ── Merge with different key names ────────────────────────
df_left = pd.DataFrame({'emp_id':[1,2,3],'name':['A','B','C']})
df_right = pd.DataFrame({'employee_id':[1,2,4],'dept':['Eng','HR','Fin']})
merged2 = pd.merge(df_left, df_right, left_on='emp_id', right_on='employee_id')
# ── Merge indicator (find which source each row came from) ─
merged3 = pd.merge(employees, departments, on='DeptID', how='outer', indicator=True)
print("\nMerge indicator:\n", merged3[['Name','DeptName','_merge']])
import pandas as pd
# ── pd.concat() — stack rows (axis=0) ────────────────────
q1 = pd.DataFrame({'Product':['A','B'],'Sales':[100,200]})
q2 = pd.DataFrame({'Product':['A','C'],'Sales':[150,250]})
q3 = pd.DataFrame({'Product':['B','C'],'Sales':[180,220]})
yearly = pd.concat([q1, q2, q3], ignore_index=True)
print("Stacked rows:\n", yearly)
# With keys to create MultiIndex
yearly_mi = pd.concat([q1,q2,q3], keys=['Q1','Q2','Q3'])
# ── pd.concat() — side by side (axis=1) ──────────────────
names = pd.DataFrame({'Name': ['Alice','Bob','Carol']})
ages = pd.DataFrame({'Age': [28, 34, 29]})
salaries= pd.DataFrame({'Salary':[72000,85000,67000]})
combined = pd.concat([names, ages, salaries], axis=1)
print("\nSide-by-side:\n", combined)
# ── .join() — merge on index ─────────────────────────────
df_a = pd.DataFrame({'Sales':[100,200,300]},
index=['Jan','Feb','Mar'])
df_b = pd.DataFrame({'Cost': [80, 160, 220]},
index=['Jan','Feb','Apr'])
joined = df_a.join(df_b, how='outer') # merge on index
print("\nJoined on index:\n", joined)
Pivot Tables and Cross-Tabulations
Pivot tables are one of the most powerful data summarisation tools. They let you reshape and aggregate data in a spreadsheet-like way — pd.pivot_table() is the Pandas equivalent of Excel's PivotTable.
import pandas as pd
import numpy as np
# Sample sales dataset
data = {
'Region': ['North','North','South','South','East','East','West','West'],
'Product': ['A','B','A','B','A','B','A','B'],
'Q': ['Q1','Q2','Q1','Q2','Q1','Q2','Q1','Q2'],
'Sales': [150, 200, 180, 220, 130, 170, 210, 190],
'Units': [30, 40, 36, 44, 26, 34, 42, 38]
}
df = pd.DataFrame(data)
# ── Basic Pivot Table ─────────────────────────────────────
pivot = pd.pivot_table(
df,
values='Sales', # what to aggregate
index='Region', # rows
columns='Product', # columns
aggfunc='sum' # aggregation function
)
print("Sales by Region & Product:\n", pivot)
# ── Multiple aggfuncs ─────────────────────────────────────
pivot2 = pd.pivot_table(
df,
values='Sales',
index='Region',
columns='Product',
aggfunc=['sum','mean','count'],
fill_value=0 # NaN → 0
)
print("\nMultiple aggregations:\n", pivot2)
# ── Multiple values ───────────────────────────────────────
pivot3 = pd.pivot_table(
df,
values=['Sales','Units'],
index='Region',
columns='Q',
aggfunc='sum',
margins=True, # add Grand Total row/col
margins_name='Total'
)
print("\nWith Grand Totals:\n", pivot3)
# ── Access pivot results ─────────────────────────────────
print("\nNorth, Product A:", pivot.loc['North','A'])
print("Best region:", pivot.sum(axis=1).idxmax())
# ── pd.crosstab() — frequency tables ─────────────────────
ct = pd.crosstab(df['Region'], df['Product'])
print("\nCross-tabulation (counts):\n", ct)
# Normalize to proportions
ct_norm = pd.crosstab(df['Region'], df['Product'], normalize='index')
print("\nNormalized (row %):\n", ct_norm.round(2))
# With values (like pivot_table)
ct_vals = pd.crosstab(df['Region'], df['Product'],
values=df['Sales'], aggfunc='sum')
print("\nCrosstab with Sales:\n", ct_vals)
Working with Time Series Data in Pandas
Pandas has first-class support for time series data with DatetimeIndex, powerful resampling, rolling windows, and date offset arithmetic. This makes financial, IoT, and log data analysis straightforward.
import pandas as pd
import numpy as np
# ── Parsing Dates ─────────────────────────────────────────
df = pd.DataFrame({
'date': ['2024-01-15','2024-02-20','2024-03-10'],
'sales': [1200, 1500, 1350]
})
df['date'] = pd.to_datetime(df['date'])
print(df.dtypes) # date → datetime64[ns]
# DateTimeIndex
date_rng = pd.date_range(start='2024-01-01', end='2024-12-31', freq='D')
ts = pd.Series(np.random.randn(len(date_rng)), index=date_rng)
# ── Date Properties ───────────────────────────────────────
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.day_name()
df['quarter'] = df['date'].dt.quarter
# ── Slicing by Date ───────────────────────────────────────
ts['2024-03'] # all of March 2024
ts['2024-01-01':'2024-03-31'] # Q1 2024
ts[ts.index.month == 6] # all June dates
# ── Resampling ────────────────────────────────────────────
# Downsample: daily → monthly
monthly = ts.resample('ME').mean() # Monthly End
quarterly = ts.resample('QE').sum() # Quarterly End
weekly = ts.resample('W').std() # Weekly std
# Upsample: monthly → daily (with forward fill)
monthly_to_daily = monthly.resample('D').ffill()
# ── Rolling Windows ───────────────────────────────────────
rolling_mean = ts.rolling(window=7).mean() # 7-day moving avg
rolling_std = ts.rolling(window=30).std() # 30-day std
rolling_max = ts.rolling(window=14).max()
# Expanding window (growing from start)
expanding_mean = ts.expanding().mean()
# ── Lag & Shift ───────────────────────────────────────────
ts_shifted = ts.shift(1) # lag by 1 period
ts_diff = ts.diff(1) # day-over-day change
ts_pct = ts.pct_change() # % change
# ── Date Arithmetic ───────────────────────────────────────
from pandas.tseries.offsets import BDay, MonthEnd
today = pd.Timestamp('2024-06-15')
next_biz_day = today + BDay(1) # next business day
month_end = today + MonthEnd(0) # end of current month
print(next_biz_day) # 2024-06-17 (skips weekend)
print(month_end) # 2024-06-30
DataFrame Operations: Apply, Map, Lambda Functions
.apply(), .map(), and .transform() let you apply custom Python functions to DataFrames — unlocking transformations that aren't covered by built-in methods.
•
Series.map() — element-wise on a single Series. Fastest for simple mappings.•
DataFrame.apply(func, axis=0/1) — applies func to each column (axis=0) or each row (axis=1).•
DataFrame.applymap() / DataFrame.map() — element-wise on all cells (deprecated name: applymap).•
GroupBy.transform() — applies func per group but returns same-shaped DataFrame.
import pandas as pd
import numpy as np
df = pd.DataFrame({
'Name': ['Alice','Bob','Carol','Dave','Eve'],
'Age': [28, 34, 29, 41, 25],
'Salary': [72000, 85000, 67000, 92000, 58000],
'Dept': ['Eng','HR','Eng','Finance','Eng']
})
# ── Series.map() — element-wise ───────────────────────────
# Map with dict (great for encoding)
dept_code = {'Eng': 1, 'HR': 2, 'Finance': 3}
df['DeptCode'] = df['Dept'].map(dept_code)
# Map with function
df['Name_Upper'] = df['Name'].map(str.upper)
df['Name_Len'] = df['Name'].map(len)
# Map with lambda
df['Salary_K'] = df['Salary'].map(lambda x: f"${x/1000:.0f}K")
print(df[['Name','Salary','Salary_K','DeptCode']].head())
# ── DataFrame.apply() — column-wise (axis=0) ─────────────
# Apply NumPy function to each column
print("\nColumn stats:")
print(df[['Age','Salary']].apply(np.mean)) # mean of each col
print(df[['Age','Salary']].apply(np.std)) # std of each col
# Custom function on column
def normalize(series):
return (series - series.min()) / (series.max() - series.min())
df_norm = df[['Age','Salary']].apply(normalize)
print("\nNormalized:\n", df_norm.round(3))
# ── DataFrame.apply() — row-wise (axis=1) ────────────────
# Access multiple columns per row
def grade(row):
if row['Salary'] > 80000 and row['Age'] < 35:
return 'High Performer'
elif row['Salary'] > 70000:
return 'Good'
else:
return 'Average'
df['Grade'] = df.apply(grade, axis=1)
print("\nWith grade:\n", df[['Name','Salary','Age','Grade']])
# ── Lambda with apply ─────────────────────────────────────
df['SalaryBand'] = df['Salary'].apply(
lambda x: 'High' if x > 80000 else ('Mid' if x > 65000 else 'Low')
)
# ── groupby + transform ───────────────────────────────────
# Add dept avg salary to each row (same shape as df)
df['DeptAvgSalary'] = df.groupby('Dept')['Salary'].transform('mean')
df['SalaryVsAvg'] = df['Salary'] - df['DeptAvgSalary']
print("\nVs. Dept Average:\n",
df[['Name','Dept','Salary','DeptAvgSalary','SalaryVsAvg']])
# ── applymap (element-wise on whole DataFrame) ────────────
numeric_df = df[['Age','Salary']].copy()
# Round to nearest 10 (element-wise)
rounded = numeric_df.applymap(lambda x: round(x, -1))
print("\nRounded to 10:\n", rounded)