Python Pandas: Data Manipulation and Analysis Guide

Pandas is the foundation of Python data analysis — it turns raw CSV files, database results, and API responses into structured DataFrames you can slice, aggregate, transform, and visualize with one-liners. This guide covers the essential Pandas operations every data engineer and analyst needs: loading data, selection and filtering, groupby aggregations, joins, pivot tables, time series, and performance tips for large datasets.

DataFrames and Series

A DataFrame is a 2D table with labeled rows (index) and columns. A Series is a single column. Both are built on NumPy arrays, making vectorized operations over millions of rows fast. Create DataFrames from dicts, lists, CSV files, SQL queries, or JSON.

import pandas as pd
import numpy as np

# Create from dict
df = pd.DataFrame({
    "name":   ["Alice", "Bob", "Carol", "Dave"],
    "dept":   ["Eng",   "Eng", "Sales", "Sales"],
    "salary": [95000,   88000, 72000,   67000],
    "years":  [5,       3,     7,       2],
})

# Read CSV, Excel, JSON, Parquet
df = pd.read_csv("employees.csv", parse_dates=["hire_date"])
df = pd.read_excel("report.xlsx", sheet_name="Q1")
df = pd.read_parquet("data.parquet")  # fastest for large files

# Quick inspection
print(df.shape)       # (4, 4)
print(df.dtypes)      # column types
print(df.describe())  # count, mean, std, min, quartiles, max
print(df.info())      # dtypes + null counts + memory usage
print(df.head(3))     # first 3 rows
print(df.tail(3))     # last 3 rows

# Column access
salaries = df["salary"]          # Series
subset = df[["name", "salary"]]  # DataFrame with 2 columns

Selection and Filtering

# Boolean indexing — most common
senior = df[df["years"] >= 5]
high_earners = df[df["salary"] > 80000]
eng_seniors = df[(df["dept"] == "Eng") & (df["years"] >= 3)]

# isin — filter on a list of values
sales_or_eng = df[df["dept"].isin(["Sales", "Eng"])]

# .loc — label-based (row label, column name)
df.loc[0, "salary"]           # single value
df.loc[0:2, ["name", "dept"]] # rows 0-2, two columns
df.loc[df["salary"] > 80000, "name"]  # filtered rows, one column

# .iloc — integer position-based
df.iloc[0, 1]        # first row, second column
df.iloc[:3, :2]      # first 3 rows, first 2 columns
df.iloc[-1]          # last row

# query() — SQL-like string syntax
result = df.query("dept == 'Eng' and salary > 85000")
result = df.query("years in @threshold_years", threshold_years=[3, 5, 7])

# String operations
df[df["name"].str.startswith("A")]
df[df["name"].str.contains("o", case=False)]

GroupBy and Aggregation

GroupBy splits the DataFrame into groups, applies a function to each group, and combines the results. It is the Pandas equivalent of SQL's GROUP BY.

# Single aggregation
dept_avg = df.groupby("dept")["salary"].mean()

# Multiple aggregations at once
summary = df.groupby("dept").agg(
    headcount=("name", "count"),
    avg_salary=("salary", "mean"),
    max_salary=("salary", "max"),
    total_years=("years", "sum"),
)

# Named aggregation with multiple columns
df.groupby("dept").agg(
    avg_salary=("salary", "mean"),
    avg_years=("years", "mean"),
    senior_count=("years", lambda x: (x >= 5).sum()),
)

# Transform — apply function but keep original index (broadcast back)
df["dept_avg_salary"] = df.groupby("dept")["salary"].transform("mean")
df["salary_vs_dept_avg"] = df["salary"] - df["dept_avg_salary"]

# Apply — custom function per group
def top_earner(group):
    return group.nlargest(1, "salary")

top_per_dept = df.groupby("dept").apply(top_earner, include_groups=False)

# Rolling window aggregation
df_ts = df.set_index("hire_date").sort_index()
df_ts["rolling_avg_salary"] = df_ts["salary"].rolling("30D").mean()

Merge and Join

departments = pd.DataFrame({
    "dept": ["Eng", "Sales", "Marketing"],
    "budget": [500000, 300000, 200000],
    "manager": ["Alice", "Dave", "Eve"],
})

# Inner join (default) — only matching rows
merged = pd.merge(df, departments, on="dept")

# Left join — all rows from df, NaN where no match
merged = pd.merge(df, departments, on="dept", how="left")

# Merge on different column names
df2 = pd.merge(df, departments, left_on="dept", right_on="dept", how="left")

# Concatenate vertically (stack DataFrames)
q1 = pd.read_csv("q1_sales.csv")
q2 = pd.read_csv("q2_sales.csv")
all_sales = pd.concat([q1, q2], ignore_index=True)

# Concatenate horizontally (add columns)
combined = pd.concat([df[["name"]], departments], axis=1)

# join() — merge on index
df.set_index("dept").join(departments.set_index("dept"), rsuffix="_dept")

Pivot Tables

sales = pd.DataFrame({
    "month": ["Jan","Jan","Feb","Feb","Mar","Mar"],
    "product": ["A","B","A","B","A","B"],
    "revenue": [10000, 8000, 12000, 9500, 11000, 10500],
    "units": [100, 80, 120, 95, 110, 105],
})

# Pivot table — rows, columns, values, aggregation
pivot = pd.pivot_table(
    sales,
    values="revenue",
    index="month",
    columns="product",
    aggfunc="sum",
    fill_value=0,
    margins=True,       # add row/column totals
    margins_name="Total",
)

# Multi-value pivot
multi_pivot = pd.pivot_table(
    sales,
    values=["revenue", "units"],
    index="month",
    columns="product",
    aggfunc={"revenue": "sum", "units": "sum"},
)

# crosstab — frequency table
ct = pd.crosstab(df["dept"], df["years"].apply(lambda y: "senior" if y >= 5 else "junior"))

Time Series

import pandas as pd

# Create date range
dates = pd.date_range("2026-01-01", periods=365, freq="D")
df_ts = pd.DataFrame({
    "date": dates,
    "sales": np.random.randint(100, 500, 365),
    "visits": np.random.randint(1000, 5000, 365),
})
df_ts = df_ts.set_index("date")

# Resample — aggregate to weekly/monthly
weekly = df_ts.resample("W").agg({"sales": "sum", "visits": "mean"})
monthly = df_ts.resample("ME").sum()

# Rolling statistics
df_ts["sales_7d_avg"] = df_ts["sales"].rolling(7).mean()
df_ts["sales_30d_max"] = df_ts["sales"].rolling(30).max()

# Shift — lag/lead features for forecasting
df_ts["sales_lag_7"] = df_ts["sales"].shift(7)
df_ts["sales_next_7"] = df_ts["sales"].shift(-7)

# Date component extraction
df_ts["year"] = df_ts.index.year
df_ts["month"] = df_ts.index.month
df_ts["day_of_week"] = df_ts.index.day_name()
df_ts["is_weekend"] = df_ts.index.dayofweek >= 5

# Time slicing
jan = df_ts["2026-01"]
q1 = df_ts["2026-01":"2026-03"]

Data Cleaning

# Check and handle missing values
print(df.isnull().sum())          # count nulls per column
df_clean = df.dropna()            # drop rows with any null
df_clean = df.dropna(subset=["salary"])  # drop only where salary is null
df["salary"].fillna(df["salary"].median(), inplace=True)  # fill with median
df.ffill(inplace=True)            # forward-fill (useful for time series)

# Duplicate handling
print(df.duplicated().sum())
df = df.drop_duplicates(subset=["email"])

# Type casting
df["salary"] = pd.to_numeric(df["salary"], errors="coerce")
df["hire_date"] = pd.to_datetime(df["hire_date"], format="%Y-%m-%d")
df["dept"] = df["dept"].astype("category")  # save memory on low-cardinality

# String cleaning
df["name"] = df["name"].str.strip().str.title()
df["email"] = df["email"].str.lower()
df["phone"] = df["phone"].str.replace(r"[^0-9]", "", regex=True)

# Outlier handling using IQR
q1, q3 = df["salary"].quantile([0.25, 0.75])
iqr = q3 - q1
mask = df["salary"].between(q1 - 1.5 * iqr, q3 + 1.5 * iqr)
df_no_outliers = df[mask]

Performance Tips

# Use categorical dtype for low-cardinality strings
df["dept"] = df["dept"].astype("category")  # 10x memory reduction

# Vectorize — never loop over rows
# BAD:
for i, row in df.iterrows():
    df.loc[i, "bonus"] = row["salary"] * 0.1
# GOOD:
df["bonus"] = df["salary"] * 0.1

# Use eval() for complex expressions (less memory than chained ops)
df.eval("total_comp = salary + bonus", inplace=True)

# Read only needed columns
df = pd.read_csv("big.csv", usecols=["name", "salary", "dept"])

# Read in chunks for huge files
chunks = []
for chunk in pd.read_csv("huge.csv", chunksize=100_000):
    chunks.append(chunk[chunk["salary"] > 50000])
df = pd.concat(chunks)

# Parquet for storage — 10-50x faster than CSV
df.to_parquet("data.parquet", index=False, compression="snappy")

# Use PyArrow backend (Pandas 2.0+)
df = pd.read_csv("data.csv", dtype_backend="pyarrow")  # faster dtypes

Frequently Asked Questions

Pandas vs Polars — which should I use?
Polars is faster (built in Rust, true multi-threading) and uses less memory for large datasets. Pandas has a larger ecosystem and is the industry standard for notebook-style exploration. Use Pandas for most analysis work; Polars for data pipelines processing millions of rows where speed matters.
How do I speed up groupby on large DataFrames?
Convert string columns to category dtype before groupby — it can be 5-10x faster. Also consider Dask for parallelized groupby on datasets that don't fit in memory, or DuckDB for SQL-based aggregation on Parquet files.
What is the difference between loc and iloc?
loc uses labels (column names, index values). iloc uses integer positions (0-based row and column numbers). Use loc for named access, iloc for position-based slicing.