Python + Excel — Complete Guide to openpyxl and pandas
Chapter 6: Python + Excel — openpyxl and pandas in Practice
Excel is the most widely used data format in business, and one of the highest-value targets for Python automation. 100 branch store files scattered across 100 spreadsheets? Manual consolidation takes a full day; a Python script handles it in 3 minutes. Formatting a monthly report across dozens of cells? Write the code once and reuse it forever. This chapter systematically covers Python's four Excel tools — openpyxl, pandas, ExcelWriter, and xlwings — and shows how to combine them effectively, finishing with a complete sales data automation pipeline.
Library Selection Guide
| Library | Read | Write | Styles | Formulas | Large Data | Best For |
|---|---|---|---|---|---|---|
| openpyxl | Yes | Yes | Full | Write (no calc) | Medium | Formatted reports, styles, merged cells, images |
| pandas | Yes | Yes | Basic | No | Excellent | Data analysis, multi-file merge, cleaning, pivot tables |
| xlwings | Yes | Yes | Yes | Yes (live calc) | Medium | Interact with open Excel, call VBA macros, Windows automation |
| xlrd | .xls only | No | No | No | Medium | Read legacy .xls files only |
Decision tree:
- Need data analysis, merging, statistics? - Need formatted output with colors and borders? - Need to interact with a running Excel instance? - Need to read legacy .xls files?
Install all at once:
pip install openpyxl pandas xlwingspandas requires openpyxl to read/write .xlsx files — always install both together.
openpyxl Deep Dive
Reading and Writing Worksheets
openpyxl fundamentals
import openpyxl
# ---- Create new workbook ----
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Sales Data"
# Write cells (two equivalent approaches)
ws["A1"] = "Product"
ws.cell(row=1, column=2, value="Revenue") # row/column are 1-indexed
# Append a full row
ws.append(["Laptop", 15000, "2024-03-01"])
ws.append(["Mouse", 299, "2024-03-01"])
wb.save("sales.xlsx")
# ---- Read existing workbook ----
wb = openpyxl.load_workbook("sales.xlsx")
ws = wb.active
ws = wb["Sales Data"] # by name
print(wb.sheetnames)
# Read a cell
print(ws["A1"].value) # "Product"
print(ws.cell(row=2, column=1).value) # "Laptop"
# Iterate rows (values_only returns values, not Cell objects)
for row in ws.iter_rows(min_row=2, values_only=True):
product, revenue, date = row
print(f"{product}: {revenue}")
# Workbook dimensions
print(f"Max row: {ws.max_row} Max col: {ws.max_column}")
Styling: Font, Color, Border, Alignment
openpyxl style system
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
# Font
ws["A1"].font = Font(name="Calibri", size=14, bold=True, color="FFFFFF")
# Background fill
ws["A1"].fill = PatternFill(fill_type="solid", fgColor="4472C4")
# Border
thin = Side(style="thin")
ws["A1"].border = Border(left=thin, right=thin, top=thin, bottom=thin)
# Alignment
ws["A1"].alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
# Number formats
ws["B2"].number_format = '#,##0.00' # 1,234.56
ws["C2"].number_format = '0.00%' # 12.34%
ws["D2"].number_format = 'yyyy-mm-dd'
# Utility: style an entire header row
def style_header_row(ws, row: int, bg: str = "4472C4", fg: str = "FFFFFF"):
thin = Side(style="thin")
for cell in ws[row]:
if cell.value is None:
continue
cell.font = Font(bold=True, color=fg, size=11)
cell.fill = PatternFill(fill_type="solid", fgColor=bg)
cell.alignment = Alignment(horizontal="center", vertical="center")
cell.border = Border(left=thin, right=thin, top=thin, bottom=thin)
# Auto-fit column widths
def auto_width(ws, extra: int = 4):
for col in ws.columns:
max_len = max((len(str(c.value or "")) for c in col), default=8)
ws.column_dimensions[col[0].column_letter].width = max_len + extra
Merged Cells and Images
merge cells / insert image
from openpyxl.drawing.image import Image as XLImage
# Merge cells for a title spanning A1:E1
ws.merge_cells("A1:E1")
ws["A1"] = "Q1 2024 Sales Report"
ws["A1"].alignment = Alignment(horizontal="center", vertical="center")
# Row height
ws.row_dimensions[1].height = 36
# Insert image anchored to F1
img = XLImage("logo.png")
img.width, img.height = 120, 50
ws.add_image(img, "F1")
# Write formulas (openpyxl stores the string; Excel calculates on open)
ws["B10"] = "=SUM(B2:B9)"
ws["C10"] = "=AVERAGE(C2:C9)"
Important: openpyxl does not evaluate formulas. Writing
="=SUM(B2:B9)"stores the formula string. Excel calculates it when the user opens the file. To read formula results in Python, usedata_only=Truewhen loading — but this requires the file to have been previously saved by Excel with cached values.
pandas Data Processing
read_excel / to_excel Key Parameters
pandas Excel I/O
import pandas as pd
# Reading
df = pd.read_excel(
"data.xlsx",
sheet_name="Sheet1", # or 0 (index); None reads all sheets as dict
header=0, # row index for column names
skiprows=2, # skip first 2 rows
usecols="A:D", # column range; also accepts [0,1,3] or ["Name","Amount"]
nrows=100, # read only 100 rows (performance)
dtype={"order_id": str}, # force column types
na_values=["N/A", "-"], # treat these as NaN
parse_dates=["date"], # parse these columns as datetime
)
# Read all sheets
all_sheets = pd.read_excel("data.xlsx", sheet_name=None) # returns {name: DataFrame}
# Writing
df.to_excel("output.xlsx", sheet_name="Results", index=False)
Multi-File Merge
Merge all Excel files in a directory
import pandas as pd
from pathlib import Path
def merge_excel_files(folder: Path, glob: str = "*.xlsx") -> pd.DataFrame:
dfs = []
for f in sorted(folder.glob(glob)):
try:
df = pd.read_excel(f)
df["_source"] = f.name
dfs.append(df)
print(f" loaded: {f.name} ({len(df)} rows)")
except Exception as e:
print(f" [skip] {f.name}: {e}")
merged = pd.concat(dfs, ignore_index=True)
print(f"\nMerged: {len(merged)} rows total")
return merged
Data Cleaning
Common pandas cleaning operations
import pandas as pd
df = pd.read_excel("raw.xlsx")
# Inspect quality
print(df.isnull().sum())
print(df.duplicated().sum())
# Handle missing values
df.dropna(subset=["order_id", "amount"])
df["qty"].fillna(0, inplace=True)
df["notes"].fillna("none", inplace=True)
# Remove duplicates
df.drop_duplicates(subset=["order_id"], keep="last", inplace=True)
# Type conversion
df["date"] = pd.to_datetime(df["date"], format="%Y/%m/%d", errors="coerce")
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")
# String cleaning
df["product"] = df["product"].str.strip().str.upper()
# Filter invalid rows
df = df[df["amount"] > 0]
# Bin into categories
df["tier"] = pd.cut(df["amount"],
bins=[0, 1000, 5000, float("inf")],
labels=["Standard", "Premium", "Enterprise"])
Pivot Tables
pandas pivot_table
import pandas as pd
df = pd.read_excel("sales.xlsx")
pivot = df.pivot_table(
values="amount",
index="region",
columns="category",
aggfunc="sum",
fill_value=0,
margins=True,
margins_name="Total",
)
# Multiple metrics
pivot_multi = df.pivot_table(
values=["amount", "qty"],
index=["region", "rep"],
aggfunc={"amount": "sum", "qty": "sum"},
)
pivot_multi.reset_index().to_excel("pivot.xlsx", index=False)
pandas + openpyxl Together
ExcelWriter multi-sheet output with style post-processing
import pandas as pd
import openpyxl
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
def apply_table_style(ws, header_color: str = "4472C4",
alt_color: str = "DCE6F1") -> None:
thin = Side(style="thin")
border = Border(left=thin, right=thin, top=thin, bottom=thin)
for row_idx, row in enumerate(ws.iter_rows(), start=1):
for cell in row:
cell.border = border
if row_idx == 1:
cell.font = Font(bold=True, color="FFFFFF")
cell.fill = PatternFill(fill_type="solid", fgColor=header_color)
cell.alignment = Alignment(horizontal="center", vertical="center")
elif row_idx % 2 == 0:
cell.fill = PatternFill(fill_type="solid", fgColor=alt_color)
for col in ws.columns:
w = max((len(str(c.value or "")) for c in col), default=6)
ws.column_dimensions[col[0].column_letter].width = min(w + 4, 35)
def generate_report(df_detail, df_pivot, output_path: str) -> None:
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
df_detail.to_excel(writer, sheet_name="Detail", index=False)
df_pivot.to_excel(writer, sheet_name="Pivot")
wb = writer.book
for sheet in ["Detail", "Pivot"]:
apply_table_style(wb[sheet])
print(f"Report saved: {output_path}")
xlwings: Interact with Live Excel
Platform requirement: xlwings requires Microsoft Excel installed on the machine. Primarily Windows (limited macOS support). Not usable on Linux servers. Best for Windows desktop automation where you need to interact with Excel while it's running.
xlwings basics
import xlwings as xw # pip install xlwings
# Connect to Excel
app = xw.App(visible=True)
wb = app.books.open("sales.xlsx")
ws = wb.sheets["Sheet1"]
# Read/write cells
ws["A1"].value = "Header"
value = ws["B2"].value # Returns computed result (not formula string!)
# Bulk read as DataFrame
import pandas as pd
df = ws["A1:D100"].options(pd.DataFrame, header=1, index=False).value
# Write 2D data
ws["A1"].value = [["Product", "Qty", "Amount"],
["Laptop", 5, 7500],
["Mouse", 20, 598]]
# Call a VBA macro
wb.macro("Module1.RefreshReport")()
# Force recalculation
wb.app.calculate()
# Save and close
wb.save()
wb.close()
app.quit()
Project: Sales Data Automation Pipeline
A complete, production-ready pipeline: **merge multiple store Excel files
sales_pipeline.py — ~110 lines, fully runnable
"""
sales_pipeline.py
Sales data automation pipeline:
1. Read all store Excel files from ./stores/
2. Merge and clean data
3. Build monthly pivot summary
4. Write styled multi-sheet report
Dependencies: pip install pandas openpyxl
Usage: python sales_pipeline.py
"""
import sys
import datetime
import pandas as pd
import openpyxl
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
from pathlib import Path
STORES_DIR = Path("./stores")
OUTPUT_FILE = Path("sales_report.xlsx")
REQUIRED_COLS = {"order_id", "date", "product", "qty", "unit_price", "amount"}
def load_store(filepath: Path) -> pd.DataFrame | None:
try:
df = pd.read_excel(filepath, dtype={"order_id": str})
except Exception as e:
print(f" [ERROR] {filepath.name}: {e}")
return None
df.columns = df.columns.str.strip().str.lower()
missing = REQUIRED_COLS - set(df.columns)
if missing:
print(f" [SKIP] {filepath.name} missing columns: {missing}")
return None
df["_store"] = filepath.stem
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")
df["qty"] = pd.to_numeric(df["qty"], errors="coerce").fillna(0).astype(int)
before = len(df)
df.dropna(subset=["order_id", "date", "amount"], inplace=True)
df = df[df["amount"] > 0]
df.drop_duplicates(subset=["order_id"], keep="last", inplace=True)
print(f" [OK] {filepath.name}: {before} rows -> {len(df)} after cleaning")
return df
def load_all(stores_dir: Path) -> pd.DataFrame:
files = sorted(stores_dir.glob("*.xlsx"))
if not files:
sys.exit(f"No .xlsx files found in {stores_dir}")
print(f"Found {len(files)} store file(s):\n")
dfs = [df for f in files if (df := load_store(f)) is not None]
if not dfs:
sys.exit("All files failed to load.")
return pd.concat(dfs, ignore_index=True)
def build_pivot(df: pd.DataFrame) -> pd.DataFrame:
df["year_month"] = df["date"].dt.to_period("M").astype(str)
return df.pivot_table(
values="amount", index="_store", columns="year_month",
aggfunc="sum", fill_value=0, margins=True, margins_name="Total"
)
def top_products(df: pd.DataFrame, n: int = 10) -> pd.DataFrame:
return (
df.groupby("product")["amount"].sum()
.sort_values(ascending=False).head(n)
.reset_index().rename(columns={"amount": "total_amount"})
)
def style_ws(ws) -> None:
thin = Side(style="thin")
bd = Border(left=thin, right=thin, top=thin, bottom=thin)
hdr_fill = PatternFill(fill_type="solid", fgColor="4472C4")
alt_fill = PatternFill(fill_type="solid", fgColor="DCE6F1")
for row_idx, row in enumerate(ws.iter_rows(), start=1):
for cell in row:
cell.border = bd
if row_idx == 1:
cell.font = Font(bold=True, color="FFFFFF", size=11)
cell.fill = hdr_fill
cell.alignment = Alignment(horizontal="center", vertical="center")
elif row_idx % 2 == 0:
cell.fill = alt_fill
for col in ws.columns:
w = max((len(str(c.value or "")) for c in col), default=6)
ws.column_dimensions[col[0].column_letter].width = min(w + 4, 35)
def write_report(df, pivot, top, output_path: Path) -> None:
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
df.drop(columns=["_store"], errors="ignore").to_excel(
writer, sheet_name="All Records", index=False)
pivot.to_excel(writer, sheet_name="Monthly Pivot")
top.to_excel(writer, sheet_name="Top Products", index=False)
wb = writer.book
for sh in ["All Records", "Monthly Pivot", "Top Products"]:
style_ws(wb[sh])
cover = wb.create_sheet("Cover", 0)
cover.sheet_view.showGridLines = False
cover.merge_cells("B2:F2")
cover["B2"] = "Sales Automation Report"
cover["B2"].font = Font(size=20, bold=True, color="2F5496")
cover["B2"].alignment = Alignment(horizontal="center")
cover.merge_cells("B3:F3")
cover["B3"] = f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}"
cover["B3"].font = Font(size=11, color="666666")
cover["B3"].alignment = Alignment(horizontal="center")
cover.row_dimensions[2].height = 40
print(f"\nReport saved: {output_path.resolve()}")
def main() -> None:
print("=" * 50)
print("Sales Data Automation Pipeline")
print("=" * 50 + "\n")
df = load_all(STORES_DIR)
pivot = build_pivot(df)
top = top_products(df)
print(f"\nTotal revenue: {df['amount'].sum():,.2f}")
print(f"Total orders: {len(df)}")
print(f"Stores: {df['_store'].nunique()}")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
write_report(df, pivot, top, OUTPUT_FILE)
if __name__ == "__main__":
main()
Chapter summary: openpyxl owns formatting and presentation; pandas owns computation and analysis. ExcelWriter bridges them seamlessly. xlwings enters only when you need to interact with a live Excel process or call VBA macros. Master this three-library combination and you can automate virtually any Excel workflow you'll encounter in a professional setting.
Previous
Next
Chapter 7: Python + Word