Chapter 6

Python + Excel — Complete Guide to openpyxl and pandas

Chapter 6: Python + Excel — openpyxl and pandas in Practice

Excel is the most widely used data format in business, and one of the highest-value targets for Python automation. 100 branch store files scattered across 100 spreadsheets? Manual consolidation takes a full day; a Python script handles it in 3 minutes. Formatting a monthly report across dozens of cells? Write the code once and reuse it forever. This chapter systematically covers Python's four Excel tools — openpyxl, pandas, ExcelWriter, and xlwings — and shows how to combine them effectively, finishing with a complete sales data automation pipeline.

Library Selection Guide

Library	Read	Write	Styles	Formulas	Large Data	Best For
openpyxl	Yes	Yes	Full	Write (no calc)	Medium	Formatted reports, styles, merged cells, images
pandas	Yes	Yes	Basic	No	Excellent	Data analysis, multi-file merge, cleaning, pivot tables
xlwings	Yes	Yes	Yes	Yes (live calc)	Medium	Interact with open Excel, call VBA macros, Windows automation
xlrd	.xls only	No	No	No	Medium	Read legacy .xls files only

Decision tree:

Need data analysis, merging, statistics? - Need formatted output with colors and borders? - Need to interact with a running Excel instance? - Need to read legacy .xls files?

Install all at once: pip install openpyxl pandas xlwings pandas requires openpyxl to read/write .xlsx files — always install both together.

openpyxl Deep Dive

Reading and Writing Worksheets

openpyxl fundamentals

import openpyxl

# ---- Create new workbook ----
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Sales Data"

# Write cells (two equivalent approaches)
ws["A1"] = "Product"
ws.cell(row=1, column=2, value="Revenue")   # row/column are 1-indexed

# Append a full row
ws.append(["Laptop", 15000, "2024-03-01"])
ws.append(["Mouse",    299, "2024-03-01"])
wb.save("sales.xlsx")

# ---- Read existing workbook ----
wb = openpyxl.load_workbook("sales.xlsx")
ws = wb.active
ws = wb["Sales Data"]           # by name
print(wb.sheetnames)

# Read a cell
print(ws["A1"].value)           # "Product"
print(ws.cell(row=2, column=1).value)  # "Laptop"

# Iterate rows (values_only returns values, not Cell objects)
for row in ws.iter_rows(min_row=2, values_only=True):
    product, revenue, date = row
    print(f"{product}: {revenue}")

# Workbook dimensions
print(f"Max row: {ws.max_row}  Max col: {ws.max_column}")

Styling: Font, Color, Border, Alignment

openpyxl style system

from openpyxl.styles import Font, PatternFill, Border, Side, Alignment

# Font
ws["A1"].font = Font(name="Calibri", size=14, bold=True, color="FFFFFF")

# Background fill
ws["A1"].fill = PatternFill(fill_type="solid", fgColor="4472C4")

# Border
thin = Side(style="thin")
ws["A1"].border = Border(left=thin, right=thin, top=thin, bottom=thin)

# Alignment
ws["A1"].alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)

# Number formats
ws["B2"].number_format = '#,##0.00'    # 1,234.56
ws["C2"].number_format = '0.00%'       # 12.34%
ws["D2"].number_format = 'yyyy-mm-dd'

# Utility: style an entire header row
def style_header_row(ws, row: int, bg: str = "4472C4", fg: str = "FFFFFF"):
    thin = Side(style="thin")
    for cell in ws[row]:
        if cell.value is None:
            continue
        cell.font = Font(bold=True, color=fg, size=11)
        cell.fill = PatternFill(fill_type="solid", fgColor=bg)
        cell.alignment = Alignment(horizontal="center", vertical="center")
        cell.border = Border(left=thin, right=thin, top=thin, bottom=thin)

# Auto-fit column widths
def auto_width(ws, extra: int = 4):
    for col in ws.columns:
        max_len = max((len(str(c.value or "")) for c in col), default=8)
        ws.column_dimensions[col[0].column_letter].width = max_len + extra

Merged Cells and Images

merge cells / insert image

from openpyxl.drawing.image import Image as XLImage

# Merge cells for a title spanning A1:E1
ws.merge_cells("A1:E1")
ws["A1"] = "Q1 2024 Sales Report"
ws["A1"].alignment = Alignment(horizontal="center", vertical="center")

# Row height
ws.row_dimensions[1].height = 36

# Insert image anchored to F1
img = XLImage("logo.png")
img.width, img.height = 120, 50
ws.add_image(img, "F1")

# Write formulas (openpyxl stores the string; Excel calculates on open)
ws["B10"] = "=SUM(B2:B9)"
ws["C10"] = "=AVERAGE(C2:C9)"

Important: openpyxl does not evaluate formulas. Writing ="=SUM(B2:B9)" stores the formula string. Excel calculates it when the user opens the file. To read formula results in Python, use data_only=True when loading — but this requires the file to have been previously saved by Excel with cached values.

pandas Data Processing

read_excel / to_excel Key Parameters

pandas Excel I/O

import pandas as pd

# Reading
df = pd.read_excel(
    "data.xlsx",
    sheet_name="Sheet1",      # or 0 (index); None reads all sheets as dict
    header=0,                 # row index for column names
    skiprows=2,               # skip first 2 rows
    usecols="A:D",            # column range; also accepts [0,1,3] or ["Name","Amount"]
    nrows=100,                # read only 100 rows (performance)
    dtype={"order_id": str},  # force column types
    na_values=["N/A", "-"],   # treat these as NaN
    parse_dates=["date"],     # parse these columns as datetime
)

# Read all sheets
all_sheets = pd.read_excel("data.xlsx", sheet_name=None)  # returns {name: DataFrame}

# Writing
df.to_excel("output.xlsx", sheet_name="Results", index=False)

Multi-File Merge

Merge all Excel files in a directory

import pandas as pd
from pathlib import Path

def merge_excel_files(folder: Path, glob: str = "*.xlsx") -> pd.DataFrame:
    dfs = []
    for f in sorted(folder.glob(glob)):
        try:
            df = pd.read_excel(f)
            df["_source"] = f.name
            dfs.append(df)
            print(f"  loaded: {f.name}  ({len(df)} rows)")
        except Exception as e:
            print(f"  [skip] {f.name}: {e}")

    merged = pd.concat(dfs, ignore_index=True)
    print(f"\nMerged: {len(merged)} rows total")
    return merged

Data Cleaning

Common pandas cleaning operations

import pandas as pd

df = pd.read_excel("raw.xlsx")

# Inspect quality
print(df.isnull().sum())
print(df.duplicated().sum())

# Handle missing values
df.dropna(subset=["order_id", "amount"])
df["qty"].fillna(0, inplace=True)
df["notes"].fillna("none", inplace=True)

# Remove duplicates
df.drop_duplicates(subset=["order_id"], keep="last", inplace=True)

# Type conversion
df["date"] = pd.to_datetime(df["date"], format="%Y/%m/%d", errors="coerce")
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

# String cleaning
df["product"] = df["product"].str.strip().str.upper()

# Filter invalid rows
df = df[df["amount"] > 0]

# Bin into categories
df["tier"] = pd.cut(df["amount"],
                    bins=[0, 1000, 5000, float("inf")],
                    labels=["Standard", "Premium", "Enterprise"])

Pivot Tables

pandas pivot_table

import pandas as pd

df = pd.read_excel("sales.xlsx")

pivot = df.pivot_table(
    values="amount",
    index="region",
    columns="category",
    aggfunc="sum",
    fill_value=0,
    margins=True,
    margins_name="Total",
)

# Multiple metrics
pivot_multi = df.pivot_table(
    values=["amount", "qty"],
    index=["region", "rep"],
    aggfunc={"amount": "sum", "qty": "sum"},
)

pivot_multi.reset_index().to_excel("pivot.xlsx", index=False)

pandas + openpyxl Together

ExcelWriter multi-sheet output with style post-processing

import pandas as pd
import openpyxl
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment

def apply_table_style(ws, header_color: str = "4472C4",
                       alt_color: str = "DCE6F1") -> None:
    thin = Side(style="thin")
    border = Border(left=thin, right=thin, top=thin, bottom=thin)
    for row_idx, row in enumerate(ws.iter_rows(), start=1):
        for cell in row:
            cell.border = border
            if row_idx == 1:
                cell.font = Font(bold=True, color="FFFFFF")
                cell.fill = PatternFill(fill_type="solid", fgColor=header_color)
                cell.alignment = Alignment(horizontal="center", vertical="center")
            elif row_idx % 2 == 0:
                cell.fill = PatternFill(fill_type="solid", fgColor=alt_color)
    for col in ws.columns:
        w = max((len(str(c.value or "")) for c in col), default=6)
        ws.column_dimensions[col[0].column_letter].width = min(w + 4, 35)

def generate_report(df_detail, df_pivot, output_path: str) -> None:
    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
        df_detail.to_excel(writer, sheet_name="Detail", index=False)
        df_pivot.to_excel(writer, sheet_name="Pivot")
        wb = writer.book
        for sheet in ["Detail", "Pivot"]:
            apply_table_style(wb[sheet])
    print(f"Report saved: {output_path}")

xlwings: Interact with Live Excel

Platform requirement: xlwings requires Microsoft Excel installed on the machine. Primarily Windows (limited macOS support). Not usable on Linux servers. Best for Windows desktop automation where you need to interact with Excel while it's running.

xlwings basics

import xlwings as xw   # pip install xlwings

# Connect to Excel
app = xw.App(visible=True)
wb = app.books.open("sales.xlsx")
ws = wb.sheets["Sheet1"]

# Read/write cells
ws["A1"].value = "Header"
value = ws["B2"].value    # Returns computed result (not formula string!)

# Bulk read as DataFrame
import pandas as pd
df = ws["A1:D100"].options(pd.DataFrame, header=1, index=False).value

# Write 2D data
ws["A1"].value = [["Product", "Qty", "Amount"],
                  ["Laptop", 5, 7500],
                  ["Mouse", 20, 598]]

# Call a VBA macro
wb.macro("Module1.RefreshReport")()

# Force recalculation
wb.app.calculate()

# Save and close
wb.save()
wb.close()
app.quit()

Project: Sales Data Automation Pipeline

A complete, production-ready pipeline: **merge multiple store Excel files

sales_pipeline.py — ~110 lines, fully runnable

"""
sales_pipeline.py
Sales data automation pipeline:
  1. Read all store Excel files from ./stores/
  2. Merge and clean data
  3. Build monthly pivot summary
  4. Write styled multi-sheet report

Dependencies: pip install pandas openpyxl
Usage: python sales_pipeline.py
"""
import sys
import datetime
import pandas as pd
import openpyxl
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
from pathlib import Path

STORES_DIR = Path("./stores")
OUTPUT_FILE = Path("sales_report.xlsx")
REQUIRED_COLS = {"order_id", "date", "product", "qty", "unit_price", "amount"}

def load_store(filepath: Path) -> pd.DataFrame | None:
    try:
        df = pd.read_excel(filepath, dtype={"order_id": str})
    except Exception as e:
        print(f"  [ERROR] {filepath.name}: {e}")
        return None

    df.columns = df.columns.str.strip().str.lower()
    missing = REQUIRED_COLS - set(df.columns)
    if missing:
        print(f"  [SKIP]  {filepath.name} missing columns: {missing}")
        return None

    df["_store"] = filepath.stem
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["amount"] = pd.to_numeric(df["amount"], errors="coerce")
    df["qty"] = pd.to_numeric(df["qty"], errors="coerce").fillna(0).astype(int)
    before = len(df)
    df.dropna(subset=["order_id", "date", "amount"], inplace=True)
    df = df[df["amount"] > 0]
    df.drop_duplicates(subset=["order_id"], keep="last", inplace=True)
    print(f"  [OK]    {filepath.name}: {before} rows -> {len(df)} after cleaning")
    return df

def load_all(stores_dir: Path) -> pd.DataFrame:
    files = sorted(stores_dir.glob("*.xlsx"))
    if not files:
        sys.exit(f"No .xlsx files found in {stores_dir}")
    print(f"Found {len(files)} store file(s):\n")
    dfs = [df for f in files if (df := load_store(f)) is not None]
    if not dfs:
        sys.exit("All files failed to load.")
    return pd.concat(dfs, ignore_index=True)

def build_pivot(df: pd.DataFrame) -> pd.DataFrame:
    df["year_month"] = df["date"].dt.to_period("M").astype(str)
    return df.pivot_table(
        values="amount", index="_store", columns="year_month",
        aggfunc="sum", fill_value=0, margins=True, margins_name="Total"
    )

def top_products(df: pd.DataFrame, n: int = 10) -> pd.DataFrame:
    return (
        df.groupby("product")["amount"].sum()
        .sort_values(ascending=False).head(n)
        .reset_index().rename(columns={"amount": "total_amount"})
    )

def style_ws(ws) -> None:
    thin = Side(style="thin")
    bd = Border(left=thin, right=thin, top=thin, bottom=thin)
    hdr_fill = PatternFill(fill_type="solid", fgColor="4472C4")
    alt_fill = PatternFill(fill_type="solid", fgColor="DCE6F1")
    for row_idx, row in enumerate(ws.iter_rows(), start=1):
        for cell in row:
            cell.border = bd
            if row_idx == 1:
                cell.font = Font(bold=True, color="FFFFFF", size=11)
                cell.fill = hdr_fill
                cell.alignment = Alignment(horizontal="center", vertical="center")
            elif row_idx % 2 == 0:
                cell.fill = alt_fill
    for col in ws.columns:
        w = max((len(str(c.value or "")) for c in col), default=6)
        ws.column_dimensions[col[0].column_letter].width = min(w + 4, 35)

def write_report(df, pivot, top, output_path: Path) -> None:
    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
        df.drop(columns=["_store"], errors="ignore").to_excel(
            writer, sheet_name="All Records", index=False)
        pivot.to_excel(writer, sheet_name="Monthly Pivot")
        top.to_excel(writer, sheet_name="Top Products", index=False)

        wb = writer.book
        for sh in ["All Records", "Monthly Pivot", "Top Products"]:
            style_ws(wb[sh])

        cover = wb.create_sheet("Cover", 0)
        cover.sheet_view.showGridLines = False
        cover.merge_cells("B2:F2")
        cover["B2"] = "Sales Automation Report"
        cover["B2"].font = Font(size=20, bold=True, color="2F5496")
        cover["B2"].alignment = Alignment(horizontal="center")
        cover.merge_cells("B3:F3")
        cover["B3"] = f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}"
        cover["B3"].font = Font(size=11, color="666666")
        cover["B3"].alignment = Alignment(horizontal="center")
        cover.row_dimensions[2].height = 40

    print(f"\nReport saved: {output_path.resolve()}")

def main() -> None:
    print("=" * 50)
    print("Sales Data Automation Pipeline")
    print("=" * 50 + "\n")

    df = load_all(STORES_DIR)
    pivot = build_pivot(df)
    top = top_products(df)

    print(f"\nTotal revenue:  {df['amount'].sum():,.2f}")
    print(f"Total orders:   {len(df)}")
    print(f"Stores:         {df['_store'].nunique()}")
    print(f"Date range:     {df['date'].min().date()} to {df['date'].max().date()}")

    write_report(df, pivot, top, OUTPUT_FILE)

if __name__ == "__main__":
    main()

Chapter summary: openpyxl owns formatting and presentation; pandas owns computation and analysis. ExcelWriter bridges them seamlessly. xlwings enters only when you need to interact with a live Excel process or call VBA macros. Master this three-library combination and you can automate virtually any Excel workflow you'll encounter in a professional setting.

Previous

Next
Chapter 7: Python + Word

Rate this chapter

4.7 / 5 (50 ratings)