Chapter 16

Python + Excel — AI-Powered Data Analysis Upgrade

Chapter 16: Python + Excel Automation

Python is one of the world's most popular programming languages and one of the most powerful tools for Excel automation. When VBA reaches its limits — cross-workbook batch processing, hundreds of files, database integration, API calls — Python steps in. The approach stays the same: describe what you want, AI writes the code, you run it.

Why Use Python for Excel

Scenario Best Tool
Repetitive operations within one Excel file VBA
Data cleaning and transformation (no code) Power Query
Processing hundreds of files Python
Complex data analysis / statistics Python (pandas)
API or database integration Python
AI model integration Python
Scheduled unattended runs Python + Task Scheduler

Environment Setup in 5 Minutes

Install required libraries (run in terminal / command prompt)

pip install openpyxl # Read/write .xlsx files pip install pandas # Data processing powerhouse

Verify installation

python -c "import openpyxl, pandas; print('OK')"

Recommended editor for beginners: Jupyter Notebook (included with Anaconda) — run code cell by cell and see results immediately.

openpyxl — Working with Excel Files

import openpyxl

wb = openpyxl.load_workbook("data.xlsx") # open workbook ws = wb.active # active sheet ws = wb["Sheet1"] # by name

Read cells

value = ws["A1"].value value = ws.cell(row=1, column=1).value

Write cells

ws["B2"].value = "Hello" ws.cell(row=2, column=2).value = "Hello"

wb.save("output.xlsx")

Case 1: Merge 100 Excel Files into One

import openpyxl import os from pathlib import Path

def merge_excel_files(folder_path, output_path): folder = Path(folder_path) xlsx_files = list(folder.glob("*.xlsx")) output_wb = openpyxl.Workbook() output_ws = output_wb.active header_written = False total_rows = 0

for file_path in xlsx_files:
    if file_path.name == Path(output_path).name:
        continue
    wb = openpyxl.load_workbook(file_path, data_only=True)
    ws = wb.active
    file_rows = list(ws.iter_rows(values_only=True))
    if not file_rows:
        continue
    if not header_written:
        output_ws.append(list(file_rows[0]) + ["Source File"])
        header_written = True
    for row in file_rows[1:]:
        output_ws.append(list(row) + [file_path.name])
        total_rows += 1
    print(f"Processed: {file_path.name} ({len(file_rows)-1} rows)")
    wb.close()

output_wb.save(output_path)
print(f"Done! {len(xlsx_files)} files, {total_rows} total rows → {output_path}")

merge_excel_files("C:/Reports/", "C:/Reports/merged.xlsx")

Case 2: Generate a Formatted Report

import openpyxl from openpyxl.styles import Font, PatternFill, Border, Side, Alignment from openpyxl.utils import get_column_letter

def create_formatted_report(data, output_path): wb = openpyxl.Workbook() ws = wb.active for row in data: ws.append(row)

header_fill = PatternFill(start_color="1F4E79", end_color="1F4E79", fill_type="solid")
header_font = Font(bold=True, color="FFFFFF", size=11)
thin = Border(left=Side(style="thin"), right=Side(style="thin"),
              top=Side(style="thin"), bottom=Side(style="thin"))

for cell in ws[1]:
    cell.fill = header_fill
    cell.font = header_font
    cell.border = thin

ws.row_dimensions[1].height = 28

for row_idx in range(2, ws.max_row + 1):
    for cell in ws[row_idx]:
        cell.border = thin

for col_idx in range(1, ws.max_column + 1):
    col_letter = get_column_letter(col_idx)
    max_len = max((len(str(c.value)) for c in ws[col_letter] if c.value), default=10)
    ws.column_dimensions[col_letter].width = min(max_len + 4, 40)

wb.save(output_path)

Case 3: Batch Insert Images into Cells

import openpyxl from openpyxl.drawing.image import Image import os

def insert_images(excel_path, image_folder, output_path): wb = openpyxl.load_workbook(excel_path) ws = wb.active ws.column_dimensions["B"].width = 20

for row_idx in range(2, ws.max_row + 1):
    product_id = ws.cell(row=row_idx, column=1).value
    if not product_id:
        continue
    img_path = os.path.join(image_folder, f"{product_id}.png")
    if not os.path.exists(img_path):
        ws.cell(row=row_idx, column=2).value = "Image not found"
        continue
    img = Image(img_path)
    img.width, img.height = 120, 80
    ws.add_image(img, f"B{row_idx}")
    ws.row_dimensions[row_idx].height = 65

wb.save(output_path)

Case 4: Generate Individual Payslip Files

import openpyxl from openpyxl.styles import Font, PatternFill, Border, Side from openpyxl.utils import get_column_letter import os

def generate_payslips(source_path, output_folder): wb = openpyxl.load_workbook(source_path, data_only=True) ws = wb.active os.makedirs(output_folder, exist_ok=True) header = [c.value for c in ws[1]]

for row in ws.iter_rows(min_row=2, values_only=True):
    if not row[0]:
        continue
    out_wb = openpyxl.Workbook()
    out_ws = out_wb.active
    out_ws.append(header)
    out_ws.append(list(row))
    safe_name = str(row[0]).replace("/", "_")
    out_wb.save(os.path.join(output_folder, f"{safe_name}_payslip.xlsx"))

print(f"Payslips saved to: {output_folder}")

pandas — Data Processing Powerhouse

import pandas as pd

df = pd.read_excel("sales.xlsx") # DataFrame = in-memory table

Quick exploration

print(df.shape) # (rows, columns) print(df.columns) # column names print(df.head()) # first 5 rows print(df.describe()) # statistics summary

Filter rows

df[df["Dept"] == "Sales"] df[(df["Dept"] == "Sales") & (df["Salary"] > 10000)]

Save

df.to_excel("output.xlsx", index=False)

Case 1: Merge 12 CSV Files

import pandas as pd, glob, os

def merge_csv_files(folder_path, output_path): csv_files = glob.glob(os.path.join(folder_path, "*.csv")) all_dfs = [] for f in csv_files: df = pd.read_csv(f, encoding="utf-8-sig") df["Source"] = os.path.basename(f) all_dfs.append(df) print(f"Read: {os.path.basename(f)} ({len(df)} rows)") merged = pd.concat(all_dfs, ignore_index=True) merged.to_excel(output_path, index=False) print(f"Merged {len(all_dfs)} files → {len(merged)} rows → {output_path}")

merge_csv_files("C:/MonthlyData/", "C:/MonthlyData/merged.xlsx")

Case 2: Data Cleaning

import pandas as pd

df = pd.read_excel("raw_data.xlsx")

df = df.drop_duplicates() df["Dept"].fillna("Unassigned", inplace=True) df["Salary"] = pd.to_numeric(df["Salary"], errors="coerce") df["Salary"].fillna(df["Salary"].median(), inplace=True) df["Name"] = df["Name"].str.strip() df["HireDate"] = pd.to_datetime(df["HireDate"], errors="coerce") df = df.dropna(subset=["Name", "Salary"])

df.to_excel("cleaned_data.xlsx", index=False)

Case 3: Group Statistics (like a Pivot Table)

import pandas as pd df = pd.read_excel("sales_data.xlsx")

summary = df.groupby("Department")["Sales"].agg( Total="sum", Average="mean", Count="count", Max="max" ).round(2).reset_index()

with pd.ExcelWriter("report.xlsx", engine="openpyxl") as writer: summary.to_excel(writer, sheet_name="Dept Summary", index=False) df.to_excel(writer, sheet_name="Raw Data", index=False)

Case 4: Multi-Condition Filter and Export

import pandas as pd df = pd.read_excel("employees.xlsx") df["HireDate"] = pd.to_datetime(df["HireDate"])

result = df[ (df["Dept"] == "Sales") & (df["Salary"] > 8000) & (df["HireDate"].dt.year == 2023) ] result.to_excel("filtered.xlsx", index=False) print(f"{len(result)} records exported")

Full Automation Workflows

Case 1: Daily Pipeline — Download → Process → Report → Email

import pandas as pd import smtplib from email.mime.multipart import MIMEMultipart from email.mime.base import MIMEBase from email.mime.text import MIMEText from email import encoders import os from datetime import date

def daily_report_pipeline(): today = date.today().strftime("%Y-%m-%d") source_path = f"C:/DailyData/raw_{today}.xlsx"

if not os.path.exists(source_path):
    print(f"Source file not found: {source_path}")
    return

df = pd.read_excel(source_path)
df = df.drop_duplicates()
df["Sales"] = pd.to_numeric(df["Sales"], errors="coerce").fillna(0)

summary = df.groupby("Salesperson")["Sales"].agg(
    TotalSales="sum", OrderCount="count"
).sort_values("TotalSales", ascending=False).reset_index()
summary.insert(0, "Rank", range(1, len(summary) + 1))

output_path = f"C:/Reports/daily_{today}.xlsx"
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    summary.to_excel(writer, sheet_name="Leaderboard", index=False)
    df.to_excel(writer, sheet_name="Raw Data", index=False)

print(f"Report saved: {output_path}")
# Add email sending here (see Case 1 above for send_report_email function)

daily_report_pipeline()

Case 2: Batch Process All Files in a Folder

import pandas as pd from pathlib import Path

def batch_process(input_folder, output_folder): Path(output_folder).mkdir(parents=True, exist_ok=True) files = list(Path(input_folder).glob("*.xlsx")) ok, fail = 0, 0

for fp in files:
    try:
        df = pd.read_excel(fp)
        # --- your processing logic here ---
        df = df.drop_duplicates()
        df["Total"] = pd.to_numeric(df["Qty"], errors="coerce") * pd.to_numeric(df["Price"], errors="coerce")
        df = df[df["Total"].notna() & (df["Total"] > 0)]
        # --- end logic ---
        df.to_excel(Path(output_folder) / fp.name, index=False)
        ok += 1
        print(f"  OK: {fp.name} ({len(df)} rows)")
    except Exception as e:
        fail += 1
        print(f"  FAIL: {fp.name} — {e}")

print(f"\nDone: {ok} succeeded, {fail} failed")

batch_process("C:/Input/", "C:/Output/")

Case 3: Compare Two Excel Files for Differences

import pandas as pd

def compare_excel(file1, file2, key_col, output): df1 = pd.read_excel(file1).set_index(key_col) df2 = pd.read_excel(file2).set_index(key_col)

new_rows = df2.loc[df2.index.difference(df1.index)].copy()
new_rows["Change"] = "Added"
deleted_rows = df1.loc[df1.index.difference(df2.index)].copy()
deleted_rows["Change"] = "Deleted"

common = df1.index.intersection(df2.index)
changes = []
for idx in common:
    for col in df1.columns:
        if col in df2.columns and df1.at[idx, col] != df2.at[idx, col]:
            changes.append({key_col: idx, "Field": col,
                            "Old": df1.at[idx, col], "New": df2.at[idx, col]})

with pd.ExcelWriter(output, engine="openpyxl") as writer:
    if not new_rows.empty:
        new_rows.reset_index().to_excel(writer, sheet_name="Added", index=False)
    if not deleted_rows.empty:
        deleted_rows.reset_index().to_excel(writer, sheet_name="Deleted", index=False)
    if changes:
        pd.DataFrame(changes).to_excel(writer, sheet_name="Modified", index=False)

print(f"Added: {len(new_rows)}, Deleted: {len(deleted_rows)}, Modified: {len(changes)} fields")
print(f"Report: {output}")

compare_excel("Q1.xlsx", "Q2.xlsx", "EmployeeID", "diff.xlsx")

Common Errors and Debugging

Error Typical Cause Quick Fix
FileNotFoundError Wrong path or file doesn't exist Check path; on Windows use r"C:\path" or "C:/path"
KeyError: 'ColumnName' Column doesn't exist in DataFrame print(df.columns) to check exact names (spaces/capitalization)
UnicodeDecodeError Wrong encoding for CSV Add encoding="utf-8-sig" or encoding="gbk"
PermissionError File open in Excel Close the file in Excel first
ModuleNotFoundError Library not installed pip install library-name

Python + Excel Cheat Sheet

Task Code
Read Excel (pandas) df = pd.read_excel("file.xlsx")
Read specific sheet df = pd.read_excel("f.xlsx", sheet_name="S1")
Save to Excel df.to_excel("out.xlsx", index=False)
Write multiple sheets with pd.ExcelWriter("out.xlsx") as w: df.to_excel(w, sheet_name="S1")
Read CSV (with BOM) df = pd.read_csv("f.csv", encoding="utf-8-sig")
Filter rows df[df["Col"] == "value"]
Drop duplicates df.drop_duplicates()
Fill nulls df["Col"].fillna("default")
Group and aggregate df.groupby("Col")["Val"].agg(["sum","mean","count"])
Merge DataFrames pd.concat([df1,df2], ignore_index=True)
Left join pd.merge(df1, df2, on="ID", how="left")
Type conversion pd.to_numeric(df["Col"], errors="coerce")
Parse dates pd.to_datetime(df["Date"])
Find all Excel files glob.glob(os.path.join(folder, "*.xlsx"))
Open openpyxl workbook wb = openpyxl.load_workbook("f.xlsx")
Read/write cell ws["A1"].value = "x" / val = ws["A1"].value
Iterate rows for row in ws.iter_rows(min_row=2, values_only=True):

Previous ← Chapter 15: VBA Basics Next Chapter 17: AI-Driven Excel Workflows →

Rate this chapter
4.5  / 5  (14 ratings)

💬 Comments