Python + Excel — AI-Powered Data Analysis Upgrade
Chapter 16: Python + Excel Automation
Python is one of the world's most popular programming languages and one of the most powerful tools for Excel automation. When VBA reaches its limits — cross-workbook batch processing, hundreds of files, database integration, API calls — Python steps in. The approach stays the same: describe what you want, AI writes the code, you run it.
Why Use Python for Excel
| Scenario | Best Tool |
|---|---|
| Repetitive operations within one Excel file | VBA |
| Data cleaning and transformation (no code) | Power Query |
| Processing hundreds of files | Python |
| Complex data analysis / statistics | Python (pandas) |
| API or database integration | Python |
| AI model integration | Python |
| Scheduled unattended runs | Python + Task Scheduler |
Environment Setup in 5 Minutes
Install required libraries (run in terminal / command prompt)
pip install openpyxl # Read/write .xlsx files pip install pandas # Data processing powerhouse
Verify installation
python -c "import openpyxl, pandas; print('OK')"
Recommended editor for beginners: Jupyter Notebook (included with Anaconda) — run code cell by cell and see results immediately.
openpyxl — Working with Excel Files
import openpyxl
wb = openpyxl.load_workbook("data.xlsx") # open workbook ws = wb.active # active sheet ws = wb["Sheet1"] # by name
Read cells
value = ws["A1"].value value = ws.cell(row=1, column=1).value
Write cells
ws["B2"].value = "Hello" ws.cell(row=2, column=2).value = "Hello"
wb.save("output.xlsx")
Case 1: Merge 100 Excel Files into One
import openpyxl import os from pathlib import Path
def merge_excel_files(folder_path, output_path): folder = Path(folder_path) xlsx_files = list(folder.glob("*.xlsx")) output_wb = openpyxl.Workbook() output_ws = output_wb.active header_written = False total_rows = 0
for file_path in xlsx_files:
if file_path.name == Path(output_path).name:
continue
wb = openpyxl.load_workbook(file_path, data_only=True)
ws = wb.active
file_rows = list(ws.iter_rows(values_only=True))
if not file_rows:
continue
if not header_written:
output_ws.append(list(file_rows[0]) + ["Source File"])
header_written = True
for row in file_rows[1:]:
output_ws.append(list(row) + [file_path.name])
total_rows += 1
print(f"Processed: {file_path.name} ({len(file_rows)-1} rows)")
wb.close()
output_wb.save(output_path)
print(f"Done! {len(xlsx_files)} files, {total_rows} total rows → {output_path}")
merge_excel_files("C:/Reports/", "C:/Reports/merged.xlsx")
Case 2: Generate a Formatted Report
import openpyxl from openpyxl.styles import Font, PatternFill, Border, Side, Alignment from openpyxl.utils import get_column_letter
def create_formatted_report(data, output_path): wb = openpyxl.Workbook() ws = wb.active for row in data: ws.append(row)
header_fill = PatternFill(start_color="1F4E79", end_color="1F4E79", fill_type="solid")
header_font = Font(bold=True, color="FFFFFF", size=11)
thin = Border(left=Side(style="thin"), right=Side(style="thin"),
top=Side(style="thin"), bottom=Side(style="thin"))
for cell in ws[1]:
cell.fill = header_fill
cell.font = header_font
cell.border = thin
ws.row_dimensions[1].height = 28
for row_idx in range(2, ws.max_row + 1):
for cell in ws[row_idx]:
cell.border = thin
for col_idx in range(1, ws.max_column + 1):
col_letter = get_column_letter(col_idx)
max_len = max((len(str(c.value)) for c in ws[col_letter] if c.value), default=10)
ws.column_dimensions[col_letter].width = min(max_len + 4, 40)
wb.save(output_path)
Case 3: Batch Insert Images into Cells
import openpyxl from openpyxl.drawing.image import Image import os
def insert_images(excel_path, image_folder, output_path): wb = openpyxl.load_workbook(excel_path) ws = wb.active ws.column_dimensions["B"].width = 20
for row_idx in range(2, ws.max_row + 1):
product_id = ws.cell(row=row_idx, column=1).value
if not product_id:
continue
img_path = os.path.join(image_folder, f"{product_id}.png")
if not os.path.exists(img_path):
ws.cell(row=row_idx, column=2).value = "Image not found"
continue
img = Image(img_path)
img.width, img.height = 120, 80
ws.add_image(img, f"B{row_idx}")
ws.row_dimensions[row_idx].height = 65
wb.save(output_path)
Case 4: Generate Individual Payslip Files
import openpyxl from openpyxl.styles import Font, PatternFill, Border, Side from openpyxl.utils import get_column_letter import os
def generate_payslips(source_path, output_folder): wb = openpyxl.load_workbook(source_path, data_only=True) ws = wb.active os.makedirs(output_folder, exist_ok=True) header = [c.value for c in ws[1]]
for row in ws.iter_rows(min_row=2, values_only=True):
if not row[0]:
continue
out_wb = openpyxl.Workbook()
out_ws = out_wb.active
out_ws.append(header)
out_ws.append(list(row))
safe_name = str(row[0]).replace("/", "_")
out_wb.save(os.path.join(output_folder, f"{safe_name}_payslip.xlsx"))
print(f"Payslips saved to: {output_folder}")
pandas — Data Processing Powerhouse
import pandas as pd
df = pd.read_excel("sales.xlsx") # DataFrame = in-memory table
Quick exploration
print(df.shape) # (rows, columns) print(df.columns) # column names print(df.head()) # first 5 rows print(df.describe()) # statistics summary
Filter rows
df[df["Dept"] == "Sales"] df[(df["Dept"] == "Sales") & (df["Salary"] > 10000)]
Save
df.to_excel("output.xlsx", index=False)
Case 1: Merge 12 CSV Files
import pandas as pd, glob, os
def merge_csv_files(folder_path, output_path): csv_files = glob.glob(os.path.join(folder_path, "*.csv")) all_dfs = [] for f in csv_files: df = pd.read_csv(f, encoding="utf-8-sig") df["Source"] = os.path.basename(f) all_dfs.append(df) print(f"Read: {os.path.basename(f)} ({len(df)} rows)") merged = pd.concat(all_dfs, ignore_index=True) merged.to_excel(output_path, index=False) print(f"Merged {len(all_dfs)} files → {len(merged)} rows → {output_path}")
merge_csv_files("C:/MonthlyData/", "C:/MonthlyData/merged.xlsx")
Case 2: Data Cleaning
import pandas as pd
df = pd.read_excel("raw_data.xlsx")
df = df.drop_duplicates() df["Dept"].fillna("Unassigned", inplace=True) df["Salary"] = pd.to_numeric(df["Salary"], errors="coerce") df["Salary"].fillna(df["Salary"].median(), inplace=True) df["Name"] = df["Name"].str.strip() df["HireDate"] = pd.to_datetime(df["HireDate"], errors="coerce") df = df.dropna(subset=["Name", "Salary"])
df.to_excel("cleaned_data.xlsx", index=False)
Case 3: Group Statistics (like a Pivot Table)
import pandas as pd df = pd.read_excel("sales_data.xlsx")
summary = df.groupby("Department")["Sales"].agg( Total="sum", Average="mean", Count="count", Max="max" ).round(2).reset_index()
with pd.ExcelWriter("report.xlsx", engine="openpyxl") as writer: summary.to_excel(writer, sheet_name="Dept Summary", index=False) df.to_excel(writer, sheet_name="Raw Data", index=False)
Case 4: Multi-Condition Filter and Export
import pandas as pd df = pd.read_excel("employees.xlsx") df["HireDate"] = pd.to_datetime(df["HireDate"])
result = df[ (df["Dept"] == "Sales") & (df["Salary"] > 8000) & (df["HireDate"].dt.year == 2023) ] result.to_excel("filtered.xlsx", index=False) print(f"{len(result)} records exported")
Full Automation Workflows
Case 1: Daily Pipeline — Download → Process → Report → Email
import pandas as pd import smtplib from email.mime.multipart import MIMEMultipart from email.mime.base import MIMEBase from email.mime.text import MIMEText from email import encoders import os from datetime import date
def daily_report_pipeline(): today = date.today().strftime("%Y-%m-%d") source_path = f"C:/DailyData/raw_{today}.xlsx"
if not os.path.exists(source_path):
print(f"Source file not found: {source_path}")
return
df = pd.read_excel(source_path)
df = df.drop_duplicates()
df["Sales"] = pd.to_numeric(df["Sales"], errors="coerce").fillna(0)
summary = df.groupby("Salesperson")["Sales"].agg(
TotalSales="sum", OrderCount="count"
).sort_values("TotalSales", ascending=False).reset_index()
summary.insert(0, "Rank", range(1, len(summary) + 1))
output_path = f"C:/Reports/daily_{today}.xlsx"
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
summary.to_excel(writer, sheet_name="Leaderboard", index=False)
df.to_excel(writer, sheet_name="Raw Data", index=False)
print(f"Report saved: {output_path}")
# Add email sending here (see Case 1 above for send_report_email function)
daily_report_pipeline()
Case 2: Batch Process All Files in a Folder
import pandas as pd from pathlib import Path
def batch_process(input_folder, output_folder): Path(output_folder).mkdir(parents=True, exist_ok=True) files = list(Path(input_folder).glob("*.xlsx")) ok, fail = 0, 0
for fp in files:
try:
df = pd.read_excel(fp)
# --- your processing logic here ---
df = df.drop_duplicates()
df["Total"] = pd.to_numeric(df["Qty"], errors="coerce") * pd.to_numeric(df["Price"], errors="coerce")
df = df[df["Total"].notna() & (df["Total"] > 0)]
# --- end logic ---
df.to_excel(Path(output_folder) / fp.name, index=False)
ok += 1
print(f" OK: {fp.name} ({len(df)} rows)")
except Exception as e:
fail += 1
print(f" FAIL: {fp.name} — {e}")
print(f"\nDone: {ok} succeeded, {fail} failed")
batch_process("C:/Input/", "C:/Output/")
Case 3: Compare Two Excel Files for Differences
import pandas as pd
def compare_excel(file1, file2, key_col, output): df1 = pd.read_excel(file1).set_index(key_col) df2 = pd.read_excel(file2).set_index(key_col)
new_rows = df2.loc[df2.index.difference(df1.index)].copy()
new_rows["Change"] = "Added"
deleted_rows = df1.loc[df1.index.difference(df2.index)].copy()
deleted_rows["Change"] = "Deleted"
common = df1.index.intersection(df2.index)
changes = []
for idx in common:
for col in df1.columns:
if col in df2.columns and df1.at[idx, col] != df2.at[idx, col]:
changes.append({key_col: idx, "Field": col,
"Old": df1.at[idx, col], "New": df2.at[idx, col]})
with pd.ExcelWriter(output, engine="openpyxl") as writer:
if not new_rows.empty:
new_rows.reset_index().to_excel(writer, sheet_name="Added", index=False)
if not deleted_rows.empty:
deleted_rows.reset_index().to_excel(writer, sheet_name="Deleted", index=False)
if changes:
pd.DataFrame(changes).to_excel(writer, sheet_name="Modified", index=False)
print(f"Added: {len(new_rows)}, Deleted: {len(deleted_rows)}, Modified: {len(changes)} fields")
print(f"Report: {output}")
compare_excel("Q1.xlsx", "Q2.xlsx", "EmployeeID", "diff.xlsx")
Common Errors and Debugging
| Error | Typical Cause | Quick Fix |
|---|---|---|
| FileNotFoundError | Wrong path or file doesn't exist | Check path; on Windows use r"C:\path" or "C:/path" |
| KeyError: 'ColumnName' | Column doesn't exist in DataFrame | print(df.columns) to check exact names (spaces/capitalization) |
| UnicodeDecodeError | Wrong encoding for CSV | Add encoding="utf-8-sig" or encoding="gbk" |
| PermissionError | File open in Excel | Close the file in Excel first |
| ModuleNotFoundError | Library not installed | pip install library-name |
Python + Excel Cheat Sheet
| Task | Code |
|---|---|
| Read Excel (pandas) | df = pd.read_excel("file.xlsx") |
| Read specific sheet | df = pd.read_excel("f.xlsx", sheet_name="S1") |
| Save to Excel | df.to_excel("out.xlsx", index=False) |
| Write multiple sheets | with pd.ExcelWriter("out.xlsx") as w: df.to_excel(w, sheet_name="S1") |
| Read CSV (with BOM) | df = pd.read_csv("f.csv", encoding="utf-8-sig") |
| Filter rows | df[df["Col"] == "value"] |
| Drop duplicates | df.drop_duplicates() |
| Fill nulls | df["Col"].fillna("default") |
| Group and aggregate | df.groupby("Col")["Val"].agg(["sum","mean","count"]) |
| Merge DataFrames | pd.concat([df1,df2], ignore_index=True) |
| Left join | pd.merge(df1, df2, on="ID", how="left") |
| Type conversion | pd.to_numeric(df["Col"], errors="coerce") |
| Parse dates | pd.to_datetime(df["Date"]) |
| Find all Excel files | glob.glob(os.path.join(folder, "*.xlsx")) |
| Open openpyxl workbook | wb = openpyxl.load_workbook("f.xlsx") |
| Read/write cell | ws["A1"].value = "x" / val = ws["A1"].value |
| Iterate rows | for row in ws.iter_rows(min_row=2, values_only=True): |
Previous ← Chapter 15: VBA Basics Next Chapter 17: AI-Driven Excel Workflows →