Python + PDF — Extract, Merge, and Generate PDFs
Chapter 8: PDF Automation — Split, Merge, Extract & Generate
PDF is the standard format of the business world: contracts, invoices, reports, specifications — nearly everything is delivered as PDF. But PDF was never designed for machine processing. Extracting data, splitting and merging in bulk, generating formatted reports — these needs have driven the development of a mature Python toolchain. This chapter covers the four core PDF scenarios: extraction, merge/split, table extraction, and generation, culminating in a complete financial report processing pipeline.
PDF Library Ecosystem: Which Tool for What
| Library | Best For | Install |
|---|---|---|
| pdfplumber | Text and table extraction from digital PDFs | pip install pdfplumber |
| pypdf | Merge, split, rotate, encrypt/decrypt | pip install pypdf |
| reportlab | Generate PDF from scratch with precise layout control | pip install reportlab |
| weasyprint | HTML template to PDF (more flexible layouts) | pip install weasyprint |
| tabula-py | Extract PDF tables into pandas DataFrames | pip install tabula-py |
| PyMuPDF (fitz) | High-performance extraction, image conversion, annotations | pip install pymupdf |
Decision guide: Use pdfplumber to read, reportlab or weasyprint to generate, pypdf to merge/split, tabula-py to extract tables. Scanned PDFs (image-based) require OCR tools — see Chapter 12 for AI-powered OCR approaches.
PDF Text Extraction
Basic Extraction with pdfplumber
extract_text.py
import pdfplumber
from pathlib import Path
def extract_pdf_text(pdf_path: str) -> str:
all_text = []
with pdfplumber.open(pdf_path) as pdf:
print(f'Total pages: {len(pdf.pages)}')
for page_num, page in enumerate(pdf.pages, start=1):
text = page.extract_text()
if text:
all_text.append(f'--- Page {page_num} ---\n{text}')
else:
all_text.append(f'--- Page {page_num}: No extractable text (scanned image?) ---')
return '\n\n'.join(all_text)
text = extract_pdf_text('annual_report.pdf')
print(text[:500]) # Preview first 500 chars
Path('extracted_text.txt').write_text(text, encoding='utf-8')
Batch Extraction: 100 PDFs to Excel
batch_extract_to_excel.py
import pdfplumber
import pandas as pd
import re
from pathlib import Path
def extract_key_info(pdf_path: Path) -> dict:
"""Extract key fields from an invoice PDF using regex."""
info = {
'filename': pdf_path.name,
'invoice_no': None,
'amount': None,
'date': None,
}
with pdfplumber.open(pdf_path) as pdf:
text = pdf.pages[0].extract_text() or ''
m = re.search(r'Invoice\s*#?\s*:?\s*([A-Z0-9\-]{5,20})', text, re.I)
if m:
info['invoice_no'] = m.group(1)
m = re.search(r'Total\s*:?\s*\$?([\d,]+\.?\d*)', text, re.I)
if m:
info['amount'] = float(m.group(1).replace(',', ''))
m = re.search(r'(\d{4})[/\-](\d{2})[/\-](\d{2})', text)
if m:
info['date'] = f"{m.group(1)}-{m.group(2)}-{m.group(3)}"
return info
pdf_dir = Path('invoices')
results, errors = [], []
for pdf_file in sorted(pdf_dir.glob('*.pdf')):
try:
results.append(extract_key_info(pdf_file))
print(f' [OK] {pdf_file.name}')
except Exception as e:
errors.append({'filename': pdf_file.name, 'error': str(e)})
print(f' [Error] {pdf_file.name}: {e}')
pd.DataFrame(results).to_excel('invoice_summary.xlsx', index=False)
print(f'\nDone: {len(results)} succeeded, {len(errors)} failed')
Scanned PDFs: If
page.extract_text()returns empty or garbled text, the PDF contains scanned images rather than selectable text. You need OCR — eitherpytesseract(requires Tesseract engine installed locally) or a cloud OCR API. Chapter 12 covers AI API integration for higher-accuracy OCR.
PDF Merge and Split
Merge Multiple PDFs
merge_pdfs.py
from pypdf import PdfWriter
from pathlib import Path
def merge_pdfs(input_files: list, output_path: str):
writer = PdfWriter()
for pdf_path in input_files:
writer.append(str(pdf_path))
print(f' Added: {Path(pdf_path).name}')
with open(output_path, 'wb') as f:
writer.write(f)
print(f'Merged {len(input_files)} files -> {output_path}')
pdf_files = sorted(Path('monthly_reports/2024-12').glob('*.pdf'))
merge_pdfs(pdf_files, 'monthly_reports/2024-12_combined.pdf')
Split by Page Range
split_pdf.py
from pypdf import PdfReader, PdfWriter
from pathlib import Path
def split_pdf_by_range(input_path: str, ranges: list, output_dir: str):
"""
ranges: [(start_page, end_page, output_filename), ...]
Page numbers are 1-indexed (human-readable).
"""
reader = PdfReader(input_path)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
for start_page, end_page, filename in ranges:
writer = PdfWriter()
for page_num in range(start_page - 1, end_page): # 0-indexed internally
writer.add_page(reader.pages[page_num])
out_path = output_dir / filename
with open(out_path, 'wb') as f:
writer.write(f)
print(f' Pages {start_page}-{end_page} -> {filename}')
split_pdf_by_range(
'annual_report_2024.pdf',
ranges=[
(1, 25, 'Q1_report.pdf'),
(26, 50, 'Q2_report.pdf'),
(51, 75, 'Q3_report.pdf'),
(76, 100, 'Q4_report.pdf'),
],
output_dir='split_reports',
)
Extract Cover Pages from Multiple Reports
extract_covers.py
from pypdf import PdfReader, PdfWriter
from pathlib import Path
def extract_cover_pages(pdf_dir: str, output_path: str):
"""Extract page 1 from every PDF in a directory, merge into one file."""
writer = PdfWriter()
for pdf_file in sorted(Path(pdf_dir).glob('*.pdf')):
reader = PdfReader(str(pdf_file))
if reader.pages:
writer.add_page(reader.pages[0])
print(f' Cover extracted: {pdf_file.name}')
with open(output_path, 'wb') as f:
writer.write(f)
print(f'Cover summary saved: {output_path}')
extract_cover_pages('monthly_reports/', 'all_covers_2024.pdf')
PDF Table Extraction
pdfplumber for Well-Formatted Tables
extract_table_pdfplumber.py
import pdfplumber
import pandas as pd
def extract_tables_from_pdf(pdf_path: str) -> list:
all_tables = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, start=1):
for idx, table in enumerate(page.extract_tables()):
if not table:
continue
df = pd.DataFrame(table[1:], columns=table[0])
df['_page'] = page_num
all_tables.append(df)
print(f' Page {page_num}, table {idx+1}: {len(df)} rows')
return all_tables
tables = extract_tables_from_pdf('financial_report.pdf')
if tables:
with pd.ExcelWriter('extracted_tables.xlsx') as writer:
for i, df in enumerate(tables):
df.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)
print(f'Extracted {len(tables)} tables to extracted_tables.xlsx')
tabula-py for More Complex Tables
extract_table_tabula.py
# pip install tabula-py
# Requires Java Runtime (JRE): https://www.java.com/download/
import tabula
import pandas as pd
# Extract all tables from all pages
tables = tabula.read_pdf(
'financial_report.pdf',
pages='all',
multiple_tables=True,
lattice=True, # Use for bordered tables
# stream=True, # Use for whitespace-separated tables without borders
)
print(f'Found {len(tables)} tables')
combined = pd.concat(tables, ignore_index=True)
combined.to_excel('all_tables.xlsx', index=False)
# Handle tables that span multiple pages
cross_page = tabula.read_pdf(
'report.pdf',
pages='3-5', # Pages 3 through 5 contain one continuous table
multiple_tables=False,
lattice=True,
)[0]
print(cross_page.head())
Cleaning Extracted Data
clean_extracted_data.py
import pandas as pd
def clean_pdf_table(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df.dropna(how='all', inplace=True)
df.dropna(axis=1, how='all', inplace=True)
df.columns = [str(c).strip() for c in df.columns]
df = df.applymap(lambda x: str(x).strip() if pd.notna(x) else x)
# Convert currency strings like "1,234.56" to float
for col in df.columns:
sample = df[col].dropna().head(5)
if sample.astype(str).str.match(r'^[\d,]+\.?\d*$').any():
df[col] = pd.to_numeric(
df[col].astype(str).str.replace(',', ''), errors='coerce'
)
return df
import tabula
tables = tabula.read_pdf('report.pdf', pages='all', multiple_tables=True)
cleaned = [clean_pdf_table(t) for t in tables]
with pd.ExcelWriter('clean_tables.xlsx') as writer:
for i, df in enumerate(cleaned):
df.to_excel(writer, sheet_name=f'Sheet{i+1}', index=False)
Generating PDF Reports
Option 1: reportlab — Fine-Grained Control
generate_pdf_reportlab.py
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
def generate_sales_report(data: list, output_path: str):
doc = SimpleDocTemplate(output_path, pagesize=A4,
rightMargin=2*cm, leftMargin=2*cm,
topMargin=2*cm, bottomMargin=2*cm)
styles = getSampleStyleSheet()
elements = [
Paragraph('Monthly Sales Report', styles['Title']),
Spacer(1, 0.5*cm),
Paragraph(f'{len(data)} records this month.', styles['Normal']),
Spacer(1, 0.5*cm),
]
table_data = [['Product', 'Units', 'Price', 'Revenue']]
total = 0
for row in data:
rev = row['qty'] * row['price']
total += rev
table_data.append([row['product'], str(row['qty']),
f"${row['price']:,.2f}", f"${rev:,.2f}"])
table_data.append(['Total', '', '', f"${total:,.2f}"])
t = Table(table_data, colWidths=[7*cm, 3*cm, 3.5*cm, 3.5*cm])
t.setStyle(TableStyle([
('BACKGROUND', (0,0), (-1,0), colors.HexColor('#1f2d3d')),
('TEXTCOLOR', (0,0), (-1,0), colors.white),
('GRID', (0,0), (-1,-1), 0.5, colors.grey),
('FONTNAME', (0,-1), (-1,-1), 'Helvetica-Bold'),
('ROWBACKGROUNDS', (0,1), (-1,-2), [colors.white, colors.HexColor('#f5f7fa')]),
]))
elements.append(t)
doc.build(elements)
print(f'Report generated: {output_path}')
generate_sales_report([
{'product': 'Laptop', 'qty': 50, 'price': 999},
{'product': 'Wireless Mouse','qty': 200, 'price': 29},
], 'sales_report.pdf')
Option 2: weasyprint — HTML Template to PDF (Recommended)
generate_pdf_weasyprint.py
# pip install weasyprint jinja2
from jinja2 import Template
from weasyprint import HTML
from datetime import date
HTML_TEMPLATE = """
<!DOCTYPE html><html><head><meta charset="utf-8">
<style>
body{font-family:Arial,sans-serif;margin:2cm}
h1{color:#1f2d3d;border-bottom:2px solid #6c63ff;padding-bottom:8px}
table{width:100%;border-collapse:collapse;margin-top:20px;font-size:12px}
th{background:#1f2d3d;color:#fff;padding:10px 14px;text-align:right}
th:first-child{text-align:left}
td{padding:8px 14px;border-bottom:1px solid #e2e8f0;text-align:right}
td:first-child{text-align:left}
.total td{font-weight:bold;background:#eef2ff}
</style></head><body>
<h1>{{"{{"}}title{{"}}"}}</h1>
<p>Generated: {{"{{"}}report_date{{"}}"}} | {{"{{"}}rows|length{{"}}"}} records</p>
<table>
<tr><th>Product</th><th>Qty</th><th>Price</th><th>Revenue</th></tr>
{% for r in rows %}
<tr>
<td>{{"{{"}}r.product{{"}}"}}</td><td>{{"{{"}}r.qty{{"}}"}}</td>
<td>${{"{{"}} "%.2f"|format(r.price) {{"}}"}}</td>
<td>${{"{{"}} "%.2f"|format(r.qty * r.price) {{"}}"}}</td>
</tr>
{% endfor %}
<tr class="total">
<td colspan="3">Total</td>
<td>${{"{{"}} "%.2f"|format(total) {{"}}"}}</td>
</tr>
</table>
</body></html>
"""
def generate_pdf(data: list, output_path: str):
total = sum(r['qty'] * r['price'] for r in data)
html = Template(HTML_TEMPLATE).render(
title='Monthly Sales Report',
report_date=date.today().strftime('%B %d, %Y'),
rows=data, total=total,
)
HTML(string=html).write_pdf(output_path)
print(f'PDF generated: {output_path}')
generate_pdf([
{'product': 'Laptop', 'qty': 50, 'price': 999},
{'product': 'Wireless Mouse','qty': 200, 'price': 29},
], 'report.pdf')
reportlab vs weasyprint: reportlab has a steeper learning curve but gives precise layout control — ideal for complex, precision-critical documents. weasyprint uses familiar HTML/CSS, making it faster to develop and easier to maintain. Both support non-Latin characters with appropriate fonts specified.
PDF Encryption and Permissions
Adding Password Protection
encrypt_pdf.py
from pypdf import PdfReader, PdfWriter
def encrypt_pdf(input_path: str, output_path: str,
user_password: str, owner_password: str = None):
reader = PdfReader(input_path)
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
writer.encrypt(
user_password=user_password,
owner_password=owner_password or user_password,
use_128bit=True,
)
with open(output_path, 'wb') as f:
writer.write(f)
print(f'Encrypted: {output_path}')
# Single file
encrypt_pdf('contract.pdf', 'contract_encrypted.pdf', 'Client@2024')
# Batch with per-client passwords
import pandas as pd
df = pd.read_excel('clients.xlsx')
for _, row in df.iterrows():
encrypt_pdf(
f"contracts/{row['Name']}_contract.pdf",
f"contracts_enc/{row['Name']}_contract_enc.pdf",
str(row['Password']),
)
Batch Decryption
decrypt_pdf.py
from pypdf import PdfReader, PdfWriter
from pathlib import Path
def decrypt_pdf(input_path: str, output_path: str, password: str) -> bool:
try:
reader = PdfReader(input_path)
if reader.is_encrypted:
if reader.decrypt(password) == 0:
print(f'Wrong password: {input_path}')
return False
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
with open(output_path, 'wb') as f:
writer.write(f)
return True
except Exception as e:
print(f'Failed {input_path}: {e}')
return False
password = 'CompanyInternal2024'
for pdf_file in Path('encrypted').glob('*.pdf'):
out = Path('decrypted') / pdf_file.name
out.parent.mkdir(exist_ok=True)
ok = decrypt_pdf(str(pdf_file), str(out), password)
print(f' {"[OK]" if ok else "[Failed]"} {pdf_file.name}')
Project: Financial Report Processing Pipeline
Monthly scenario: receive 10–20 department PDF reports, extract key financial metrics from each, consolidate into an Excel summary, and generate a one-page PDF digest for management.
financial_pipeline.py — Complete pipeline
"""
Financial Report Processing Pipeline
Requirements: pip install pdfplumber pypdf pandas openpyxl weasyprint jinja2
"""
import pdfplumber
import pandas as pd
from pathlib import Path
from datetime import date
import re, logging
from jinja2 import Template
from weasyprint import HTML
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)
PDF_DIR = Path('reports/monthly')
OUTPUT_DIR = Path('output')
PERIOD = 'December 2024'
def extract_financial_data(pdf_path: Path) -> dict:
dept = pdf_path.stem.replace('_report', '')
result = {'Department': dept, 'Revenue': None, 'Cost': None, 'Profit': None, 'Notes': ''}
with pdfplumber.open(pdf_path) as pdf:
text = '\n'.join(p.extract_text() or '' for p in pdf.pages)
patterns = {
'Revenue': r'(?:Total Revenue|Revenue)\s*:?\s*\$?([\d,]+\.?\d*)',
'Cost': r'(?:Total Cost|Expenses)\s*:?\s*\$?([\d,]+\.?\d*)',
'Profit': r'(?:Net Profit|Net Income)\s*:?\s*\$?([\d,]+\.?\d*)',
}
for key, pat in patterns.items():
m = re.search(pat, text, re.I)
result[key] = float(m.group(1).replace(',', '')) if m else None
if not m:
result['Notes'] += f'{key} not found; '
return result
def batch_extract(pdf_dir: Path) -> pd.DataFrame:
records = []
for pdf_file in sorted(pdf_dir.glob('*.pdf')):
try:
records.append(extract_financial_data(pdf_file))
logger.info(f' [OK] {pdf_file.name}')
except Exception as e:
records.append({'Department': pdf_file.stem, 'Notes': f'Error: {e}'})
logger.error(f' [Error] {pdf_file.name}: {e}')
return pd.DataFrame(records)
SUMMARY_HTML = """
<!DOCTYPE html><html><head><meta charset="utf-8">
<style>
body{font-family:Arial,sans-serif;margin:2cm;color:#1a1a2e}
h1{color:#1f2d3d;font-size:20px;border-bottom:3px solid #6c63ff;padding-bottom:6px}
.meta{color:#666;font-size:12px;margin-bottom:20px}
table{width:100%;border-collapse:collapse;font-size:12px}
th{background:#1f2d3d;color:#fff;padding:8px 12px;text-align:right}
th:first-child{text-align:left}
td{padding:7px 12px;border-bottom:1px solid #e2e8f0;text-align:right}
td:first-child{text-align:left}
tr:nth-child(even) td{background:#f8fafc}
.total-row td{font-weight:bold;background:#eef2ff}
.pos{color:#16a34a}.neg{color:#dc2626}
</style></head><body>
<h1>{{"{{"}}period{{"}}"}} Financial Summary</h1>
<div class="meta">Generated: {{"{{"}}gen_date{{"}}"}} | {{"{{"}}rows|length{{"}}"}} departments</div>
<table>
<tr><th>Department</th><th>Revenue ($)</th><th>Cost ($)</th><th>Profit ($)</th></tr>
{% for r in rows %}
<tr>
<td>{{"{{"}}r['Department']{{"}}"}}</td>
<td>{{"{{"}} "{:,.0f}".format(r['Revenue'] or 0) {{"}}"}}</td>
<td>{{"{{"}} "{:,.0f}".format(r['Cost'] or 0) {{"}}"}}</td>
<td class="{{"{{"}}'pos' if (r['Profit'] or 0) >= 0 else 'neg'{{"}}"}}">
{{"{{"}} "{:,.0f}".format(r['Profit'] or 0) {{"}}"}}
</td>
</tr>
{% endfor %}
<tr class="total-row">
<td>Total</td>
<td>{{"{{"}} "{:,.0f}".format(total_rev) {{"}}"}}</td>
<td>{{"{{"}} "{:,.0f}".format(total_cost) {{"}}"}}</td>
<td>{{"{{"}} "{:,.0f}".format(total_profit) {{"}}"}}</td>
</tr>
</table></body></html>
"""
def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f'Processing {PERIOD} reports...')
df = batch_extract(PDF_DIR)
# Save Excel
excel_path = OUTPUT_DIR / f'{PERIOD}_summary.xlsx'
with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='Data', index=False)
logger.info(f'Excel saved: {excel_path}')
# Generate PDF digest
pdf_path = OUTPUT_DIR / f'{PERIOD}_digest.pdf'
html = Template(SUMMARY_HTML).render(
period=PERIOD,
gen_date=date.today().strftime('%B %d, %Y'),
rows=df.to_dict('records'),
total_rev=df['Revenue'].sum(),
total_cost=df['Cost'].sum(),
total_profit=df['Profit'].sum(),
)
HTML(string=html).write_pdf(str(pdf_path))
logger.info(f'PDF digest saved: {pdf_path}')
logger.info(f'Output directory: {OUTPUT_DIR.resolve()}')
if __name__ == '__main__':
main()
Pipeline design principles: Extraction, transformation, and output are fully decoupled — a failure in one department report does not interrupt the others Regex patterns must be adapted to the actual format of your source PDFs Dual output (Excel + PDF): Excel enables downstream analysis; PDF is archive-ready and easily forwarded Color-coded profit/loss gives management instant visual signal without reading numbers
Previous
Next
Chapter 9: Web Scraping