Chapter 8

Python + PDF — Extract, Merge, and Generate PDFs

Chapter 8: PDF Automation — Split, Merge, Extract & Generate

PDF is the standard format of the business world: contracts, invoices, reports, specifications — nearly everything is delivered as PDF. But PDF was never designed for machine processing. Extracting data, splitting and merging in bulk, generating formatted reports — these needs have driven the development of a mature Python toolchain. This chapter covers the four core PDF scenarios: extraction, merge/split, table extraction, and generation, culminating in a complete financial report processing pipeline.

PDF Library Ecosystem: Which Tool for What

Library	Best For	Install
pdfplumber	Text and table extraction from digital PDFs	`pip install pdfplumber`
pypdf	Merge, split, rotate, encrypt/decrypt	`pip install pypdf`
reportlab	Generate PDF from scratch with precise layout control	`pip install reportlab`
weasyprint	HTML template to PDF (more flexible layouts)	`pip install weasyprint`
tabula-py	Extract PDF tables into pandas DataFrames	`pip install tabula-py`
PyMuPDF (fitz)	High-performance extraction, image conversion, annotations	`pip install pymupdf`

Decision guide: Use pdfplumber to read, reportlab or weasyprint to generate, pypdf to merge/split, tabula-py to extract tables. Scanned PDFs (image-based) require OCR tools — see Chapter 12 for AI-powered OCR approaches.

PDF Text Extraction

Basic Extraction with pdfplumber

extract_text.py

import pdfplumber
from pathlib import Path

def extract_pdf_text(pdf_path: str) -> str:
    all_text = []
    with pdfplumber.open(pdf_path) as pdf:
        print(f'Total pages: {len(pdf.pages)}')
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if text:
                all_text.append(f'--- Page {page_num} ---\n{text}')
            else:
                all_text.append(f'--- Page {page_num}: No extractable text (scanned image?) ---')
    return '\n\n'.join(all_text)

text = extract_pdf_text('annual_report.pdf')
print(text[:500])  # Preview first 500 chars

Path('extracted_text.txt').write_text(text, encoding='utf-8')

Batch Extraction: 100 PDFs to Excel

batch_extract_to_excel.py

import pdfplumber
import pandas as pd
import re
from pathlib import Path

def extract_key_info(pdf_path: Path) -> dict:
    """Extract key fields from an invoice PDF using regex."""
    info = {
        'filename':    pdf_path.name,
        'invoice_no':  None,
        'amount':      None,
        'date':        None,
    }
    with pdfplumber.open(pdf_path) as pdf:
        text = pdf.pages[0].extract_text() or ''

    m = re.search(r'Invoice\s*#?\s*:?\s*([A-Z0-9\-]{5,20})', text, re.I)
    if m:
        info['invoice_no'] = m.group(1)

    m = re.search(r'Total\s*:?\s*\$?([\d,]+\.?\d*)', text, re.I)
    if m:
        info['amount'] = float(m.group(1).replace(',', ''))

    m = re.search(r'(\d{4})[/\-](\d{2})[/\-](\d{2})', text)
    if m:
        info['date'] = f"{m.group(1)}-{m.group(2)}-{m.group(3)}"

    return info

pdf_dir = Path('invoices')
results, errors = [], []

for pdf_file in sorted(pdf_dir.glob('*.pdf')):
    try:
        results.append(extract_key_info(pdf_file))
        print(f'  [OK] {pdf_file.name}')
    except Exception as e:
        errors.append({'filename': pdf_file.name, 'error': str(e)})
        print(f'  [Error] {pdf_file.name}: {e}')

pd.DataFrame(results).to_excel('invoice_summary.xlsx', index=False)
print(f'\nDone: {len(results)} succeeded, {len(errors)} failed')

Scanned PDFs: If page.extract_text() returns empty or garbled text, the PDF contains scanned images rather than selectable text. You need OCR — either pytesseract (requires Tesseract engine installed locally) or a cloud OCR API. Chapter 12 covers AI API integration for higher-accuracy OCR.

PDF Merge and Split

Merge Multiple PDFs

merge_pdfs.py

from pypdf import PdfWriter
from pathlib import Path

def merge_pdfs(input_files: list, output_path: str):
    writer = PdfWriter()
    for pdf_path in input_files:
        writer.append(str(pdf_path))
        print(f'  Added: {Path(pdf_path).name}')
    with open(output_path, 'wb') as f:
        writer.write(f)
    print(f'Merged {len(input_files)} files -> {output_path}')

pdf_files = sorted(Path('monthly_reports/2024-12').glob('*.pdf'))
merge_pdfs(pdf_files, 'monthly_reports/2024-12_combined.pdf')

Split by Page Range

split_pdf.py

from pypdf import PdfReader, PdfWriter
from pathlib import Path

def split_pdf_by_range(input_path: str, ranges: list, output_dir: str):
    """
    ranges: [(start_page, end_page, output_filename), ...]
    Page numbers are 1-indexed (human-readable).
    """
    reader = PdfReader(input_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    for start_page, end_page, filename in ranges:
        writer = PdfWriter()
        for page_num in range(start_page - 1, end_page):  # 0-indexed internally
            writer.add_page(reader.pages[page_num])
        out_path = output_dir / filename
        with open(out_path, 'wb') as f:
            writer.write(f)
        print(f'  Pages {start_page}-{end_page} -> {filename}')

split_pdf_by_range(
    'annual_report_2024.pdf',
    ranges=[
        (1,  25,  'Q1_report.pdf'),
        (26, 50,  'Q2_report.pdf'),
        (51, 75,  'Q3_report.pdf'),
        (76, 100, 'Q4_report.pdf'),
    ],
    output_dir='split_reports',
)

Extract Cover Pages from Multiple Reports

extract_covers.py

from pypdf import PdfReader, PdfWriter
from pathlib import Path

def extract_cover_pages(pdf_dir: str, output_path: str):
    """Extract page 1 from every PDF in a directory, merge into one file."""
    writer = PdfWriter()
    for pdf_file in sorted(Path(pdf_dir).glob('*.pdf')):
        reader = PdfReader(str(pdf_file))
        if reader.pages:
            writer.add_page(reader.pages[0])
            print(f'  Cover extracted: {pdf_file.name}')
    with open(output_path, 'wb') as f:
        writer.write(f)
    print(f'Cover summary saved: {output_path}')

extract_cover_pages('monthly_reports/', 'all_covers_2024.pdf')

PDF Table Extraction

pdfplumber for Well-Formatted Tables

extract_table_pdfplumber.py

import pdfplumber
import pandas as pd

def extract_tables_from_pdf(pdf_path: str) -> list:
    all_tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            for idx, table in enumerate(page.extract_tables()):
                if not table:
                    continue
                df = pd.DataFrame(table[1:], columns=table[0])
                df['_page'] = page_num
                all_tables.append(df)
                print(f'  Page {page_num}, table {idx+1}: {len(df)} rows')
    return all_tables

tables = extract_tables_from_pdf('financial_report.pdf')
if tables:
    with pd.ExcelWriter('extracted_tables.xlsx') as writer:
        for i, df in enumerate(tables):
            df.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)
    print(f'Extracted {len(tables)} tables to extracted_tables.xlsx')

tabula-py for More Complex Tables

extract_table_tabula.py

# pip install tabula-py
# Requires Java Runtime (JRE): https://www.java.com/download/
import tabula
import pandas as pd

# Extract all tables from all pages
tables = tabula.read_pdf(
    'financial_report.pdf',
    pages='all',
    multiple_tables=True,
    lattice=True,   # Use for bordered tables
    # stream=True,  # Use for whitespace-separated tables without borders
)
print(f'Found {len(tables)} tables')

combined = pd.concat(tables, ignore_index=True)
combined.to_excel('all_tables.xlsx', index=False)

# Handle tables that span multiple pages
cross_page = tabula.read_pdf(
    'report.pdf',
    pages='3-5',           # Pages 3 through 5 contain one continuous table
    multiple_tables=False,
    lattice=True,
)[0]
print(cross_page.head())

Cleaning Extracted Data

clean_extracted_data.py

import pandas as pd

def clean_pdf_table(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.dropna(how='all', inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    df.columns = [str(c).strip() for c in df.columns]
    df = df.applymap(lambda x: str(x).strip() if pd.notna(x) else x)
    # Convert currency strings like "1,234.56" to float
    for col in df.columns:
        sample = df[col].dropna().head(5)
        if sample.astype(str).str.match(r'^[\d,]+\.?\d*$').any():
            df[col] = pd.to_numeric(
                df[col].astype(str).str.replace(',', ''), errors='coerce'
            )
    return df

import tabula
tables = tabula.read_pdf('report.pdf', pages='all', multiple_tables=True)
cleaned = [clean_pdf_table(t) for t in tables]
with pd.ExcelWriter('clean_tables.xlsx') as writer:
    for i, df in enumerate(cleaned):
        df.to_excel(writer, sheet_name=f'Sheet{i+1}', index=False)

Generating PDF Reports

Option 1: reportlab — Fine-Grained Control

generate_pdf_reportlab.py

from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle

def generate_sales_report(data: list, output_path: str):
    doc = SimpleDocTemplate(output_path, pagesize=A4,
                            rightMargin=2*cm, leftMargin=2*cm,
                            topMargin=2*cm, bottomMargin=2*cm)
    styles = getSampleStyleSheet()
    elements = [
        Paragraph('Monthly Sales Report', styles['Title']),
        Spacer(1, 0.5*cm),
        Paragraph(f'{len(data)} records this month.', styles['Normal']),
        Spacer(1, 0.5*cm),
    ]

    table_data = [['Product', 'Units', 'Price', 'Revenue']]
    total = 0
    for row in data:
        rev = row['qty'] * row['price']
        total += rev
        table_data.append([row['product'], str(row['qty']),
                           f"${row['price']:,.2f}", f"${rev:,.2f}"])
    table_data.append(['Total', '', '', f"${total:,.2f}"])

    t = Table(table_data, colWidths=[7*cm, 3*cm, 3.5*cm, 3.5*cm])
    t.setStyle(TableStyle([
        ('BACKGROUND',  (0,0), (-1,0),  colors.HexColor('#1f2d3d')),
        ('TEXTCOLOR',   (0,0), (-1,0),  colors.white),
        ('GRID',        (0,0), (-1,-1), 0.5, colors.grey),
        ('FONTNAME',    (0,-1), (-1,-1), 'Helvetica-Bold'),
        ('ROWBACKGROUNDS', (0,1), (-1,-2), [colors.white, colors.HexColor('#f5f7fa')]),
    ]))
    elements.append(t)
    doc.build(elements)
    print(f'Report generated: {output_path}')

generate_sales_report([
    {'product': 'Laptop',        'qty': 50,  'price': 999},
    {'product': 'Wireless Mouse','qty': 200, 'price': 29},
], 'sales_report.pdf')

Option 2: weasyprint — HTML Template to PDF (Recommended)

generate_pdf_weasyprint.py

# pip install weasyprint jinja2
from jinja2 import Template
from weasyprint import HTML
from datetime import date

HTML_TEMPLATE = """
<!DOCTYPE html><html><head><meta charset="utf-8">
<style>
  body{font-family:Arial,sans-serif;margin:2cm}
  h1{color:#1f2d3d;border-bottom:2px solid #6c63ff;padding-bottom:8px}
  table{width:100%;border-collapse:collapse;margin-top:20px;font-size:12px}
  th{background:#1f2d3d;color:#fff;padding:10px 14px;text-align:right}
  th:first-child{text-align:left}
  td{padding:8px 14px;border-bottom:1px solid #e2e8f0;text-align:right}
  td:first-child{text-align:left}
  .total td{font-weight:bold;background:#eef2ff}
</style></head><body>
  <h1>{{"{{"}}title{{"}}"}}</h1>
  <p>Generated: {{"{{"}}report_date{{"}}"}} | {{"{{"}}rows|length{{"}}"}} records</p>
  <table>
    <tr><th>Product</th><th>Qty</th><th>Price</th><th>Revenue</th></tr>
    {% for r in rows %}
    <tr>
      <td>{{"{{"}}r.product{{"}}"}}</td><td>{{"{{"}}r.qty{{"}}"}}</td>
      <td>${{"{{"}} "%.2f"|format(r.price) {{"}}"}}</td>
      <td>${{"{{"}} "%.2f"|format(r.qty * r.price) {{"}}"}}</td>
    </tr>
    {% endfor %}
    <tr class="total">
      <td colspan="3">Total</td>
      <td>${{"{{"}} "%.2f"|format(total) {{"}}"}}</td>
    </tr>
  </table>
</body></html>
"""

def generate_pdf(data: list, output_path: str):
    total = sum(r['qty'] * r['price'] for r in data)
    html = Template(HTML_TEMPLATE).render(
        title='Monthly Sales Report',
        report_date=date.today().strftime('%B %d, %Y'),
        rows=data, total=total,
    )
    HTML(string=html).write_pdf(output_path)
    print(f'PDF generated: {output_path}')

generate_pdf([
    {'product': 'Laptop',        'qty': 50,  'price': 999},
    {'product': 'Wireless Mouse','qty': 200, 'price': 29},
], 'report.pdf')

reportlab vs weasyprint: reportlab has a steeper learning curve but gives precise layout control — ideal for complex, precision-critical documents. weasyprint uses familiar HTML/CSS, making it faster to develop and easier to maintain. Both support non-Latin characters with appropriate fonts specified.

PDF Encryption and Permissions

Adding Password Protection

encrypt_pdf.py

from pypdf import PdfReader, PdfWriter

def encrypt_pdf(input_path: str, output_path: str,
                user_password: str, owner_password: str = None):
    reader = PdfReader(input_path)
    writer = PdfWriter()
    for page in reader.pages:
        writer.add_page(page)
    writer.encrypt(
        user_password=user_password,
        owner_password=owner_password or user_password,
        use_128bit=True,
    )
    with open(output_path, 'wb') as f:
        writer.write(f)
    print(f'Encrypted: {output_path}')

# Single file
encrypt_pdf('contract.pdf', 'contract_encrypted.pdf', 'Client@2024')

# Batch with per-client passwords
import pandas as pd
df = pd.read_excel('clients.xlsx')
for _, row in df.iterrows():
    encrypt_pdf(
        f"contracts/{row['Name']}_contract.pdf",
        f"contracts_enc/{row['Name']}_contract_enc.pdf",
        str(row['Password']),
    )

Batch Decryption

decrypt_pdf.py

from pypdf import PdfReader, PdfWriter
from pathlib import Path

def decrypt_pdf(input_path: str, output_path: str, password: str) -> bool:
    try:
        reader = PdfReader(input_path)
        if reader.is_encrypted:
            if reader.decrypt(password) == 0:
                print(f'Wrong password: {input_path}')
                return False
        writer = PdfWriter()
        for page in reader.pages:
            writer.add_page(page)
        with open(output_path, 'wb') as f:
            writer.write(f)
        return True
    except Exception as e:
        print(f'Failed {input_path}: {e}')
        return False

password = 'CompanyInternal2024'
for pdf_file in Path('encrypted').glob('*.pdf'):
    out = Path('decrypted') / pdf_file.name
    out.parent.mkdir(exist_ok=True)
    ok = decrypt_pdf(str(pdf_file), str(out), password)
    print(f'  {"[OK]" if ok else "[Failed]"} {pdf_file.name}')

Project: Financial Report Processing Pipeline

Monthly scenario: receive 10–20 department PDF reports, extract key financial metrics from each, consolidate into an Excel summary, and generate a one-page PDF digest for management.

financial_pipeline.py — Complete pipeline

"""
Financial Report Processing Pipeline
Requirements: pip install pdfplumber pypdf pandas openpyxl weasyprint jinja2
"""
import pdfplumber
import pandas as pd
from pathlib import Path
from datetime import date
import re, logging
from jinja2 import Template
from weasyprint import HTML

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

PDF_DIR    = Path('reports/monthly')
OUTPUT_DIR = Path('output')
PERIOD     = 'December 2024'

def extract_financial_data(pdf_path: Path) -> dict:
    dept = pdf_path.stem.replace('_report', '')
    result = {'Department': dept, 'Revenue': None, 'Cost': None, 'Profit': None, 'Notes': ''}
    with pdfplumber.open(pdf_path) as pdf:
        text = '\n'.join(p.extract_text() or '' for p in pdf.pages)
    patterns = {
        'Revenue': r'(?:Total Revenue|Revenue)\s*:?\s*\$?([\d,]+\.?\d*)',
        'Cost':    r'(?:Total Cost|Expenses)\s*:?\s*\$?([\d,]+\.?\d*)',
        'Profit':  r'(?:Net Profit|Net Income)\s*:?\s*\$?([\d,]+\.?\d*)',
    }
    for key, pat in patterns.items():
        m = re.search(pat, text, re.I)
        result[key] = float(m.group(1).replace(',', '')) if m else None
        if not m:
            result['Notes'] += f'{key} not found; '
    return result

def batch_extract(pdf_dir: Path) -> pd.DataFrame:
    records = []
    for pdf_file in sorted(pdf_dir.glob('*.pdf')):
        try:
            records.append(extract_financial_data(pdf_file))
            logger.info(f'  [OK] {pdf_file.name}')
        except Exception as e:
            records.append({'Department': pdf_file.stem, 'Notes': f'Error: {e}'})
            logger.error(f'  [Error] {pdf_file.name}: {e}')
    return pd.DataFrame(records)

SUMMARY_HTML = """
<!DOCTYPE html><html><head><meta charset="utf-8">
<style>
body{font-family:Arial,sans-serif;margin:2cm;color:#1a1a2e}
h1{color:#1f2d3d;font-size:20px;border-bottom:3px solid #6c63ff;padding-bottom:6px}
.meta{color:#666;font-size:12px;margin-bottom:20px}
table{width:100%;border-collapse:collapse;font-size:12px}
th{background:#1f2d3d;color:#fff;padding:8px 12px;text-align:right}
th:first-child{text-align:left}
td{padding:7px 12px;border-bottom:1px solid #e2e8f0;text-align:right}
td:first-child{text-align:left}
tr:nth-child(even) td{background:#f8fafc}
.total-row td{font-weight:bold;background:#eef2ff}
.pos{color:#16a34a}.neg{color:#dc2626}
</style></head><body>
<h1>{{"{{"}}period{{"}}"}} Financial Summary</h1>
<div class="meta">Generated: {{"{{"}}gen_date{{"}}"}} | {{"{{"}}rows|length{{"}}"}} departments</div>
<table>
<tr><th>Department</th><th>Revenue ($)</th><th>Cost ($)</th><th>Profit ($)</th></tr>
{% for r in rows %}
<tr>
  <td>{{"{{"}}r['Department']{{"}}"}}</td>
  <td>{{"{{"}} "{:,.0f}".format(r['Revenue'] or 0) {{"}}"}}</td>
  <td>{{"{{"}} "{:,.0f}".format(r['Cost'] or 0) {{"}}"}}</td>
  <td class="{{"{{"}}'pos' if (r['Profit'] or 0) >= 0 else 'neg'{{"}}"}}">
    {{"{{"}} "{:,.0f}".format(r['Profit'] or 0) {{"}}"}}
  </td>
</tr>
{% endfor %}
<tr class="total-row">
  <td>Total</td>
  <td>{{"{{"}} "{:,.0f}".format(total_rev) {{"}}"}}</td>
  <td>{{"{{"}} "{:,.0f}".format(total_cost) {{"}}"}}</td>
  <td>{{"{{"}} "{:,.0f}".format(total_profit) {{"}}"}}</td>
</tr>
</table></body></html>
"""

def main():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    logger.info(f'Processing {PERIOD} reports...')

    df = batch_extract(PDF_DIR)

    # Save Excel
    excel_path = OUTPUT_DIR / f'{PERIOD}_summary.xlsx'
    with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
        df.to_excel(writer, sheet_name='Data', index=False)
    logger.info(f'Excel saved: {excel_path}')

    # Generate PDF digest
    pdf_path = OUTPUT_DIR / f'{PERIOD}_digest.pdf'
    html = Template(SUMMARY_HTML).render(
        period=PERIOD,
        gen_date=date.today().strftime('%B %d, %Y'),
        rows=df.to_dict('records'),
        total_rev=df['Revenue'].sum(),
        total_cost=df['Cost'].sum(),
        total_profit=df['Profit'].sum(),
    )
    HTML(string=html).write_pdf(str(pdf_path))
    logger.info(f'PDF digest saved: {pdf_path}')
    logger.info(f'Output directory: {OUTPUT_DIR.resolve()}')

if __name__ == '__main__':
    main()

Pipeline design principles: Extraction, transformation, and output are fully decoupled — a failure in one department report does not interrupt the others Regex patterns must be adapted to the actual format of your source PDFs Dual output (Excel + PDF): Excel enables downstream analysis; PDF is archive-ready and easily forwarded Color-coded profit/loss gives management instant visual signal without reading numbers

Previous

Next
Chapter 9: Web Scraping

Rate this chapter

4.8 / 5 (39 ratings)