GUI Automation — pyautogui and Playwright for Desktop Control
Chapter 14: GUI Automation — Controlling Desktop Apps with pyautogui and pywinauto
Not every system has an API. When you're dealing with a legacy ERP with no interface, a government system that only works through a thick client, or any desktop software that resists automation, GUI automation is your last resort. This chapter covers pyautogui's full mouse and keyboard control toolkit, pywinauto's Windows control tree access, image recognition, and OCR — all tied together with a complete form-filling automation project.
When to Use GUI Automation
GUI automation is a last resort, not a first choice. Before writing any code, evaluate your options:
| Approach | Best For | Reliability | Speed |
|---|---|---|---|
| API / SDK | Systems with official interfaces | Excellent | Very fast |
| Playwright / Selenium | Web applications, browsers | High | Fast |
| pywinauto | Windows native apps with control trees | Medium-high | Medium |
| pyautogui + image recognition | Any visible UI, truly last resort | Medium-low | Slow |
Valid reasons to use GUI automation:
- Legacy ERP or MES systems with no API and only a desktop client
- Government or institutional systems that mandate specific software
- Third-party software with no scripting or extension support
- Features that exist only in the GUI with no keyboard shortcut or CLI equivalent
Key risks:
- Fragility: Resolution changes, theme updates, or font size adjustments can break coordinate-based automation
- Speed: You must wait for UI responses; no concurrency
- Debugging difficulty: Failures are hard to diagnose remotely
- Platform limits: pywinauto is Windows-only; pyautogui is cross-platform but less precise
Evaluation order: If a web version exists, use Playwright first. If it's a Windows native app with standard controls, try pywinauto. Only fall back to image-recognition-based pyautogui when nothing else works.
pyautogui Basics
pip install pyautogui pillow pyperclip
Mouse Control
import pyautogui
import time
# Screen dimensions and current mouse position
width, height = pyautogui.size()
x, y = pyautogui.position()
# Movement
pyautogui.moveTo(500, 300, duration=0.5) # Smooth move to absolute coords
pyautogui.moveRel(100, 0, duration=0.3) # Move 100px right from current
# Clicks
pyautogui.click(500, 300)
pyautogui.doubleClick(500, 300)
pyautogui.rightClick(500, 300)
# Drag
pyautogui.dragTo(700, 400, duration=1.0)
pyautogui.dragRel(200, 0, duration=0.5)
# Scroll
pyautogui.scroll(3) # Scroll up 3 clicks
pyautogui.scroll(-3) # Scroll down 3 clicks
Keyboard Input
import pyautogui
import pyperclip
# ASCII text (typewrite does not support Unicode / non-ASCII)
pyautogui.typewrite('Hello World', interval=0.05)
# Non-ASCII text: write to clipboard, then paste
pyperclip.copy('Special characters or CJK text here')
pyautogui.hotkey('ctrl', 'v')
# Individual keys
pyautogui.press('enter')
pyautogui.press('tab')
pyautogui.press('escape')
pyautogui.press('f5')
# Hotkeys / key combos
pyautogui.hotkey('ctrl', 'a') # Select all
pyautogui.hotkey('ctrl', 'c') # Copy
pyautogui.hotkey('ctrl', 'z') # Undo
pyautogui.hotkey('ctrl', 'shift', 's') # Three-key combo
# Hold / release for complex sequences
pyautogui.keyDown('shift')
pyautogui.press('left')
pyautogui.keyUp('shift')
Screenshots and Image Recognition
import pyautogui
# Full screenshot
screenshot = pyautogui.screenshot()
screenshot.save('screen.png')
# Region screenshot (left, top, width, height)
region = pyautogui.screenshot(region=(0, 0, 800, 600))
# Find an image on screen (requires opencv-python for confidence parameter)
location = pyautogui.locateOnScreen('button.png', confidence=0.9)
if location:
center = pyautogui.center(location)
pyautogui.click(center)
else:
print("Image not found on screen")
# Wait for an image to appear (up to 10 seconds)
try:
location = pyautogui.locateOnScreen('loading_done.png',
confidence=0.85,
minSearchTime=10)
pyautogui.click(pyautogui.center(location))
except pyautogui.ImageNotFoundException:
print("Timeout: target image never appeared")
Safety: FAILSAFE
import pyautogui
import time
# FAILSAFE is on by default — move mouse to top-left corner (0,0) to abort
pyautogui.FAILSAFE = True # Keep this enabled
pyautogui.PAUSE = 0.5 # 0.5s delay between every action
# Give yourself time to switch to the target window before automation begins
print("Starting in 3 seconds — switch to the target application now...")
time.sleep(3)
pywinauto (Windows Only)
pywinauto accesses the Windows UI Automation or Win32 control tree directly, making it far more reliable than coordinate-based automation.
pip install pywinauto
Connecting to an Application
from pywinauto.application import Application
import time
# Launch a new application
app = Application(backend='uia').start('notepad.exe')
time.sleep(1)
# Connect to running app by process name
app = Application(backend='uia').connect(path='notepad.exe')
# Connect by window title (regex supported)
app = Application(backend='uia').connect(title_re='.*Notepad.*')
window = app.top_window()
print(window.window_text())
# Print the control tree — essential for finding control identifiers
window.print_control_identifiers()
Working with Controls
from pywinauto.application import Application
app = Application(backend='uia').start('notepad.exe')
window = app.top_window()
# Find and interact with an Edit control
edit = window.child_window(control_type='Edit')
edit.click_input()
edit.type_keys('Hello pywinauto{ENTER}', with_spaces=True)
# Click a button by title
btn = window.child_window(title='Save', control_type='Button')
btn.click()
# Navigate menus
window.menu_select('File->Save As')
# Select from a ComboBox
combo = window.child_window(control_type='ComboBox', found_index=0)
combo.select('UTF-8')
Case Study: Automating a Legacy ERP Export
"""
Automate data export from a legacy ERP desktop client with no API.
"""
from pywinauto.application import Application
import time
def export_erp_report(start_date: str, end_date: str, save_path: str):
app = Application(backend='uia').connect(title_re='.*ERP.*')
main_win = app.top_window()
main_win.menu_select('Reports->Sales Daily->Export by Date')
time.sleep(1.5)
export_dlg = app.window(title_re='.*Export.*')
export_dlg.wait('ready', timeout=10)
export_dlg.child_window(auto_id='startDate', control_type='Edit').set_edit_text(start_date)
export_dlg.child_window(auto_id='endDate', control_type='Edit').set_edit_text(end_date)
export_dlg.child_window(title='Export', control_type='Button').click()
save_dlg = app.window(title='Save As')
save_dlg.wait('exists', timeout=15)
save_dlg.child_window(auto_id='1148', control_type='Edit').set_edit_text(save_path)
save_dlg.child_window(title='Save', control_type='Button').click()
time.sleep(2)
print(f"Export complete: {save_path}")
export_erp_report('2024-01-01', '2024-01-31', r'C:\Reports\sales_2024_01.xlsx')
Image Recognition Automation
import cv2
import numpy as np
import pyautogui
def find_element_on_screen(template_path: str, threshold: float = 0.8):
"""
Locate a template image on the current screen.
Returns (center_x, center_y, confidence) or None.
"""
screenshot = pyautogui.screenshot()
screen_np = np.array(screenshot)
screen_gray = cv2.cvtColor(screen_np, cv2.COLOR_RGB2GRAY)
template = cv2.imread(template_path, cv2.IMREAD_GRAYSCALE)
h, w = template.shape
result = cv2.matchTemplate(screen_gray, template, cv2.TM_CCOEFF_NORMED)
_, max_val, _, max_loc = cv2.minMaxLoc(result)
if max_val >= threshold:
cx = max_loc[0] + w // 2
cy = max_loc[1] + h // 2
return cx, cy, max_val
return None
result = find_element_on_screen('submit_button.png', threshold=0.85)
if result:
x, y, confidence = result
pyautogui.click(x, y)
Screenshot Monitoring and OCR
"""
Periodic screenshot monitoring with OCR text extraction.
pip install pytesseract pillow schedule
Also install Tesseract OCR engine from:
https://github.com/tesseract-ocr/tesseract
"""
import pytesseract
import pyautogui
import schedule
import time
from PIL import Image, ImageFilter
from pathlib import Path
from datetime import datetime
SCREENSHOT_DIR = Path('./screenshots')
SCREENSHOT_DIR.mkdir(exist_ok=True)
def extract_text_from_region(region: tuple, lang: str = 'eng') -> str:
"""Extract text from a screen region using OCR."""
img = pyautogui.screenshot(region=region)
img = img.convert('L')
img = img.filter(ImageFilter.SHARPEN)
img = img.point(lambda x: 0 if x < 140 else 255)
return pytesseract.image_to_string(img, lang=lang, config='--psm 6').strip()
def capture_and_check():
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
fname = SCREENSHOT_DIR / f'screen_{timestamp}.png'
pyautogui.screenshot().save(fname)
# Check for error state via image matching
error_dlg = pyautogui.locateOnScreen('error_dialog.png', confidence=0.85)
if error_dlg:
print(f'[{timestamp}] Error dialog detected — sending alert')
close_btn = pyautogui.locateOnScreen('close_button.png', confidence=0.9)
if close_btn:
pyautogui.click(pyautogui.center(close_btn))
# Check status bar text via OCR
status = extract_text_from_region(region=(0, 0, 400, 30))
if 'Error' in status or 'Failed' in status:
print(f'[{timestamp}] OCR detected error text: {status}')
schedule.every(5).minutes.do(capture_and_check)
print('Screen monitoring active. Press Ctrl+C to stop.')
while True:
schedule.run_pending()
time.sleep(1)
Project: Automated Form Submission
"""
Automated form filler: read rows from Excel, fill a desktop form for each row.
Dependencies: openpyxl, pyautogui, pywinauto, pyperclip
"""
import time
import pyautogui
import pyperclip
import openpyxl
from dataclasses import dataclass
from pywinauto.application import Application
pyautogui.FAILSAFE = True
pyautogui.PAUSE = 0.3
@dataclass
class FormRecord:
company: str
tax_id: str
amount: float
category: str
note: str = ''
def load_records(path: str) -> list[FormRecord]:
wb = openpyxl.load_workbook(path)
ws = wb.active
return [
FormRecord(str(r[0]), str(r[1]), float(r[2]), str(r[3]), str(r[4] or ''))
for r in ws.iter_rows(min_row=2, values_only=True)
if r[0]
]
def type_text(text: str):
"""Paste text via clipboard to support all Unicode."""
pyperclip.copy(text)
pyautogui.hotkey('ctrl', 'v')
time.sleep(0.15)
def fill_field(pos: tuple, value: str):
pyautogui.click(*pos)
time.sleep(0.15)
pyautogui.hotkey('ctrl', 'a')
type_text(str(value))
# Adjust coordinates to match actual application layout
POSITIONS = {
'new': (120, 85),
'company': (280, 220),
'tax_id': (280, 265),
'amount': (280, 310),
'category': (280, 355),
'note': (280, 400),
'submit': (400, 480),
'confirm': (380, 340),
}
def fill_record(record: FormRecord) -> bool:
try:
pyautogui.click(*POSITIONS['new'])
time.sleep(0.8)
fill_field(POSITIONS['company'], record.company)
fill_field(POSITIONS['tax_id'], record.tax_id)
fill_field(POSITIONS['amount'], f'{record.amount:.2f}')
pyautogui.click(*POSITIONS['category'])
time.sleep(0.3)
type_text(record.category)
pyautogui.press('enter')
if record.note:
fill_field(POSITIONS['note'], record.note)
pyautogui.click(*POSITIONS['submit'])
time.sleep(1.0)
confirm = pyautogui.locateOnScreen('confirm_dialog.png', confidence=0.9)
if confirm:
pyautogui.click(*POSITIONS['confirm'])
time.sleep(0.5)
return True
except Exception as e:
print(f' ERROR: {e}')
return False
def run(excel_path: str):
records = load_records(excel_path)
print(f'Loaded {len(records)} records. Starting in 3 seconds...')
time.sleep(3)
ok, failed = 0, []
for i, rec in enumerate(records, 1):
print(f'[{i}/{len(records)}] {rec.company}', end=' ')
if fill_record(rec):
print('OK')
ok += 1
else:
print('FAILED')
failed.append(rec)
print(f'\nDone: {ok} succeeded, {len(failed)} failed.')
for r in failed:
print(f' - {r.company} ({r.tax_id})')
if __name__ == '__main__':
run('./data/records.xlsx')
Previous
Next
Chapter 15: Schedulers & CLI Tools