Chapter 25

Expiry and Eviction: 8 Policies Deep Dive

Chapter 25 — Key Expiration and Memory Eviction: All 8 Strategies In Depth

Redis memory management involves two independent but complementary mechanisms: key expiration (managing the lifecycle of keys with TTLs) and memory eviction (eviction policies that activate when maxmemory is reached). This chapter provides a deep, source-code-level analysis of both mechanisms, covering implementation details, algorithm design, and engineering trade-offs.


25.1 The Three Data Structures Behind Expiration

The expires Dictionary

Every redisDb contains two parallel dictionaries:

// server.h
typedef struct redisDb {
    dict *dict;       // Main data dict: key → redisObject
    dict *expires;    // Expiry dict:    key → expiry timestamp (milliseconds, long long)
    dict *blocking_keys;
    dict *watched_keys;
    int id;
    long long avg_ttl;            // Average TTL across expiring keys (for stats)
    unsigned long expires_cursor; // Scan cursor for activeExpireCycle
    list *ready_keys;
} redisDb;

Key design choices:

How Timestamps Are Stored

// expire.c
void expireGenericCommand(client *c, long long basetime, int unit) {
    // Convert relative time to absolute timestamp in milliseconds
    long long when = unit == UNIT_SECONDS
                     ? basetime + (long long)c->argv[2]->ptr * 1000
                     : basetime + (long long)c->argv[2]->ptr;

    setExpire(c, c->db, c->argv[1], when);
}

void setExpire(client *c, redisDb *db, robj *key, long long when) {
    dictEntry *kde, *de;

    // Verify the key exists in the main dict first
    kde = dictFind(db->dict, key->ptr);
    serverAssertWithInfo(NULL, key, kde != NULL);

    // Add or update the entry in the expires dict
    // The key pointer is shared with the main dict — no copy
    de = dictAddOrFind(db->expires, dictGetKey(kde));
    dictSetSignedIntegerVal(de, when);  // Value = raw timestamp, not robj

    // Notify keyspace listeners if configured
    int writable_slave = server.masterhost && server.repl_slave_lazy_flush;
    if (c && !writable_slave && server.notify_keyspace_events) {
        notifyKeyspaceEvent(NOTIFY_GENERIC, "expire", key, db->id);
    }
}

25.2 Lazy Expiration: expireIfNeeded

Every time a key is read or written, Redis calls expireIfNeeded first:

// db.c
int expireIfNeeded(redisDb *db, robj *key, int flags) {
    if (!keyIsExpired(db, key)) return 0;

    // Replicas do NOT actively delete expired keys (they wait for DEL from master)
    if (server.masterhost != NULL) {
        if (server.current_client &&
            server.current_client->flags & CLIENT_MASTER) return 0;
        // On a replica: report as expired (return empty) but don't actually delete
        if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return 1;
    }

    // Update stats
    server.stat_expiredkeys++;

    // Propagate DEL/UNLINK to replicas and AOF
    propagateExpire(db, key, server.lazyfree_lazy_expire);

    notifyKeyspaceEvent(NOTIFY_GENERIC, "expired", key, db->id);

    int deleted;
    if (server.lazyfree_lazy_expire) {
        // Async deletion: sever reference, bio thread frees memory
        deleted = dbAsyncDelete(db, key);
    } else {
        // Synchronous deletion
        deleted = dbSyncDelete(db, key);
    }

    if (deleted) notifyKeyspaceEvent(NOTIFY_GENERIC, "del", key, db->id);
    return deleted;
}

// Check whether a key is expired right now
int keyIsExpired(redisDb *db, robj *key) {
    mstime_t when = getExpire(db, key);
    mstime_t now;

    if (when < 0) return 0;  // Key has no expiry set

    // During RDB/AOF load: never expire (avoid deleting historical data)
    if (server.loading) return 0;

    // During Lua script execution: use the script's start time
    // (guarantees temporal consistency within a script)
    if (server.script_caller)
        now = server.lua_time_start;
    else if (server.fixed_time_expire > 0)
        now = server.mstime;  // In MULTI/EXEC, use a fixed time for consistency
    else
        now = mstime();

    return now > when;
}

The weakness of lazy expiration:


25.3 Active Expiration: activeExpireCycle

activeExpireCycle is the core function that proactively scans and deletes expired keys, driven by serverCron:

// expire.c
#define ACTIVE_EXPIRE_CYCLE_SLOW   0  // Slow mode (in serverCron, up to 25% CPU)
#define ACTIVE_EXPIRE_CYCLE_FAST   1  // Fast mode (in beforeSleep, 1ms time limit)

void activeExpireCycle(int type) {
    // Tunable parameters
    unsigned long
        effort = server.active_expire_effort - 1,  /* 0–9, default 0 */
        config_keys_per_loop = ACTIVE_EXPIRE_CYCLE_KEYS_PER_LOOP +    /* 20 base */
                               ACTIVE_EXPIRE_CYCLE_KEYS_PER_LOOP/4 * effort,
        config_cycle_fast_duration = ACTIVE_EXPIRE_CYCLE_FAST_DURATION +  /* 1000µs */
                                     ACTIVE_EXPIRE_CYCLE_FAST_DURATION/4 * effort,
        config_cycle_acceptable_stale = ACTIVE_EXPIRE_CYCLE_ACCEPTABLE_STALE; /* 10% */

    long long start = ustime(), timelimit, elapsed;

    // Compute the time budget for this invocation
    if (type == ACTIVE_EXPIRE_CYCLE_FAST) {
        timelimit = config_cycle_fast_duration;  // Hard limit: ~1ms
    } else {
        // Slow mode: consume at most ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC% of CPU
        // Default 25%, so at hz=10: max 250ms per second spent on expiration
        timelimit = 1000000 * ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC / server.hz / 100;
    }

    // Iterate over all databases
    for (j = 0; j < dbs_per_call && timelimit_exit == 0; j++) {
        redisDb *db = server.db + (current_db % server.dbnum);
        current_db++;

        do {
            if (dictSize(db->expires) == 0) break;

            unsigned long num   = dictSize(db->expires);
            unsigned long slots = dictSlots(db->expires);

            // Skip sparsely populated expires dicts
            if (num && slots > DICT_HT_INITIAL_SIZE &&
                (num * 100 / slots < 1)) break;

            expired = 0;
            sampled = 0;
            long long ttl_sum = 0;
            int ttl_samples = 0;

            // Sample up to config_keys_per_loop (default 20) random keys
            if (num > config_keys_per_loop)
                num = config_keys_per_loop;

            while (sampled < num) {
                dictEntry *de;
                long long ttl;

                if ((de = dictGetRandomKey(db->expires)) == NULL) break;
                ttl = dictGetSignedIntegerVal(de) - now;

                if (activeExpireCycleTryExpire(db, de, now)) expired++;

                if (ttl > 0) {
                    ttl_sum += ttl;
                    ttl_samples++;
                }
                sampled++;
            }

            // Update average TTL statistic
            if (ttl_samples) db->avg_ttl = ttl_sum / ttl_samples;

            // If the expired ratio exceeds acceptable_stale (10%), keep scanning
        } while (sampled == 0 ||
                 (expired * 100 / sampled) > config_cycle_acceptable_stale);

        // Check time budget
        elapsed = ustime() - start;
        if (elapsed > timelimit) {
            timelimit_exit = 1;
            server.stat_expire_cycle_time_used += elapsed;
            break;
        }
    }
}

activeExpireCycleTryExpire: The Actual Deletion

static int activeExpireCycleTryExpire(redisDb *db, dictEntry *de, long long now) {
    long long t = dictGetSignedIntegerVal(de);
    if (now > t) {
        sds key = dictGetKey(de);
        robj *keyobj = createStringObject(key, sdslen(key));

        propagateExpire(db, keyobj, server.lazyfree_lazy_expire);
        if (server.lazyfree_lazy_expire) {
            dbAsyncDelete(db, keyobj);
        } else {
            dbSyncDelete(db, keyobj);
        }
        notifyKeyspaceEvent(NOTIFY_GENERIC, "expired", keyobj, db->id);
        decrRefCount(keyobj);
        server.stat_expiredkeys++;
        return 1;
    }
    return 0;
}

Impact of hz on Expiration Precision

hz 10          # Default: serverCron runs 10×/sec = minimum 100ms expiration lag
hz 100         # High precision: runs 100×/sec, lag drops to ~10ms, +1–3% CPU
dynamic-hz yes # Automatically increases hz when there are many clients (up to 500Hz)

Real measured data:


25.4 Expiration Handling on Replicas

Replica expiration handling has an important special case:

// Replicas do NOT proactively delete expired keys.
// They only delete a key when they receive a DEL/UNLINK command from the master.

// In expireIfNeeded on a replica:
if (server.masterhost != NULL) {
    // Report the key as expired to callers (return NULL/empty)
    // but do NOT actually delete it — wait for master's DEL
    if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return 1;
}

Practical implications:


25.5 Memory Eviction: When It Triggers

When used_memory reaches maxmemory, eviction is triggered before every write command:

// server.c — processCommand
int processCommand(client *c) {
    // ...
    if (server.maxmemory && !server.loading) {
        int out_of_memory = (performEvictions() == EVICT_FAIL);
        // If eviction fails (noeviction policy or no eligible keys):
        // return OOM error to the client
        if (out_of_memory && isCommandDeniedInOOM(c)) {
            addReply(c, shared.oomerr);
            return C_OK;
        }
    }
    // Proceed with command execution...
}

The OOM error response:

-OOM command not allowed when used memory > 'maxmemory'.

25.6 All 8 Memory Eviction Policies

Policy Overview

Policy Eviction Scope Algorithm Best Use Case
noeviction None (reject writes) Database-style apps where data loss is unacceptable
allkeys-lru All keys Approximate LRU General-purpose caching (recommended default)
allkeys-lfu All keys LFU Morris counter Caches with highly non-uniform hot-spot access
allkeys-random All keys Random Uniform access distribution, hot-spot irrelevant
volatile-lru Keys with TTL only Approximate LRU Mixing cached data (with TTL) and persistent data (no TTL)
volatile-lfu Keys with TTL only LFU Morris counter Same as volatile-lru, but hot-spots are concentrated
volatile-random Keys with TTL only Random Mixed storage, uniform access pattern
volatile-ttl Keys with TTL only Smallest TTL first Prefer evicting keys closest to expiry

performEvictions Core Logic

// evict.c
int performEvictions(void) {
    if (!isSafeToPerformEvictions()) return EVICT_OK;

    // Check if we actually need to evict
    size_t mem_reported, mem_tofree;
    if (getMaxmemoryState(&mem_reported, NULL, &mem_tofree, NULL) == C_OK)
        return EVICT_OK;  // Memory is within budget — no action needed

    int keys_freed = 0;
    while (mem_tofree > 0) {
        bestkey = NULL;

        for (int j = 0; j < server.dbnum; j++) {
            redisDb *db = server.db + j;
            dict *dict;

            // Select the candidate dictionary based on policy scope
            if (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) {
                dict = db->dict;    // Consider all keys
            } else {
                dict = db->expires; // Consider only keys with TTLs
            }
            if (dictSize(dict) == 0) continue;

            if (server.maxmemory_policy & MAXMEMORY_FLAG_LRU ||
                server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
                // Populate the eviction candidate pool
                evictionPoolPopulate(j, dict, db->dict, pool);
            } else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM ||
                       server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM) {
                dictEntry *de = dictGetRandomKey(dict);
                bestkey = dictGetKey(de);
                bestdbid = j;
            } else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
                // Sample maxmemory_samples keys, pick the one with smallest TTL
                // (handled inside evictionPoolPopulate with ULLONG_MAX - TTL scoring)
                evictionPoolPopulate(j, dict, db->dict, pool);
            }
        }

        // Extract the best candidate from the pool
        if (bestkey == NULL && pool[EVPOOL_SIZE-1].key != NULL) {
            // Pool entry with highest idle score = best eviction candidate
            bestkey  = pool[EVPOOL_SIZE-1].key;
            bestdbid = pool[EVPOOL_SIZE-1].dbid;
        }

        if (bestkey) {
            robj *keyobj = createStringObject(bestkey, sdslen(bestkey));
            propagateExpire(db, keyobj, server.lazyfree_lazy_eviction);
            if (server.lazyfree_lazy_eviction) {
                dbAsyncDelete(db, keyobj);
            } else {
                dbSyncDelete(db, keyobj);
            }
            server.stat_evictedkeys++;
            mem_tofree -= estimatedSize;
            keys_freed++;
            decrRefCount(keyobj);
        }

        if (mem_tofree <= 0 || bestkey == NULL) break;
    }

    return keys_freed > 0 ? EVICT_OK : EVICT_FAIL;
}

25.7 Approximate LRU: Redis's Unique Design

Why Not Exact LRU?

A textbook LRU requires a doubly-linked list ordered by access time. Every access must reorder the list:

Per-key exact LRU list node: 2 pointers = 16 bytes
1 million keys: 16MB of additional overhead
Worse: pointer updates cause cache misses throughout the keyspace

Redis's Approximate LRU Solution

// Every redisObject has a 24-bit lru field
typedef struct redisObject {
    unsigned type:4;
    unsigned encoding:4;
    unsigned lru:24;   // LRU clock in seconds (low 24 bits, wraps after ~194 days)
    int refcount;
    void *ptr;
} robj;

// LRU clock: updated every serverCron tick
#define LRU_CLOCK_MAX ((1<<24)-1)      // 16,777,215 seconds
#define LRU_CLOCK_RESOLUTION 1000      // 1-second precision

unsigned int getLRUClock(void) {
    return (mstime()/LRU_CLOCK_RESOLUTION) & LRU_CLOCK_MAX;
}

// Estimate how long a key has been idle (in milliseconds)
unsigned long long estimateObjectIdleTime(robj *o) {
    unsigned long long lruclock = LRU_CLOCK();
    if (lruclock >= o->lru) {
        return (lruclock - o->lru) * LRU_CLOCK_RESOLUTION;
    } else {
        // Handle 24-bit clock wraparound (after ~194 days)
        return (lruclock + (LRU_CLOCK_MAX - o->lru)) * LRU_CLOCK_RESOLUTION;
    }
}

evictionPoolPopulate: The Sampling Core

// evict.c
// The eviction pool holds up to EVPOOL_SIZE (16) candidates, sorted by idle score
void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict,
                          struct evictionPoolEntry *pool) {
    int j, k, count;
    dictEntry *samples[server.maxmemory_samples];

    // Randomly sample maxmemory_samples (default: 5) keys from the dict
    count = dictGetSomeKeys(sampledict, samples, server.maxmemory_samples);

    for (j = 0; j < count; j++) {
        unsigned long long idle;
        sds key;
        robj *o;
        dictEntry *de;

        de  = samples[j];
        key = dictGetKey(de);

        // Look up the actual value in keydict (sampledict may be expires)
        if (sampledict != keydict) de = dictFind(keydict, key);
        o = dictGetVal(de);

        // Compute the "idle score" — higher = better eviction candidate
        if (server.maxmemory_policy & MAXMEMORY_FLAG_LRU) {
            idle = estimateObjectIdleTime(o);       // Seconds since last access
        } else if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
            // Lower frequency → higher idle score → evict first
            idle = 255 - LFUDecrAndReturn(o);
        } else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
            // Smaller TTL → higher idle score → evict first
            idle = ULLONG_MAX - (long)dictGetVal(de);
        }

        // Insert into pool maintaining ascending idle order
        // Pool entry at index EVPOOL_SIZE-1 has the highest idle score
        k = 0;
        while (k < EVPOOL_SIZE && pool[k].key &&
               pool[k].idle < idle) k++;
        if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) continue;
        // Shift and insert...
    }
}

Accuracy vs. exact LRU:


25.8 LFU Morris Counter: Redis 4.0 Innovation

The Problem with Naive Counters

A pure access counter causes:

Redis's LFU Implementation

Redis reuses the 24-bit redisObject.lru field as an LFU counter:

// LFU field layout (24 bits):
// High 16 bits = last access time in minutes (16-bit, wraps after ~45 days)
// Low  8 bits  = Morris counter (0–255)

#define LFU_INIT_VAL 5  // Initial counter value for new keys

// Get the current LFU frequency (with time-based decay applied)
uint8_t LFUDecrAndReturn(robj *o) {
    unsigned long ldt     = o->lru >> 8;       // High 16 bits: last access minutes
    unsigned long counter = o->lru & 255;      // Low 8 bits: Morris counter

    // Calculate how many decay periods have elapsed
    unsigned long num_periods = LFUTimeElapsed(ldt) / server.lfu_decay_time;
    if (num_periods) {
        counter = (num_periods > counter) ? 0 : counter - num_periods;
    }
    return counter;
}

// How many minutes have elapsed since last access?
unsigned long LFUTimeElapsed(unsigned long ldt) {
    unsigned long now = LFUGetTimeInMinutes();
    if (now >= ldt) return now - ldt;
    return 65535 - ldt + now;  // Handle 16-bit wraparound (~45 days)
}

// Morris counter increment — probabilistic to prevent overflow
uint8_t LFULogIncr(uint8_t counter) {
    if (counter == 255) return 255;  // Already at maximum
    double r = (double)rand() / RAND_MAX;
    // Probability of incrementing = 1 / (counter * lfu_log_factor + 1)
    // Higher counter → lower probability → prevents counter from growing unboundedly
    double p = 1.0 / (counter * server.lfu_log_factor + 1);
    if (r < p) counter++;
    return counter;
}

Updating LFU on Access

// object.c — called after lookupKey
void updateLFU(robj *val) {
    unsigned long counter = LFUDecrAndReturn(val);    // Apply decay
    counter = LFULogIncr(counter);                     // Probabilistically increment
    val->lru = (LFUGetTimeInMinutes() << 8) | counter; // Write back: timestamp + counter
}

LFU Configuration

# redis.conf

# Decay rate: counter decrements by 1 every lfu-decay-time minutes (default: 1)
lfu-decay-time 1

# Log factor for the Morris counter (default: 10)
# Higher factor → slower growth → more accesses needed to reach high counter values
# Lower factor  → faster growth → counter saturates at 255 more quickly
lfu-log-factor 10

Access counts required to reach a given counter value (by lfu-log-factor):

lfu-log-factor Accesses to reach counter=10 Accesses to reach counter=100 Accesses to reach counter=200
0 1 ~10 ~100
1 10 ~1,000 ~100,000
10 1,000 ~100,000 ~10,000,000
100 10,000 ~10,000,000 ~1,000,000,000

With lfu-log-factor 10 (default): roughly 10,000 accesses are needed to reach counter=10, which is appropriate for distinguishing real hot-spots in production.

LFU vs. LRU: When to Choose Which

Scenario Better Policy Reason
Keys accessed uniformly or randomly LRU LFU offers no advantage
Small set of keys accessed very frequently LFU LFU keeps true hot-spots in cache longer
Workload with periodic scanning patterns LFU LRU would evict real hot-spots after a scan
Recently-accessed keys are most important LRU LRU naturally biases toward recency
Access pattern follows power-law (Zipfian) LFU LFU dramatically improves hit rate

25.9 Eviction Policy Selection Guide

Recommendations by Workload

# Scenario 1: Pure cache (all keys are evictable)
maxmemory-policy allkeys-lru
# Automatically evicts least recently used keys
# No need to set TTLs on keys

# Scenario 2: Cache with highly non-uniform hot-spots
maxmemory-policy allkeys-lfu
# LFU more accurately identifies truly hot keys
# Particularly effective when a small fraction of keys account for most hits

# Scenario 3: Mixed cache + persistent data
maxmemory-policy volatile-lru    # or volatile-lfu
# Cache keys: set TTL → eligible for eviction
# Persistent keys: no TTL → never evicted
# This protects persistent data while still allowing cache entries to be recycled

# Scenario 4: Database-style application (no data loss acceptable)
maxmemory-policy noeviction
# New writes fail with OOM error when memory is full
# Application must handle OOM gracefully (retry, circuit-break, etc.)

# Scenario 5: Short-lived temporary data (sessions, verification codes)
maxmemory-policy volatile-ttl
# Preferentially evicts keys closest to expiring
# Keeps more recently-created (longer-lived) entries in memory longer

maxmemory Sizing Guidelines

# Set to 70–80% of physical RAM (reserve for OS, fork COW, page cache)
maxmemory 6gb          # On an 8GB machine

# In cluster mode: set per shard (per master node)
# Recommended: total_ram * 0.75 / master_shard_count

# In master-replica mode: set maxmemory only on masters
# Replicas should have equal or larger memory (they hold output buffers too)
# Replica output buffers are not counted against maxmemory

# Check current memory status
redis-cli INFO memory | grep "used_memory:"
redis-cli INFO memory | grep "maxmemory:"
redis-cli INFO memory | grep "maxmemory_human:"

25.10 Monitoring and Tuning

Expiration Monitoring

# Cumulative expired key count
redis-cli INFO stats | grep expired_keys
# expired_keys:12345678

# Real-time expiration rate (using redis-cli --stat)
redis-cli --stat | awk '{print $6}'  # expired column

# Per-db expiration stats
redis-cli INFO keyspace
# db0:keys=1000000,expires=500000,avg_ttl=3600000

Eviction Monitoring

# Cumulative evicted key count
redis-cli INFO stats | grep evicted_keys
# evicted_keys:0  (non-zero means you're hitting maxmemory regularly)

# If evicted_keys is increasing: either raise maxmemory or optimize data usage

# Monitor eviction in real time
redis-cli MONITOR | grep '"DEL"\|"UNLINK"'

# How much memory would be freed if we evicted N keys?
redis-cli MEMORY USAGE keyname  # Estimate for individual keys

Tuning Parameters

# Increase expiration scan frequency (default: 10, can go up to 100)
hz 100
dynamic-hz yes

# Increase expiration scan aggressiveness (0–9, default: 0)
active-expire-effort 5

# Increase LRU/LFU sampling count for better eviction decisions
# Default: 5, doubling to 10 gives ~95% accuracy at ~2× CPU cost per eviction
maxmemory-samples 10

# Enable async freeing for eviction and expiration
lazyfree-lazy-expire yes
lazyfree-lazy-eviction yes

Prometheus Alert Rules (using redis_exporter)

# Memory utilization alerts
- alert: RedisMemoryUsageHigh
  expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.85
  severity: warning

- alert: RedisMemoryUsageCritical
  expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.95
  severity: critical

# Active eviction alerts (evictions should not occur in normal operation)
- alert: RedisEvictingKeys
  expr: rate(redis_evicted_keys_total[5m]) > 0
  severity: warning
  annotations:
    summary: "Redis is evicting keys — maxmemory may be too low"

# Fragmentation alerts
- alert: RedisHighFragmentation
  expr: redis_mem_fragmentation_ratio > 1.5
  severity: warning

# Lazyfree backlog alert
- alert: RedisLazyfreePending
  expr: redis_lazyfree_pending_objects > 10000
  severity: warning
  annotations:
    summary: "Large lazyfree backlog — many large objects pending async deletion"

Chapter Summary

Rate this chapter
4.7  / 5  (5 ratings)

💬 Comments