Skip to content

PostgreSQL 冗余索引识别与清理

冗余索引的概念与危害

什么是冗余索引

冗余索引是指在数据库表中存在的、对查询性能没有贡献或贡献极少,但会影响DML操作性能的索引。常见的冗余索引类型包括:

  • 完全冗余索引:一个索引的列完全包含在另一个索引中,且顺序相同
  • 前缀冗余索引:一个索引是另一个索引的前缀列
  • 功能冗余索引:两个索引在查询中提供类似的功能

冗余索引的危害

  1. 增加存储空间:冗余索引会占用额外的磁盘空间
  2. 降低写入性能:每次INSERT/UPDATE/DELETE操作都需要更新所有相关索引
  3. 增加维护成本:索引需要定期维护(VACUUM、REINDEX等)
  4. 影响查询优化器:过多的索引会使查询优化器难以选择最优索引
  5. 增加备份时间:备份时需要处理更多的数据

冗余索引识别方法

1. 使用系统视图识别

sql
-- 查看表的所有索引
SELECT 
    tablename,
    indexname,
    indexdef
FROM 
    pg_indexes
WHERE 
    schemaname = 'public'
ORDER BY 
    tablename, indexname;

-- 识别冗余索引(基于列前缀)
WITH index_columns AS (
    SELECT 
        i.relname AS index_name,
        t.relname AS table_name,
        array_agg(a.attname ORDER BY x.idx) AS columns
    FROM 
        pg_index ix
    JOIN 
        pg_class i ON i.oid = ix.indexrelid
    JOIN 
        pg_class t ON t.oid = ix.indrelid
    JOIN 
        pg_namespace n ON n.oid = t.relnamespace
    CROSS JOIN LATERAL (
        SELECT * FROM unnest(ix.indkey) WITH ORDINALITY AS x(attnum, idx)
    ) x
    JOIN 
        pg_attribute a ON a.attrelid = t.oid AND a.attnum = x.attnum
    WHERE 
        n.nspname = 'public'
    GROUP BY 
        i.relname, t.relname
)
SELECT 
    ic1.table_name,
    ic1.index_name AS redundant_index,
    ic2.index_name AS covering_index,
    ic1.columns AS redundant_columns,
    ic2.columns AS covering_columns
FROM 
    index_columns ic1
JOIN 
    index_columns ic2 ON ic1.table_name = ic2.table_name
                    AND ic1.index_name != ic2.index_name
                    AND ic1.columns <@ ic2.columns
                    AND array_length(ic1.columns, 1) < array_length(ic2.columns, 1);

2. 使用pg_stat_user_indexes分析索引使用情况

sql
-- 查看索引使用统计信息
SELECT 
    schemaname,
    relname AS table_name,
    indexrelname AS index_name,
    idx_scan AS scan_count,
    idx_tup_read AS tuples_read,
    idx_tup_fetch AS tuples_fetched,
    pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM 
    pg_stat_user_indexes
ORDER BY 
    schemaname, relname, indexrelname;

-- 查找从未使用的索引
SELECT 
    schemaname,
    relname AS table_name,
    indexrelname AS index_name,
    pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM 
    pg_stat_user_indexes
WHERE 
    idx_scan = 0
ORDER BY 
    pg_relation_size(indexrelid) DESC;

-- 查找使用频率低的索引
SELECT 
    schemaname,
    relname AS table_name,
    indexrelname AS index_name,
    idx_scan AS scan_count,
    pg_size_pretty(pg_relation_size(indexrelid)) AS index_size,
    idx_scan / GREATEST(1, (SELECT EXTRACT(EPOCH FROM (NOW() - stats_reset))/86400 FROM pg_stat_user_tables WHERE relid = pg_stat_user_indexes.relid)) AS scans_per_day
FROM 
    pg_stat_user_indexes
ORDER BY 
    scans_per_day ASC NULLS FIRST,
    pg_relation_size(indexrelid) DESC;

3. 使用pg_stattuple查看索引统计信息

sql
-- 安装pg_stattuple扩展(如果未安装)
CREATE EXTENSION IF NOT EXISTS pg_stattuple;

-- 查看索引的详细统计信息
SELECT 
    schemaname,
    tablename,
    indexname,
    pg_size_pretty(size) AS index_size,
    tuples,
    dead_tuples,
    n_distinct,
    correlation
FROM 
    pg_stat_user_indexes i
JOIN 
    pg_indexes ix ON i.schemaname = ix.schemaname AND i.relname = ix.tablename AND i.indexrelname = ix.indexname
CROSS JOIN LATERAL 
    pg_stattuple(i.indexrelid);

冗余索引清理策略

1. 清理前的准备工作

sql
-- 1. 备份数据库(可选,但建议在生产环境执行)
-- pg_dump -h localhost -U postgres -d your_database -f backup_before_index_cleanup.sql

-- 2. 收集索引使用统计信息(如果统计信息较旧)
ANALYZE VERBOSE;

-- 3. 记录当前索引状态
CREATE TABLE index_backup AS
SELECT 
    schemaname,
    tablename,
    indexname,
    indexdef,
    pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM 
    pg_indexes
JOIN 
    pg_class ON pg_class.relname = pg_indexes.indexname
WHERE 
    schemaname = 'public';

2. 安全的索引清理方法

sql
-- 方法1:使用DROP INDEX CONCURRENTLY(推荐)
-- 这种方法不会阻塞表的读写操作
DROP INDEX CONCURRENTLY IF EXISTS idx_table_column;

-- 方法2:先重命名索引,观察一段时间后再删除
ALTER INDEX idx_table_column RENAME TO idx_table_column_old;
-- 观察一段时间(如1-2周),确认没有问题后删除
DROP INDEX IF EXISTS idx_table_column_old;

-- 方法3:批量清理从未使用的小索引
DO $$
DECLARE
    rec RECORD;
BEGIN
    FOR rec IN 
        SELECT 
            schemaname,
            indexrelname AS index_name
        FROM 
            pg_stat_user_indexes
        WHERE 
            idx_scan = 0
            AND pg_relation_size(indexrelid) < 1024 * 1024 * 10 -- 小于10MB
            AND schemaname = 'public'
    LOOP
        EXECUTE format('DROP INDEX CONCURRENTLY %I.%I', rec.schemaname, rec.index_name);
        RAISE NOTICE 'Dropped unused index: %.%', rec.schemaname, rec.index_name;
    END LOOP;
END;
$$;

3. 清理后的验证

sql
-- 1. 验证索引已被删除
SELECT 
    indexname
FROM 
    pg_indexes
WHERE 
    schemaname = 'public'
    AND indexname = 'idx_table_column';

-- 2. 检查查询性能是否受到影响
-- 使用EXPLAIN ANALYZE查看查询计划
EXPLAIN ANALYZE SELECT * FROM your_table WHERE column = 'value';

-- 3. 监控写入性能变化
-- 可以通过pg_stat_user_tables查看写入统计
SELECT 
    relname,
    n_tup_ins,
    n_tup_upd,
    n_tup_del,
    last_vacuum,
    last_autovacuum
FROM 
    pg_stat_user_tables
WHERE 
    schemaname = 'public';

冗余索引预防措施

1. 建立索引设计规范

  • 只创建必要的索引:根据实际查询需求创建索引
  • 优先考虑复合索引:合理设计复合索引,覆盖多个查询场景
  • 遵循索引前缀原则:如果经常查询(a, b),则不需要单独为a创建索引
  • 定期审查索引:每季度或半年审查一次索引使用情况

2. 使用工具辅助索引设计

sql
-- 使用pg_stat_statements查看慢查询
-- 安装pg_stat_statements扩展
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;

-- 查看慢查询及其使用的索引
SELECT 
    queryid,
    query,
    calls,
    total_exec_time,
    mean_exec_time,
    rows
FROM 
    pg_stat_statements
ORDER BY 
    mean_exec_time DESC
LIMIT 20;

-- 使用EXPLAIN查看查询计划,确认索引使用情况
EXPLAIN SELECT * FROM your_table WHERE column1 = 'value' AND column2 > 100;

3. 监控索引使用情况

sql
-- 创建索引使用监控视图
CREATE OR REPLACE VIEW index_usage_monitor AS
SELECT 
    s.schemaname,
    s.relname AS table_name,
    s.indexrelname AS index_name,
    s.idx_scan AS scan_count,
    s.idx_tup_read AS tuples_read,
    s.idx_tup_fetch AS tuples_fetched,
    pg_size_pretty(pg_relation_size(s.indexrelid)) AS index_size,
    COALESCE(s.idx_scan / NULLIF((EXTRACT(EPOCH FROM (NOW() - t.stats_reset))/86400), 0), 0) AS scans_per_day,
    t.last_autoanalyze
FROM 
    pg_stat_user_indexes s
JOIN 
    pg_stat_user_tables t ON s.schemaname = t.schemaname AND s.relname = t.relname
ORDER BY 
    s.schemaname, s.relname, s.indexrelname;

-- 查看监控视图
SELECT * FROM index_usage_monitor;

冗余索引清理最佳实践

1. 生产环境清理建议

  1. 选择合适的时间窗口:在业务低峰期进行索引清理
  2. 分批清理:不要一次性清理大量索引,建议每次清理5-10个索引
  3. 使用CONCURRENTLY选项:避免阻塞业务操作
  4. 监控系统性能:清理过程中监控CPU、内存、I/O等指标
  5. 准备回滚计划:如果出现问题,能够快速恢复

2. 常见场景的处理

场景处理建议
完全冗余索引直接删除冗余索引
前缀冗余索引检查是否有查询只使用前缀列,如果没有则删除
低频使用索引观察1-2周,确认使用率低后删除
大表上的冗余索引优先使用CONCURRENTLY选项,或在维护窗口删除
主键相关的冗余索引谨慎处理,确保不会影响外键约束

3. 自动化清理脚本

sql
-- 创建自动清理冗余索引的函数
CREATE OR REPLACE FUNCTION cleanup_redundant_indexes()
RETURNS TABLE(schemaname TEXT, table_name TEXT, index_name TEXT, action TEXT)
AS $$
DECLARE
    rec RECORD;
BEGIN
    -- 识别并清理冗余索引
    FOR rec IN 
        WITH index_columns AS (
            SELECT 
                i.relname AS index_name,
                t.relname AS table_name,
                n.nspname AS schemaname,
                array_agg(a.attname ORDER BY x.idx) AS columns
            FROM 
                pg_index ix
            JOIN 
                pg_class i ON i.oid = ix.indexrelid
            JOIN 
                pg_class t ON t.oid = ix.indrelid
            JOIN 
                pg_namespace n ON n.oid = t.relnamespace
            CROSS JOIN LATERAL (
                SELECT * FROM unnest(ix.indkey) WITH ORDINALITY AS x(attnum, idx)
            ) x
            JOIN 
                pg_attribute a ON a.attrelid = t.oid AND a.attnum = x.attnum
            WHERE 
                n.nspname = 'public'
            GROUP BY 
                i.relname, t.relname, n.nspname
        )
        SELECT 
            ic1.schemaname,
            ic1.table_name,
            ic1.index_name AS redundant_index
        FROM 
            index_columns ic1
        JOIN 
            index_columns ic2 ON ic1.table_name = ic2.table_name
                            AND ic1.index_name != ic2.index_name
                            AND ic1.columns <@ ic2.columns
                            AND array_length(ic1.columns, 1) < array_length(ic2.columns, 1)
        WHERE
            NOT EXISTS (
                SELECT 1 FROM pg_stat_user_indexes 
                WHERE schemaname = ic1.schemaname 
                AND relname = ic1.table_name 
                AND indexrelname = ic1.index_name 
                AND idx_scan > 0
            )
    LOOP
        BEGIN
            -- 使用CONCURRENTLY删除索引
            EXECUTE format('DROP INDEX CONCURRENTLY %I.%I', rec.schemaname, rec.redundant_index);
            
            -- 返回结果
            schemaname := rec.schemaname;
            table_name := rec.table_name;
            index_name := rec.redundant_index;
            action := 'Dropped';
            RETURN NEXT;
            
        EXCEPTION
            WHEN OTHERS THEN
                -- 记录错误,但继续处理其他索引
                schemaname := rec.schemaname;
                table_name := rec.table_name;
                index_name := rec.redundant_index;
                action := 'Error: ' || SQLERRM;
                RETURN NEXT;
        END;
    END LOOP;
END;
$$ LANGUAGE plpgsql;

-- 执行自动清理
SELECT * FROM cleanup_redundant_indexes();

常见问题(FAQ)

Q1:如何确定一个索引是否真的冗余?

A1:判断索引是否冗余需要考虑以下因素:

  • 索引列是否被其他索引完全包含
  • 索引是否被查询实际使用
  • 索引的维护成本与查询收益比
  • 是否有特殊场景需要使用该索引

建议结合索引使用统计信息和实际查询需求来判断。

Q2:DROP INDEX CONCURRENTLY和DROP INDEX有什么区别?

A2:

  • DROP INDEX:会获取表的排他锁,阻塞所有对表的读写操作
  • DROP INDEX CONCURRENTLY:不会获取排他锁,允许表的读写操作继续进行
  • 使用CONCURRENTLY选项需要更多的时间和资源,但对业务影响更小

Q3:如何恢复误删的索引?

A3:

  • 如果有索引备份,可以直接重建
  • 从index_backup表中获取索引定义并重建
  • 使用pg_dump备份的索引定义重建

Q4:清理冗余索引后,查询性能下降怎么办?

A4:

  1. 立即重建被删除的索引
  2. 检查查询计划,确认是否需要调整查询或创建新的索引
  3. 分析性能下降的原因,可能需要优化其他方面

Q5:多久应该检查一次冗余索引?

A5:建议:

  • 开发环境:每月检查一次
  • 测试环境:每季度检查一次
  • 生产环境:每半年检查一次,或在业务高峰期前检查

对于频繁进行DML操作的表,建议更频繁地检查。