外观
PostgreSQL 冗余索引识别与清理
冗余索引的概念与危害
什么是冗余索引
冗余索引是指在数据库表中存在的、对查询性能没有贡献或贡献极少,但会影响DML操作性能的索引。常见的冗余索引类型包括:
- 完全冗余索引:一个索引的列完全包含在另一个索引中,且顺序相同
- 前缀冗余索引:一个索引是另一个索引的前缀列
- 功能冗余索引:两个索引在查询中提供类似的功能
冗余索引的危害
- 增加存储空间:冗余索引会占用额外的磁盘空间
- 降低写入性能:每次INSERT/UPDATE/DELETE操作都需要更新所有相关索引
- 增加维护成本:索引需要定期维护(VACUUM、REINDEX等)
- 影响查询优化器:过多的索引会使查询优化器难以选择最优索引
- 增加备份时间:备份时需要处理更多的数据
冗余索引识别方法
1. 使用系统视图识别
sql
-- 查看表的所有索引
SELECT
tablename,
indexname,
indexdef
FROM
pg_indexes
WHERE
schemaname = 'public'
ORDER BY
tablename, indexname;
-- 识别冗余索引(基于列前缀)
WITH index_columns AS (
SELECT
i.relname AS index_name,
t.relname AS table_name,
array_agg(a.attname ORDER BY x.idx) AS columns
FROM
pg_index ix
JOIN
pg_class i ON i.oid = ix.indexrelid
JOIN
pg_class t ON t.oid = ix.indrelid
JOIN
pg_namespace n ON n.oid = t.relnamespace
CROSS JOIN LATERAL (
SELECT * FROM unnest(ix.indkey) WITH ORDINALITY AS x(attnum, idx)
) x
JOIN
pg_attribute a ON a.attrelid = t.oid AND a.attnum = x.attnum
WHERE
n.nspname = 'public'
GROUP BY
i.relname, t.relname
)
SELECT
ic1.table_name,
ic1.index_name AS redundant_index,
ic2.index_name AS covering_index,
ic1.columns AS redundant_columns,
ic2.columns AS covering_columns
FROM
index_columns ic1
JOIN
index_columns ic2 ON ic1.table_name = ic2.table_name
AND ic1.index_name != ic2.index_name
AND ic1.columns <@ ic2.columns
AND array_length(ic1.columns, 1) < array_length(ic2.columns, 1);2. 使用pg_stat_user_indexes分析索引使用情况
sql
-- 查看索引使用统计信息
SELECT
schemaname,
relname AS table_name,
indexrelname AS index_name,
idx_scan AS scan_count,
idx_tup_read AS tuples_read,
idx_tup_fetch AS tuples_fetched,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM
pg_stat_user_indexes
ORDER BY
schemaname, relname, indexrelname;
-- 查找从未使用的索引
SELECT
schemaname,
relname AS table_name,
indexrelname AS index_name,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM
pg_stat_user_indexes
WHERE
idx_scan = 0
ORDER BY
pg_relation_size(indexrelid) DESC;
-- 查找使用频率低的索引
SELECT
schemaname,
relname AS table_name,
indexrelname AS index_name,
idx_scan AS scan_count,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size,
idx_scan / GREATEST(1, (SELECT EXTRACT(EPOCH FROM (NOW() - stats_reset))/86400 FROM pg_stat_user_tables WHERE relid = pg_stat_user_indexes.relid)) AS scans_per_day
FROM
pg_stat_user_indexes
ORDER BY
scans_per_day ASC NULLS FIRST,
pg_relation_size(indexrelid) DESC;3. 使用pg_stattuple查看索引统计信息
sql
-- 安装pg_stattuple扩展(如果未安装)
CREATE EXTENSION IF NOT EXISTS pg_stattuple;
-- 查看索引的详细统计信息
SELECT
schemaname,
tablename,
indexname,
pg_size_pretty(size) AS index_size,
tuples,
dead_tuples,
n_distinct,
correlation
FROM
pg_stat_user_indexes i
JOIN
pg_indexes ix ON i.schemaname = ix.schemaname AND i.relname = ix.tablename AND i.indexrelname = ix.indexname
CROSS JOIN LATERAL
pg_stattuple(i.indexrelid);冗余索引清理策略
1. 清理前的准备工作
sql
-- 1. 备份数据库(可选,但建议在生产环境执行)
-- pg_dump -h localhost -U postgres -d your_database -f backup_before_index_cleanup.sql
-- 2. 收集索引使用统计信息(如果统计信息较旧)
ANALYZE VERBOSE;
-- 3. 记录当前索引状态
CREATE TABLE index_backup AS
SELECT
schemaname,
tablename,
indexname,
indexdef,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM
pg_indexes
JOIN
pg_class ON pg_class.relname = pg_indexes.indexname
WHERE
schemaname = 'public';2. 安全的索引清理方法
sql
-- 方法1:使用DROP INDEX CONCURRENTLY(推荐)
-- 这种方法不会阻塞表的读写操作
DROP INDEX CONCURRENTLY IF EXISTS idx_table_column;
-- 方法2:先重命名索引,观察一段时间后再删除
ALTER INDEX idx_table_column RENAME TO idx_table_column_old;
-- 观察一段时间(如1-2周),确认没有问题后删除
DROP INDEX IF EXISTS idx_table_column_old;
-- 方法3:批量清理从未使用的小索引
DO $$
DECLARE
rec RECORD;
BEGIN
FOR rec IN
SELECT
schemaname,
indexrelname AS index_name
FROM
pg_stat_user_indexes
WHERE
idx_scan = 0
AND pg_relation_size(indexrelid) < 1024 * 1024 * 10 -- 小于10MB
AND schemaname = 'public'
LOOP
EXECUTE format('DROP INDEX CONCURRENTLY %I.%I', rec.schemaname, rec.index_name);
RAISE NOTICE 'Dropped unused index: %.%', rec.schemaname, rec.index_name;
END LOOP;
END;
$$;3. 清理后的验证
sql
-- 1. 验证索引已被删除
SELECT
indexname
FROM
pg_indexes
WHERE
schemaname = 'public'
AND indexname = 'idx_table_column';
-- 2. 检查查询性能是否受到影响
-- 使用EXPLAIN ANALYZE查看查询计划
EXPLAIN ANALYZE SELECT * FROM your_table WHERE column = 'value';
-- 3. 监控写入性能变化
-- 可以通过pg_stat_user_tables查看写入统计
SELECT
relname,
n_tup_ins,
n_tup_upd,
n_tup_del,
last_vacuum,
last_autovacuum
FROM
pg_stat_user_tables
WHERE
schemaname = 'public';冗余索引预防措施
1. 建立索引设计规范
- 只创建必要的索引:根据实际查询需求创建索引
- 优先考虑复合索引:合理设计复合索引,覆盖多个查询场景
- 遵循索引前缀原则:如果经常查询(a, b),则不需要单独为a创建索引
- 定期审查索引:每季度或半年审查一次索引使用情况
2. 使用工具辅助索引设计
sql
-- 使用pg_stat_statements查看慢查询
-- 安装pg_stat_statements扩展
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
-- 查看慢查询及其使用的索引
SELECT
queryid,
query,
calls,
total_exec_time,
mean_exec_time,
rows
FROM
pg_stat_statements
ORDER BY
mean_exec_time DESC
LIMIT 20;
-- 使用EXPLAIN查看查询计划,确认索引使用情况
EXPLAIN SELECT * FROM your_table WHERE column1 = 'value' AND column2 > 100;3. 监控索引使用情况
sql
-- 创建索引使用监控视图
CREATE OR REPLACE VIEW index_usage_monitor AS
SELECT
s.schemaname,
s.relname AS table_name,
s.indexrelname AS index_name,
s.idx_scan AS scan_count,
s.idx_tup_read AS tuples_read,
s.idx_tup_fetch AS tuples_fetched,
pg_size_pretty(pg_relation_size(s.indexrelid)) AS index_size,
COALESCE(s.idx_scan / NULLIF((EXTRACT(EPOCH FROM (NOW() - t.stats_reset))/86400), 0), 0) AS scans_per_day,
t.last_autoanalyze
FROM
pg_stat_user_indexes s
JOIN
pg_stat_user_tables t ON s.schemaname = t.schemaname AND s.relname = t.relname
ORDER BY
s.schemaname, s.relname, s.indexrelname;
-- 查看监控视图
SELECT * FROM index_usage_monitor;冗余索引清理最佳实践
1. 生产环境清理建议
- 选择合适的时间窗口:在业务低峰期进行索引清理
- 分批清理:不要一次性清理大量索引,建议每次清理5-10个索引
- 使用CONCURRENTLY选项:避免阻塞业务操作
- 监控系统性能:清理过程中监控CPU、内存、I/O等指标
- 准备回滚计划:如果出现问题,能够快速恢复
2. 常见场景的处理
| 场景 | 处理建议 |
|---|---|
| 完全冗余索引 | 直接删除冗余索引 |
| 前缀冗余索引 | 检查是否有查询只使用前缀列,如果没有则删除 |
| 低频使用索引 | 观察1-2周,确认使用率低后删除 |
| 大表上的冗余索引 | 优先使用CONCURRENTLY选项,或在维护窗口删除 |
| 主键相关的冗余索引 | 谨慎处理,确保不会影响外键约束 |
3. 自动化清理脚本
sql
-- 创建自动清理冗余索引的函数
CREATE OR REPLACE FUNCTION cleanup_redundant_indexes()
RETURNS TABLE(schemaname TEXT, table_name TEXT, index_name TEXT, action TEXT)
AS $$
DECLARE
rec RECORD;
BEGIN
-- 识别并清理冗余索引
FOR rec IN
WITH index_columns AS (
SELECT
i.relname AS index_name,
t.relname AS table_name,
n.nspname AS schemaname,
array_agg(a.attname ORDER BY x.idx) AS columns
FROM
pg_index ix
JOIN
pg_class i ON i.oid = ix.indexrelid
JOIN
pg_class t ON t.oid = ix.indrelid
JOIN
pg_namespace n ON n.oid = t.relnamespace
CROSS JOIN LATERAL (
SELECT * FROM unnest(ix.indkey) WITH ORDINALITY AS x(attnum, idx)
) x
JOIN
pg_attribute a ON a.attrelid = t.oid AND a.attnum = x.attnum
WHERE
n.nspname = 'public'
GROUP BY
i.relname, t.relname, n.nspname
)
SELECT
ic1.schemaname,
ic1.table_name,
ic1.index_name AS redundant_index
FROM
index_columns ic1
JOIN
index_columns ic2 ON ic1.table_name = ic2.table_name
AND ic1.index_name != ic2.index_name
AND ic1.columns <@ ic2.columns
AND array_length(ic1.columns, 1) < array_length(ic2.columns, 1)
WHERE
NOT EXISTS (
SELECT 1 FROM pg_stat_user_indexes
WHERE schemaname = ic1.schemaname
AND relname = ic1.table_name
AND indexrelname = ic1.index_name
AND idx_scan > 0
)
LOOP
BEGIN
-- 使用CONCURRENTLY删除索引
EXECUTE format('DROP INDEX CONCURRENTLY %I.%I', rec.schemaname, rec.redundant_index);
-- 返回结果
schemaname := rec.schemaname;
table_name := rec.table_name;
index_name := rec.redundant_index;
action := 'Dropped';
RETURN NEXT;
EXCEPTION
WHEN OTHERS THEN
-- 记录错误,但继续处理其他索引
schemaname := rec.schemaname;
table_name := rec.table_name;
index_name := rec.redundant_index;
action := 'Error: ' || SQLERRM;
RETURN NEXT;
END;
END LOOP;
END;
$$ LANGUAGE plpgsql;
-- 执行自动清理
SELECT * FROM cleanup_redundant_indexes();常见问题(FAQ)
Q1:如何确定一个索引是否真的冗余?
A1:判断索引是否冗余需要考虑以下因素:
- 索引列是否被其他索引完全包含
- 索引是否被查询实际使用
- 索引的维护成本与查询收益比
- 是否有特殊场景需要使用该索引
建议结合索引使用统计信息和实际查询需求来判断。
Q2:DROP INDEX CONCURRENTLY和DROP INDEX有什么区别?
A2:
- DROP INDEX:会获取表的排他锁,阻塞所有对表的读写操作
- DROP INDEX CONCURRENTLY:不会获取排他锁,允许表的读写操作继续进行
- 使用CONCURRENTLY选项需要更多的时间和资源,但对业务影响更小
Q3:如何恢复误删的索引?
A3:
- 如果有索引备份,可以直接重建
- 从index_backup表中获取索引定义并重建
- 使用pg_dump备份的索引定义重建
Q4:清理冗余索引后,查询性能下降怎么办?
A4:
- 立即重建被删除的索引
- 检查查询计划,确认是否需要调整查询或创建新的索引
- 分析性能下降的原因,可能需要优化其他方面
Q5:多久应该检查一次冗余索引?
A5:建议:
- 开发环境:每月检查一次
- 测试环境:每季度检查一次
- 生产环境:每半年检查一次,或在业务高峰期前检查
对于频繁进行DML操作的表,建议更频繁地检查。
