外观
PostgreSQL 慢查询风暴处理
慢查询风暴诊断
1. 识别慢查询风暴
sql
-- 查看当前慢查询数量
SELECT
COUNT(*) AS slow_query_count,
state,
wait_event_type,
wait_event
FROM
pg_stat_activity
WHERE
state = 'active'
AND query_start < NOW() - INTERVAL '1 second'
GROUP BY
state, wait_event_type, wait_event
ORDER BY
slow_query_count DESC;
-- 查看执行时间最长的查询
SELECT
pid,
usename,
datname,
application_name,
client_addr,
state,
query_start,
now() - query_start AS duration,
substr(query, 1, 200) AS query_sample
FROM
pg_stat_activity
WHERE
state = 'active'
ORDER BY
duration DESC
LIMIT 10;
-- 查看慢查询的资源使用情况
SELECT
pid,
usename,
datname,
application_name,
client_addr,
state,
query_start,
now() - query_start AS duration,
wait_event_type,
wait_event,
substr(query, 1, 200) AS query_sample
FROM
pg_stat_activity
WHERE
state = 'active'
AND query_start < NOW() - INTERVAL '5 seconds';2. 分析慢查询原因
sql
-- 查看共享缓冲区命中率
SELECT
round(100.0 * sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)), 2) AS hit_rate
FROM
pg_statio_user_tables;
-- 查看锁等待情况
SELECT
blocked_locks.pid AS blocked_pid,
blocked_activity.usename AS blocked_user,
blocking_locks.pid AS blocking_pid,
blocking_activity.usename AS blocking_user,
blocked_activity.query AS blocked_query,
blocking_activity.query AS blocking_query
FROM
pg_catalog.pg_locks blocked_locks
JOIN
pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN
pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.DATABASE IS NOT DISTINCT FROM blocked_locks.DATABASE
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
AND blocking_locks.pid != blocked_locks.pid
JOIN
pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
WHERE
NOT blocked_locks.GRANTED;
-- 查看索引使用情况
SELECT
schemaname,
relname AS table_name,
seq_scan,
idx_scan,
seq_tup_read,
idx_tup_fetch,
round(CASE
WHEN seq_scan + idx_scan = 0 THEN 0
ELSE 100.0 * idx_scan / (seq_scan + idx_scan)
END, 2) AS idx_scan_ratio
FROM
pg_stat_user_tables
ORDER BY
seq_scan DESC
LIMIT 20;慢查询风暴应急处理
1. 终止慢查询
sql
-- 终止单个慢查询
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE pid = <slow_query_pid>;
-- 终止所有执行时间超过5分钟的查询
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE
state = 'active'
AND now() - query_start > INTERVAL '5 minutes'
AND usename != 'postgres';
-- 终止来自特定IP的所有查询
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE client_addr = '192.168.1.100';
-- 终止特定应用的所有查询
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE application_name = 'problematic_app';2. 限制查询资源
sql
-- 启用查询超时
ALTER SYSTEM SET statement_timeout = '30s';
-- 限制每个用户的连接数
ALTER ROLE problematic_user CONNECTION LIMIT 10;
-- 限制全局连接数
ALTER SYSTEM SET max_connections = '200';
-- 限制工作内存
ALTER SYSTEM SET work_mem = '4MB';
-- 重新加载配置
SELECT pg_reload_conf();3. 启用慢查询日志
sql
-- 启用慢查询日志
ALTER SYSTEM SET log_min_duration_statement = '100ms';
ALTER SYSTEM SET log_statement = 'mod';
ALTER SYSTEM SET log_destination = 'csvlog';
ALTER SYSTEM SET logging_collector = 'on';
-- 重新加载配置
SELECT pg_reload_conf();慢查询风暴根本原因分析
1. 查询优化问题
sql
-- 分析慢查询执行计划
EXPLAIN ANALYZE
SELECT * FROM large_table WHERE unindexed_column = 'value';
-- 查看缺少索引的表
SELECT
schemaname,
relname AS table_name,
seq_scan,
seq_tup_read,
idx_scan,
idx_tup_fetch
FROM
pg_stat_user_tables
WHERE
seq_scan > 0
AND idx_scan = 0
ORDER BY
seq_tup_read DESC;
-- 查看最耗资源的查询(使用pg_stat_statements)
SELECT
queryid,
substr(query, 1, 200) AS query,
calls,
total_time / 1000 AS total_time_seconds,
mean_time / 1000 AS mean_time_seconds,
rows,
100.0 * shared_blks_hit / nullif(shared_blks_hit + shared_blks_read, 0) AS hit_percent
FROM
pg_stat_statements
ORDER BY
total_time DESC
LIMIT 10;2. 资源瓶颈分析
sql
-- 查看CPU使用率高的查询
SELECT
pid,
usename,
datname,
application_name,
client_addr,
state,
query_start,
now() - query_start AS duration,
substr(query, 1, 200) AS query_sample
FROM
pg_stat_activity
WHERE
state = 'active'
AND wait_event_type IS NULL;
-- 查看IO等待高的查询
SELECT
pid,
usename,
datname,
application_name,
client_addr,
state,
query_start,
now() - query_start AS duration,
wait_event_type,
wait_event,
substr(query, 1, 200) AS query_sample
FROM
pg_stat_activity
WHERE
wait_event_type = 'IO';
-- 查看锁等待高的查询
SELECT
pid,
usename,
datname,
application_name,
client_addr,
state,
query_start,
now() - query_start AS duration,
wait_event_type,
wait_event,
substr(query, 1, 200) AS query_sample
FROM
pg_stat_activity
WHERE
wait_event_type = 'Lock';慢查询风暴预防措施
1. 配置优化
sql
-- 设置合理的查询超时
ALTER SYSTEM SET statement_timeout = '30s';
-- 配置自动清理
ALTER SYSTEM SET autovacuum = 'on';
ALTER SYSTEM SET autovacuum_vacuum_scale_factor = '0.05';
ALTER SYSTEM SET autovacuum_analyze_scale_factor = '0.02';
-- 配置连接池相关参数
ALTER SYSTEM SET tcp_keepalives_idle = '60';
ALTER SYSTEM SET tcp_keepalives_interval = '10';
ALTER SYSTEM SET tcp_keepalives_count = '6';
-- 重新加载配置
SELECT pg_reload_conf();2. 监控与告警
yaml
# Prometheus 慢查询风暴告警规则
- alert: PostgreSQLSlowQueryStorm
expr: count(pg_stat_activity_pid{state="active"}) > 50 and sum(pg_stat_activity_pid{state="active"}) by (instance) > 50
for: 2m
labels:
severity: critical
annotations:
summary: "PostgreSQL 慢查询风暴"
description: "实例 {{ $labels.instance }} 存在 {{ $value }} 个活跃慢查询,可能发生慢查询风暴"
- alert: PostgreSQLQueryTimeout
expr: pg_stat_activity_count_by_state{state="active"} > 10 and pg_stat_activity_query_start < (time() - 30)
for: 1m
labels:
severity: warning
annotations:
summary: "PostgreSQL 慢查询超时"
description: "实例 {{ $labels.instance }} 存在超过30秒的慢查询,当前活跃查询数: {{ $value }}"3. 应用层优化
- 添加合适的索引:根据查询模式添加索引
- 优化SQL语句:避免全表扫描,优化WHERE子句
- 使用连接池:限制应用连接数
- 实现请求限流:对应用请求进行限流
- 优化事务设计:减少长事务,避免持有锁时间过长
慢查询风暴恢复步骤
1. 紧急处理
- 终止慢查询:使用
pg_terminate_backend终止慢查询 - 限制资源使用:设置查询超时、限制连接数
- 启用慢查询日志:便于后续分析
- 监控恢复情况:持续监控数据库状态
2. 根本原因分析
- 分析慢查询日志:找出导致风暴的查询
- 检查执行计划:分析查询性能问题
- 检查资源瓶颈:CPU、内存、磁盘IO等
- 检查锁等待:是否存在锁竞争
3. 长期优化
- 优化查询:重写慢查询,添加索引
- 调整配置:根据实际情况调整PostgreSQL参数
- 优化应用:改进应用代码,减少慢查询
- 加强监控:配置更完善的监控和告警
- 定期维护:定期分析慢查询,优化数据库
常见问题(FAQ)
Q1:如何快速识别慢查询风暴?
A1:可以通过以下指标快速识别:
- 活跃连接数突然增加
- 数据库CPU使用率飙升
- 查询执行时间显著增长
- 锁等待数量增加
- 共享缓冲区命中率下降
Q2:慢查询风暴发生时,应该先做什么?
A2:紧急处理步骤:
- 首先终止执行时间最长的慢查询
- 限制问题用户或应用的连接数
- 启用查询超时
- 启用慢查询日志便于后续分析
- 监控数据库恢复情况
Q3:如何防止慢查询风暴再次发生?
A3:预防措施:
- 配置合理的查询超时
- 限制每个用户和应用的连接数
- 定期分析慢查询日志
- 添加合适的索引
- 优化应用代码
- 配置完善的监控和告警
Q4:慢查询风暴会导致数据丢失吗?
A4:一般不会直接导致数据丢失,但可能会:
- 导致数据库响应缓慢,影响业务
- 导致连接数耗尽,新连接无法建立
- 可能导致数据库崩溃(在极端情况下)
- 可能导致事务超时,影响数据一致性
Q5:如何区分慢查询风暴和其他性能问题?
A5:区分方法:
- 慢查询风暴:大量类似的慢查询同时执行
- 锁竞争:查询等待锁资源,锁等待数量增加
- 资源瓶颈:CPU、内存、磁盘IO使用率高
- 配置问题:参数配置不合理导致性能下降
Q6:使用pg_stat_statements需要注意什么?
A6:使用pg_stat_statements的注意事项:
- 需要安装扩展:`CREATE EXTENSION pg_stat_statements;
- 会消耗一定的系统资源,建议根据实际情况调整配置
- 需要定期重置统计数据:`SELECT pg_stat_statements_reset();
- 可以通过
pg_stat_statements.max参数调整保存的查询数量 - 建议将慢查询日志与pg_stat_statements结合使用,获得更全面的性能数据
