Skip to content

KingBaseES 监控与诊断命令

KingBaseES提供了丰富的监控与诊断命令,用于监控数据库性能、诊断故障和查看系统状态。这些命令帮助DBA实时了解数据库运行情况,及时发现并解决问题。本文将详细介绍KingBaseES的监控与诊断命令。

系统状态监控命令

数据库实例状态

生产环境监控脚本示例

bash
#!/bin/bash
# KingBaseES 实例状态监控脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
KB_DATA=/opt/Kingbase/ES/V8/data
LOG_FILE=/opt/Kingbase/logs/instance_status_$(date +%Y%m%d).log

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE
}

# 检查实例状态
log "开始检查数据库实例状态"

# 查看数据库实例状态
$KB_BIN/sys_ctl status -D $KB_DATA
if [ $? -eq 0 ]; then
    log "实例状态:正常运行"
    
    # 查看数据库版本
    VERSION=$($KB_BIN/ksql -U system -d test -t -c "SELECT version();")
    log "数据库版本:$VERSION"
    
    # 查看数据库运行时间
    UPTIME=$($KB_BIN/ksql -U system -d test -t -c "SELECT now() - pg_postmaster_start_time() AS uptime;")
    log "运行时间:$UPTIME"
else
    log "错误:实例状态异常"
    # 发送告警邮件
    # mail -s "KingBaseES Instance Status Alert" dba@example.com < $LOG_FILE
    exit 1
fi

log "实例状态检查完成"

连接状态监控

生产环境连接监控示例

bash
#!/bin/bash
# KingBaseES 连接状态监控脚本

KB_BIN=/opt/Kingbase/ES/V8/Server/bin
ALERT_THRESHOLD=150

# 查看当前连接数
CURRENT_CONNECTIONS=$($KB_BIN/ksql -U system -d test -t -c "SELECT count(*) FROM sys_stat_activity;")
MAX_CONNECTIONS=$($KB_BIN/ksql -U system -d test -t -c "SHOW max_connections;")

echo "当前连接数:$CURRENT_CONNECTIONS / $MAX_CONNECTIONS"

# 检查连接数是否超过阈值
if [ $CURRENT_CONNECTIONS -gt $ALERT_THRESHOLD ]; then
    echo "警告:连接数超过阈值 $ALERT_THRESHOLD"
    
    # 查看活跃连接详情
    echo "\n活跃连接详情:"
    $KB_BIN/ksql -U system -d test -c "SELECT datname, usename, client_addr, application_name, state, query_start, query FROM sys_stat_activity WHERE state = 'active' ORDER BY query_start;"
    
    # 查看连接来源分布
    echo "\n连接来源分布:"
    $KB_BIN/ksql -U system -d test -c "SELECT client_addr, count(*) FROM sys_stat_activity GROUP BY client_addr ORDER BY count(*) DESC;"
fi

资源使用情况

生产环境资源监控示例

bash
#!/bin/bash
# KingBaseES 资源使用情况监控脚本

KB_BIN=/opt/Kingbase/ES/V8/Server/bin

# 查看CPU使用情况
echo "CPU使用情况:"
$KB_BIN/ksql -U system -d test -c "SELECT cpu_usage_percent, user_cpu_usage_percent, system_cpu_usage_percent FROM sys_os_usage;"

# 查看内存使用情况
echo "\n内存使用情况:"
$KB_BIN/ksql -U system -d test -c "SELECT total_memory, used_memory, free_memory, round(used_memory::numeric/total_memory::numeric*100, 2) AS usage_percent FROM sys_memory_usage;"

# 查看磁盘I/O情况
echo "\n磁盘I/O情况:"
$KB_BIN/ksql -U system -d test -c "SELECT read_bytes_per_second, write_bytes_per_second, read_latency_ms, write_latency_ms FROM sys_disk_usage;"

# V8 R7 新增:查看系统负载
if $KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_system_load;" > /dev/null 2>&1; then
    echo "\n系统负载:"
    $KB_BIN/ksql -U system -d test -c "SELECT load_average_1min, load_average_5min, load_average_15min FROM sys_system_load;"
fi

性能监控命令

查询性能监控

生产环境慢查询监控脚本示例

bash
#!/bin/bash
# KingBaseES 慢查询监控脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
LOG_FILE=/opt/Kingbase/logs/slow_query_monitor_$(date +%Y%m%d).log
THRESHOLD=5 # 慢查询阈值(秒)

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE
}

log "开始检查慢查询"

# 查看当前正在执行的慢查询
echo "当前正在执行的慢查询:"
$KB_BIN/ksql -U system -d test -c "SELECT pid, datname, usename, client_addr, query_start, now() - query_start AS duration, query FROM sys_stat_activity WHERE state = 'active' AND now() - query_start > interval '1 second' ORDER BY duration DESC;"

# 使用sys_stat_statements查看历史慢查询
echo "\n历史慢查询统计(Top 10):"
$KB_BIN/ksql -U system -d test -c "SELECT queryid, calls, total_time, mean_time, rows, left(query, 100) AS query_sample FROM sys_stat_statements WHERE mean_time > $THRESHOLD ORDER BY total_time DESC LIMIT 10;"

# 检查慢查询日志配置
log "慢查询日志配置:"
$KB_BIN/ksql -U system -d test -c "SHOW slow_query_log; SHOW long_query_time;" >> $LOG_FILE

# V8 R7 增强:使用pg_stat_monitor查看慢查询
echo "\n使用pg_stat_monitor查看慢查询(V8 R7):"
if $KB_BIN/ksql -U system -d test -c "SELECT * FROM pg_stat_monitor LIMIT 1;" > /dev/null 2>&1; then
    $KB_BIN/ksql -U system -d test -c "SELECT queryid, calls, total_time, mean_time, rows, left(query, 100) AS query_sample FROM pg_stat_monitor WHERE mean_time > $THRESHOLD ORDER BY total_time DESC LIMIT 10;"
fi

索引使用情况

生产环境索引分析脚本示例

bash
#!/bin/bash
# KingBaseES 索引使用情况分析脚本

KB_BIN=/opt/Kingbase/ES/V8/Server/bin
REPORT_FILE=/opt/Kingbase/reports/index_usage_$(date +%Y%m%d).txt

# 创建报告
cat > $REPORT_FILE << EOF
KingBaseES 索引使用情况分析报告
生成时间:$(date '+%Y-%m-%d %H:%M:%S')
========================================
EOF

# 1. 查看索引使用统计
echo "\n1. 索引使用统计:" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT schemaname, relname, indexrelname, idx_scan, idx_tup_read, idx_tup_fetch FROM sys_stat_user_indexes ORDER BY idx_scan ASC;" >> $REPORT_FILE

# 2. 查看未使用的索引
echo "\n2. 未使用的索引:" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT schemaname, relname, indexrelname FROM sys_stat_user_indexes WHERE idx_scan = 0;" >> $REPORT_FILE

# 3. 查看低效索引(扫描次数多但命中率低)
echo "\n3. 低效索引(扫描次数多但命中率低):" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT schemaname, relname, indexrelname, idx_scan, idx_tup_read, idx_tup_fetch, round(idx_tup_fetch::numeric/nullif(idx_tup_read, 0)::numeric * 100, 2) AS hit_rate FROM sys_stat_user_indexes WHERE idx_scan > 100 AND round(idx_tup_fetch::numeric/nullif(idx_tup_read, 0)::numeric * 100, 2) < 10 ORDER BY hit_rate ASC;" >> $REPORT_FILE

# 4. 查看索引大小
echo "\n4. 索引大小统计(Top 20):" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT schemaname, tablename, indexname, size_mb FROM sys_indexes_size ORDER BY size_mb DESC LIMIT 20;" >> $REPORT_FILE

# 5. 查看表的索引覆盖情况
echo "\n5. 表的索引覆盖情况:" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT schemaname, relname, count(indexrelname) AS index_count FROM sys_stat_user_indexes GROUP BY schemaname, relname ORDER BY index_count DESC;" >> $REPORT_FILE

echo "\n索引分析报告已生成:$REPORT_FILE"
# 发送报告邮件
# mail -s "KingBaseES Index Usage Report" dba@example.com < $REPORT_FILE

表性能统计

生产环境表性能分析脚本示例

bash
#!/bin/bash
# KingBaseES 表性能统计脚本

KB_BIN=/opt/Kingbase/ES/V8/Server/bin

# 查看表访问统计
echo "表访问统计(按扫描次数排序):"
$KB_BIN/ksql -U system -d test -c "SELECT schemaname, relname, seq_scan, idx_scan, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd FROM sys_stat_user_tables ORDER BY seq_scan + idx_scan DESC LIMIT 10;"

# 查看表大小
echo "\n表大小统计(Top 10):"
$KB_BIN/ksql -U system -d test -c "SELECT schemaname, tablename, size_mb, table_size_mb, index_size_mb FROM sys_tables_size ORDER BY size_mb DESC LIMIT 10;"

# 查看表膨胀情况
echo "\n表膨胀情况(Top 10):"
$KB_BIN/ksql -U system -d test -c "SELECT schemaname, tablename, bloat_ratio, bloat_size_mb FROM sys_table_bloat WHERE bloat_ratio > 100 ORDER BY bloat_ratio DESC LIMIT 10;"

# 查看更新频繁的表
echo "\n更新频繁的表(Top 10):"
$KB_BIN/ksql -U system -d test -c "SELECT schemaname, relname, n_tup_upd + n_tup_del AS modification_count FROM sys_stat_user_tables ORDER BY modification_count DESC LIMIT 10;"

# 查看热点表(读写频繁)
echo "\n热点表(读写频繁):"
$KB_BIN/ksql -U system -d test -c "SELECT schemaname, relname, (seq_scan + idx_scan + n_tup_ins + n_tup_upd + n_tup_del) AS total_activity FROM sys_stat_user_tables ORDER BY total_activity DESC LIMIT 10;"

存储监控命令

表空间监控

生产环境表空间监控脚本示例

bash
#!/bin/bash
# KingBaseES 表空间监控脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
LOG_FILE=/opt/Kingbase/logs/tablespace_monitor_$(date +%Y%m%d).log
ALERT_THRESHOLD=80 # 表空间使用率告警阈值(%)

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE
}

log "开始检查表空间使用情况"

# 查看表空间使用情况
echo "表空间使用情况:"
TABLESPACE_INFO=$($KB_BIN/ksql -U system -d test -t -c "SELECT spcname, size_mb, used_mb, free_mb, round(used_mb::numeric/size_mb::numeric*100, 2) AS usage_percent FROM sys_tablespace_usage;" | grep -v "spcname")
echo "$TABLESPACE_INFO"

echo "$TABLESPACE_INFO" >> $LOG_FILE

# 检查是否有表空间使用率超过阈值
echo "\n检查超过阈值的表空间:"
while read -r line; do
    SPCNAME=$(echo $line | awk '{print $1}')
    USAGE_PERCENT=$(echo $line | awk '{print $5}')
    
    if (( $(echo "$USAGE_PERCENT > $ALERT_THRESHOLD" | bc -l) )); then
        echo "警告:表空间 $SPCNAME 使用率为 $USAGE_PERCENT%,已超过阈值 $ALERT_THRESHOLD%"
        log "警告:表空间 $SPCNAME 使用率为 $USAGE_PERCENT%,已超过阈值 $ALERT_THRESHOLD%"
        
        # 查看该表空间下的大表
echo "\n表空间 $SPCNAME 下的大表(Top 10):"
$KB_BIN/ksql -U system -d test -c "SELECT schemaname, tablename, size_mb FROM sys_tables_size WHERE tablespace = '$SPCNAME' ORDER BY size_mb DESC LIMIT 10;"
    fi
done <<< "$TABLESPACE_INFO"

# 查看表空间大小变化趋势(V8 R7)
echo "\n表空间大小变化趋势(V8 R7):"
if $KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_tablespace_usage_history LIMIT 1;" > /dev/null 2>&1; then
    $KB_BIN/ksql -U system -d test -c "SELECT spcname, date_trunc('day', sample_time) AS day, avg(used_mb) AS avg_used_mb FROM sys_tablespace_usage_history WHERE sample_time > now() - interval '7 days' GROUP BY spcname, day ORDER BY spcname, day;"
fi

数据文件监控

生产环境数据文件监控脚本示例

bash
#!/bin/bash
# KingBaseES 数据文件监控脚本

KB_BIN=/opt/Kingbase/ES/V8/Server/bin

# 查看数据文件列表
echo "数据文件列表:"
$KB_BIN/ksql -U system -d test -c "SELECT tablespace_name, file_name, size_mb FROM sys_data_files ORDER BY size_mb DESC;"

# 查看数据文件大小统计
echo "\n数据文件大小统计:"
$KB_BIN/ksql -U system -d test -c "SELECT tablespace_name, count(*) AS file_count, sum(size_mb) AS total_size_mb, avg(size_mb) AS avg_size_mb FROM sys_data_files GROUP BY tablespace_name ORDER BY total_size_mb DESC;"

# 查看增长迅速的数据文件(V8 R7)
echo "\n增长迅速的数据文件(V8 R7):"
if $KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_data_files_growth LIMIT 1;" > /dev/null 2>&1; then
    $KB_BIN/ksql -U system -d test -c "SELECT file_name, growth_rate_mb_per_day FROM sys_data_files_growth WHERE growth_rate_mb_per_day > 100 ORDER BY growth_rate_mb_per_day DESC LIMIT 10;"
fi

# 检查数据文件碎片
echo "\n检查数据文件碎片:"
$KB_BIN/ksql -U system -d test -c "SELECT file_name, extent_count, round((1 - (min_extent_size::numeric/max_extent_size::numeric))*100, 2) AS fragmentation_percent FROM sys_data_files WHERE extent_count > 100 ORDER BY fragmentation_percent DESC LIMIT 10;"

日志文件监控

生产环境日志文件监控脚本示例

bash
#!/bin/bash
# KingBaseES 日志文件监控脚本

KB_BIN=/opt/Kingbase/ES/V8/Server/bin
LOG_DIR=$($KB_BIN/ksql -U system -d test -t -c "SHOW log_directory;")
LOG_DIR=${LOG_DIR:-/opt/Kingbase/ES/V8/data/sys_log}

# 查看日志文件配置
echo "日志文件配置:"
$KB_BIN/ksql -U system -d test -c "SHOW log_destination; SHOW logging_collector; SHOW log_directory; SHOW log_filename; SHOW log_rotation_size; SHOW log_rotation_age;"

# 查看日志文件大小
echo "\n日志文件大小:"
ls -lh $LOG_DIR/ | grep -E "\.(log|csv)$"

# 检查日志文件增长速度
echo "\n日志文件增长速度(过去24小时):"
find $LOG_DIR -name "*.log" -mtime -1 -exec du -h {} \;

# 查看日志文件数量
echo "\n日志文件数量:"
find $LOG_DIR -name "*.log" | wc -l
find $LOG_DIR -name "*.csv" | wc -l

# 检查错误日志中的严重错误
echo "\n错误日志中的严重错误(过去24小时):"
grep -i -E "(error|fatal|panic)" $LOG_DIR/kingbase.log | tail -20

# 清理旧日志文件(保留最近7天)
echo "\n清理7天前的日志文件:"
find $LOG_DIR -name "*.log" -mtime +7 -delete
find $LOG_DIR -name "*.csv" -mtime +7 -delete

事务与锁监控

事务监控

生产环境事务监控脚本示例

bash
#!/bin/bash
# KingBaseES 事务监控脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
LOG_FILE=/opt/Kingbase/logs/transaction_monitor_$(date +%Y%m%d).log
LONG_TRANS_THRESHOLD=5 # 长时间事务阈值(分钟)

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE
}

log "开始检查事务状态"

# 查看当前事务
echo "当前活跃事务:"
$KB_BIN/ksql -U system -d test -c "SELECT pid, datname, usename, client_addr, state, backend_xid, query_start, now() - query_start AS duration, query FROM sys_stat_activity WHERE backend_xid IS NOT NULL OR state = 'idle in transaction' ORDER BY duration DESC;"

# 查看长时间运行的事务
echo "\n长时间运行的事务(超过 $LONG_TRANS_THRESHOLD 分钟):"
LONG_TRANS=$($KB_BIN/ksql -U system -d test -t -c "SELECT pid, datname, usename, client_addr, state, backend_xid, query_start, now() - query_start AS duration, query FROM sys_stat_activity WHERE state IN ('idle in transaction', 'active') AND query_start < now() - interval '$LONG_TRANS_THRESHOLD minutes' ORDER BY duration DESC;")

echo "$LONG_TRANS"

if [ -n "$LONG_TRANS" ]; then
    log "发现长时间运行的事务:"
    echo "$LONG_TRANS" >> $LOG_FILE
    
    # 显示事务详情
echo "\n事务详情:"
while read -r line; do
    PID=$(echo $line | awk '{print $1}')
    log "长时间事务 PID:$PID"
    # 可以在这里添加自动终止长时间事务的逻辑(谨慎使用)
done <<< "$LONG_TRANS"
fi

# 查看事务统计
echo "\n事务统计:"
$KB_BIN/ksql -U system -d test -c "SELECT datname, xact_commit, xact_rollback, round(xact_rollback::numeric/nullif(xact_commit + xact_rollback, 0)::numeric*100, 2) AS rollback_ratio FROM sys_stat_database WHERE datname IS NOT NULL ORDER BY xact_commit + xact_rollback DESC;"

# V8 R7 新增:查看事务历史统计
echo "\n事务历史统计(V8 R7):"
if $KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_transaction_log LIMIT 1;" > /dev/null 2>&1; then
    $KB_BIN/ksql -U system -d test -c "SELECT date_trunc('hour', commit_time) AS hour, count(*) AS transaction_count, sum(CASE WHEN is_rollback THEN 1 ELSE 0 END) AS rollback_count FROM sys_transaction_log WHERE commit_time > now() - interval '24 hours' GROUP BY hour ORDER BY hour;"
fi

锁监控

生产环境锁监控脚本示例

bash
#!/bin/bash
# KingBaseES 锁监控脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
LOG_FILE=/opt/Kingbase/logs/lock_monitor_$(date +%Y%m%d).log
LOCK_WAIT_THRESHOLD=10 # 锁等待阈值(秒)

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE
}

log "开始检查锁状态"

# 查看锁等待情况
echo "锁等待情况:"
LOCK_WAITS=$($KB_BIN/ksql -U system -d test -t -c "SELECT blocked_locks.pid AS blocked_pid, blocked_activity.usename AS blocked_user, blocking_locks.pid AS blocking_pid, blocking_activity.usename AS blocking_user, blocked_locks.relation::regclass AS table_name, blocked_locks.mode AS blocked_mode, blocking_locks.mode AS blocking_mode, now() - blocked_activity.query_start AS wait_time, blocked_activity.query AS blocked_query, blocking_activity.query AS blocking_query FROM sys_locks blocked_locks JOIN sys_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid JOIN sys_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid AND blocking_locks.pid != blocked_locks.pid JOIN sys_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid WHERE NOT blocked_locks.granted;"

if [ -n "$LOCK_WAITS" ]; then
    echo "$LOCK_WAITS"
    log "发现锁等待:"
    echo "$LOCK_WAITS" >> $LOG_FILE
    
    # 检查是否有长时间锁等待
    while read -r line; do
        WAIT_TIME=$(echo $line | awk '{print $10}')
        BLOCKED_PID=$(echo $line | awk '{print $1}')
        BLOCKING_PID=$(echo $line | awk '{print $3}')
        
        # 简单判断等待时间是否超过阈值(实际需要更复杂的时间解析)
        if [[ $WAIT_TIME == *"min"* ]] || [[ $WAIT_TIME == *"hour"* ]]; then
            echo "\n警告:PID $BLOCKED_PID 已等待锁超过 $LOCK_WAIT_THRESHOLD 秒,阻塞PID:$BLOCKING_PID"
            log "警告:PID $BLOCKED_PID 已等待锁超过 $LOCK_WAIT_THRESHOLD 秒,阻塞PID:$BLOCKING_PID"
        fi
    done <<< "$LOCK_WAITS"
else
    echo "无锁等待情况"
    log "无锁等待情况"
fi

# 查看当前锁
echo "\n当前锁持有情况:"
$KB_BIN/ksql -U system -d test -c "SELECT locktype, database, relation::regclass, page, tuple, virtualxid, transactionid, mode, granted, pid, usename, query FROM sys_locks WHERE granted = true ORDER BY relation, mode;"

# 查看死锁信息
echo "\n死锁信息:"
DEADLOCKS=$($KB_BIN/ksql -U system -d test -t -c "SELECT * FROM sys_deadlocks ORDER BY deadlock_time DESC LIMIT 10;")
if [ -n "$DEADLOCKS" ]; then
    echo "$DEADLOCKS"
    log "发现死锁:"
    echo "$DEADLOCKS" >> $LOG_FILE
else
    echo "无死锁记录"
    log "无死锁记录"
fi

# V8 R7 新增:查看死锁历史
echo "\n死锁历史记录(V8 R7):"
if $KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_deadlock_history LIMIT 1;" > /dev/null 2>&1; then
    $KB_BIN/ksql -U system -d test -c "SELECT deadlock_time, deadlock_graph FROM sys_deadlock_history WHERE deadlock_time > now() - interval '7 days' ORDER BY deadlock_time DESC LIMIT 5;"
fi

复制状态监控

主备复制状态

生产环境主备复制监控脚本示例

bash
#!/bin/bash
# KingBaseES 主备复制状态监控脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
LOG_FILE=/opt/Kingbase/logs/replication_monitor_$(date +%Y%m%d).log
ALERT_THRESHOLD=30 # 复制延迟告警阈值(秒)

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE
}

log "开始检查复制状态"

# 检查是否为主节点
IS_PRIMARY=$($KB_BIN/ksql -U system -d test -t -c "SELECT pg_is_in_recovery();")

if [ "$IS_PRIMARY" = "f" ]; then
    log "当前节点为主节点"
    
    # 主节点查看复制状态
    echo "主节点复制状态:"
    REPLICATION_STATUS=$($KB_BIN/ksql -U system -d test -c "SELECT client_addr, application_name, state, sync_state, write_lag, flush_lag, replay_lag FROM sys_stat_replication;")
    echo "$REPLICATION_STATUS"
    log "$REPLICATION_STATUS"
    
    # 检查是否有备库连接
    REPL_COUNT=$($KB_BIN/ksql -U system -d test -t -c "SELECT count(*) FROM sys_stat_replication;")
    if [ $REPL_COUNT -eq 0 ]; then
        log "警告:未发现备库连接"
        echo "警告:未发现备库连接"
    fi
else
    log "当前节点为备节点"
    
    # 备节点查看复制状态
    echo "备节点复制状态:"
    WAL_RECEIVER=$($KB_BIN/ksql -U system -d test -c "SELECT status, receive_start_lsn, received_lsn, last_msg_send_time, last_msg_receipt_time FROM sys_stat_wal_receiver;")
    echo "$WAL_RECEIVER"
    log "$WAL_RECEIVER"
    
    # 查看复制延迟
    echo "\n复制延迟:"
    REPL_DELAY=$($KB_BIN/ksql -U system -d test -t -c "SELECT extract(epoch FROM (now() - pg_last_xact_replay_timestamp())) AS replication_delay_seconds;")
    echo "$REPL_DELAY 秒"
    
    # 检查复制延迟是否超过阈值
    if (( $(echo "$REPL_DELAY > $ALERT_THRESHOLD" | bc -l) )); then
        log "警告:复制延迟为 $REPL_DELAY 秒,已超过阈值 $ALERT_THRESHOLD 秒"
        echo "警告:复制延迟为 $REPL_DELAY 秒,已超过阈值 $ALERT_THRESHOLD 秒"
    fi
    
    # 检查备库同步状态
    echo "\n备库同步状态:"
    SYNC_STATUS=$($KB_BIN/ksql -U system -d test -t -c "SELECT pg_is_in_recovery(), pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn();")
    echo "$SYNC_STATUS"
    log "$SYNC_STATUS"
fi

WAL日志监控

生产环境WAL日志监控脚本示例

bash
#!/bin/bash
# KingBaseES WAL日志监控脚本

KB_BIN=/opt/Kingbase/ES/V8/Server/bin
LOG_FILE=/opt/Kingbase/logs/wal_monitor_$(date +%Y%m%d).log

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE
}

log "开始检查WAL日志状态"

# 查看WAL日志配置
echo "WAL日志配置:"
WAL_CONFIG=$($KB_BIN/ksql -U system -d test -c "SHOW wal_level; SHOW max_wal_size; SHOW min_wal_size; SHOW checkpoint_timeout; SHOW archive_mode; SHOW archive_command;")
echo "$WAL_CONFIG"
log "$WAL_CONFIG"

# 查看WAL日志使用情况
echo "\nWAL日志使用情况:"
WAL_USAGE=$($KB_BIN/ksql -U system -d test -c "SELECT wal_used_mb, wal_total_mb, round(wal_used_mb::numeric/wal_total_mb::numeric*100, 2) AS usage_percent, checkpoint_count FROM sys_wal_usage;")
echo "$WAL_USAGE"
log "$WAL_USAGE"

# 查看WAL归档状态
echo "\nWAL归档状态:"
ARCHIVE_STATUS=$($KB_BIN/ksql -U system -d test -c "SELECT archived_count, failed_count, last_archived_wal, last_archived_time FROM sys_archive_status;")
echo "$ARCHIVE_STATUS"
log "$ARCHIVE_STATUS"

# 检查归档失败情况
FAILED_COUNT=$($KB_BIN/ksql -U system -d test -t -c "SELECT failed_count FROM sys_archive_status;")
if [ $FAILED_COUNT -gt 0 ]; then
    log "警告:发现 $FAILED_COUNT 个WAL归档失败"
    echo "警告:发现 $FAILED_COUNT 个WAL归档失败"
fi

# 查看WAL文件数量
echo "\nWAL文件数量:"
WAL_DIR=$($KB_BIN/ksql -U system -d test -t -c "SHOW data_directory;")/pg_wal
if [ -d "$WAL_DIR" ]; then
    WAL_FILE_COUNT=$(ls -la $WAL_DIR/000000* | wc -l)
    echo "$WAL_FILE_COUNT 个WAL文件"
    log "WAL文件数量:$WAL_FILE_COUNT"
fi

性能诊断命令

执行计划分析

生产环境执行计划分析脚本示例

bash
#!/bin/bash
# KingBaseES 执行计划分析脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
REPORT_DIR=/opt/Kingbase/reports

# 确保报告目录存在
mkdir -p $REPORT_DIR

# 函数:分析SQL执行计划
analyze_sql() {
    local SQL="$1"
    local REPORT_FILE="$REPORT_DIR/explain_$(date +%Y%m%d_%H%M%S).txt"
    
    echo "KingBaseES 执行计划分析报告" > $REPORT_FILE
    echo "生成时间:$(date '+%Y-%m-%d %H:%M:%S')" >> $REPORT_FILE
    echo "========================================" >> $REPORT_FILE
    echo "原始SQL:" >> $REPORT_FILE
    echo "$SQL" >> $REPORT_FILE
    echo "========================================" >> $REPORT_FILE
    
    echo "\n1. 预估执行计划:" >> $REPORT_FILE
    $KB_BIN/ksql -U system -d test -c "EXPLAIN $SQL" >> $REPORT_FILE
    
    echo "\n2. 实际执行计划:" >> $REPORT_FILE
    $KB_BIN/ksql -U system -d test -c "EXPLAIN ANALYZE $SQL" >> $REPORT_FILE
    
    echo "\n3. 详细JSON执行计划:" >> $REPORT_FILE
    $KB_BIN/ksql -U system -d test -c "EXPLAIN (FORMAT JSON, ANALYZE, VERBOSE, BUFFERS, TIMING) $SQL" >> $REPORT_FILE
    
    echo "\n执行计划报告已生成:$REPORT_FILE"
    return $REPORT_FILE
}

# 示例使用
# analyze_sql "SELECT * FROM employees WHERE department_id = 10;"

# 批量分析慢查询TOP 5
echo "正在分析慢查询TOP 5的执行计划..."
SLOW_QUERIES=$($KB_BIN/ksql -U system -d test -t -c "SELECT query FROM sys_stat_statements ORDER BY total_time DESC LIMIT 5;")

while read -r QUERY; do
    if [ -n "$QUERY" ]; then
        echo "\n分析SQL:$(echo $QUERY | head -c 100)..."
        analyze_sql "$QUERY"
    fi
done <<< "$SLOW_QUERIES"

慢查询分析

生产环境慢查询分析脚本示例

bash
#!/bin/bash
# KingBaseES 慢查询分析脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
LOG_DIR=$($KB_BIN/ksql -U system -d test -t -c "SHOW log_directory;")
LOG_DIR=${LOG_DIR:-/opt/Kingbase/ES/V8/data/sys_log}
REPORT_DIR=/opt/Kingbase/reports
THRESHOLD=1 # 慢查询阈值(秒)

# 确保报告目录存在
mkdir -p $REPORT_DIR

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}

log "开始分析慢查询"

# 查看慢查询配置
log "慢查询配置:"
$KB_BIN/ksql -U system -d test -c "SHOW slow_query_log; SHOW long_query_time; SHOW log_min_duration_statement;"

# 生成慢查询报告
REPORT_FILE="$REPORT_DIR/slow_query_analysis_$(date +%Y%m%d).txt"

cat > $REPORT_FILE << EOF
KingBaseES 慢查询分析报告
生成时间:$(date '+%Y-%m-%d %H:%M:%S')
慢查询阈值:${THRESHOLD}秒
========================================
EOF

# 1. 从pg_stat_statements获取慢查询统计
log "从pg_stat_statements获取慢查询统计..."
echo "\n1. pg_stat_statements慢查询统计(TOP 20):" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT queryid, calls, total_time, mean_time, rows, shared_blks_hit, shared_blks_read, left(query, 200) AS query_sample FROM sys_stat_statements WHERE mean_time > $THRESHOLD ORDER BY total_time DESC LIMIT 20;" >> $REPORT_FILE

# 2. 分析慢查询日志
log "分析慢查询日志..."
echo "\n2. 慢查询日志分析:" >> $REPORT_FILE
if [ -f "$LOG_DIR/slowquery.log" ]; then
    echo "慢查询日志位置:$LOG_DIR/slowquery.log" >> $REPORT_FILE
    echo "近24小时慢查询数量:$(grep -c -i "duration:" $LOG_DIR/slowquery.log 2>/dev/null || echo 0)" >> $REPORT_FILE
    echo "\n最近10条慢查询:" >> $REPORT_FILE
    tail -n 200 $LOG_DIR/slowquery.log | grep -A 5 "duration:" | head -n 50 >> $REPORT_FILE
else
    echo "慢查询日志未启用或不存在" >> $REPORT_FILE
fi

# 3. 分析慢查询TOP SQL的执行计划
echo "\n3. 慢查询TOP 5执行计划分析:" >> $REPORT_FILE
SLOW_QUERIES=$($KB_BIN/ksql -U system -d test -t -c "SELECT query FROM sys_stat_statements WHERE mean_time > $THRESHOLD ORDER BY total_time DESC LIMIT 5;")

i=1
while read -r QUERY; do
    if [ -n "$QUERY" ]; then
        echo "\n--- TOP $i ---" >> $REPORT_FILE
        echo "SQL:$(echo $QUERY | head -c 150)..." >> $REPORT_FILE
        echo "执行计划:" >> $REPORT_FILE
        $KB_BIN/ksql -U system -d test -c "EXPLAIN $QUERY" >> $REPORT_FILE
        i=$((i+1))
    fi
done <<< "$SLOW_QUERIES"

log "慢查询分析报告已生成:$REPORT_FILE"

性能报告生成

生产环境性能报告脚本示例

bash
#!/bin/bash
# KingBaseES 性能报告生成脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
REPORT_DIR=/opt/Kingbase/reports
LOG_FILE=/opt/Kingbase/logs/performance_report_$(date +%Y%m%d).log

# 确保报告目录存在
mkdir -p $REPORT_DIR

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE
}

log "开始生成性能报告"

# 生成性能报告
REPORT_ID=$($KB_BIN/ksql -U system -d test -t -c "SELECT generate_performance_report();")
log "性能报告生成成功,报告ID:$REPORT_ID"

# 查看性能报告摘要
echo "性能报告摘要:"
$KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_performance_report WHERE report_id = '$REPORT_ID';"

# 保存完整性能报告
FULL_REPORT_FILE="$REPORT_DIR/performance_report_${REPORT_ID}_$(date +%Y%m%d).txt"
log "保存完整性能报告到:$FULL_REPORT_FILE"

# 生成自定义性能报告
cat > $FULL_REPORT_FILE << EOF
KingBaseES 自定义性能报告
报告ID:$REPORT_ID
生成时间:$(date '+%Y-%m-%d %H:%M:%S')
========================================
EOF

# 1. 系统状态
echo "\n1. 系统状态:" >> $FULL_REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT datname, numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted FROM sys_stat_database WHERE datname IS NOT NULL;" >> $FULL_REPORT_FILE

# 2. 资源使用情况
echo "\n2. 资源使用情况:" >> $FULL_REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_os_usage;" >> $FULL_REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_memory_usage;" >> $FULL_REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_disk_usage;" >> $FULL_REPORT_FILE

# 3. 慢查询统计
echo "\n3. 慢查询统计:" >> $FULL_REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT calls, total_time, mean_time, rows, left(query, 200) AS query_sample FROM sys_stat_statements ORDER BY total_time DESC LIMIT 10;" >> $FULL_REPORT_FILE

# 4. 锁状态
echo "\n4. 锁状态:" >> $FULL_REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT locktype, database, relation::regclass, mode, granted, count(*) FROM sys_locks GROUP BY locktype, database, relation, mode, granted;" >> $FULL_REPORT_FILE

# 5. 连接状态
echo "\n5. 连接状态:" >> $FULL_REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT state, count(*) FROM sys_stat_activity GROUP BY state;" >> $FULL_REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT usename, count(*) FROM sys_stat_activity GROUP BY usename ORDER BY count(*) DESC;" >> $FULL_REPORT_FILE

log "性能报告生成完成"
echo "\n性能报告已生成:$FULL_REPORT_FILE"

故障诊断命令

错误日志查看

生产环境错误日志分析脚本示例

bash
#!/bin/bash
# KingBaseES 错误日志分析脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
LOG_DIR=$($KB_BIN/ksql -U system -d test -t -c "SHOW log_directory;")
LOG_DIR=${LOG_DIR:-/opt/Kingbase/ES/V8/data/sys_log}
REPORT_DIR=/opt/Kingbase/reports
ERROR_LOG=$LOG_DIR/kingbase.log

# 确保报告目录存在
mkdir -p $REPORT_DIR

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}

log "开始分析错误日志"

# 生成错误日志分析报告
REPORT_FILE="$REPORT_DIR/error_log_analysis_$(date +%Y%m%d).txt"

cat > $REPORT_FILE << EOF
KingBaseES 错误日志分析报告
生成时间:$(date '+%Y-%m-%d %H:%M:%S')
日志文件:$ERROR_LOG
========================================
EOF

# 1. 错误统计
echo "\n1. 错误类型统计(近24小时):" >> $REPORT_FILE
tail -n 10000 $ERROR_LOG | grep -i -E "(error|fatal|panic|warning|notice)" | grep -o -E "(ERROR|FATAL|PANIC|WARNING|NOTICE)" | sort | uniq -c | sort -nr >> $REPORT_FILE

# 2. 最近30条严重错误
echo "\n2. 最近30条严重错误(ERROR/FATAL/PANIC):" >> $REPORT_FILE
tail -n 5000 $ERROR_LOG | grep -i -E "(error|fatal|panic)" | tail -n 30 >> $REPORT_FILE

# 3. 连接相关错误
echo "\n3. 连接相关错误(近24小时):" >> $REPORT_FILE
tail -n 10000 $ERROR_LOG | grep -i -E "(connection|auth|login)" | grep -i error | head -n 20 >> $REPORT_FILE

# 4. 锁相关错误
echo "\n4. 锁相关错误(近24小时):" >> $REPORT_FILE
tail -n 10000 $ERROR_LOG | grep -i -E "(deadlock|lock)" | head -n 20 >> $REPORT_FILE

# 5. 复制相关错误
echo "\n5. 复制相关错误(近24小时):" >> $REPORT_FILE
tail -n 10000 $ERROR_LOG | grep -i -E "(replication|wal|standby)" | grep -i error | head -n 20 >> $REPORT_FILE

# 6. 资源相关错误
echo "\n6. 资源相关错误(近24小时):" >> $REPORT_FILE
tail -n 10000 $ERROR_LOG | grep -i -E "(out of memory|disk full|too many connections)" | head -n 20 >> $REPORT_FILE

log "错误日志分析报告已生成:$REPORT_FILE"

# 检查是否有严重错误需要告警
FATAL_ERRORS=$(tail -n 1000 $ERROR_LOG | grep -i -c "(fatal|panic)")
if [ $FATAL_ERRORS -gt 0 ]; then
    log "警告:发现 $FATAL_ERRORS 个严重错误(FATAL/PANIC)"
    # 发送告警邮件
    # mail -s "KingBaseES Error Alert" dba@example.com < $REPORT_FILE
fi

数据库一致性检查

生产环境数据库一致性检查脚本示例

bash
#!/bin/bash
# KingBaseES 数据库一致性检查脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
KB_DATA=/opt/Kingbase/ES/V8/data
LOG_FILE=/opt/Kingbase/logs/consistency_check_$(date +%Y%m%d).log
REPORT_DIR=/opt/Kingbase/reports

# 确保报告目录存在
mkdir -p $REPORT_DIR

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE
}

log "开始数据库一致性检查"

# 1. 检查数据库一致性(需要停库或进入维护模式)
log "1. 数据库校验和检查:"
# 注意:pg_checksums 命令需要数据库处于关闭状态或进入维护模式
# 如果数据库正在运行,可以跳过此步骤或在维护窗口执行
# $KB_BIN/pg_checksums -c -D $KB_DATA >> $LOG_FILE 2>&1

# 2. 检查表结构一致性
log "\n2. 表结构一致性检查:"
TABLES=$($KB_BIN/ksql -U system -d test -t -c "SELECT schemaname || '.' || tablename FROM sys_tables WHERE schemaname NOT IN ('sys', 'sys_catalog', 'information_schema');")

for TABLE in $TABLES; do
    log "检查表格:$TABLE"
    $KB_BIN/ksql -U system -d test -c "SELECT check_table_consistency('$TABLE');" >> $LOG_FILE 2>&1
    if [ $? -ne 0 ]; then
        log "警告:表格 $TABLE 结构一致性检查失败"
    fi
    
    # 检查索引一致性
    log "检查索引:$TABLE"
    $KB_BIN/ksql -U system -d test -c "SELECT check_index_consistency('$TABLE');" >> $LOG_FILE 2>&1
    if [ $? -ne 0 ]; then
        log "警告:表格 $TABLE 索引一致性检查失败"
    fi
done

# 3. 检查数据库对象有效性
log "\n3. 数据库对象有效性检查:"
INVALID_OBJECTS=$($KB_BIN/ksql -U system -d test -t -c "SELECT schemaname || '.' || proname FROM sys_proc WHERE pronamespace NOT IN (11, 99) AND proisagg = false AND NOT provalid;")

if [ -n "$INVALID_OBJECTS" ]; then
    log "发现无效对象:"
    echo "$INVALID_OBJECTS" >> $LOG_FILE
    # 可以在这里添加修复无效对象的逻辑
    # 例如:$KB_BIN/ksql -U system -d test -c "REINDEX TABLE <table_name>;"
else
    log "未发现无效对象"
fi

log "数据库一致性检查完成"
echo "一致性检查日志:$LOG_FILE"

系统资源诊断

生产环境系统资源诊断脚本示例

bash
#!/bin/bash
# KingBaseES 系统资源诊断脚本

# 配置
KB_BIN=/opt/Kingbase/ES/V8/Server/bin
LOG_FILE=/opt/Kingbase/logs/system_resource_diagnostic_$(date +%Y%m%d).log
REPORT_DIR=/opt/Kingbase/reports

# 确保报告目录存在
mkdir -p $REPORT_DIR

# 日志函数
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE
}

log "开始系统资源诊断"

# 生成系统资源诊断报告
REPORT_FILE="$REPORT_DIR/system_resource_diagnostic_$(date +%Y%m%d).txt"

cat > $REPORT_FILE << EOF
KingBaseES 系统资源诊断报告
生成时间:$(date '+%Y-%m-%d %H:%M:%S')
========================================
EOF

# 1. 系统资源限制
echo "\n1. 系统资源限制:" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_resource_limits;" >> $REPORT_FILE

# 2. 系统内核参数
echo "\n2. 系统内核参数(关键参数):" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_kernel_parameters WHERE name IN ('shared_buffers', 'work_mem', 'maintenance_work_mem', 'effective_cache_size', 'max_connections', 'wal_buffers');" >> $REPORT_FILE

# 3. 系统负载和资源使用情况
echo "\n3. 系统负载:" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_system_load;" >> $REPORT_FILE

echo "\n4. CPU 使用情况:" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_os_usage;" >> $REPORT_FILE

echo "\n5. 内存使用情况:" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_memory_usage;" >> $REPORT_FILE

echo "\n6. 磁盘 I/O 情况:" >> $REPORT_FILE
$KB_BIN/ksql -U system -d test -c "SELECT * FROM sys_disk_usage;" >> $REPORT_FILE

# 4. 操作系统级资源检查
echo "\n7. 操作系统级资源检查:" >> $REPORT_FILE

# 磁盘空间
echo "\n7.1 磁盘空间:" >> $REPORT_FILE
df -h >> $REPORT_FILE

# 内存使用
echo "\n7.2 内存使用:" >> $REPORT_FILE
free -h >> $REPORT_FILE

# CPU 负载
echo "\n7.3 CPU 负载:" >> $REPORT_FILE
uptime >> $REPORT_FILE

# 进程数量
echo "\n7.4 进程数量:" >> $REPORT_FILE
ps aux | wc -l >> $REPORT_FILE

echo "\n7.5 数据库相关进程:" >> $REPORT_FILE
ps aux | grep -i kingbase | grep -v grep >> $REPORT_FILE

log "系统资源诊断报告已生成:$REPORT_FILE"
echo "系统资源诊断报告:$REPORT_FILE"

# 检查资源使用告警
MEMORY_USAGE=$($KB_BIN/ksql -U system -d test -t -c "SELECT round(used_memory::numeric/total_memory::numeric*100, 2) FROM sys_memory_usage;")
if (( $(echo "$MEMORY_USAGE > 85" | bc -l) )); then
    log "警告:内存使用率过高,当前为 $MEMORY_USAGE%"
fi

DISK_USAGE=$($KB_BIN/ksql -U system -d test -t -c "SELECT round(used_disk::numeric/total_disk::numeric*100, 2) FROM sys_disk_usage LIMIT 1;")
if (( $(echo "$DISK_USAGE > 80" | bc -l) )); then
    log "警告:磁盘使用率过高,当前为 $DISK_USAGE%"
fi

动态性能视图

KingBaseES提供了丰富的动态性能视图,用于监控和诊断数据库:

常用动态性能视图

视图名称描述
sys_stat_activity当前数据库连接和会话信息
sys_stat_database数据库级别的统计信息
sys_stat_user_tables用户表的访问统计
sys_stat_user_indexes用户索引的访问统计
sys_stat_statementsSQL语句执行统计
sys_locks当前锁信息
sys_lock_waits锁等待信息
sys_stat_replication复制状态信息
sys_stat_wal_receiver备库WAL接收状态
sys_os_usage操作系统资源使用情况
sys_memory_usage内存使用情况
sys_disk_usage磁盘I/O使用情况

动态性能视图查询示例

bash
# 查看当前活跃会话
ksql -U system -d test -c "SELECT datname, usename, client_addr, application_name, state, query FROM sys_stat_activity WHERE state = 'active';"

# 查看Top 10消耗CPU的SQL
ksql -U system -d test -c "SELECT query, calls, total_time, mean_time FROM sys_stat_statements ORDER BY total_time DESC LIMIT 10;"

# 查看表空间使用情况
ksql -U system -d test -c "SELECT spcname, size_mb, used_mb, (used_mb/size_mb)*100 AS usage_percent FROM sys_tablespace_usage;"

监控工具命令

pg_stat_monitor

pg_stat_monitor是KingBaseES提供的增强型统计监控工具,提供更详细的性能统计信息:

bash
# 查看pg_stat_monitor配置
ksql -U system -d test -c "SHOW pg_stat_monitor.pgsm_max;"

# 查看pg_stat_monitor统计信息
ksql -U system -d test -c "SELECT * FROM pg_stat_monitor;"

# 重置pg_stat_monitor统计
ksql -U system -d test -c "SELECT pg_stat_monitor_reset();"

auto_explain

auto_explain是KingBaseES提供的自动解释工具,用于自动记录慢查询的执行计划:

bash
# 配置auto_explain
ksql -U system -d test -c "ALTER SYSTEM SET auto_explain.log_min_duration = '1000ms';"
ksql -U system -d test -c "ALTER SYSTEM SET auto_explain.log_analyze = on;"
ksql -U system -d test -c "ALTER SYSTEM SET auto_explain.log_format = 'json';"
ksql -U system -d test -c "SELECT sys_reload_conf();"

# 查看自动记录的执行计划
cat /opt/Kingbase/ES/V8/data/sys_log/kingbase.log | grep -A 20 "auto_explain"

诊断包生成

生成诊断包

KingBaseES提供了诊断包生成工具,用于收集数据库的各种诊断信息:

bash
# 生成诊断包
kb_diagnose -D /opt/Kingbase/ES/V8/data -o /tmp/kb_diagnose

# 生成包含特定时间范围的诊断包
kb_diagnose -D /opt/Kingbase/ES/V8/data -o /tmp/kb_diagnose --start-time="2023-05-01 10:00:00" --end-time="2023-05-01 11:00:00"

# 生成包含特定组件的诊断包
kb_diagnose -D /opt/Kingbase/ES/V8/data -o /tmp/kb_diagnose --components=database,system,logs

诊断包内容

诊断包包含以下内容:

  • 数据库配置文件
  • 日志文件
  • 系统状态信息
  • 性能统计数据
  • 锁和事务信息
  • 复制状态信息
  • 操作系统信息

版本差异说明

功能V8 R6V8 R7
动态性能视图基础视图集,包括sys_stat_activity、sys_locks、sys_stat_replication等新增30+细粒度性能视图,如sys_transaction_log、sys_deadlock_history、sys_tablespace_usage_history、sys_data_files_growth等
监控工具仅支持pg_stat_statements,基础的SQL执行统计新增pg_stat_monitor,提供更详细的统计维度(如计划缓存、绑定变量、时间维度统计);增强auto_explain,支持更多执行计划选项
事务监控基础的事务状态视图,无历史记录新增sys_transaction_log视图,记录事务历史和详细执行信息;支持长时间事务自动告警配置
锁监控基础的sys_locks和sys_lock_waits视图新增sys_deadlock_history视图,记录死锁历史和死锁图;增强锁等待分析,支持锁等待链可视化
复制监控基础的复制状态视图,无延迟统计新增复制延迟细粒度统计,支持write_lag、flush_lag、replay_lag等延迟指标;支持复制性能统计视图
存储监控基础的表空间和数据文件视图新增表空间使用历史、数据文件增长趋势、数据文件碎片统计等视图
性能报告简单的性能报告,包含基础指标增强的性能报告,支持多维度分析(CPU、内存、I/O、SQL性能等);支持自定义报告模板
诊断工具基础的诊断包生成,包含核心信息增强的kb_diagnose工具,支持更多组件诊断(系统资源、网络、存储);支持时间范围筛选和自定义诊断项
系统资源监控基础的OS资源视图新增sys_system_load、sys_network_usage等视图,提供更全面的系统资源监控
慢查询分析基础的慢查询日志和pg_stat_statements增强的慢查询分析,支持按时间分布、计划类型、绑定变量等维度分析;支持自动执行计划捕获

最佳实践

监控命令使用建议

  1. 定期执行监控命令:建立定期执行监控命令的习惯,及时发现问题
  2. 结合监控工具使用:将命令行监控与图形化监控工具结合使用,提高效率
  3. 保存监控历史:定期保存监控数据,用于趋势分析和问题回溯
  4. 设置告警阈值:根据监控数据设置合理的告警阈值,及时收到告警通知
  5. 熟悉动态性能视图:深入了解动态性能视图的含义和用法,提高诊断效率

诊断命令使用建议

  1. 系统正常时建立基准:在系统正常运行时收集基准数据,便于故障时对比分析
  2. 故障时收集完整信息:故障发生时,收集完整的诊断信息,包括日志、状态和性能数据
  3. 逐步缩小问题范围:使用诊断命令逐步缩小问题范围,定位根本原因
  4. 记录诊断过程:记录诊断过程和结果,便于后续分析和总结
  5. 定期进行健康检查:定期执行健康检查命令,预防潜在问题

常见问题(FAQ)

Q1: 如何查看数据库当前的连接数和连接状态?

A: 使用以下命令查看当前连接数和连接状态:

bash
# 查看总连接数
ksql -U system -d test -c "SELECT count(*) FROM sys_stat_activity;"

# 查看连接状态分布
ksql -U system -d test -c "SELECT state, count(*) FROM sys_stat_activity GROUP BY state;"

# 查看活跃连接详情
ksql -U system -d test -c "SELECT datname, usename, client_addr, application_name, state, query FROM sys_stat_activity WHERE state = 'active' ORDER BY query_start;"

Q2: 如何查找消耗CPU和I/O最多的SQL?

A: 使用以下命令查找消耗资源最多的SQL:

bash
# 查找消耗CPU最多的SQL
ksql -U system -d test -c "SELECT queryid, calls, total_time, mean_time, rows, left(query, 100) AS query_sample FROM sys_stat_statements ORDER BY total_time DESC LIMIT 10;"

# 查找消耗I/O最多的SQL(逻辑读+物理读)
ksql -U system -d test -c "SELECT queryid, calls, shared_blks_hit + shared_blks_read AS total_io, left(query, 100) AS query_sample FROM sys_stat_statements ORDER BY total_io DESC LIMIT 10;"

Q3: 如何查看数据库的锁等待情况和锁等待链?

A: 使用以下命令查看锁等待情况和锁等待链:

bash
# 查看锁等待基本信息
ksql -U system -d test -c "SELECT * FROM sys_lock_waits;"

# 查看详细的锁等待链
ksql -U system -d test -c "SELECT blocked_locks.pid AS blocked_pid, blocked_activity.usename AS blocked_user, blocking_locks.pid AS blocking_pid, blocking_activity.usename AS blocking_user, blocked_locks.relation::regclass AS table_name, blocked_locks.mode AS blocked_mode, blocking_locks.mode AS blocking_mode, now() - blocked_activity.query_start AS wait_time, blocked_activity.query AS blocked_query, blocking_activity.query AS blocking_query FROM sys_locks blocked_locks JOIN sys_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid JOIN sys_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid AND blocking_locks.pid != blocked_locks.pid JOIN sys_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid WHERE NOT blocked_locks.granted;"

Q4: 如何生成数据库诊断包并分析诊断结果?

A: 使用kb_diagnose工具生成诊断包,并分析诊断结果:

bash
# 生成完整诊断包
kb_diagnose -D /opt/Kingbase/ES/V8/data -o /tmp/kb_diagnose

# 生成指定时间范围的诊断包
kb_diagnose -D /opt/Kingbase/ES/V8/data -o /tmp/kb_diagnose --start-time="2023-05-01 10:00:00" --end-time="2023-05-01 11:00:00"

# 生成指定组件的诊断包
kb_diagnose -D /opt/Kingbase/ES/V8/data -o /tmp/kb_diagnose --components=database,system,logs

# 分析诊断包(解压后查看)
tar -zxvf /tmp/kb_diagnose/kb_diagnose_*.tar.gz -C /tmp/kb_diagnose
ls -la /tmp/kb_diagnose/

Q5: 如何查看主备复制延迟和复制状态?

A: 在主库和备库上执行以下命令查看复制状态:

bash
# 主库查看复制状态
ksql -U system -d test -c "SELECT client_addr, application_name, state, sync_state, write_lag, flush_lag, replay_lag FROM sys_stat_replication;"

# 备库查看复制状态
ksql -U system -d test -c "SELECT status, receive_start_lsn, received_lsn, last_msg_send_time, last_msg_receipt_time FROM sys_stat_wal_receiver;"

# 备库查看复制延迟
ksql -U system -d test -c "SELECT extract(epoch FROM (now() - pg_last_xact_replay_timestamp())) AS replication_delay_seconds;"

Q6: 如何分析慢查询并优化?

A: 可以通过以下步骤分析和优化慢查询:

  1. 查看慢查询配置:SHOW slow_query_log; SHOW long_query_time;
  2. 查看慢查询日志:tail -n 100 /opt/Kingbase/ES/V8/data/sys_log/slowquery.log
  3. 使用pg_stat_statements分析:SELECT * FROM sys_stat_statements ORDER BY total_time DESC LIMIT 10;
  4. 查看执行计划:EXPLAIN ANALYZE <slow_query>;
  5. 优化建议:添加索引、重写SQL、调整参数等

Q7: 如何检查数据库一致性和修复数据损坏?

A: 使用以下命令检查和修复数据库一致性:

bash
# 检查数据库校验和(需要停库)
pg_checksums -c -D /opt/Kingbase/ES/V8/data

# 检查表结构一致性
ksql -U system -d test -c "SELECT check_table_consistency('public');"

# 检查索引一致性
ksql -U system -d test -c "SELECT check_index_consistency('public');"

# 修复损坏的表(如果需要)
ksql -U system -d test -c "REINDEX TABLE public.employees;"
ksql -U system -d test -c "VACUUM ANALYZE public.employees;"

Q8: 如何查看表空间使用情况和增长趋势?

A: 使用以下命令查看表空间使用情况和增长趋势:

bash
# 查看表空间当前使用情况
ksql -U system -d test -c "SELECT spcname, size_mb, used_mb, free_mb, round(used_mb::numeric/size_mb::numeric*100, 2) AS usage_percent FROM sys_tablespace_usage;"

# 查看表空间下的大表
ksql -U system -d test -c "SELECT schemaname, tablename, size_mb FROM sys_tables_size ORDER BY size_mb DESC LIMIT 10;"

# V8 R7: 查看表空间使用历史
ksql -U system -d test -c "SELECT spcname, date_trunc('day', sample_time) AS day, avg(used_mb) AS avg_used_mb FROM sys_tablespace_usage_history WHERE sample_time > now() - interval '7 days' GROUP BY spcname, day ORDER BY spcname, day;"

Q9: 如何监控和处理长时间运行的事务?

A: 使用以下命令监控和处理长时间运行的事务:

bash
# 查看长时间运行的事务(超过5分钟)
ksql -U system -d test -c "SELECT pid, datname, usename, client_addr, state, backend_xid, query_start, now() - query_start AS duration, query FROM sys_stat_activity WHERE state IN ('idle in transaction', 'active') AND query_start < now() - interval '5 minutes' ORDER BY duration DESC;"

# 终止长时间运行的事务(谨慎使用)
ksql -U system -d test -c "SELECT pg_terminate_backend(<pid>);"

# V8 R7: 查看事务历史记录
ksql -U system -d test -c "SELECT * FROM sys_transaction_log WHERE start_time > now() - interval '1 hour' ORDER BY duration DESC LIMIT 10;"

Q10: 如何查看和分析死锁?

A: 使用以下命令查看和分析死锁:

bash
# 查看当前死锁信息
ksql -U system -d test -c "SELECT * FROM sys_deadlocks;"

# V8 R7: 查看死锁历史记录和死锁图
ksql -U system -d test -c "SELECT deadlock_time, deadlock_graph FROM sys_deadlock_history WHERE deadlock_time > now() - interval '7 days' ORDER BY deadlock_time DESC;"

# 分析死锁原因:查看死锁日志
cat /opt/Kingbase/ES/V8/data/sys_log/kingbase.log | grep -A 20 "DEADLOCK DETECTED"

Q11: 如何监控数据库的内存和CPU使用情况?

A: 使用以下命令监控数据库的内存和CPU使用情况:

bash
# 查看CPU使用情况
ksql -U system -d test -c "SELECT cpu_usage_percent, user_cpu_usage_percent, system_cpu_usage_percent FROM sys_os_usage;"

# 查看内存使用情况
ksql -U system -d test -c "SELECT total_memory, used_memory, free_memory, round(used_memory::numeric/total_memory::numeric*100, 2) AS usage_percent FROM sys_memory_usage;"

# 查看数据库进程的内存使用
ps aux --sort=-rss | grep -i kingbase | head -n 10

Q12: 如何查看和分析WAL日志使用情况?

A: 使用以下命令查看和分析WAL日志使用情况:

bash
# 查看WAL配置
ksql -U system -d test -c "SHOW wal_level; SHOW max_wal_size; SHOW min_wal_size;"

# 查看WAL使用情况
ksql -U system -d test -c "SELECT wal_used_mb, wal_total_mb, round(wal_used_mb::numeric/wal_total_mb::numeric*100, 2) AS usage_percent, checkpoint_count FROM sys_wal_usage;"

# 查看WAL归档状态
ksql -U system -d test -c "SELECT archived_count, failed_count, last_archived_wal, last_archived_time FROM sys_archive_status;"

# 查看WAL文件数量
ls -la /opt/Kingbase/ES/V8/data/pg_wal/000000* | wc -l

Q13: 如何生成和分析数据库性能报告?

A: 使用以下命令生成和分析数据库性能报告:

bash
# 生成性能报告
ksql -U system -d test -c "SELECT generate_performance_report();"

# 查看性能报告摘要
ksql -U system -d test -c "SELECT * FROM sys_performance_report;"

# 查看详细性能报告(如果支持)
ksql -U system -d test -c "SELECT get_performance_report_detail(<report_id>);"

Q14: 如何监控数据库的连接池使用情况?

A: 使用以下命令监控数据库的连接池使用情况:

bash
# 查看连接池基本信息(如果使用连接池)
ksql -U system -d test -c "SELECT datname, usename, client_addr, application_name, state, query FROM sys_stat_activity WHERE application_name LIKE '%pool%' ORDER BY state;"

# 查看连接池相关参数(如果配置了)
ksql -U system -d test -c "SHOW max_connections; SHOW superuser_reserved_connections;"

Q15: 如何使用pg_stat_monitor进行高级性能监控?

A: 使用以下命令配置和使用pg_stat_monitor:

bash
# 查看pg_stat_monitor配置
ksql -U system -d test -c "SHOW pg_stat_monitor.pgsm_max; SHOW pg_stat_monitor.pgsm_track_planning;"

# 查看pg_stat_monitor统计信息
ksql -U system -d test -c "SELECT queryid, userid, calls, total_time, mean_time, rows, left(query, 100) AS query_sample FROM pg_stat_monitor ORDER BY total_time DESC LIMIT 10;"

# 按时间段查看统计信息
ksql -U system -d test -c "SELECT bucket, count(*) FROM pg_stat_monitor GROUP BY bucket ORDER BY bucket;"

# 重置pg_stat_monitor统计
ksql -U system -d test -c "SELECT pg_stat_monitor_reset();"

总结

KingBaseES提供了丰富的监控与诊断命令,帮助DBA实时了解数据库运行情况,及时发现并解决问题。本文介绍了KingBaseES的系统状态监控、性能监控、存储监控、事务与锁监控、复制状态监控、性能诊断、故障诊断等命令。在实际使用中,DBA应根据具体情况选择合适的命令,结合监控工具和动态性能视图,建立完善的监控和诊断体系,确保数据库的稳定运行。