生产运维规范
一、权限管理与账号安全
1.1 用户创建与权限分配
sql
-- 最小权限原则:只给业务所需的最小权限
-- 应用账号:只读写业务库
CREATE USER 'app_read'@'192.168.1.%' IDENTIFIED BY 'password';
GRANT SELECT ON mydb.* TO 'app_read'@'192.168.1.%';
CREATE USER 'app_write'@'192.168.1.%' IDENTIFIED BY 'password';
GRANT SELECT, INSERT, UPDATE, DELETE ON mydb.* TO 'app_write'@'192.168.1.%';
-- 运维账号:管理权限
CREATE USER 'dba'@'192.168.1.%' IDENTIFIED BY 'strong_password';
GRANT ALL PRIVILEGES ON *.* TO 'dba'@'192.168.1.%' WITH GRANT OPTION;
-- 备份账号:备份所需的最小权限
CREATE USER 'backup'@'localhost' IDENTIFIED BY 'password';
GRANT SELECT, RELOAD, LOCK TABLES, REPLICATION CLIENT, SHOW VIEW, EVENT, TRIGGER
ON *.* TO 'backup'@'localhost';
-- 监控账号:只读
CREATE USER 'monitor'@'%' IDENTIFIED BY 'password';
GRANT PROCESS, REPLICATION CLIENT, SHOW DATABASES ON *.* TO 'monitor'@'%';
GRANT SELECT ON performance_schema.* TO 'monitor'@'%';
1.2 账号安全规范
sql
-- 1. 删除默认用户
DROP USER IF EXISTS ''@'localhost';
DROP USER IF EXISTS 'root'@'%'; -- root 只允许本地登录
-- 2. 限制登录主机
-- 不要使用 '%'(任意主机),指定具体 IP 段
CREATE USER 'app'@'192.168.1.%' IDENTIFIED BY 'password';
-- 3. 密码策略
SHOW VARIABLES LIKE 'validate_password%';
SET GLOBAL validate_password_policy = STRONG;
SET GLOBAL validate_password_length = 12;
-- 4. 定期修改密码
ALTER USER 'app'@'192.168.1.%' IDENTIFIED BY 'new_password';
-- 5. 查看用户权限
SHOW GRANTS FOR 'app'@'192.168.1.%';
1.3 SQL 审计
sql
-- MySQL 企业版审计插件
-- 或使用 general_log(生产慎用,影响性能)
-- 开启 general_log(临时排查用)
SET GLOBAL general_log = ON;
SET GLOBAL general_log_file = '/var/log/mysql/general.log';
-- 查询审计日志
SELECT * FROM mysql.general_log WHERE command_type = 'Query'
AND argument NOT LIKE '%information_schema%'
ORDER BY event_time DESC LIMIT 100;
-- 使用 performance_schema 审计(MySQL 5.7+)
-- 查看执行的 SQL
SELECT * FROM performance_schema.events_statements_summary_by_digest
ORDER BY SUM_TIMER_WAIT DESC
LIMIT 20;
-- 第三方审计:使用 pt-query-digest 分析慢查询
pt-query-digest /var/log/mysql/slow.log
二、监控指标
2.1 核心监控指标
sql
-- 1. 连接数
SELECT
VARIABLE_VALUE AS current_connections
FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Threads_connected';
-- 2. QPS(每秒查询数)
SHOW GLOBAL STATUS LIKE 'Questions';
-- 两次采样相除 / 时间间隔
-- 3. TPS(每秒事务数)
SHOW GLOBAL STATUS LIKE 'Com_commit';
SHOW GLOBAL STATUS LIKE 'Com_rollback';
-- 4. 慢查询
SHOW GLOBAL STATUS LIKE 'Slow_queries';
-- 5. 锁等待
SHOW STATUS LIKE 'Innodb_row_lock_current_waits';
SHOW STATUS LIKE 'Innodb_row_lock_time_avg';
-- 6. InnoDB 状态
SHOW ENGINE INNODB STATUS\G
2.2 监控 SQL 汇总
sql
-- QPS 监控(每秒查询次数)
SELECT
ROUND(SUM(IF(VARIABLE_NAME REGEXP 'Com_(select|insert|update|delete)',
VARIABLE_VALUE, 0)) /
(SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Uptime'), 2) AS avg_qps
FROM performance_schema.global_status
WHERE VARIABLE_NAME REGEXP 'Com_(select|insert|update|delete)';
-- 活跃连接数
SELECT COUNT(*) AS active_connections
FROM information_schema.PROCESSLIST
WHERE command != 'Sleep';
-- 锁等待
SELECT
waiting_pid AS waiting_thread_id,
waiting_query AS waiting_query,
blocking_pid AS blocking_thread_id,
blocking_query AS blocking_query
FROM sys.innodb_lock_waits;
-- 表大小排名
SELECT
table_schema AS database_name,
table_name,
ROUND(data_length / 1024 / 1024, 2) AS data_size_mb,
ROUND(index_length / 1024 / 1024, 2) AS index_size_mb,
ROUND((data_length + index_length) / 1024 / 1024, 2) AS total_size_mb,
table_rows
FROM information_schema.tables
WHERE table_schema NOT IN ('mysql', 'performance_schema', 'sys', 'information_schema')
ORDER BY total_size_mb DESC
LIMIT 20;
-- Buffer Pool 使用率
SELECT
(1 - (SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_buffer_pool_pages_free')
/
(SELECT VARIABLE_VALUE FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Innodb_buffer_pool_pages_total')) * 100 AS buffer_pool_usage_pct;
-- 复制延迟
SHOW SLAVE STATUS\G
-- Seconds_Behind_Master
-- 监控大事务
SELECT
trx_id,
TIME_TO_SEC(TIMEDIFF(NOW(), trx_started)) AS trx_running_seconds,
trx_state,
trx_isolation_level,
trx_rows_locked,
trx_rows_modified
FROM information_schema.INNODB_TRX
WHERE TIME_TO_SEC(TIMEDIFF(NOW(), trx_started)) > 60;
2.3 告警阈值
| 指标 | 告警阈值 | 严重级别 |
|---|---|---|
| 连接数使用率 | > 80% | Warning |
| 连接数使用率 | > 90% | Critical |
| 慢查询 | > 100/分钟 | Warning |
| Buffer Pool 命中率 | < 95% | Warning |
| 复制延迟 | > 30秒 | Warning |
| 复制延迟 | > 300秒 | Critical |
| 锁等待 | > 10个并发 | Warning |
| 磁盘使用率 | > 80% | Warning |
| 磁盘使用率 | > 90% | Critical |
三、大事务处理
3.1 大事务的危害
sql
-- 1. 占用大量连接资源
-- 2. 锁定大量行,阻塞其他事务
-- 3. 产生大量 undo log,影响 purge 线程
-- 4. 主从复制延迟(从库要执行同样的大事务)
3.2 大事务的识别
sql
-- 查找运行超过 60 秒的事务
SELECT
trx_id,
trx_mysql_thread_id AS process_id,
trx_started,
TIME_TO_SEC(TIMEDIFF(NOW(), trx_started)) AS trx_running_seconds,
trx_rows_locked,
trx_rows_modified,
(SELECT PROCESSLIST_INFO FROM information_schema.PROCESSLIST
WHERE ID = trx_mysql_thread_id) AS current_query
FROM information_schema.INNODB_TRX
WHERE TIME_TO_SEC(TIMEDIFF(NOW(), trx_started)) > 60
ORDER BY trx_running_seconds DESC;
-- 查找导致大事务的 SQL
SELECT
thd.PROCESSLIST_ID,
thd.PROCESSLIST_INFO AS sql_text,
trx.trx_started,
trx.trx_rows_modified
FROM performance_schema.threads thd
JOIN information_schema.INNODB_TRX trx
ON thd.PROCESSLIST_ID = trx.trx_mysql_thread_id
ORDER BY trx.trx_rows_modified DESC;
3.3 大事务的处理
sql
-- 处理方案
-- 1. 分批处理
-- 坏:一次修改所有数据
UPDATE orders SET status = 2 WHERE create_time < '2023-01-01';
-- 好:分批处理
DO $$
DECLARE
affected_rows INT;
BEGIN
LOOP
UPDATE orders SET status = 2
WHERE create_time < '2023-01-01'
LIMIT 1000;
GET DIAGNOSTICS affected_rows = ROW_COUNT;
IF affected_rows = 0 THEN
EXIT;
END IF;
COMMIT; -- 每批提交一次
END LOOP;
END $$;
-- 2. 设置事务超时
SET SESSION innodb_lock_wait_timeout = 5; -- 5秒超时
四、线上风险操作规避
4.1 DDL 操作规范
sql
-- 坏:大表直接 ALTER TABLE(会锁表,阻塞读写)
ALTER TABLE orders ADD COLUMN remark VARCHAR(500);
-- 好:使用 pt-online-schema-change(在线 DDL)
pt-online-schema-change --alter "ADD COLUMN remark VARCHAR(500)" \
D=mydb,t=orders \
--execute
-- 或使用 MySQL 8.0 的 INSTANT DDL(加列不重建表)
ALTER TABLE orders ADD COLUMN remark VARCHAR(500), ALGORITHM=INSTANT;
4.2 DELETE 操作规范
sql
-- 坏:一次性删除大量数据
DELETE FROM logs WHERE create_time < '2023-01-01'; -- 可能删几百万行
-- 好:分批删除
DELIMITER $$
CREATE PROCEDURE sp_batch_delete_logs()
BEGIN
DECLARE v_affected INT DEFAULT 1;
WHILE v_affected > 0 DO
DELETE FROM logs
WHERE create_time < '2023-01-01'
LIMIT 1000;
SET v_affected = ROW_COUNT();
COMMIT;
-- 间歇,减少主库压力
DO SLEEP(1);
END WHILE;
END$$
DELIMITER ;
4.3 索引操作规范
sql
-- 坏:无索引时的全表扫描查询
SELECT * FROM big_table WHERE unindexed_col = 'value';
-- 会锁大量行,影响并发
-- 好:创建索引操作注意
-- 1. 低峰期执行
-- 2. 使用 pt-online-schema-change 或 INPLACE 算法
-- 3. 监控主从延迟
-- 检查索引使用率
SELECT
OBJECT_SCHEMA,
OBJECT_NAME,
INDEX_NAME,
COUNT_STAR AS accesses,
COUNT_READ AS reads
FROM performance_schema.table_io_waits_summary_by_index_usage
WHERE INDEX_NAME IS NOT NULL
ORDER BY accesses ASC
LIMIT 20; -- 使用少的索引可考虑删除
4.4 线上操作检查清单
DDL 操作前:
[ ] 在测试环境验证
[ ] 选择业务低峰期执行
[ ] 评估影响(预估执行时间、磁盘空间)
[ ] 准备回滚方案
[ ] 通知相关团队
DML 操作前:
[ ] 确认 WHERE 条件正确(先 SELECT 确认影响行数)
[ ] 开启事务,不自动提交
[ ] 先备份受影响数据
[ ] 分批执行,避免大事务
数据修复前:
[ ] 备份原始数据
[ ] 在测试环境验证修复 SQL
[ ] 准备回滚脚本
[ ] 通知业务方停服(如果需要)
紧急操作:
[ ] 评估影响范围
[ ] 通知相关方
[ ] 执行后验证
[ ] 记录故障时间线和原因
五、实战案例
案例:完整的运维脚本
bash
#!/bin/bash
# MySQL 运维健康检查脚本
MYSQL_USER="monitor"
MYSQL_PASS="password"
MYSQL_HOST="127.0.0.1"
MYSQL_PORT="3306"
ALERT_EMAIL="dba@company.com"
# 检查函数
check_mysql_alive() {
mysql -u$MYSQL_USER -p$MYSQL_PASS -h$MYSQL_HOST -P$MYSQL_PORT -e "SELECT 1" > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "CRITICAL: MySQL is not responding" | mail -s "MySQL DOWN" $ALERT_EMAIL
return 1
fi
}
check_connection_count() {
CONN_COUNT=$(mysql -u$MYSQL_USER -p$MYSQL_PASS -h$MYSQL_HOST -P$MYSQL_PORT \
-e "SELECT COUNT(*) FROM information_schema.PROCESSLIST" 2>/dev/null | tail -1)
MAX_CONN=$(mysql -u$MYSQL_USER -p$MYSQL_PASS -h$MYSQL_HOST -P$MYSQL_PORT \
-e "SHOW VARIABLES LIKE 'max_connections'" 2>/dev/null | awk '{print $2}')
USAGE=$(echo "scale=2; $CONN_COUNT / $MAX_CONN * 100" | bc)
if [ $(echo "$USAGE > 80" | bc) -eq 1 ]; then
echo "WARNING: Connection usage at $USAGE%" | mail -s "MySQL Connection Alert" $ALERT_EMAIL
fi
}
check_replication() {
SLAVE_STATUS=$(mysql -u$MYSQL_USER -p$MYSQL_PASS -h$MYSQL_HOST -P$MYSQL_PORT \
-e "SHOW SLAVE STATUS\G" 2>/dev/null)
IO_RUNNING=$(echo "$SLAVE_STATUS" | grep "Slave_IO_Running:" | awk '{print $2}')
SQL_RUNNING=$(echo "$SLAVE_STATUS" | grep "Slave_SQL_Running:" | awk '{print $2}')
SECONDS_BEHIND=$(echo "$SLAVE_STATUS" | grep "Seconds_Behind_Master:" | awk '{print $2}')
if [ "$IO_RUNNING" != "Yes" ] || [ "$SQL_RUNNING" != "Yes" ]; then
echo "CRITICAL: Replication is not running" | mail -s "MySQL Replication Down" $ALERT_EMAIL
fi
if [ "$SECONDS_BEHIND" -gt 300 ]; then
echo "CRITICAL: Replication delay $SECONDS_BEHIND seconds" | mail -s "MySQL Replication Delay" $ALERT_EMAIL
fi
}
check_disk_usage() {
DISK_USAGE=$(df -h /var/lib/mysql | tail -1 | awk '{print $5}' | sed 's/%//')
if [ "$DISK_USAGE" -gt 80 ]; then
echo "WARNING: Disk usage at $DISK_USAGE%" | mail -s "MySQL Disk Alert" $ALERT_EMAIL
fi
}
# 执行检查
check_mysql_alive || exit 1
check_connection_count
check_replication
check_disk_usage
echo "Health check completed at $(date)"
总结
| 领域 | 核心规范 |
|---|---|
| 权限管理 | 最小权限、限制登录主机、强密码策略 |
| 监控指标 | 连接数、QPS/TPS、慢查询、锁等待、复制延迟 |
| 大事务处理 | 分批处理、设置超时、及时提交 |
| DDL 规范 | pt-online-schema-change、低峰期、备份 |
| DML 规范 | 先 SELECT 确认、分批处理、备份原始数据 |
| 紧急操作 | 评估影响、通知相关方、记录故障时间线 |