← 返回 MYSQL 列表

生产运维规范

生产运维规范

一、权限管理与账号安全

1.1 用户创建与权限分配

sql
-- 最小权限原则:只给业务所需的最小权限

-- 应用账号:只读写业务库
CREATE USER 'app_read'@'192.168.1.%' IDENTIFIED BY 'password';
GRANT SELECT ON mydb.* TO 'app_read'@'192.168.1.%';

CREATE USER 'app_write'@'192.168.1.%' IDENTIFIED BY 'password';
GRANT SELECT, INSERT, UPDATE, DELETE ON mydb.* TO 'app_write'@'192.168.1.%';

-- 运维账号:管理权限
CREATE USER 'dba'@'192.168.1.%' IDENTIFIED BY 'strong_password';
GRANT ALL PRIVILEGES ON *.* TO 'dba'@'192.168.1.%' WITH GRANT OPTION;

-- 备份账号:备份所需的最小权限
CREATE USER 'backup'@'localhost' IDENTIFIED BY 'password';
GRANT SELECT, RELOAD, LOCK TABLES, REPLICATION CLIENT, SHOW VIEW, EVENT, TRIGGER
ON *.* TO 'backup'@'localhost';

-- 监控账号:只读
CREATE USER 'monitor'@'%' IDENTIFIED BY 'password';
GRANT PROCESS, REPLICATION CLIENT, SHOW DATABASES ON *.* TO 'monitor'@'%';
GRANT SELECT ON performance_schema.* TO 'monitor'@'%';

1.2 账号安全规范

sql
-- 1. 删除默认用户
DROP USER IF EXISTS ''@'localhost';
DROP USER IF EXISTS 'root'@'%';  -- root 只允许本地登录

-- 2. 限制登录主机
-- 不要使用 '%'(任意主机),指定具体 IP 段
CREATE USER 'app'@'192.168.1.%' IDENTIFIED BY 'password';

-- 3. 密码策略
SHOW VARIABLES LIKE 'validate_password%';
SET GLOBAL validate_password_policy = STRONG;
SET GLOBAL validate_password_length = 12;

-- 4. 定期修改密码
ALTER USER 'app'@'192.168.1.%' IDENTIFIED BY 'new_password';

-- 5. 查看用户权限
SHOW GRANTS FOR 'app'@'192.168.1.%';

1.3 SQL 审计

sql
-- MySQL 企业版审计插件
-- 或使用 general_log(生产慎用,影响性能)

-- 开启 general_log(临时排查用)
SET GLOBAL general_log = ON;
SET GLOBAL general_log_file = '/var/log/mysql/general.log';

-- 查询审计日志
SELECT * FROM mysql.general_log WHERE command_type = 'Query'
  AND argument NOT LIKE '%information_schema%'
ORDER BY event_time DESC LIMIT 100;

-- 使用 performance_schema 审计(MySQL 5.7+)
-- 查看执行的 SQL
SELECT * FROM performance_schema.events_statements_summary_by_digest
ORDER BY SUM_TIMER_WAIT DESC
LIMIT 20;

-- 第三方审计:使用 pt-query-digest 分析慢查询
pt-query-digest /var/log/mysql/slow.log

二、监控指标

2.1 核心监控指标

sql
-- 1. 连接数
SELECT
    VARIABLE_VALUE AS current_connections
FROM performance_schema.global_status
WHERE VARIABLE_NAME = 'Threads_connected';

-- 2. QPS(每秒查询数)
SHOW GLOBAL STATUS LIKE 'Questions';
-- 两次采样相除 / 时间间隔

-- 3. TPS(每秒事务数)
SHOW GLOBAL STATUS LIKE 'Com_commit';
SHOW GLOBAL STATUS LIKE 'Com_rollback';

-- 4. 慢查询
SHOW GLOBAL STATUS LIKE 'Slow_queries';

-- 5. 锁等待
SHOW STATUS LIKE 'Innodb_row_lock_current_waits';
SHOW STATUS LIKE 'Innodb_row_lock_time_avg';

-- 6. InnoDB 状态
SHOW ENGINE INNODB STATUS\G

2.2 监控 SQL 汇总

sql
-- QPS 监控(每秒查询次数)
SELECT
    ROUND(SUM(IF(VARIABLE_NAME REGEXP 'Com_(select|insert|update|delete)',
                 VARIABLE_VALUE, 0)) /
          (SELECT VARIABLE_VALUE FROM performance_schema.global_status
           WHERE VARIABLE_NAME = 'Uptime'), 2) AS avg_qps
FROM performance_schema.global_status
WHERE VARIABLE_NAME REGEXP 'Com_(select|insert|update|delete)';

-- 活跃连接数
SELECT COUNT(*) AS active_connections
FROM information_schema.PROCESSLIST
WHERE command != 'Sleep';

-- 锁等待
SELECT
    waiting_pid AS waiting_thread_id,
    waiting_query AS waiting_query,
    blocking_pid AS blocking_thread_id,
    blocking_query AS blocking_query
FROM sys.innodb_lock_waits;

-- 表大小排名
SELECT
    table_schema AS database_name,
    table_name,
    ROUND(data_length / 1024 / 1024, 2) AS data_size_mb,
    ROUND(index_length / 1024 / 1024, 2) AS index_size_mb,
    ROUND((data_length + index_length) / 1024 / 1024, 2) AS total_size_mb,
    table_rows
FROM information_schema.tables
WHERE table_schema NOT IN ('mysql', 'performance_schema', 'sys', 'information_schema')
ORDER BY total_size_mb DESC
LIMIT 20;

-- Buffer Pool 使用率
SELECT
    (1 - (SELECT VARIABLE_VALUE FROM performance_schema.global_status
          WHERE VARIABLE_NAME = 'Innodb_buffer_pool_pages_free')
    /
    (SELECT VARIABLE_VALUE FROM performance_schema.global_status
     WHERE VARIABLE_NAME = 'Innodb_buffer_pool_pages_total')) * 100 AS buffer_pool_usage_pct;

-- 复制延迟
SHOW SLAVE STATUS\G
-- Seconds_Behind_Master

-- 监控大事务
SELECT
    trx_id,
    TIME_TO_SEC(TIMEDIFF(NOW(), trx_started)) AS trx_running_seconds,
    trx_state,
    trx_isolation_level,
    trx_rows_locked,
    trx_rows_modified
FROM information_schema.INNODB_TRX
WHERE TIME_TO_SEC(TIMEDIFF(NOW(), trx_started)) > 60;

2.3 告警阈值

指标 告警阈值 严重级别
连接数使用率 > 80% Warning
连接数使用率 > 90% Critical
慢查询 > 100/分钟 Warning
Buffer Pool 命中率 < 95% Warning
复制延迟 > 30秒 Warning
复制延迟 > 300秒 Critical
锁等待 > 10个并发 Warning
磁盘使用率 > 80% Warning
磁盘使用率 > 90% Critical

三、大事务处理

3.1 大事务的危害

sql
-- 1. 占用大量连接资源
-- 2. 锁定大量行,阻塞其他事务
-- 3. 产生大量 undo log,影响 purge 线程
-- 4. 主从复制延迟(从库要执行同样的大事务)

3.2 大事务的识别

sql
-- 查找运行超过 60 秒的事务
SELECT
    trx_id,
    trx_mysql_thread_id AS process_id,
    trx_started,
    TIME_TO_SEC(TIMEDIFF(NOW(), trx_started)) AS trx_running_seconds,
    trx_rows_locked,
    trx_rows_modified,
    (SELECT PROCESSLIST_INFO FROM information_schema.PROCESSLIST
     WHERE ID = trx_mysql_thread_id) AS current_query
FROM information_schema.INNODB_TRX
WHERE TIME_TO_SEC(TIMEDIFF(NOW(), trx_started)) > 60
ORDER BY trx_running_seconds DESC;

-- 查找导致大事务的 SQL
SELECT
    thd.PROCESSLIST_ID,
    thd.PROCESSLIST_INFO AS sql_text,
    trx.trx_started,
    trx.trx_rows_modified
FROM performance_schema.threads thd
JOIN information_schema.INNODB_TRX trx
  ON thd.PROCESSLIST_ID = trx.trx_mysql_thread_id
ORDER BY trx.trx_rows_modified DESC;

3.3 大事务的处理

sql
-- 处理方案

-- 1. 分批处理
-- 坏:一次修改所有数据
UPDATE orders SET status = 2 WHERE create_time < '2023-01-01';

-- 好:分批处理
DO $$
DECLARE
    affected_rows INT;
BEGIN
    LOOP
        UPDATE orders SET status = 2
        WHERE create_time < '2023-01-01'
        LIMIT 1000;

        GET DIAGNOSTICS affected_rows = ROW_COUNT;

        IF affected_rows = 0 THEN
            EXIT;
        END IF;

        COMMIT;  -- 每批提交一次
    END LOOP;
END $$;

-- 2. 设置事务超时
SET SESSION innodb_lock_wait_timeout = 5;  -- 5秒超时

四、线上风险操作规避

4.1 DDL 操作规范

sql
-- 坏:大表直接 ALTER TABLE(会锁表,阻塞读写)
ALTER TABLE orders ADD COLUMN remark VARCHAR(500);

-- 好:使用 pt-online-schema-change(在线 DDL)
pt-online-schema-change --alter "ADD COLUMN remark VARCHAR(500)" \
    D=mydb,t=orders \
    --execute

-- 或使用 MySQL 8.0 的 INSTANT DDL(加列不重建表)
ALTER TABLE orders ADD COLUMN remark VARCHAR(500), ALGORITHM=INSTANT;

4.2 DELETE 操作规范

sql
-- 坏:一次性删除大量数据
DELETE FROM logs WHERE create_time < '2023-01-01';  -- 可能删几百万行

-- 好:分批删除
DELIMITER $$
CREATE PROCEDURE sp_batch_delete_logs()
BEGIN
    DECLARE v_affected INT DEFAULT 1;

    WHILE v_affected > 0 DO
        DELETE FROM logs
        WHERE create_time < '2023-01-01'
        LIMIT 1000;

        SET v_affected = ROW_COUNT();
        COMMIT;

        -- 间歇,减少主库压力
        DO SLEEP(1);
    END WHILE;
END$$
DELIMITER ;

4.3 索引操作规范

sql
-- 坏:无索引时的全表扫描查询
SELECT * FROM big_table WHERE unindexed_col = 'value';
-- 会锁大量行,影响并发

-- 好:创建索引操作注意
-- 1. 低峰期执行
-- 2. 使用 pt-online-schema-change 或 INPLACE 算法
-- 3. 监控主从延迟

-- 检查索引使用率
SELECT
    OBJECT_SCHEMA,
    OBJECT_NAME,
    INDEX_NAME,
    COUNT_STAR AS accesses,
    COUNT_READ AS reads
FROM performance_schema.table_io_waits_summary_by_index_usage
WHERE INDEX_NAME IS NOT NULL
ORDER BY accesses ASC
LIMIT 20;  -- 使用少的索引可考虑删除

4.4 线上操作检查清单

DDL 操作前:
  [ ] 在测试环境验证
  [ ] 选择业务低峰期执行
  [ ] 评估影响(预估执行时间、磁盘空间)
  [ ] 准备回滚方案
  [ ] 通知相关团队

DML 操作前:
  [ ] 确认 WHERE 条件正确(先 SELECT 确认影响行数)
  [ ] 开启事务,不自动提交
  [ ] 先备份受影响数据
  [ ] 分批执行,避免大事务

数据修复前:
  [ ] 备份原始数据
  [ ] 在测试环境验证修复 SQL
  [ ] 准备回滚脚本
  [ ] 通知业务方停服(如果需要)

紧急操作:
  [ ] 评估影响范围
  [ ] 通知相关方
  [ ] 执行后验证
  [ ] 记录故障时间线和原因

五、实战案例

案例:完整的运维脚本

bash
#!/bin/bash
# MySQL 运维健康检查脚本

MYSQL_USER="monitor"
MYSQL_PASS="password"
MYSQL_HOST="127.0.0.1"
MYSQL_PORT="3306"
ALERT_EMAIL="dba@company.com"

# 检查函数
check_mysql_alive() {
    mysql -u$MYSQL_USER -p$MYSQL_PASS -h$MYSQL_HOST -P$MYSQL_PORT -e "SELECT 1" > /dev/null 2>&1
    if [ $? -ne 0 ]; then
        echo "CRITICAL: MySQL is not responding" | mail -s "MySQL DOWN" $ALERT_EMAIL
        return 1
    fi
}

check_connection_count() {
    CONN_COUNT=$(mysql -u$MYSQL_USER -p$MYSQL_PASS -h$MYSQL_HOST -P$MYSQL_PORT \
        -e "SELECT COUNT(*) FROM information_schema.PROCESSLIST" 2>/dev/null | tail -1)
    MAX_CONN=$(mysql -u$MYSQL_USER -p$MYSQL_PASS -h$MYSQL_HOST -P$MYSQL_PORT \
        -e "SHOW VARIABLES LIKE 'max_connections'" 2>/dev/null | awk '{print $2}')

    USAGE=$(echo "scale=2; $CONN_COUNT / $MAX_CONN * 100" | bc)
    if [ $(echo "$USAGE > 80" | bc) -eq 1 ]; then
        echo "WARNING: Connection usage at $USAGE%" | mail -s "MySQL Connection Alert" $ALERT_EMAIL
    fi
}

check_replication() {
    SLAVE_STATUS=$(mysql -u$MYSQL_USER -p$MYSQL_PASS -h$MYSQL_HOST -P$MYSQL_PORT \
        -e "SHOW SLAVE STATUS\G" 2>/dev/null)

    IO_RUNNING=$(echo "$SLAVE_STATUS" | grep "Slave_IO_Running:" | awk '{print $2}')
    SQL_RUNNING=$(echo "$SLAVE_STATUS" | grep "Slave_SQL_Running:" | awk '{print $2}')
    SECONDS_BEHIND=$(echo "$SLAVE_STATUS" | grep "Seconds_Behind_Master:" | awk '{print $2}')

    if [ "$IO_RUNNING" != "Yes" ] || [ "$SQL_RUNNING" != "Yes" ]; then
        echo "CRITICAL: Replication is not running" | mail -s "MySQL Replication Down" $ALERT_EMAIL
    fi

    if [ "$SECONDS_BEHIND" -gt 300 ]; then
        echo "CRITICAL: Replication delay $SECONDS_BEHIND seconds" | mail -s "MySQL Replication Delay" $ALERT_EMAIL
    fi
}

check_disk_usage() {
    DISK_USAGE=$(df -h /var/lib/mysql | tail -1 | awk '{print $5}' | sed 's/%//')
    if [ "$DISK_USAGE" -gt 80 ]; then
        echo "WARNING: Disk usage at $DISK_USAGE%" | mail -s "MySQL Disk Alert" $ALERT_EMAIL
    fi
}

# 执行检查
check_mysql_alive || exit 1
check_connection_count
check_replication
check_disk_usage

echo "Health check completed at $(date)"

总结

领域 核心规范
权限管理 最小权限、限制登录主机、强密码策略
监控指标 连接数、QPS/TPS、慢查询、锁等待、复制延迟
大事务处理 分批处理、设置超时、及时提交
DDL 规范 pt-online-schema-change、低峰期、备份
DML 规范 先 SELECT 确认、分批处理、备份原始数据
紧急操作 评估影响、通知相关方、记录故障时间线