业务报警订单提交异常,页面一直没有反应,排查后是事务没有提交或者回滚导致,想到如果及时监控事务的运行状态报警出来,那么就
序言:
业务报警订单提交异常,页面一直没有反应,排查后是事务没有提交或者回滚导致,想到如果及时监控事务的运行状态报警出来,那么就可以及时排查出问题所在,方便运营处理,所以自己就弄了一个shell脚本放在nagios来处理事务报警情况。
1,编写事务监控脚本
#!/bin/bash
# author: tim.man
# version: 1.0
# desc: check the running transaction over
st_ok=0
st_wr=1
st_cr=2
st_uk=3
time_trx=10
# 提示信息
print_help() {
echo $progname -w int -c int
echo options:
echo -w/--warning)
echo sets a warning number
echo -c/--critical)
echo sets a critical level for io
exit $st_uk
}
while test -n $1; do
case $1 in
-help|-h)
print_help
exit $st_uk
;;
--warning|-w)
warning=$2
shift
;;
--critical|-c)
critical=$2
shift
;;
*)
echo unknown argument: $1
print_help
exit $st_uk
;;
esac
shift
done
get_wcdiff() {
if [ ! -z $warning -a ! -z $critical ]
then
wclvls=1
if [ ${warning} -gt ${critical} ]
then
wcdiff=1
fi
elif [ ! -z $warning -a -z $critical ]
then
wcdiff=2
elif [ -z $warning -a ! -z $critical ]
then
wcdiff=3
fi
}
# 脚本判断
val_wcdiff() {
if [ $wcdiff = 1 ]
then
echo please adjust your warning/critical thresholds. the warning must be lower than the critical level!
exit $st_uk
elif [ $wcdiff = 2 ]
then
echo please also set a critical value when you want to use warning/critical thresholds!
exit $st_uk
elif [ $wcdiff = 3 ]
then
echo please also set a warning value when you want to use warning/critical thresholds!
exit $st_uk
fi
}
get_wcdiff
val_wcdiff
# 统计mysql的事务中最大运行时间
max_over_time=`/usr/local/mysql/bin/mysql --user=nagios --password=nagiosq@xxx -ns /usr/local/mysql/mysql.sock -e select time_to_sec(timediff(now(),t.trx_started)) from information_schem
a.innodb_trx t where time_to_sec(timediff(now(),t.trx_started))>$time_trx order by time_to_sec(timediff(now(),t.trx_started)) desc limit 1; |awk '{print $1}'`
# 如果当前没有running的事务,则直接赋值为0,以免下面if判断出错
if [ ! -n $max_over_time ];then max_over_time=0
fi
# 取得当前所以阻塞的事务数量
num_trx=`/usr/local/mysql/bin/mysql --user=nagios --password=nagiosq@xxx -ns /usr/local/mysql/mysql.sock -e select count(1) from information_schema.innodb_trx t where time_to_sec(timedif
f(now(),t.trx_started))>$time_trx; |awk '{print $1}'`
if [ -n $warning -a -n $critical ]
then
if [ `expr $max_over_time \> $warning` -eq 1 -a `expr $max_over_time \ then
echo warning - $num_trx transactions running,go over for $max_over_time seconds
exit $st_wr
elif [ `expr $max_over_time \> $critical` -eq 1 ]
then
echo critical- $num_trx transactions runnning,go over for $max_over_time seconds
exit $st_cr
else
echo ok- transactions ran successfully.
exit $st_ok
fi
fi