Article Listing 1 Listing 2 Listing 3 oct2004.tar

Listing 2 load monitoring script

#!/usr/bin/ksh

###########################################################
#10/01/2003
###########################################################
#load-mon.sh command to monitor load from cron TEMPORARY
#
#this script check if idle is under 10% for last 200 sec
#and do alerting, goes each 11,21,31,41,51 minutes
#This has to be done trough RMC
#
# ARGUMENTS: 
#
# QUIRCKS
# it is just improvisation ...
#
# FUNCTIONS:
# mesgLoad 
# mesgHog 
# eventLoad 
# eventHog 
#
###########################################################
# 
# REVISIONS 
# Wed Oct  1 10:53:07 DFT 2003
# 1.0 First version, tested and rushed into production
# 1.1  speed up - load was oscilating
# Fri Oct 24 09:49:59 DFT 2003
# 1.1   hog monitoring too - CPU cummulative for oracle >4%
# just not very good fix 
# 1.2 Mon Jan 26 08:25:22 NFT 2004
# L modified to be able to fire oracle snap 
#
###########################################################

#action on load over limit event

mesgLoad ()
{
    logger "Load $L"

mailx -s "Load on $HST over 90%" $MAILS << EOF
Load on $HST over 90%
EOF

    exit 0

}

#action on hog event
#mail is commented out
mesgHog ()
{
    logger "Hog $H"
    exit 0

#mailx -s "Possible CPU hog on $HST " $MAILS << EOF
#Possible CPU hog on $HST
#$H
#EOF

}

#load detection
eventLoad ()
{
    sar -u -i$TIME | tail -3 | head -1 | awk '$5+0< '$IDLE ' { print }'
}

#hog detection
eventHog ()
{
    ps -ekF "%u %p %C %x %c %a" | sort -rn +2 | awk '$3+0>'$HOGL' && \
      $1~/oracle/ { print }'| sed 's/  *$//'
}

##########################################################
#main part

export PATH=/usr/bin:/etc:/usr/sbin:/usr/ucb:/usr/bin/X11:/sbin:/usr/java131/ \
  jre/bin:/usr/java131/bin:/usr/lib/instl:/usr/local/bin:/usr/symcli/ \
  bin:/usr/lpp/Symmetrix/bin

##########################################################

### configuration part

VERSION="1.2"

#where messages are going
MAILS="root"

#if monitoring is enabled
/rootop/control_enabled.sh || exit 0

DT=$(date "+%y.%m.%d.%H:%M:%S")

HST=$(hostname)

#sar with 10 minutes resolution -i600, changed to 5 minutes
#load idle less than 10%
IDLE=10    ;    #idle 
TIME=200;       #200 seconds

## sensor part and condition part - there is flodding control here

L=$(eventLoad)

#long term hog - CPU over 7 - by experinece
HOGL=7    ;#cumulative CPU load

H=$(eventHog)

## event and action part

#it is event fire snapshot oracle and send message exit because all is done
test -n "$L" && mesgLoad && /rootop/fireOracleSnap.sh && exit 0

#it is event fire snapshot oracle and send message
test -n "$H" &&  mesgHog && /rootop/fireOracleSnap.sh 

exit 0