head node monitor

The following is edited in order to be benign, please handle with care 🙂 essentially written to get illegal procs scrubbed from head nodes at PSU RCC. Obviously passwordless ssh and sendmail need to be configured.

#!/bin/bash
#head node monitoring - WJB 10/13
#if process is not allowed, and CPU% is > 1, not run by root ->
#log it, email user (only once a day), maybe kill process?

day=$(date | awk '{print $3}')

declare -a machs=('clusterA' 'clusterB' 'clusterC')
declare -a procs=('matlab' 'gimp' 'amber12' 'STAR' 'molden' 'lsdyna' 'MATLAB')


while [ 1 ]
do

#get some data 
for x in ${machs[@]}
do
	ssh $x.my.dom.edu ps -Alf |\
	awk '{if(($6 > 1) && ($3!="root") && (NR>1)){print $3,$5,$15}}' > $x.zero_mins
done

#wait a bit
sleep 10m

#check again
for x in ${machs[@]}
do
	ssh $x.my.dom.edu ps -Alf |\
	awk '{if(($6 > 1) && ($3!="root") && (NR>1)){print $3,$5,$15}}' > $x.ten_mins

	#the same entries are possible candidates for process deletion
	for y in ${procs[@]}
	do
		comm -1 -2 $x.zero_mins $x.ten_mins 2> /dev/null | grep $y | awk '{print $1,$2}' 1>> $x.kill_maybe
	done

	while read p; do

		mailed_already=$(more $x.nasty_grams | awk "BEGIN{i=0} {if (/$p/){i++;}} END {print i}")
		if [ mailed_already == 0 ]; then
			
			#we don't want users getting multiple nasty grams per day
			echo $p >> $x.nasty_grams

			#substitute uid eventually in email addy
			#uid=$p[1]
		    	subject="Your process(es) running on $x"

		    	echo   "Dear User;
		    	You have one or more compute processes set for deletion on the head node of $x in 30mins, 
		    	please remove it before then in keeping with policy. Recall that applications may only 
		    	be run interactively on the appropriate cluster eg., hammer, or through PBS on the lion clusters." |\
			sendmail -s $subject bill@localhost
		fi


	done < $x.kill_maybe


done

#wait a bit
sleep 30m

#check finally
for x in ${machs[@]}
do
	ssh $x.my.dom.edu ps -Alf |\
	awk '{if(($6 > 1) && ($3!="root") && (NR>1)){print $3,$5,$15}}' > $x.30_mins

	#the same entries at this point are candidates for process termination
	for y in ${procs[@]}
	do
		comm -1 -2 $x.zero_mins $x.30_mins 2> /dev/null | grep $y | awk '{print $1,$2}' 1>> $x.kill_def
	done

	echo "On cluster: $x"
	more $x.kill_def

	rm $x.kill_def $x.kill_maybe
done

done



new_day=$(date | awk '{print $3}')


if [ new_day != day ]; then
	day=new_day;
	#get rid of nasty gram logs for new day
	for x in ${machs[@]}
	do
		rm $x.nasty_grams

	done

fi

exit
Advertisements

2 comments

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s