#!/usr/local/bin/perl

##################################################
#
# NodeWatch - Monitor and react to TCP/IP network nodes.
#
#     Version 1.6
#     Last revised on September 23, 1998.
#
# Copyright (C) 1998 Patrick Ryan.  All rights reserved.
#
# Authors:
#
#     Patrick Ryan <pryan@fhcrc.org>
#     Stuart Kendrick <skendrc@fhcrc.org>
#     Ron Hood <rhood@fhcrc.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
##################################################

##################################################
#
# Usage:
#
#      nodewatch
#
# Synopsis:
#
#      NodeWatch pings a list of nodes obtained from
#      a database and notes, through the execution
#      of actions which have been passed messages
#      and entries to the syslog, if a node has gone
#      down or up.
#
#      It is designed to support the management of an
#      IP network in which a small group of people
#      manage the net and a larger collection of people
#      are interested in the health of specific nodes.
#
#      Status reports are issued at predefined times.
#      The status message indicates nodewatch is
#      operational, and whether any nodes are down at
#      the time of issue.  If any nodes are down, then
#      a list of the down nodes is reported as well.
#      The status reports are communicated by executing
#      the special action.
#
#      All options are specified in the option database.
#      NodeWatch will automatically detect changes in the
#      option database and reread it.  This applies to the
#      node, action, and time period databases as well.
#
# NodeWatch relies on Time::Period.  It can be obtained on CPAN at
# <URL=http://www.perl.com/CPAN/modules/by-module/Time/>.
#
# History (current version is 1.6):
#
#      Version 1.6:
#           Patrick Ryan:
#            - Added the scheduled down time feature.
#            - Redirected fping stderr to the syslog.
#            - Added period validity checking in read_period_db().
#      Version 1.5:
#           Patrick Ryan:
#            - Fixed a bug dealing with network partitioning.
#            - Added the ability to use the "none" action.
#            - Fixed some code which set nodes to the unknown status.
#      Version 1.4:
#           Patrick Ryan:
#            - Added differentiation between syslog operational messages and
#              syslog node messages.
#            - Changed the name to NodeWatch.
#            - Added allow multiple instances option.
#      Version 1.3:
#           Patrick Ryan:
#            - Pulled out qpage code from pingpage and made an action field
#              in the nodes file.  The action field is a command to run when
#              pingpage wants to send a message.  A set of symbols is defined
#              which, when encountered in the action field, are replaced with
#              what the symbols represent.
#      Version 1.2:
#           Patrick Ryan:
#            - Merged Pingpage 1.1 with Pingpage 1.1bug2.  Pingpage 1.2
#              is designed to use the Quick Page alphanumeric paging client.
#            - Cleaned up source code.
#            - Put configuration variables which were previously configurable
#              only from the source into the configuration file mechanism.
#      Version 1.1:
#           Patrick Ryan:
#            - Fixed a bug with pinging.  If $result returned null after
#              executing the pager, the error wouldn't be dected.  It is now.
#      Version 1.0:
#           Stuart Kendrick:
#	     - Production
#      Version 0.9:
#	    Stuart Kendrick:
#           - Bug fix to $check_partition
#	    - Migrated monoprocess code to do_lockfile().
#           - Give up current UID and become $pingpage_user
#	    - Removed location of /usr/bin/ps and pingpage.lock
#	      from options list
#      Version 0.8:
#           Patrick Ryan:
#            - Added %node_hist to keep track of time at which a node
#              last changed states.
#      Version 0.7:
#           Patrick Ryan:
#            - Complete rewrite.
#            - Major changes:
#               - Multiple pinging using fping.
#               - Dumps current node status to a well known file when
#                 given a signal.
#               - Makes the crier times independant of the start and end
#                 of the day.
#      Version 0.1:
#           Patrick Ryan:
#            - Changed source code formatting.
#            - Implemented simple changes.
#      Version 95Jun15
#           Ron Hood:
#            - Added on/off switches.
#      Version 94Jun08
#           Ron Hood:
#            - Added paging for other organizations.
#      Version 94Jun06
#           Ron Hood:
#            - Added crier function.
#      Version 94Jun02
#           Ron Hood:
#            - First Version.
#
##################################################

##################################################
# Location of the option database and option defaults.

$option_db = "nodewatch.options";

# Default values for the options stored in $option_db.
%default_options = (
  'n_rpts_act' => 1,                        # Node Reports via Action
  'n_rpts_sys' => 1,                        # Node Reports via Syslog
  'n_flux_sys' => 1,                        # Node Flux in Syslog
  's_rpts_sact' => 1,                       # Status Reports via Special Action
  's_rpts_sys' => 1,                        # Status Reports via Syslog
  's_rpt_times' => '700 1800',              # Status Report Times
  'rrc_sact' => 1,                          # Report Report Changes via Special
                                            # Action
  'rrc_sys' => 1,                           # Report Report Changes via Syslog
  'n_per' => 6,                             # Nodes per Action Message
  'lockfile' =>                             # Lockfile location
    '/home/netops/etc/nodewatch.lockfile',  #
  'run_as' => 'netops',                     # Run as User
  'mult_int' => 1,                          # Allow Multiple Instances?
  'node_db' =>                              # Node Database
    '/usr/local/etc/nodewatch.nodes',       #
  'action_db' =>                            # Action Database
    '/usr/local/etc/nodewatch.actions',     #
  'period_db' =>                            # Time Period Database
    '/usr/local/etc/nodewatch.periods',     #
  'stats_dump_file' =>                      # Path to stats dump file.
    '/usr/local/etc/nodewatch.node.stats',  #
  'ps' => '/usr/bin/ps -A -o pid,args',     # PS command
  'pinger' => '/usr/local/bin/fping',       # Pinger command
  'p_name' => 'nodewatch',                  # Process Name
  'p_type' => 'local0',                     # Process Type
  'node_m_type' => 'notice',                # Node Message Type
  'm_type' => 'info',                       # Message Type
  'err_m_type' => 'err',                    # Error Message Type
  'pid' => 0                                # Show PID
);

#
##################################################

##################################################
# Table of variable semantics.
#
# $option_db       - The option database.
# $mtime_option_db - The last time the option database was modified.
# $mtime_node_db   - The last time the node database was modified.
# $mtime_action_db - The last time the the action database was modified.
# $mtime_period_db - The last time the time period database was modified.
# $loop_time       - The current time for this cycle.
# $loop_time_then  - The time for the pervious cycle.
# %options         - Global options.
# %old_options     - Global options from the last time they were read.
# @nodes           - Array of nodes to watch.
# %class           - The class of the nodes.
# %count           - The count of the nodes.
# %old_count       - The count of the nodes from the last node db reading.
# %timeout         - Timeout associated with each node.
# %act_pers        - Action and associated periods for each node.
# %actions         - A mapping from action identifiers to definitions.
# %periods         - A mapping from period identifiers to definitions.
# %ping_result     - The list of pinger responses from this cycle.
# %old_ping_result - The list of pinger responses from one cycle ago.
# %status          - The status of the nodes.
# %old_status      - The status of the nodes from the previous cycle.
# %in_a_row        - The number of consecutive same type responses.
# %in_sdt_then     - Whether a node was in a SDT during the previous cycle.
# %sdt_been_dn     - Whether a node has entered a SDT down and stayed down.
# %sdt             - How a node's cheduled down times affect it, if at all.
# $partitioned     - Flag which indicates whether any routers are down.
# @up              - Array of nodes currently up.
# @dn              - Array of nodes currently down.
# @unknown         - Array of nodes currently unknown.
# @gone_up         - Array of nodes that have just gone up.
# @gone_dn         - Array of nodes that have just gone down.
# @up_flux         - Array of nodes that are up but in flux.
# @dn_flux         - Array of nodes that are down but in flux.
# $r               - Right delimiter for a node list in a message.
# $l               - Left delimiter for a node list in a message.
# $dump_stats      - Flag to dump node statistics to file.
# %node_hist       - Time each node last changed states.
#
##################################################

##################################################
# What the functions do.
#
##################################################

##################################################
# The main program.

use Time::Period;
use IPC::Open3;
use Sys::Syslog;

to_syslog("Starting up.");

# The signal that we'll use to dump node statistics.
$SIG{USR1} = \&caught_dump_sig;

# The stop process signals that we'll use to run shut_down().
$SIG{HUP} = \&shut_down;
$SIG{INT} = \&shut_down;
$SIG{TERM} = \&shut_down;

# This allows us to harvest child processes.
$SIG{CHLD} = sub { wait; };

# Remember the last modification time for the option database.
$mtime_option_db = (stat($option_db))[9];

# Read the option database.
read_option_db();

# Become user $options{run_as} if possible.
change_id() if ($> == 0);

# Check for presence of another process, initialize lockfile.
do_lockfile();

# Loop forever.
while () {

  # Check to see if the option, node, action, or period databases have changed.
  # If so, reread.  If this is the first time through, the node status gets
  # initialized to unknown.

  check_files();

  # At this point, we are virtually gauranteed up-to-date database data.


  # Make a copy of the %ping_result array.

  %old_ping_result = %ping_result;

  # Ping all the nodes.  The return values are: 1 for echo, -1 for no echo,
  # and 0 for bad address.  This subroutine sets the %ping_result hash.

  ping_nodes();

  # If the ping_result hash is null, then the pinger must not be executable.

  if (!defined(%ping_result)) {
    sleep($options{sleep_time});
    next;
  }

  # If any of the nodes were bad addresses, remove them from %ping_result
  # and complain.

  check_for_bad_addresses();


  # Update %in_a_row to account for the new ping data (%ping_result).

  in_a_row();


  # %in_a_row keeps track of the number of consecutive pings for a node.
  # A positve number indicates the number of pings that have come back.
  # A negative number for how many pings a node has missed in a row.
  # Zero means that a node entry was reset.


  # Save a copy of the status hash just before updating it so that we can
  # detect state changes.

  %old_status = %status;


  # Now that we have current ping information, let's go though and see if
  # the status of any nodes need to change because of this ping.
  # This routine also adds nodes that are in flux to either @up_flux or
  # @dn_flux.  Flux is defined as a node that is up and missing pings,
  # or a node that is down and returning pings.  Nodes that are up and
  # in flux belong in @up_flux and nodes which are down and in flux belong
  # in @dn_flux.

  update_status();


  # %status works like this: 1 for up, 0 for unknown, and -1 for down.
  # This indicates the current state of the node.  A node goes up if
  # it was previously down or unknown and it returns n consecutive pings.
  # A node goes down if it was previously up or unknown and it misses
  # n consecutive pings.  A node is up if it has gone up and has not gone
  # down or been set to unknown.  A node is down if it has gone down and
  # not gone up or been set to unknown.  Unknown is used to control reporting.
  # A message won't go out when a node goes from unknown to either up or down
  # or from down or up to unknown.


  # Check to see if any of the routers are down.  If they are, then the
  # network is partitioned.  If the network is partitioned, then only the
  # special action is invoked and the only node events which are noticed
  # are router state changes or fluctuations.  This is accomplished by
  # erasing non-router entries from @up_flux and @dn_flux and by setting
  # all non-router nodes to unknown.
  # $partitioned gets set to 1 if the network is partitioned and 0 if not.

  partitioning();


  # Updates %sdt to indicate which nodes should be suppressed due to
  # scheduled down times associated with nodes.  If a node has an undefined
  # value in %sdt, it means it is unaffected by any scheduled down time and
  # will behave according to the other actions and non-sdt periods associated
  # with it.  If it is defined, but contains the empty string, then all
  # actions are suppressed for the node, including the one associated with
  # the sdt.  However, if the entry in %sdt for a node contains something
  # other than "" then it is assumed to be a list of actions to take for that
  # node.

  update_sdt();


  # Set up the various arrays to be used by the messaging code.  Those
  # arrays are @up, @dn, @unknown, @gone_up, and @gone_dn.  This routine
  # also records the time at which a node last changed states.  This
  # information is recorded in %node_hist.

  to_arrays();


  # Ok, we have the seven arrays set up.  Those arrays are: @up, @dn,
  # @unknown, @gone_up, @gone_dn, @up_flux, and @dn_flux.  Let's take
  # the appropriate action.


  # Make syslog entries for nodes which are in flux.

  node_flux_syslog() if ($options{n_flux_sys});


  # This routine sets the delimiters for messages.  It hinges on
  # whether the network is partitioned or not.  Braces are used "{" for
  # a partitioned network and brackets "[" for a non-partitioned network.

  set_delimiters();


  # Now, make a syslog entry for any nodes that have gone up or down.

  node_syslog() if ($options{n_rpts_sys});


  # Dump the current node statistics if we've received a signal to do so.

  dump_stats() if $dump_stats;


  # Store the current time for use by the status reporting routines.
  # This line bumps the hour over two orders of magnitude (to the thousands
  # place) and then adds the hour, that way we get "hhmm".

  $loop_time = (localtime)[2]*100 + (localtime)[1];


  # Now, let's talk taking action.  Only execute the action for hosts that
  # have gone up or down.

  node_action() if ($options{n_rpts_act});


  # Status reporting.

  do_status();


  sleep($options{sleep_time});
}

#
##################################################

##################################################
# Function definitions, nothing else below this point.

sub read_option_db {
  # Reads option database named by $option_db and sets an
  # associative array of all the options in the file.  If the file
  # can't be read and the options are unitialized, or if a particular
  # option can't be set, then the defaults are used.
  # Depends on $option_db, %default_options, %options, and error().

  my($option, $value);

  if (! open(DB, $option_db) ) {
    # Okay, we can't read the option database, is %options empty (first time?)?
    if (!defined(%options)) {
      error("The option database cannot be opened, using defaults.");

      # Use the default values.
      %options = %default_options;
      return();
    } else {
      error("The option database cannot be opened, options unchanged.");

      # Okay, %options holds stuff.  Let's assume that simply means
      # %options has been initialized with the proper values.  Meaning
      # at one point, either the defaults were used, or we could read
      # $option_db.
      return();
    }
  }

  %options = ();

  while(<DB>) {
    # Skip blank lines.
    next if (/^\s+$/);
    # Skip comments.
    next if (/^\s*#/);
    # Strip off trailing comments
    s/#.*//;

    # Is this line a "option = value"?
    if (/^\s*(.+)=\s*(.+)/) {
      $option = $1;
      $value = $2;

      # Chop off the white space that may be trailing...
      $option =~ s/\s*$//;
      $value =~ s/\s*$//;

      # Change whitespace to one space.
      $option =~ s/\s+/ /g;

      # Match for each option.
      if ($option =~ /^Node Reports via Action$/i) {
        $options{n_rpts_act} = $value;
      } elsif ($option =~ /^Node Reports via Syslog$/i) {
        $options{n_rpts_sys} = $value;
      } elsif ($option =~ /^Node Flux in Syslog$/i) {
        $options{n_flux_sys} = $value;
      } elsif ($option =~ /^Status Reports via Special Action$/i) {
        $options{s_rpts_sact} = $value;
      } elsif ($option =~ /^Status Reports via Syslog$/i) {
        $options{s_rpts_sys} = $value;
      } elsif ($option =~ /^Status Report Times$/i) {
        $options{s_rpt_times} = $value;
      } elsif ($option =~ /^Report Report Changes via Special Action$/i) {
        $options{rrc_sact} = $value;
      } elsif ($option =~ /^Report Report Changes via Syslog$/i) {
        $options{rrc_sys} = $value;
      } elsif ($option =~ /^Nodes per Message$/i) {
        $options{n_per} = $value;
      } elsif ($option =~ /^Lockfile$/i) {
        $options{lockfile} = $value;
      } elsif ($option =~ /^Sleep Time$/i) {
        $options{sleep_time} = $value;
      } elsif ($option =~ /^Run as User$/i) {
        $options{run_as} = $value;
      } elsif ($option =~ /^Multiple Instances$/i) {
        $options{mult_int} = $value;
      } elsif ($option =~ /^Node Database$/i) {
        $options{node_db} = $value;
      } elsif ($option =~ /^Action Database$/i) {
        $options{action_db} = $value;
      } elsif ($option =~ /^Time Period Database$/i) {
        $options{period_db} = $value;
      } elsif ($option =~ /^Statistics Dump File$/i) {
        $options{stats_dump_file} = $value;
      } elsif ($option =~ /^PS$/i) {
        $options{ps} = $value;
      } elsif ($option =~ /^Pinger$/i) {
        $options{pinger} = $value;
      } elsif ($option =~ /^Process Name$/i) {
        $options{p_name} = $value;
      } elsif ($option =~ /^Process Type$/i) {
        $options{p_type} = $value;
      } elsif ($option =~ /^Node Message Type$/i) {
        $options{node_m_type} = $value;
      } elsif ($option =~ /^Message Type$/i) {
        $options{m_type} = $value;
      } elsif ($option =~ /^Error Message Type$/i) {
        $options{err_m_type} = $value;
      } elsif ($option =~ /^Show PID$/i) {
        $options{pid} = $value;
      } else {
        error("Invalid entry at line $. of option database $option_db.  " .
              "Ignoring entry.");
      }
    }
  }

  close(DB);

  # Go through each option and see if it was missed.  If so, fill in
  # the default value and make a note of the discrepancy.
  foreach $option (keys %default_options) {
    if (!defined($options{$option})) {
      $options{$option} = $default_options{$option};
      error("Option $option is broken in the option database, using default ". 
             "($default_options{$option}).");
    }
  }
}

sub change_id {
  # Try to become the user specified.
  # Depends on $options{run_as} and error().

  my(@pw);

  @pw = (getpwnam($options{run_as}))[2, 3];

  if (@pw != 2) {
    error("Cannot find user $options{run_as}.\n");
    shut_down();
  }

  # Set real & effective uids and gids.  Note that this only
  # sets the primary group from the password file.
  ($(, $)) = ($pw[1], $pw[1]);
  ($<, $>) = ($pw[0], $pw[0]);
}

sub do_lockfile {
  # If we aren't supposed to allow multiple instances of NodeWatch, then
  # check to see if any NodeWatchs are running.  If so, croak.  Regardless
  # of whether we allow multiple instances or not, write the pid of this
  # process to the lock file.
  # Depends on $options{ps}, $options{lockfile}, $options{mult_int}, and
  # error().

  my($pid, @ps, $process);

  if (!$options{mult_int}) {

    # First of all, does $options{lockfile} exist?  If not, dump the
    # PID into it and bail...
    if (! -e $options{lockfile}) {
      if (!open(FILE, ">$options{lockfile}")) {
        error("Couldn't open $options{lockfile}.");
        shut_down();
      }
      printf(FILE "%d\n", $$);
      close(FILE);
      return();
    }

    # Okay, $options{lockfile} exists.  Let's try to get a pid out of it.
    if ( ! open(FILE, $options{lockfile}) ) {
      error("The lock file ($options{lockfile}) cannot be opened.");
      shut_down();
    }
    $pid = <FILE>;
    close(FILE);

    # Remove the newline from $pid, if there is one.
    chop($pid) if (substr($pid, -1, 1) eq "\n");

    # Okay, we have a pid.  Is there a nodewatch process associated with it?

    # Check to see if ps is executable.
    $process = $options{ps};
    $process =~ s/^(\S+).*/$1/;
    if (! -x $process) {
      error("$process isn't executable, assuming another $0 isn't running.");
      @ps = ();
    } else {
      @ps = `$options{ps}`;
    }

    foreach $process (@ps) {
      if ($process =~ /^\s*$pid.+$0/) {
        error("Another instance of $0 is already running.");
        shut_down();
      }
    }

    # Cool, there isn't a nodewatch process running with the pid listed in the 
    # lockfile.
  }

  # Let's write out pid to the lockfile and be done with it.

  if ( ! open(FILE, ">$options{lockfile}") ) {
    error("The lock file ($options{lockfile}) cannot be opened.");
    shut_down();
  }
  printf(FILE "%d\n", $$);
  close(FILE);
}


sub check_files {
  # Checks to see if any of four files have changed: the option database,
  # the node database, the action database, and the period database.  If the
  # modification time for any of those files have changed, the ones with
  # the differing modification times are reread.  Also, update %status or
  # initialize it if it is the first time.
  # Depends on $mtime_option_db, $option_db, %options, read_option_db(),
  # $mtime_node_db, %old_count, %count, read_node_db(), @nodes, %status,
  # $mtime_action_db, $mtime_period_db, fix_status(), %in_a_row, %old_options,
  # to_syslog(), check_options(), check_report_changes(), and clean_hashes().

  # If $option_db has changed, read it again.  $mtime_option_db will
  # be unititialized if this is the first iteration, so the conditional
  # will evaulate as true.
  if (!defined($mtime_option_db) || $mtime_option_db != (stat($option_db))[9]) {

    # The option database was read when NodeWatch started.
    to_syslog("Reading option database.");

    $mtime_option_db = (stat($option_db))[9];

    %old_options = %options;

    # Read in the option database.
    read_option_db();

    # Try to make sure dependencies work, such as pinger and ps.
    check_options();

    # Do we need to report report changes?  In other words, have
    # report options changed?
    check_report_changes();
  }

  # Same thing for the node database (stored in $options{node_db}).
  if (!defined($mtime_node_db) ||
      $mtime_node_db != (stat($options{node_db}))[9]) {

    to_syslog("Reading node database.") if (defined($mtime_node_db));

    $mtime_node_db = (stat($options{node_db}))[9];

    # Stick the current %count into %old_count.  This is done because we
    # want to know how %count changed after reading the node list.  This is
    # used by fix_status().
    %old_count = %count;

    # Read in the node list.  This dumps node names into @nodes.
    # Four associative arrays are outputted to: %class, %count, %timeout,
    # and %act_pers.
    read_node_db();

    # Recalculates the status associative array to account for changes in
    # the %count associative array.
    fix_status();

    # Erases entries for nodes that aren't in @nodes from %status, %node_hist,
    # %in_a_row, %in_sdt_then, %sdt_been_dn, and %sdt.  This prevents a
    # buildup of ghost nodes that were erased from the node database.
    clean_hashes();
  }

  # Same thing for the action database (stored in $options{action_db}).
  if (!defined($mtime_action_db) ||
      $mtime_action_db != (stat($options{action_db}))[9]) {

    to_syslog("Reading action database.") if (defined($mtime_action_db));

    $mtime_action_db = (stat($options{action_db}))[9];
    read_action_db();
  }

  # Same thing for the time period database (stored in $options{period_db}).
  if (!defined($mtime_period_db) ||
      $mtime_period_db != (stat($options{period_db}))[9]) {

    to_syslog("Reading time period database.") if (defined($mtime_period_db));

    $mtime_period_db = (stat($options{period_db}))[9];
    read_period_db();
  }
}

sub check_options {
  # Check to see if ps and pinger are executable.
  # Depends on $options{pinger} and error().

  my($temp);

  # See if the executable $options{pinger} is actually executable.
  # $options{ps} isn't checked because it was checked when we first
  # started and it's only executed when we first start up.

  $temp = $options{pinger};
  $temp =~ s/^(\S+).*/$1/;
  if (! -x $temp) {
    error("$temp isn't executable, and it must be.");
    shut_down();
  }
}

sub check_report_changes {
  # Checks to see if report options have changed.  If this is the
  # first time through, the %old_options hash will be null and changes
  # won't get reported.  Four options are watched, those are:
  # $options{n_rpts_act}, $options{s_rpts_sact}, $options{s_rpt_times},
  # and $options{rrc_sact}.  Any changes are noted through the special.
  # Depends on do_action(), to_syslog(), $actions{special}, and the options
  # from both %options and %old_options: rrc_sact, rrc_sys, n_rpts_act,
  # s_rpts_sact, and s_rpt_times.

  return() if (!defined(%old_options));

  my($msg);

  if ($old_options{rrc_sact} != $options{rrc_sact}) {
    if ($options{rrc_sact}) {
      $msg = "It is $loop_time and the report report changes option " .
              "has been enabled.";
      do_action("special", $msg, "special");
      to_syslog($msg) if $options{rrc_sys};
    } else {
      $msg = "It is $loop_time and the report report changes option " .
             "has been disabled.";
      do_action("special", $msg, "special");
      to_syslog($msg) if $options{rrc_sys};
    }
  }

  return() if (!$options{rrc_sact});

  if ($old_options{n_rpts_act} != $options{n_rpts_act}) {
    if ($options{n_rpts_act}) {
      $msg = "It is $loop_time and node reports via action have been enabled.";
      do_action("special", $msg, "special");
      to_syslog($msg) if $options{rrc_sys};
    } else {
      $msg = "It is $loop_time and node reports via action has been disabled.";
      do_action("special", $msg, "special");
      to_syslog($msg) if $options{rrc_sys};
    }
  }

  if ($old_options{s_rpts_sact} != $options{s_rpts_sact}) {
    if ($options{s_rpts_sact}) {
      $msg = "It is $loop_time and status reports via special action have " .
             "been enabled.";
      do_action("special", $msg, "special");
      to_syslog($msg) if $options{rrc_sys};
    } else {
      $msg = "It is $loop_time and status reports via special action have ".
             "been disabled.";
      do_action("special", $msg, "special");
      to_syslog($msg) if $options{rrc_sys};
    }
  }

  if ($old_options{s_rpt_times} ne $options{s_rpt_times}) {
    $msg = "It is $loop_time and the status reporting times have been changed.";
    do_action("special", $msg, "special");
    to_syslog($msg) if $options{rrc_sys};
  }
}

sub read_node_db {
  # Read in the node database.  This dumps node names into @nodes.
  # Four associative arrays are outputted to: %class, %count, %timeout,
  # and %act_pers.
  # Depends on $options{node_db} and error().

  my($node, $class, $count, $timeout, $act_pers_data, $act_per, $action,
     $period, $malformed);

  # Clear the node data arrays.
  @nodes = ();
  %class = ();
  %count = ();
  %timeout = ();
  %act_pers = ();

  # If the node database can't be opened, see if we've already read it in
  # before.  If yes, then say so, otherwise, choke.
  if (! open(DB, $options{node_db}) ) {
    if (@nodes == 0) {
      error("The node database ($options{node_db}) couldn't be opened.");
      shut_down();
    } else {
      error("The node database ($options{node_db}) couldn't be opened, " .
            "update unsuccessful.");
      return();
    }
  }

  while(<DB>) {

    # Skip blank lines.
    next if (/^\s+$/);
    # Skip comments.
    next if (/^\s*#/);
    # Strip off trailing comments
    s/#.*//;

    # The format for a node entry is:
    # node class count timeout action[:period[(d)]][[,action[:period[(d)]]] ...]
    if (/^\s*(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)/) {
      ($node, $class, $count, $timeout, $act_pers_data) = ($1, $2, $3, $4, $5);

      # Scream if the fields don't look okay.
      if ( ($class !~ /^[rch]$/i) || ($count !~ /^\d+$/) || ($count <= 0) ||
           ($timeout !~ /^\d+$/) || ($timeout <= 0) ) {
        error("The entry for $node in $options{node_db} at line $. is " .
              "screwed up.  Ignoring it.");
        next;
      }

      # Strip off trailing whitespace.
      $act_pers_data =~ s/\s*$//;

      # Strip off whitespace from commas.
      $act_pers_data =~ s/,\s*/,/g;
      $act_pers_data =~ s/\s*,/,/g;

      # Strip off whitespace from colons.
      $act_pers_data =~ s/\s*:/:/g;
      $act_pers_data =~ s/:\s*/:/g;

      $malformed = 0;

      # Try and make sure the action and valid time periods field is okay.
      foreach $act_per (split(/,/, $act_pers_data)) {
        if ( $act_per =~ /:/ && $act_per !~ /^[^:]+:[^:]+$/ ||
             $act_per =~ /:\s*\(\s*[Dd]\s*\)$/ ) {
          $malformed = 1;
          last;
        } elsif ($act_per !~ /\S/) {
          $malformed = 1;
          last;
        }
      }

      if ($malformed) {
        error("The entry for $node in $options{node_db} at line $. is " .
              "screwed up.  Ignoring it.");
        next;
      }

      # Add entries to arrays and hashes for $node.
      push(@nodes, $node);
      $class{$node} = $class;
      $count{$node} = $count;
      $timeout{$node} = $timeout;
      $act_pers{$node} = $act_pers_data;
    }
  }

  close(DB);
}

sub read_action_db {
  # Reads the action database and stores the data in %actions.
  # Depends on $options{action_db}, %actions, and error().

  my($action, $action_def);

  if (!open(DB, $options{action_db})) {
    if (!defined(%actions)) {
      error("The action database ($options{action_db}) couldn't be opened.");
      shut_down();
    } else {
      error("The action database ($options{action_db}) couldn't be opend, " .
            "update unsuccessful.");
      return();
    }
  }

  %actions = ();

  while(<DB>) {

    # Skip blank lines.
    next if (/^\s+$/);
    # Skip comments.
    next if (/^\s*#/);
    # Strip off trailing comments
    s/#.*//;

    if (/^\s*(.+)=\s*(.+)/) {
      $action = $1;
      $action_def = $2;

      $action =~ s/\s*$//;
      $action_def =~ s/\s*$//;

      if ($action =~ /:/ || $action =~ /,/) {
        error("Line $. in the action database ($options{action_db}) is " .
              "malformed.  Ignoring it.");
        next;
      }

      if ($action eq "none") {
         error("Line $. in the action database ($options{action_db}) defined " .
               "the action \"none\", which is internally defined.");
      }

      $actions{$action} = $action_def;
    }
  }

  close(DB);

  if (!defined($actions{special})) {
    error("No special action identifier in the action database " .
          "($options{action_db}).");
    shut_down();
  }
}

sub read_period_db {
  # Reads the time period database and stores the data in %periods.
  # Depends on $options{period_db}, %periods, and error().

  my($now, $period, $period_def);

  if (!open(DB, $options{period_db})) {
    if (!defined(%periods)) {
      error("The period database ($options{period_db}) couldn't be opened.");
      return();
    } else {
      error("The period database ($options{period_db}) couldn't be opened, " .
            "update unsuccessful.");
      return();
    }
  }

  %periods = ();

  $now = time();

  while(<DB>) {

    # Skip blank lines.
    next if (/^\s+$/);
    # Skip comments.
    next if (/^\s*#/);
    # Strip off trailing comments
    s/#.*//;

    if (/^\s*(.+)=\s*(.+)/) {
      $period = $1;
      $period_def = $2;

      $period =~ s/\s*$//;
      $period_def =~ s/\s*$//;

      if ($period =~ /,/) {
        error("Line $. in the period database ($options{period_db}) is " .
              "malformed.  Ignoring it.");
        next;
      }

      if (inPeriod($now, $period_def) == -1) {
        error("Line $. in the period database ($options{$period_db}) " .
              "contains an invalid period.  Ignoring it.");
        next;
      }

      $periods{$period} = $period_def;
    }
  }

  close(DB);
}

sub fix_status {
  # Recalculates %status based on changes that may have occured in the
  # count after rereading the node database.
  # If this is the first time through, then %status will be null and it
  # will get initialized.  This means that the status of each node will
  # be 0 (unknown).  This prevents screaming when the script is first
  # started.  If the count for an entry changes, then put that node to
  # unknown so that it can sort itself out gracefully.
  # Depends on %status, %count, %old_count, %in_a_row, and %node_hist.

  my($node, $now);

  $now = time();

  # Is this the first time through?  If so, set the status of all the nodes
  # to unknown, set the history hash, and return.
  if (!defined(%status)) {
    foreach $node (@nodes) {
      $status{$node} = 0;

      # Record the current time.
      $node_hist{$node} = $now;
    }
    return();
  }

  # Go through each node.
  foreach $node (@nodes) {
    if (!defined($old_count{$node}) || ($count{$node} != $old_count{$node})) {
      # If the count changes, set the status of the node to unknown.
      $status{$node} = 0;

      # Set %in_a_row to 0 for that node.
      $in_a_row{$node} = 0;

      # Record the current time.
      $node_hist{$node} = $now;
    }
  }
}

sub clean_hashes {
  # Erases entries for nodes that aren't in @nodes from %status, %in_a_row,
  # %in_sdt_then, %sdt_been_dn, and %sdt.  This prevents a buildup of ghost
  # nodes that were erased from the node database.
  # Depends on @nodes, %status, %node_hist, %in_a_row, %in_sdt_then,
  # %sdt_been_dn, and %sdt.

  my($node, %fresh_status, %fresh_node_hist, %fresh_in_a_row,
     %fresh_in_sdt_then, %fresh_sdt_been_dn, %fresh_sdt);

  foreach $node (@nodes) {
    $fresh_status{$node} = $status{$node};
    $fresh_node_hist{$node} = $node_hist{$node};
    $fresh_in_a_row{$node} = $in_a_row{$node};
    $fresh_in_sdt_then{$node} = $in_sdt_then{$node};
    $fresh_sdt_been_dn{$node} = $sdt_been_dn{$node};
    $fresh_sdt{$node} = $sdt{$node};
  }

  %status = %fresh_status;
  %node_hist = %fresh_node_hist;
  %in_a_row = %fresh_in_a_row;
  %in_sdt_then = %fresh_in_sdt_then;
  %sdt_been_dn = %fresh_sdt_been_dn;
  %sdt = %fresh_sdt;
}

sub ping_nodes {
  # Pings a list of nodes and sets an associative array called %ping_result,
  # keyed by the node name, that contains the result: 1 for ping reply, -1
  # for no reply, and 0 for bad address.  If a node didn't get assigned one
  # of those responses, then a complaint is registered.
  # If the pinger couldn't be run, then %ping_result is set to null.
  # Depends on @nodes, %timeout, $options{pinger}, %ping_result, and error().

  my($node, %by_timeout, $value, $temp, $timeout);

  %ping_result = ();

  # Build an associative array that is keyed by timeout values and
  # contains the nodes that use that timeout value.
  foreach $node (@nodes) {
    if (!defined($by_timeout{$timeout{$node}})) {
      $by_timeout{$timeout{$node}} = $node;
    } else {
      $by_timeout{$timeout{$node}} .= " $node";
    }
  }

  # Do a seperate ping for each different timeout value.
  foreach $value (keys %by_timeout) {

    # Can we execute the pinger?
    $temp = $options{pinger};
    $temp =~ s/^(\S+).*/$1/;
    if (! -x $temp) {
      error("Cannot execute the pinger ($temp).");
      %ping_result = ();
      return();
    }

    # Get the timeout in milliseconds.  Remember, $value is the timeout,
    # but in seconds, and fping needs it in milliseconds.
    $timeout = $value * 1000;

    # Do the pings.  Expected format: pinger -t timeout hosts
    open3(INPUT, OUTPUT, ERROR,
          "$options{pinger} -t $timeout $by_timeout{$value}");

    while(<OUTPUT>) {

      # Go through the results and assign a status code to each node.
      # Three different responses are assumed: node is alive, node is
      # unreachable, and node address not found.

      if (/^(\S+) is alive/) {
        $ping_result{$1} = 1;
      } elsif (/^(\S+) is unreachable/) {
        $ping_result{$1} = -1;
      } elsif (/^(\S+) address not found/) {
        $ping_result{$1} = 0;
      } else {
        error("Strange response from pinger ($_).");
      }
    }

    while(<ERROR>) {

      # Go through the errors and see if any are "x address not found".
      # If so, set the ping_result for x to 0.  If not, send it to the
      # syslog.

      if (/^(\S+) address not found/) {
        $ping_result{$1} = 0;
      } else {
        chop($_) if (/\n$/);
        error("$options{pinger} error: $_");
      }
    }
  }
}

sub check_for_bad_addresses {
  # Goes through the ping results.  A node is deleted from %ping_result if
  # the last ping came back with a bad address.  It also complains.
  # Depends on %ping_result and error().

  my($node);

  foreach $node (keys %ping_result) {
    if ($ping_result{$node} == 0) {
      delete($ping_result{$node});
      error("$node address not found.");
    }
  }
}

sub in_a_row {
  # Recalculates %in_a_row to account for the most recent ping data (stored
  # in %ping_result).
  # Depends on %ping_result, %old_ping_result, and %in_a_row.

  my($node);

  # %in_a_row records the number of consecutive pings (or misses) a node
  # has attributed to it, as of the last ping.

  foreach $node (keys %ping_result) {
    # If this is the first time through for this node...
    if (!defined($old_ping_result{$node})) {
      $in_a_row{$node} = $ping_result{$node};
      next;
    }

    if ($old_ping_result{$node} == $ping_result{$node}) {
        $in_a_row{$node}++ if ($ping_result{$node} == 1);
        $in_a_row{$node}-- if ($ping_result{$node} == -1);
    } else {
      $in_a_row{$node} = $ping_result{$node};
    }
  }
}

sub update_status {
  # Figures out the current status.  It relies on the simple rules for node
  # status.  1 for up, 0 for unknown, and -1 for down.  This routine also
  # sets up the @up_flux and @dn_flux arrays.
  # Depends on %ping_result, %in_a_row, %status, and %count.

  my($node);

  @up_flux = ();
  @dn_flux = ();

  foreach $node (keys %ping_result) {
    if ($status{$node} == 0) {      # Node status unknown?
      if ($in_a_row{$node} == $count{$node}) {       # Is the node going up?
        $status{$node} = 1;
      } elsif ($in_a_row{$node} == -$count{$node}) { # Is the node going down?
        $status{$node} = -1;
      }
    } elsif ($status{$node} == 1) { # Node status up?
      if ($in_a_row{$node} < 0) {                    # Is the node in flux?
        push(@up_flux, $node);
      }
      if ($in_a_row{$node} == -$count{$node}) {      # Is the node going down?
        $status{$node} = -1;
      }
    } elsif ($status{$node} == -1) { # Node status down?
      if ($in_a_row{$node} > 0) {                    # Is the node in flux?
        push(@dn_flux, $node);
      }
      if ($in_a_row{$node} == $count{$node}) {       # Is the node going up?
        $status{$node} = 1;
      }
    }
  }
}

sub partitioning {
  # Go through each node in the status hash and see if it is a router
  # and whether it is down.  If so, then erase all non-router entries
  # from @up_flux and @dn_flux.  This way, the only action that will be
  # executed regarding nodes will be actions corresponding to routers.
  # Depends on %ping_result, %in_a_row, @up_flux, @dn_flux, and %class.

  my($node, @tmp_up_flux, @tmp_dn_flux);

  $partitioned = 0;

  foreach $node (keys %ping_result) {
    if ($status{$node} == -1 && $class{$node} eq "r") {
      $partitioned = 1;
      last;
    }
  }

  # If the network is partitioned, then set the status of all the
  # non-router nodes to unknown.

  if ($partitioned) {
    foreach $node (keys %ping_result) {
      if ($class{$node} ne "r") {
        $status{$node} = 0;
        $in_a_row{$node} = 0;
      }
    }

    foreach $node (@up_flux) {
      push(@tmp_up_flux, $node) if ($class{$node} eq "r");
    }
    foreach $node (@dn_flux) {
      push(@tmp_dn_flux, $node) if ($class{$node} eq "r");
    }

    @up_flux = @tmp_up_flux;
    @dn_flux = @tmp_dn_flux;
  }
}

sub update_sdt {
  # Updates %sdt.  %sdt determines whether a node should be suppressed
  # due to a scheduled down time for that node.
  # Depends on %ping_result, %periods, %status, %old_status, %in_sdt_then,
  # %sdt_been_dn, and %sdt.

  my($now, $node, $in_sdt, $act_per, $action, $period);

  $now = time();

  # Go through each node that's been updated.
  foreach $node (keys %ping_result) {

    # This will get set to "" or a list of actions if it is effected by a
    # scheduled down time.
    undef($sdt{$node});

    # This won't be defined upon startup or if a node gets inserted into the
    # database while it is running.  In either case, the node will be
    # in the unknown state, therefore it is okay to skip.
    if (!defined($in_sdt_then{$node})) {
      $sdt_been_dn{$node} = 0;
      $in_sdt_then{$node} = 0;
      next;
    }

    # $in_sdt determines if $node is in a scheduled down time.
    $in_sdt = 0;

    # Determine if any of the action-period associations for $node is a sdt.
    foreach $act_per (split(/,/, $act_pers{$node})) {

      if ($act_per =~ /:/) {
        $action = $`;
        $period = $';
        if ($period =~ /\s*\(\s*[Dd]\s*\)$/) {
          $period = $`;
        } else {
          next;
        }
      } else {
        next;
      }

      # If we got this far, then $period is a sdt for $node.  But is $node
      # in a sdt?

      if (!defined($periods{$period})) {
        error("For $node, action $action has an undefined period $period.");
        next;
      }

      # We have a defined period.  Check it.

      $in_sdt = inPeriod($now, $periods{$period});

      if ($in_sdt == -1) {
        error("For $node, action $action has an invalid period $period.");
        next;
      }

      # If we are in a scheduled down time, suppress actions and stop
      # searching the list.
      if ($in_sdt == 1) {

        # Actions are suppressed while in a scheduled down time except
        # when we've been down since entering the sdt and it goes up.
        $sdt{$node} = "";

        # Update %sdt_been_dn.
        if (!$in_sdt_then{$node}) {
          if ($old_status{$node} == -1 && $status{$node} == -1) {
            # We entered a scheduled down time down, start the tracker.
            $sdt_been_dn{$node} = 1;
          }
        } else {
          if ($sdt_been_dn{$node} && $status{$node} == 1) {
            # We entered a scheduled down time down and this is the first
            # time going up, so unsuppress the other actions, if any.
            $sdt_been_dn{$node} = 0;

            undef($sdt{$node});
          }
        }

        $in_sdt_then{$node} = 1;
        last;
      }

      # Since $node is not in a sdt, is it just emerging from one?

      if ($in_sdt_then{$node}) {
        # It is emerging, so let's check to see if it is down.  If so,
        # then the action associated with this scheduled down time should
        # be executed.

        # Only add the action if it didn't enter the sdt down and stay down.
        if ($status{$node} == -1 && !$sdt_been_dn{$node}) {
          if ($sdt{$node} eq "") {
            $sdt{$node} = $action;
          } else {
            $sdt{$node} .= ",$action";
          }
        }

        # It is no longer emerging nor it is in an sdt.
        $in_sdt_then{$node} = 0;
        $sdt_been_dn{$node} = 0;
      }
    }
  }
}

sub to_arrays {
  # This assigns nodes to the status arrays.  This is simply a collection
  # of lists grouping nodes based on status.  The arrays are @up, @dn,
  # @gone_up, @gone_dn, and @unknown.  A node that has gone up or down
  # also belongs with the up or dn arrays (respectively) because it is now
  # up or down (respectively).  The time a node changed states is recorded
  # in %node_hist.
  # Depends on %ping_result, %status, %old_status, %sdt, %in_a_row,
  # and %node_hist.

  my($now, $node);

  @up = ();
  @dn = ();
  @gone_up = ();
  @gone_dn = ();
  @unknown = ();

  $now = time();

  # A node will not get added to any arrays if it goes from unknown to
  # either up or down, or down or up to unknown.

  foreach $node (keys %ping_result) {
    # If a node has changed states...
    if ($status{$node} != $old_status{$node}) {
      # Did the node go from down to up or up to down?
      if ( ($old_status{$node} == -1) && ($status{$node} == 1) ) {
        push(@gone_up, $node);
        push(@up, $node);
      } elsif ( ($old_status{$node} == 1) && ($status{$node} == -1) ) {
        push(@gone_dn, $node);
        push(@dn, $node);
      }

      # Record the current time.
      $node_hist{$node} = $now;
    } else {
      push(@up, $node) if ($status{$node} == 1);
      push(@unknown, $node) if ($status{$node} == 0);
      push(@dn, $node) if ($status{$node} == -1);
    }
  }
}

sub node_flux_syslog {
  # Sends a message to the syslog for each node that is up and is missing
  # pings or that is down and hitting pings.
  # Depends on @up_flux, @dn_flux, %in_a_row, $r, $l, $optins{node_m_type},
  # and to_syslog().

  my($node);

  # These two foreach loops make an entry to the syslog if a node is
  # up and it is missing pings, or if a node is down and is hitting pings.

  foreach $node (@up_flux) {
    to_syslog("$node $l$in_a_row{$node}$r", $options{node_m_type});
  }

  foreach $node (@dn_flux) {
    to_syslog("$node $l+$in_a_row{$node}$r", $options{node_m_type});
  }
}

sub set_delimiters {
  # Sets the message delimiters.  Braces for a partitioned network and
  # brackets for a non-partitioned network.
  # Depends on $partitioned.

  if ($partitioned) {
    $l = "{";
    $r = "}";
  } else {
    $l = "[";
    $r = "]";
  }
}

sub node_syslog {
  # Sends a message to the syslog for each node that has gone up or down.
  # Depends on @gone_up, @gone_dn, $r, $l, $options{node_m_type},
  # and to_syslog().

  my($node);

  foreach $node (@gone_up) {
    to_syslog("$node $l" . "up" . "$r", $options{node_m_type});
  }

  foreach $node (@gone_dn) {
    to_syslog("$node $l" . "dn" . "$r", $options{node_m_type});
  }
}

sub node_action {
  # Takes action for nodes that have gone up or down.
  # Depends on @gone_up, @gone_dn, %act_pers, %periods, %actions, %sdt,
  # do_action(), inPeriod(), and error().

  my($now, $node, $act_per, $action, $period, $result, %gone_up_by_action,
     %gone_dn_by_action, @action_nodes, $num_down_nodes, $msg);

  $now = time();

  # Build two hashes (associative arrays).  One hash will be for
  # up nodes, and another for down nodes.  The hashes will be keyed
  # by action and will contain up or down nodes that belong to each
  # action.

  # At this point, the actions used in these two foreach loops are just
  # the piecemeal entries from the node database.  That means we can expect
  # a format like this: "action:period", where the action is, well, some action,
  # and period is a time period used to limit the use of the action to.
  # If there is no colon and period, then any period is assumed.  This is
  # controled by not adding an entry to the associative arrays
  # (%gone_up_by_action and %gone_dn_by_action) for the actions not to be
  # invoked.  Also, the colon and decimal is stripped from the $action
  # variable before making an entry to the hash.

  foreach $node (@gone_up) {

    # Skip if actions are effected a scheduled down time.
    next if (defined($sdt{$node}));

    foreach $act_per (split(/,/, $act_pers{$node})) {

      if ($act_per =~ /:/) {
        $action = $`;
        $period = $';

        # If this is a SDT period, simply skip it.  Because of the way SDT
        # works, no action will ever be taken for a node affected by SDT
        # when it goes up.  It can be suppressed, however.
        next if ($period =~ /\s*\(\s*[Dd]\s*\)$/);

        next if ($action eq "none");

        if (!defined($periods{$period})) {
          error("For $node, action $action has an invalid period $period.");
          next;
        } else {
          next if (inPeriod($now, $periods{$period}) == 0);
        }
      } else {
        next if ($act_per eq "none");
        $action = $act_per;
      }

      if (!defined($gone_up_by_action{$action})) {
        $gone_up_by_action{$action} = $node;
      } else {
        $gone_up_by_action{$action} .= ",$node";
      }
    }
  }

  foreach $node (@gone_dn) {

    # Skip if actions are suppressed due to a scheduled down time.
    next if (defined($sdt{$node}));

    foreach $act_per (split(/,/, $act_pers{$node})) {

      if ($act_per =~ /:/) {
        $action = $`;
        $period = $';

        # If we got here, then actions aren't suppressed for this node by its
        # SDTs.  Simply skip the SDTed periods.
        if ($period =~ /\s*\(\s*[Dd]\s*\)$/) {
          next;
        }

        next if ($action eq "none");

        if (!defined($periods{$period})) {
          if ($action ne "special") {
             error("No time period definition for $period.");
          }
        } else {
          next if (inPeriod($now, $periods{$period}) == 0);
        }
      } else {
        next if ($act_per eq "none");
        $action = $act_per;
      }

      if (!defined($gone_dn_by_action{$action})) {
        $gone_dn_by_action{$action} = $node;
      } else {
        $gone_dn_by_action{$action} .= ",$node";
      }
    }
  }

  # Go through %sdt and add entries to %gone_dn_by_action for those nodes
  # having at least one action in %sdt.
  foreach $node (keys %sdt) {
    next if (!defined($sdt{$node}));

    foreach $action (split(/,/, $sdt{$node})) {

      next if ($action eq "none");

      if (!defined($gone_dn_by_action{$action})) {
        $gone_dn_by_action{$action} = $node;
      } else {
        $gone_dn_by_action{$action} .= ",$node";
      }
    }
  }

  # For each action, go through and spool off the nodes for that action.
  # We have to pay attention to the message node limit ($options{n_per}).
  # That means a message can have up to $options{n_per} nodes in the message.

  # Figure out how many down nodes there currently are.
  $num_down_nodes = @dn;

  foreach $action (keys %gone_up_by_action) {

    @action_nodes = split(/,/, $gone_up_by_action{$action});

    while (@action_nodes != 0) {
      $msg = "up:$num_down_nodes$l" .
             join(", ", splice(@action_nodes, 0, $options{n_per})) . "$r";
      do_action($action, $msg);
    }
  }

  foreach $action (keys %gone_dn_by_action) {

    @action_nodes = split(/,/, $gone_dn_by_action{$action});

    while (@action_nodes != 0) {
      $msg = "dn:$num_down_nodes$l" .
             join(", ", splice(@action_nodes, 0, $options{n_per})) . "$r";
      do_action($action, $msg);
    }
  }
}

sub do_status {
  # Sends out the status reports, if it is time to do so, that is.
  # Depends on $loop_time_then, $loop_time, $options{s_rpt_times},
  # $options{n_rpts_act}, $options{s_rpts_sys}, $options{s_rpts_sact},
  # to_syslog(), $actions{special}, do_action(), $l, $r, and @dn.

  my($a_time, $its_time, $msg, @tmp, $num_down_nodes);

  # Initialize $loop_time_then if it hasn't been.
  $loop_time_then = $loop_time if (!defined($loop_time_then));

  $its_time = 0;

  # Is it time?
  foreach $a_time (split(/\s+/, $options{s_rpt_times})) {
    if ( ($loop_time >= $a_time) && ($loop_time_then < $a_time) ) {
      $loop_time_then = $loop_time;
      $its_time = 1;
    }
  }

  # If it's not time, bail from this subroutine.
  if ($its_time == 0) {
    $loop_time_then = $loop_time;
    return();
  }

  # So, we are supposed to do a status report now...  Well, let's put together
  # the message.

  # If node actions are to be executed, make this a normal status report.  If
  # not, then say node actions are disabled.
  if ($options{n_rpts_act}) {
    # Are hosts down at this time?
    if (@dn > 0) {
      $msg = "It is $loop_time and hosts are down.";
    } else {
      $msg = "It is $loop_time and all is well.";
    }
  } else {
    if (@dn > 0) {
      $msg = "It is $loop_time, node actions are disabled, and hosts are down.";
    } else {
      $msg = "It is $loop_time, node actions are disabled, and all is well.";
    }
  }

  # Should we shoot the message to the syslog?
  if ($options{s_rpts_sys}) {
    to_syslog($msg, $options{node_m_type});
  }

  # Should we shoot the message to the special action?
  if ($options{s_rpts_sact}) {
    do_action("special", $msg, "special");

    # If nodes are down, build up a message which contains a list of
    # the down nodes and pass it on to the special action.
    if (@dn > 0) {
      @tmp = @dn;
      $num_down_nodes = @tmp;

      while (@tmp != 0) {
        $msg = "dn:$num_down_nodes$l" .
               join(", ", splice(@tmp, 0, $options{n_per})) . "$r";
        do_action("special", $msg, "special");
      }
    }
  }
}

sub do_action {
  # Invokes the specified action.  Expects two to three arguments: the
  # action identifier, the message to replace _MSG_ with in the action
  # definition, time period identifier.
  # Depends on %actions, %periods, inPeriod(), and error().

  my($action, $msg, $period) = @_;
  my($cmd, $result);

  if (!defined($actions{$action})) {
    error("No definition for $action.");
    return();
  }

  if (defined($period)) {
    # Since special has no period, it goes right through.
    # If there is no definition for non special period identifiers, any time
    # is also assumed, but it is considered an abnormal condition.
    if (!defined($periods{$period})) {
      if ($period ne "special") {
        error("No time period definition for $period, assuming all times.");
      }
    } else {
      # If the time period is malformed, then process the action anyway.
      $result = inPeriod(time(), $periods{$period});
      return() if ($result == 0);
      if ($result == -1) {
        error("The period $period for action $action is malformed.");
      }
    }
  }

  $cmd = $actions{$action};
  $cmd =~ /^(\S+).*/;
  if (! -x $1) {
    error("$1 is not executable.");
    return();
  }

  $cmd =~ s/_MSG_/\'$msg\'/g;

  exec($cmd) if (fork() == 0);
}

sub caught_dump_sig {
  # Sets the dump stats flag.
  # Depends on to_syslog().

  $dump_stats = 1;
  to_syslog("Received signal to dump node statistics.");
}

sub dump_stats {
  # Dumps out current node stats to $options{stats_dump_file}.  Gets
  # statistics from %status and %in_a_row.
  # Depends on $options{stats_dump_file}, to_syslog(), %status,
  # %node_hist, $dump_stats, and the file "ctime.pl" from the perl library.

  my($node, $status);

  to_syslog("Dumping node statistics.");

  if (!open(FILE, ">$options{stats_dump_file}")) {
    error("Cannot dump statistics to $options{stats_dump_file}.");
    return();
  }

  foreach $node (keys %status) {
    if ($status{$node} == 1) {
      $status = "up";
    } elsif ($status{$node} == -1) {
      $status = "down";
    } else {
      $status = "unknown";
    }
    # localtime() will generate a syslog type date string when used in a
    # scalar context.
    printf( FILE "%s\t%s\t%s\n", $node, $status,
            scalar localtime($node_hist{$node}));
  }

  close(FILE);

  # Set $dump_stats back to zero.
  $dump_stats = 0;
}

sub shut_down {
  # Sends a message to the syslog and exits.

  to_syslog("Shutting down.");

  exit(1);
}

sub syslog_ready {
  # Make sure the syslog related options are set up.
  # Depends on the %options and %default_options elements named p_name,
  # p_type, m_type, and pid.

  my($p_name, $p_type, $m_type, $pid);

  if (!defined($options{p_name})) {
    $p_name = $default_options{p_name};
  } else {
    $p_name = $options{p_name};
  }

  if (!defined($options{p_type})) {
    $p_type = $default_options{p_type};
  } else {
    $p_type = $options{p_type};
  }

  if (!defined($options{m_type})) {
    $m_type = $default_options{m_type};
  } else {
    $m_type = $options{m_type};
  }

  if (!defined($options{pid})) {
    $pid = $default_options{pid};
  } else {
    $pid = $options{pid};
  }

  return ($p_name, $p_type, $m_type, $pid);
}

sub to_syslog {
  # Sends messages to syslog.  Takes two arguments: the message and the
  # type.  The second argument is optional.  If it isn't present, then
  # $options{m_type} is used.
  # Depends on $options{pid}, $options{p_name}, $options{p_type},
  # $options{m_type}, openlog(), syslog(), and closelog() (those three
  # from the Sys::Syslog namespace).

  my($msg, $msg_m_type) = @_;
  my($p_name, $p_type, $m_type, $pid) = syslog_ready();

  # Set up $msg_m_type.
  if (!defined($msg_m_type)) {
    $msg_m_type = $m_type;
  }

  # Display the process id in the syslog message if it is set to do so.
  if ($pid) {
    openlog($p_name, 'pid', $p_type);
  } else {
    openlog($p_name, '', $p_type);
  }

  syslog($msg_m_type, $msg);

  closelog();
}

sub error {
  # Dumps out errors.
  # Depends on $options{err_m_type} and to_syslog().

  my($msg) = $_[0];

  to_syslog($msg, $options{err_m_type});
}

#
##################################################

