require 'rcs-common/trace' require_release 'rcs-db/db_layer' require_release 'rcs-db/shard' require_relative 'mailer' module RCS module Monitor class ComponentChecker include RCS::Tracer attr_reader :mailer CACHE_TIMEOUT = 3600 * 3 #3h def initialize @mailer = Mailer.new reset_cache reset raise "Invalid cn: #{hostname}" if hostname.to_s.empty? end def check_all no_raise do reset check_mongo("mongoDB router", "#{hostname}:27017") check_mongo("mongoDB shard", "#{hostname}:27018") check_mongo("mongoDB config", "#{hostname}:27019") if @failed.empty? # If there are no errors with mongoDB RCS::DB::Shard.sorted.each do |shard| address = shard['host'].split(':').first check_mongo(shard['_id'], "#{address}:27017") check_mongo(shard['_id'], "#{address}:27018") end end if @failed.empty? # If there are no errors with mongoDB in any shard ::Status.all.each { |status| check_component(status) } end if @failed.any? subject = "Some errors were reported" short_message = subject + ": " + @failed.map { |hash| hash[:name] }.join(", ") mailer.alert(subject, :component_failed, binding: binding, short_message: short_message) end if @restored.any? subject = "Components restored" short_message = subject + ": " + @restored.map { |hash| hash[:name] }.join(", ") mailer.alert(subject, :component_restored, binding: binding, short_message: short_message) end end end private def reset_cache @cache = {} @cache_at = Time.now end def reset @failed = [] @restored = [] reset_cache if Time.now - @cache_at > CACHE_TIMEOUT end def no_raise(&block) yield rescue Moped::Errors::ConnectionFailure, Moped::Errors::QueryFailure => error trace(:error, "There was a problem querying mongoDB") rescue Exception => error # TODO: try to send a mail with the error trace(:fatal, "[#{error.class}] #{error.message} #{error.backtrace}") end def hostname @hostname ||= RCS::DB::Config.instance.global['CN'] end def check_connection(host, port) build_info = nil RCS::DB::DB.instance.open(host, port, 'admin', raise: false) do |session| build_info = session.command(buildInfo: 1) end build_info end def check_mongo(name, address) host, port = *address.split(':') trace(:debug, "Checking mongoDB connection to #{address}") connection_available = check_connection(host, port) hash = {name: "#{name} @ #{address}"} if @cache[address] != :error and !connection_available trace(:error, "#{name} @ #{host} has failed") @failed << hash.merge(message: "Unable to connect (attempted from #{hostname})") elsif @cache[address] == :error and connection_available trace(:info, "#{name} @ #{host} is healthy again") @restored << hash end @cache[address] = connection_available ? :ok : :error end def check_component(status) trace(:debug, "Checking component #{status.name}") status.check hash = {name: " #{status.name} (ver. #{status.version}) @ #{status.address}", message: status.info, time: status.time} if @cache[status.id] != :error and status.error? trace(:error, "Component #{status.name} #{status.version} has failed: #{status.info}") @failed << hash elsif @cache[status.id] == :error and !status.error? trace(:info, "Component #{status.name} #{status.version} is healthy again") @restored << hash end @cache[status.id] = status.error? ? :error : :ok end end end end .