summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Newson <robert.newson@cloudant.com>2013-11-11 16:04:08 +0000
committerRobert Newson <robert.newson@cloudant.com>2013-11-20 12:51:48 +0000
commit1c1b48e19d0eb36691d98f2194c19fe898cb55db (patch)
tree81ed80923d6f7034d7c1a06aeab93e25f29ce233
parent8b01c3af6ae6761b472547e43fb65d6a042bf080 (diff)
downloadcouchdb-1c1b48e19d0eb36691d98f2194c19fe898cb55db.tar.gz
Send sensu events for everything
-rw-r--r--src/custodian/src/custodian_db_checker.erl22
-rw-r--r--src/custodian/src/custodian_server.erl66
2 files changed, 48 insertions, 40 deletions
diff --git a/src/custodian/src/custodian_db_checker.erl b/src/custodian/src/custodian_db_checker.erl
index 02bd0d486..2dd770674 100644
--- a/src/custodian/src/custodian_db_checker.erl
+++ b/src/custodian/src/custodian_db_checker.erl
@@ -151,20 +151,16 @@ get_bacon_db() ->
send_missing_db_alert(DbName) ->
twig:log(notice, "Missing system database ~s", [DbName]),
Command = [
- "send_snmptrap",
- "--trap",
- "CLOUDANT-DBCORE-MIB::cloudantDbcoreMissingDbEvent",
- "-o",
- "'cloudantDbcoreDbName:STRING:" ++ binary_to_list(DbName) ++ "'"
- ],
- os:cmd(string:join(Command, " ")).
-
+ "send-sensu-event --standalone --critical",
+ " --output=\"Missing system database ",
+ binary_to_list(DbName),
+ "\" --handler=default custodian-missing-db-check"],
+ os:cmd(lists:concat(Command)).
clear_missing_dbs_alert() ->
twig:log(notice, "All system databases exist.", []),
Command = [
- "send_snmptrap",
- "--trap",
- "CLOUDANT-DBCORE-MIB::cloudantDbcoreAllDbsAvailableEvent"
- ],
- os:cmd(string:join(Command, " ")).
+ "send-sensu-event --standalone --ok",
+ " --output=\"All system databases exist\"",
+ " --handler=default custodian-missing-db-check"],
+ os:cmd(lists:concat(Command)).
diff --git a/src/custodian/src/custodian_server.erl b/src/custodian/src/custodian_server.erl
index 61885bb79..188d3713f 100644
--- a/src/custodian/src/custodian_server.erl
+++ b/src/custodian/src/custodian_server.erl
@@ -106,31 +106,43 @@ handle_db_event(_DbName, _Event, _St) ->
{ok, nil}.
check_shards() ->
- Summary = custodian:summary(),
- send_conflicted_alert(proplists:get_value(conflicted, Summary)),
- send_unavailable_alert(proplists:get_value(unavailable, Summary)),
- send_one_copy_alert(proplists:get_value(one_copy, Summary)),
- send_impaired_alert(proplists:get_value(impaired, Summary)).
-
-%% specific alert functions
-send_conflicted_alert(Count) ->
- send_snmp_alert(Count, "partition tables conflicted", "NoPartitionTablesConflictedEvent", "PartitionTablesConflictedEvent").
-
-send_impaired_alert(Count) ->
- send_snmp_alert(Count, "shards impaired", "AllShardsUnimpairedEvent", "ShardsImpairedEvent").
-
-send_unavailable_alert(Count) ->
- send_snmp_alert(Count, "unavailable shards", "AllShardsAvailableEvent", "ShardsUnavailableEvent").
-
-send_one_copy_alert(Count) ->
- send_snmp_alert(Count, "shards with only one copy", "AllShardsMultipleCopiesEvent", "ShardsOneCopyEvent").
-
-%% generic SNMP alert functions
-send_snmp_alert(undefined, AlertType, ClearMib, _) ->
- twig:log(notice, "No ~s in this cluster", [AlertType]),
- Cmd = lists:concat(["send_snmptrap --trap CLOUDANT-DBCORE-MIB::cloudantDbcore", ClearMib]),
- os:cmd(Cmd);
-send_snmp_alert(Count, AlertType, _, AlertMib) when is_integer(Count) ->
- twig:log(crit, "~B ~s in this cluster", [Count, AlertType]),
- Cmd = lists:concat(["send_snmptrap --trap CLOUDANT-DBCORE-MIB::cloudantDbcore", AlertMib," -o cloudantDbcoreShardCount:INTEGER:", Count]),
+ [send_sensu_event(Item) || Item <- custodian:summary()].
+
+send_sensu_event({_, Count} = Item) ->
+ if Count > 0 -> twig:log(crit, "~s", [describe(Item)]); true -> ok end,
+ Cmd = lists:concat(["send-sensu-event --standalone ",
+ level(Item),
+ " --output=\"",
+ describe(Item),
+ "\" ",
+ check_name(Item)]),
os:cmd(Cmd).
+
+level({_, 0}) ->
+ "--ok";
+level(_) ->
+ "--critical".
+
+describe({{safe, N}, Count}) ->
+ lists:concat([Count, " ", shards(Count), " in cluster with only ", N,
+ " ", copies(N), " on nodes that are currently up"]);
+describe({{live, N}, Count}) ->
+ lists:concat([Count, " ", shards(Count), " in cluster with only ",
+ N, " ", copies(N), " on nodes not in maintenance mode"]);
+describe({conflicted, Count}) ->
+ lists:concat([Count, " conflicted ", shards(Count), " in cluster"]).
+
+check_name({{Type, N}, _}) ->
+ lists:concat(["custodian-", N, "-", Type, "-shards-check"]);
+check_name({Type, _}) ->
+ lists:concat(["custodian-", Type, "-shards-check"]).
+
+shards(1) ->
+ "shard";
+shards(_) ->
+ "shards".
+
+copies(1) ->
+ "copy";
+copies(_) ->
+ "copies".