I found a monitor for when the PRIMARY of a mongodb replica set changes – but how to determine which server actually is primary?
Well, this can be done on the command line…
root@server:/opt/mongodb/bin# ./mongo 192.168.1.1 --eval "printjson(rs.status())" MongoDB shell version: 2.0.4 connecting to: 192.168.108.161/test { "set" : "px_mongo", "date" : ISODate("2013-04-01T21:13:56Z"), "myState" : 2, "syncingTo" : "server3:27017", "members" : [ { "_id" : 0, "name" : "server2:27017", "health" : 1, "state" : 2, "stateStr" : "SECONDARY", "uptime" : 405575, "optime" : { "t" : 1364846364000, "i" : 1 }, "optimeDate" : ISODate("2013-04-01T19:59:24Z"), "lastHeartbeat" : ISODate("2013-04-01T21:13:54Z"), "pingMs" : 1 }, { "_id" : 1, "name" : "server1:27017", "health" : 1, "state" : 2, "stateStr" : "SECONDARY", "uptime" : 405590, "optime" : { "t" : 1364846364000, "i" : 1 }, "optimeDate" : ISODate("2013-04-01T19:59:24Z"), "lastHeartbeat" : ISODate("2013-04-01T21:13:55Z"), "pingMs" : 1 }, { "_id" : 3, "name" : "server4:27017", "health" : 1, "state" : 2, "stateStr" : "SECONDARY", "optime" : { "t" : 1364846364000, "i" : 1 }, "optimeDate" : ISODate("2013-04-01T19:59:24Z"), "self" : true }, { "_id" : 5, "name" : "server3:27017", "health" : 1, "state" : 1, "stateStr" : "PRIMARY", "uptime" : 405576, "optime" : { "t" : 1364846364000, "i" : 1 }, "optimeDate" : ISODate("2013-04-01T19:59:24Z"), "lastHeartbeat" : ISODate("2013-04-01T21:13:55Z"), "pingMs" : 1 }, { "_id" : 6, "name" : "server5:27017", "health" : 1, "state" : 2, "stateStr" : "SECONDARY", "uptime" : 280908, "optime" : { "t" : 1364846364000, "i" : 1 }, "optimeDate" : ISODate("2013-04-01T19:59:24Z"), "lastHeartbeat" : ISODate("2013-04-01T21:13:54Z"), "pingMs" : 0 } ], "ok" : 1 }
I added a message to the nagios check to tell me what the current primary server name is when the condition DOESN’T change:
1040 def check_replica_primary(con,host, warning, critical,perf_data, replicaset): 1041 """ A function to check if the primary server of a replica set has changed """ 1042 if warning is None and critical is None: 1043 warning=1 1044 warning=warning or 2 1045 critical=critical or 2 1046 1047 primary_status=0 1048 message="Primary server has not changed" 1049 db=con["nagios"] 1050 data=get_server_status(con) 1051 if replicaset != data['repl'].get('setName'): 1052 message = "Replica set requested: %s differs from the one found: %s" % (replicaset, data['repl'].get('setName')) 1053 primary_status = 2 1054 return check_levels(primary_status,warning,critical,message) 1055 current_primary=data['repl'].get('primary') 1056 saved_primary=get_stored_primary_server_name(db) 1057 if current_primary is None: 1058 current_primary = "None" 1059 if saved_primary is None: 1060 saved_primary = "None" 1061 if current_primary != saved_primary: 1062 last_primary_server_record = {"server": current_primary} 1063 db.last_primary_server.update({"_id": "last_primary"}, {"$set" : last_primary_server_record} , upsert=True, safe=True) 1064 message = "Primary server has changed from %s to %s" % (saved_primary, current_primary) 1065 primary_status=1 1066 if current_primary == saved_primary: 1067 message = "Primary server has not changed and is %s" % (current_primary) 1068 return check_levels(primary_status,warning,critical,message)
Added lines 1066 and 1067 – return what the primary server name is with the OK check…
—doug