This website requires JavaScript.
dcd39d098fb004e0c143e9b75ec08d9e189dacd1
swift /bin /swift-recon
351 lines
11 KiB
Plaintext
2011年07月27日 10:41:07 -05:00
#! /usr/bin/env python
cmdline utility to perform cluster reconnaissance
from eventlet.green import urllib2
from swift.common.ring import Ring
import simplejson as json
2011年08月12日 15:01:28 -05:00
import sys
2011年07月27日 10:41:07 -05:00
import os
#todo , fitler by zone[s]
ring_file = "/etc/swift/object.ring.gz"
ring_data = Ring(ring_file)
ips = set((n['ip'], n['port']) for n in ring_data.devs)
def scout(base_url, recon_type):
global VERBOSE, SUPPRESS_ERRORS
url = base_url + recon_type
body = urllib2.urlopen(url).read()
content = json.loads(body)
print "-> %s: %s" % (url, content)
except urllib2.HTTPError as e:
if not SUPPRESS_ERRORS or VERBOSE:
print "-> %s: %s" % (url, e)
except urllib2.URLError as e:
if not SUPPRESS_ERRORS or VERBOSE:
print "-> %s: %s" % (url, e)
return url, content, status
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "ringmd5")
return url, content, status
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "async")
return url, content, status
def scout_replication(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "replication")
return url, content, status
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "load")
return url, content, status
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "diskusage")
return url, content, status
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "unmounted")
return url, content, status
2011年08月12日 15:01:28 -05:00
def scout_quarantine(host):
base_url = "http://%s:%s/recon/" % (host[0], host[1])
url, content, status = scout(base_url, "quarantined")
return url, content, status
2011年08月12日 16:29:13 -05:00
2011年07月27日 10:41:07 -05:00
def get_ringmd5(ringfile):
with open(ringfile, 'rb') as f:
ring_sum = md5sum.hexdigest()
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking ring md5sum's on %s hosts..." % (now, len(hosts))
print "-> On disk md5sum: %s" % ring_sum
for url, response, status in pool.imap(scout_md5, hosts):
#fixme - need to grab from config
stats[url] = response[ringfile]
if response[ringfile] != ring_sum:
print "!! %s (%s) doesn't match on disk md5sum" % \
(url, response[ringfile])
print "-> %s matches." % url
print "%s/%s hosts matched, %s error[s] while checking hosts." % \
(matches, len(hosts), errors)
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking async pendings on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_async, hosts):
stats[url] = response['async_pending']
low = min(stats.values())
high = max(stats.values())
total = sum(stats.values())
average = total / len(stats)
print "Async stats: low: %d, high: %d, avg: %d, total: %d" % (low,
2011年08月12日 15:01:28 -05:00
print "Error: No hosts available or returned valid information."
2011年07月27日 10:41:07 -05:00
print "=" * 79
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Getting unmounted drives from %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_umount, hosts):
print "Not mounted: %s on %s" % (stats[host], host)
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking replication times on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_replication, hosts):
stats[url] = response['object_replication_time']
low = min(stats.values())
high = max(stats.values())
total = sum(stats.values())
average = total / len(stats)
print "[Replication Times] shortest: %s, longest: %s, avg: %s" % \
2011年08月12日 15:01:28 -05:00
print "Error: No hosts available or returned valid information."
2011年07月27日 10:41:07 -05:00
print "=" * 79
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking load avg's on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_load, hosts):
load1[url] = response['1m']
load5[url] = response['5m']
load15[url] = response['15m']
stats = {"1m": load1, "5m": load5, "15m": load15}
low = min(stats[item].values())
high = max(stats[item].values())
total = sum(stats[item].values())
average = total / len(stats[item])
print "[%s load average] lowest: %s, highest: %s, avg: %s" % \
(item, low, high, average)
2011年08月12日 15:01:28 -05:00
print "Error: No hosts available or returned valid information."
2011年08月12日 16:29:13 -05:00
2011年08月12日 15:01:28 -05:00
def quarantine_check():
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking quarantine dirs on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_quarantine, hosts):
objq[url] = response['objects']
conq[url] = response['containers']
acctq[url] = response['accounts']
stats = {"objects": objq, "containers": conq, "accounts": acctq}
low = min(stats[item].values())
high = max(stats[item].values())
total = sum(stats[item].values())
average = total / len(stats[item])
2011年08月12日 16:29:13 -05:00
print "[Quarantined %s] low: %d, high: %d, avg: %d, total: %d" % \
2011年08月12日 15:01:28 -05:00
(item, low, high, average, total)
print "Error: No hosts available or returned valid information."
2011年07月27日 10:41:07 -05:00
print "=" * 79
pool = eventlet.GreenPool(20)
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print "[%s] Checking disk usage on %s hosts..." % (now, len(hosts))
for url, response, status in pool.imap(scout_du, hosts):
used = float(entry['used']) / float(entry['size']) * 100.0
hostusage.append(round(used, 2))
#get per host hi/los for another day
average = total / len(stats[url])
for percent in stats[url]:
percents[percent] = percents.get(percent, 0) + 1
print "-> %s: Error. No drive info available." % url
average = sum(averages) / len(averages)
#distrib graph shamelessly stolen from https://github.com/gholt/tcod
print "Distribution Graph:"
mul = 69.0 / max(percents.values())
for percent in sorted(percents):
print '% 3d%% % 4d %s' % (percent, percents[percent], \
'*' * int(percents[percent] * mul))
print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \
2011年08月12日 15:01:28 -05:00
print "Error: No hosts available or returned valid information."
2011年07月27日 10:41:07 -05:00
print "=" * 79
global VERBOSE, SUPPRESS_ERRORS, swift_dir, pool
2011年08月12日 15:01:28 -05:00
usage: %prog [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--objmd5]
2011年07月27日 10:41:07 -05:00
'''
args = optparse.OptionParser(usage)
args.add_option('--verbose', '-v', action="store_true",
help="Print verbose info")
args.add_option('--suppress', action="store_true",
help="Suppress most connection related errors")
args.add_option('--async', '-a', action="store_true",
args.add_option('--replication', '-r', action="store_true",
help="Get replication stats")
args.add_option('--unmounted', '-u', action="store_true",
help="Check cluster for unmounted devices")
args.add_option('--diskusage', '-d', action="store_true",
help="Get disk usage stats")
args.add_option('--loadstats', '-l', action="store_true",
help="Get cluster load average stats")
2011年08月12日 15:01:28 -05:00
args.add_option('--quarantined', '-q', action="store_true",
help="Get cluster quarantine stats")
2011年07月27日 10:41:07 -05:00
args.add_option('--objmd5', action="store_true",
help="Get md5sums of object.ring.gz and compare to local copy")
args.add_option('--swiftdir', default="/etc/swift",
help="Default = /etc/swift")
options, arguments = args.parse_args()
2011年08月12日 16:29:13 -05:00
2011年08月12日 15:01:28 -05:00
if len(sys.argv) <= 1:
2011年07月27日 10:41:07 -05:00
swift_dir = options.swiftdir
VERBOSE = options.verbose
SUPPRESS_ERRORS = options.suppress
get_ringmd5(os.path.join(swift_dir, 'object.ring.gz'))
2011年08月12日 15:01:28 -05:00
if options.quarantined:
2011年07月27日 10:41:07 -05:00
if __name__ == '__main__':
except KeyboardInterrupt: