loadall 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. #!/usr/bin/python
  2. # Get a snapshot of the cluster load by means of ps command, launched from a ssh session, here, python Threads.
  3. # See sshcmd2node module for datails.
  4. # FIXME: ssh command is good if you're root from a server. User-side tool should use a telenet service to avoid 'Too many login' problems and password prompt if someone does not have keys
  5. # created: Andrea Silva (08-04-2017)
  6. # last edit: Andrea Silva (25-05-2017)
  7. from time import time
  8. import argparse
  9. import subprocess, sys, os, re
  10. from threading import Thread
  11. from re import sub
  12. sys.path.append("/home/150/scratch")
  13. from sshcmd2node import Node
  14. def print_progressbar(index, num) :
  15. sys.stderr.write('\r ['
  16. + '='*index
  17. + '>'*(1-int(index/num))
  18. + ' '*(num-index-1) + ']')
  19. sys.stderr.flush()
  20. # Long output header...
  21. psauxheader="USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND"
  22. ####################
  23. # COMMAND DEFINITION
  24. ####################
  25. cputhold='50'
  26. checkload="ps aux | awk '$3>"+cputhold+"{print}' | sort -rk 3"
  27. ## Main program
  28. desc="""Get cluster load by ssh connection and ps aux query (a calculation is defined as cpuload>"""+cputhold+"""%)
  29. If the nodes aren't in the ~/.ssh/know_hosts, you will be asked to continue
  30. (e.g. it's the first time you run this command)
  31. Use the -s flag to run in serial and answer ssh prompts"""
  32. ## Argument Parser definition
  33. parser = argparse.ArgumentParser(description=desc)
  34. # optional arguments
  35. parser.add_argument( '-n', nargs='+', dest='node', default=[],
  36. help='select one or more nodes by hostname (at least one)' )
  37. parser.add_argument( '-l', '--long', action='store_true', dest='long',
  38. help='be verbose: returns ps output for selected nodes' )
  39. parser.add_argument( '-p', '--disable-progbar', action='store_false', dest='progbar',
  40. help='disable the progression bar.' )
  41. parser.add_argument( '-s', '--serial', action='store_true', dest='serial',
  42. help="""Do the queries to nodes in serial rather than parallel (default).
  43. Hint: useful when connecting for the first time.
  44. Imply -p option.""" )
  45. # End arg parser definition
  46. args = parser.parse_args()
  47. if args.serial : args.progbar=False
  48. ### HOST LIST ###
  49. # Only edit here to add/remove/change hostlist
  50. Hosts = [
  51. ('abe', 'LCM1'),
  52. ('crash', 'LCM1'),
  53. ('duke', 'LCM1'),
  54. ('glados', 'LCM1'),
  55. ('lara', 'LCM1'),
  56. ('link', 'LCM1'),
  57. ('king', 'LCM1'),
  58. ('pang', 'LCM1'),
  59. ('pong', 'LCM1'),
  60. ('snake', 'LCM1'),
  61. ('sonic', 'LCM1'),
  62. ('spyro', 'LCM1'),
  63. ('yoshi', 'LCM1'),
  64. ('actarus', 'LCM2'),
  65. ('elwood', 'LCM2'),
  66. ('gex', 'LCM2'),
  67. ('gin', 'LCM2'),
  68. ('jake', 'LCM2'),
  69. ('kirk', 'LCM2'),
  70. ('martini', 'LCM2'),
  71. ('picard', 'LCM2'),
  72. ('q', 'LCM2'),
  73. ('raziel', 'LCM2'),
  74. ('sarek', 'LCM2'),
  75. ('spock', 'LCM2'),
  76. ('tron', 'LCM2'),
  77. ('worf', 'LCM2'),
  78. ('zombie', 'LCM2'),
  79. ]
  80. # Create nodes list according to options.
  81. ## All nodes
  82. nodes=[ Node(x[0],x[1], checkload) for x in Hosts ]
  83. ## Select given nodes
  84. if len(args.node) :
  85. nodes=[ x for x in nodes if x.hostname in args.node ]
  86. # Start time from here, when the threads are created
  87. start = time()
  88. # Start threads
  89. for i in nodes :
  90. i.start()
  91. if args.serial: i.join()
  92. # Get results: rejoin threads when their work is done
  93. num=len(nodes)
  94. index=0
  95. print ' Querying ' + str(num) + ' hosts...'
  96. for i in nodes:
  97. i.join()
  98. index += 1
  99. if args.progbar : print_progressbar(index, num)
  100. # New line after progress bar
  101. print '\n Done... (%(t).3f s)' % {'t': (time() - start)}
  102. for n in nodes :
  103. if not n.up :
  104. print "==>", n.hostname, "is not up"
  105. continue
  106. exitcode,output,error = n.cmdresult
  107. if exitcode == 0 :
  108. if len(output)>1 :
  109. print "==>", n.hostname, ": Running", len(output), "calculation(s)"
  110. # If long output is required, print all the matcher ps aux lines
  111. if args.long:
  112. print '\t',psauxheader
  113. for process in output:
  114. print '\t', process
  115. else :
  116. # Ssh "bug" can't log in twice on the same node. Should use Telnet instead
  117. if re.search("Too many logins", '\n'.join(output)) :
  118. print "==> Too many logins on", n.hostname
  119. else :
  120. print >> sys.stderr, "Query to host", n.hostname,"exited with", exitcode
  121. if args.long:
  122. print >> sys.stderr, "stderr:", error, '\n', \
  123. "stdout", '\n'.join(output)