runme

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Script to run one simulation.
Example command to run an ensemble using 'jobrun' via the runner module:
jobrun ./runme -r -- -o output/run -p ctl.nyears=5000
'''
import subprocess as subp 
import sys, os, argparse, shutil, glob, json

# Check whether runner library is installed or not:
try:
    from runner.ext.namelist import param_summary, param_write_to_files
    runner_is_installed = True
except:
    runner_is_installed = False

class DictAction(argparse.Action):
    '''Parse a list of parameters ['key=val',...] into a dict.
    Convert parameter values to appropriate type (str,float,int)
    Adapted from: https://sumit-ghosh.com/articles/parsing-dictionary-key-value-pairs-kwargs-argparse-python/
    '''
    def __call__(self, parser, namespace, values, option_string=None):
        setattr(namespace, self.dest, dict())
        for value in values:
            key, valstr = value.split('=')
            if ',' in valstr:
                raise Exception('Only one value allowed for each parameter using -p: {}={}'.format(key,valstr))
            try:
                value = float(valstr)
                if value % 1 == 0 and not '.' in valstr: value = int(value)
            except:
                value = valstr 
            getattr(namespace, self.dest)[key] = value

def main():
    '''Main subroutine to run one simulation.'''

    # Define configuration filenames
    config_file = "." + os.path.basename(__file__) + "_config"
    
    # Load config file, specifically check if its available in the main directory
    if os.path.isfile(config_file):
        # If file is found, load it up
        hpc_config = json.load(open(config_file))
    else: 
        # Copy file from config directory, inform user, 
        # and then load it. 
        error_msg = ("Required json file containing hpc job defaults not found: {} \n".format(config_file) +
                     "This is probably the first time you are running this script. \n" +
                     "Copy the default file from the .runme config directory using the following command, \n" +
                     "check the settings and then try again: \n\n cp .runme/runme_config {} \n".format(config_file,config_file))

        raise Exception(error_msg)

    # Load all queue information and extract the relevant one for the current hpc
    queues_all = json.load(open(hpc_config["queues_file"]))
    hpc_queues = queues_all[hpc_config["hpc"]]

    # Load configuration info for the current setup (relevant paths, links, program aliases, etc)
    info = json.load(open(hpc_config["info_file"]))
    

    # Make exe_aliases string for printing help
    exe_aliases_str = "; ".join(["{}={}".format(key,val) 
                        for key,val in info["exe_aliases"].items()])


    ### Manage command-line arguments ############################

    # Initialize argument parser
    parser = argparse.ArgumentParser()

    # Add options
    parser.add_argument('-e','--exe',type=str,default=info['exe_default'],
        help="Define the executable file to use here. " \
             "Shortcuts: " + exe_aliases_str)
    parser.add_argument('-r','--run',action="store_true",
        help='Run the executable after preparing the job?')
    parser.add_argument('-s','--submit',action="store_true",
        help='Run the executable after preparing the job by submitting to the queue?')
    parser.add_argument('-q','--queue', type=str, default=None,
        help='Alias of the queue the job should be submitted to.')
    parser.add_argument('-w','--wall', type=str, default=None,
        help='HPC wall time "hh:mm:ss" specification to allow for submitted jobs (overrides settings given by queue alias)')
    parser.add_argument('--part',type=str, metavar="PARTITION", default=None,
        help='HPC partition specification (overrides settings given by queue alias)')
    parser.add_argument('--qos',type=str, metavar="QOS", default=None,
        help='HPC quality of service specification (overrides settings given by queue alias)')
    parser.add_argument('--omp', type=int, metavar="OMP", default=hpc_config["omp"],
        help='Specify number of threads for running with OpenMP (default = 1 implies no parallel computation)')
    parser.add_argument('--email', type=str, default=hpc_config["email"],
        help='Email address to send job notifications from cluster (overrides config settings).')
    parser.add_argument('--account', type=str, default=hpc_config["account"],
        help='HPC account associated with job, often set for resource permissions (overrides config settings).')
    parser.add_argument('-x',action="store_true",
        help='This argument is used when the script called by jobrun')
    parser.add_argument('-v',action="store_true",help='Verbose script output?')

    parser.add_argument("-p",metavar="KEY=VALUE",nargs='+',action=DictAction,
                        help="Set a number of key-value pairs "
                             "(do not put spaces before or after the = sign). "
                             "If a value contains spaces, you should define "
                             "it with double quotes: "
                             'foo="this is a sentence". Note that '
                             "values are always treated as strings.")

    # Required arguments:
    # Note: Given the `-p` argument above with multiple parameters possible,
    # `rundir` cannot be a positional argument. Thus it is specified as 
    # required with the `-o` prefix:
    requiredNamed = parser.add_argument_group('required named arguments')
    requiredNamed.add_argument('-o',dest='rundir',metavar='RUNDIR',type=str,required=True,
         help='Path where simulation will run and store output.')
    
    if info["par_path_as_argument"] is True:
        requiredNamed.add_argument('-n',dest='par_path',metavar='PAR_PATH',type=str,required=True,
            help='Path to input parameter file/folder.')

    # Parse the arguments
    args = parser.parse_args()

    ### Manage user options and arguments ############################
    
    # Options
    exe_path    = args.exe       # Path relative to current working directory (cwd)
    run         = args.run 
    submit      = args.submit 
    wall        = args.wall
    queue       = args.queue 
    qos         = args.qos  
    partition   = args.part
    omp         = args.omp 
    email       = args.email 
    account     = args.account
    with_runner = args.x   
    verbose     = args.v 
    par         = args.p 
    rundir      = args.rundir 
    
    if info["par_path_as_argument"] is True:
        par_path    = args.par_path 

    # Define some variables based on default values (not parameters for now)
    mail_type               = hpc_config["mail_type"]
    jobname                 = hpc_config["jobname"]
    path_jobscript_template = hpc_queues["job_template"] 
    
    # Could maybe add this as a parameter choice:
    with_profiler = False 

    # Copy the executable file to the output directory, or
    # call it from its compiled location?    
    copy_exec = True 

    # If argument 'submit' is true, then ensure 'run' is also true
    #if submit: run = True 
    # ajr: disabled this, so that one has to explicitly also choose to run. 
    # that way, all directories could be generated included job.submit scripts,
    # and then jobs could be run another way.

    if submit:
        # Make sure queue choices make sense

        if wall is None or qos is None or partition is None:
            if queue is None:
                error_msg = "At least the queue alias QUEUE, or all inidividual queue options " \
                            "(WALL,QOS,PARTITION) must be specified. See help (-h) for details."
                raise Exception(error_msg)
            
            else:
                # Populate queue options from alias as needed
                if qos is None: qos = hpc_queues["queues"][queue]["qos"]
                if partition is None: partition = hpc_queues["queues"][queue]["partition"]
                if wall is None: wall = hpc_queues["queues"][queue]["wall"]

    # Additional options, consistency checks

    if with_runner and not runner_is_installed:
        error_msg = "The Python module 'runner' is not installed, or not installed properly. Do not use option -x. " \
                    "This option should only be used by the runner::jobrun script internally. " \
                    "To install runner, see: https://github.com/alex-robinson/runner"
        raise Exception(error_msg)

    if not par is None and not runner_is_installed:
        error_msg = "The Python module 'runner' is not installed, or not installed properly. Do not use option -p. " \
                    "To install runner, see: https://github.com/alex-robinson/runner"
        raise Exception(error_msg)

    # Expand executable path alias if defined, otherwise exe_path remains unchanged.
    if exe_path in info["exe_aliases"]:
        exe_path = info["exe_aliases"].get(exe_path)
    
    # Also extract executable filename 
    exe_fname = os.path.basename(exe_path)


    if info["par_path_as_argument"] is True:
        # Executable assumed to take parameter file as command line argument
        # Define argument depending on whether running from rundir (copy_exec==True)
        # or from central directory. 
        if copy_exec: 
            exe_args = os.path.basename(par_path)
        else:
            par_fname  = os.path.basename(par_path)
            exe_args = os.path.join(rundir,par_fname)

    else:
        # No arguments expected
        exe_args = ""

    # Define prefix part of command for running with a profiler (eg vtune)
    if with_profiler:
        # Use prefix for vtune profiler 
        if copy_exec:
            profiler_prefix = "amplxe-cl -c hotspots -r {} -- ".format("./")
        else:
            profiler_prefix = "amplxe-cl -c hotspots -r {} -- ".format(rundir)
    else: 
        profiler_prefix = ""

    # Make sure input file(s) exist 
    if not os.path.isfile(exe_path):
        print("Input file does not exist: {}".format(exe_path))
        sys.exit() 
    
    if info["par_path_as_argument"] is True:
        # Make sure input parameter file exists
        if not os.path.isfile(par_path):
            print("Input file does not exist: {}".format(par_path))
            sys.exit() 


    ### Start the script to make the job, and then run it ############################

    ##########################################
    # 1. Make the job (output directory, 
    #    parameter files, additional 
    #    data files/links)
    ##########################################

    # Using `jobrun` (runner), the rundir has already been created,
    # if not, it needs to be created here (removing existing nc and nml files): 
    if not with_runner: 
        makedirs(rundir,remove=True)

    # Copy files as needed 
    copy_files(info["files"],rundir) 

    # Copy special dir that need a different destination 
    for src, dst in info["dir-special"].items():
        copy_dir(src,rundir,dst)

    ## Generate symbolic links to input data folders
    for link in info["links"]:
        make_link(link,rundir)

    # Copy exe file to rundir (even if running from cwd)
    shutil.copy(exe_path,rundir)
    
    ### PARAMETER FILES ###

    # Get list of parameter files to manage
    par_paths = list(info["par_paths"].values())

    if info["par_path_as_argument"] is True:
        par_paths.append(par_path)   # add included parameter file provided at command line too

    # Eliminate empty entries
    par_paths = [entry for entry in par_paths if entry and entry != "None"]
    
    # Copy the default parameter files to the rundir 
    copy_files(par_paths,rundir)
    
    # Get list of new parameter file destinations 
    par_paths_rundir = []
    for par_path in par_paths:
        par_path_now = os.path.join(rundir,os.path.basename(par_path))
        par_paths_rundir.append(par_path_now) 
      
    # First modify parameters according to command-line values
    # "-p key=val key=val ..." argument.
    if not par is None:
        
        # Write parameters to files (in rundir!)
        param_write_to_files(par,par_paths_rundir,par_paths_rundir,info["grp_aliases"])
        
    # Next modify parameters according to command-line values specified
    # by runner (via `jobrun` "-p key=val [key=val ...]" argument)
    if with_runner:

        # Read param file runner.json always written by runner to rundir
        f  = os.path.join(rundir, 'runner.json')
        js = json.load(open(f))
        par_runner = js['params'] 

        # Summarize what is to be done
        param_summary(par_runner,rundir,verbose) 

        # Write parameters to files (in rundir!)
        param_write_to_files(par_runner,par_paths_rundir,par_paths_rundir,info["grp_aliases"])
        
    ### DONE WITH PARAMETER FILES ###

    ## Write summary file to rundir 

    run_info = {}
    run_info['Command'] = " ".join(sys.argv)
    run_info['Called via runner'] = with_runner 
    
    # Write the current git revision information to output directory 
    if os.path.isdir(".git"):
        run_info['git hash'] = get_git_revision_hash()
    else:
        run_info['git hash'] = "Not under git version control."
    
    with open(os.path.join(rundir,"run_info.txt"), 'w') as file:
        for key, val in run_info.items():
            file.write("{} : {}\n".format(key,val))

    ##########################################
    # 2. Run the job
    ##########################################

    # Determine where job will be running from
    if copy_exec: 
        # Assume executable is running from rundir
        exe_rundir = "."

    else:
        # Assume executable will run from current working directory 
        exe_rundir = os.getcwd()

    # Generate the appropriate executable command to run program
    executable = "{}{}/{} {}".format(profiler_prefix,exe_rundir,exe_fname,exe_args)
    
    if submit:

        # Write the job.submit script to rundir
        preparejob(path_jobscript_template,rundir,executable,qos,
                            wall,partition,account,omp,jobname,email,mail_type)
        
        if run:
            # Submit job to queue 
            pid = submitjob(rundir) 
    
    else:
        
        if run:
            # Run job in background 
            pid = runjob(rundir,executable,omp) 

    return 


######### Helper functions ############### 


def runjob(rundir,cmd,omp):
    '''Run a job generated with makejob.'''

    if omp > 0:
        # && export OMP_THREAD_LIMIT=2"
        cmd_job = "cd {} && export OMP_NUM_THREADS={} && mpiexec -n {} {} > {} &".format(rundir,omp,omp,cmd,"out.out")
    else:
        cmd_job = "cd {} && exec {} > {} &".format(rundir,cmd,"out.out")
        #cmd_job = "cd {} && exec mpiexec -n 1 {} > {} &".format(rundir,cmd,"out.out")
    
    print("Running job in background: {}".format(cmd_job))

    # Run the command (ie, change to output directory and submit job)
    # Note: the argument `shell=True` can be a security hazard, but should
    # be ok in this context, see https://docs.python.org/2/library/subprocess.html#frequently-used-arguments
    #jobstatus = subp.check_call(cmd_job,shell=True)
    try:
        jobstatus = subp.check_output(cmd_job,shell=True,stdin=None,stderr=None)
    except subp.CalledProcessError as error:
        print(error)
        sys.exit() 
    
    return jobstatus

def preparejob(path_template,rundir,cmd,qos,wall,partition,account,omp,jobname,email,mail_type):
    '''Prepare and write job script for submitting a job to a HPC queue'''

    # Create the jobscript using current info
    nm_jobscript   = 'job.submit'
    path_jobscript = "{}/{}".format(rundir,nm_jobscript)
    
    # Generate a job submission script, write to path
    script  = generate_jobscript(path_template,cmd,jobname,account,qos,wall,partition,omp,email,mail_type)
    jobfile = open(path_jobscript,'w').write(script)

    return

def submitjob(rundir):
    '''Submit a job to a HPC queue (sbatch,qsub) with a template job script'''

    # Prepare job command
    nm_jobscript = 'job.submit'
    cmd_job = "cd {} && sbatch {}".format(rundir,nm_jobscript)

    # Run the command (ie, change to output directory and submit job)
    # Note: the argument `shell=True` can be a security hazard, but should
    # be ok in this context, see https://docs.python.org/2/library/subprocess.html#frequently-used-arguments
    try:
        out = subp.check_output(cmd_job,shell=True,stderr=subp.STDOUT)
        jobstatus = out.decode("utf-8").strip() 
        print(jobstatus)
    except subp.CalledProcessError as error:
        print(error)
        sys.exit() 

    return jobstatus

def make_link(srcname,rundir,target=None):
    '''Make a link in the output dir.'''

    if target is None: target = srcname 

    # Define destination in rundir
    dstname = os.path.join(rundir,target)
    if os.path.islink(dstname): os.unlink(dstname)

    # Make link from srcname to dstname
    if os.path.islink(srcname):
        linkto = os.readlink(srcname)
        os.symlink(linkto, dstname)
    elif os.path.isdir(srcname):
        srcpath = os.path.abspath(srcname)
        os.symlink(srcpath,dstname)
    else:
        print("Warning: path does not exist {}".format(srcname))

    return

def copy_dir(path,rundir,target):
    '''Copy file(s) to run directory.'''

    if not path == "" and not path == "None":
        dst = os.path.join(rundir,target)
        shutil.copytree(path,dst,dirs_exist_ok=True)  # Only v3.8+
        #copy_tree(path, dst) # Deprecated after v3.12+

    return 

def copy_file(path,rundir,target):
    '''Copy file(s) to run directory.'''

    if not path == "" and not path == "None":
        dst = os.path.join(rundir,target)
        shutil.copy(path,dst)

    return 

def copy_files(paths,rundir):
    '''Bulk copy file(s) to run directory.'''

    for pnow in paths:
        if not pnow == "" and not pnow == "None":
            shutil.copy(pnow,rundir)

    return 

def makedirs(dirname,remove):
    '''
    Make a directory (including sub-directories),
    but first ensuring that path doesn't already exist
    or some other error prevents the creation.
    '''

    try:
        os.makedirs(dirname)
        print('Directory created: {}'.format(dirname))
    except OSError:
        if os.path.isdir(dirname):
            print('Directory already exists: {}'.format(dirname))
            if remove:
                for f in glob.glob("{}/*.nc".format(dirname)): 
                    os.remove(f)
                for f in glob.glob("{}/*.nml".format(dirname)):
                    os.remove(f)
                #print('Removed *.nml and *.nc files.')
            pass
        else:
            # There was an error on creation, so make sure we know about it
            raise

    return

def autofolder(params,outfldr0):
    '''Given a list of parameters,
       generate an appropriate folder name.
    '''

    parts = []

    for p in params:
        parts.append( p.short() )

    # Join the parts together, combine with the base output dir
    autofldr = '.'.join(parts)
    outfldr  = outfldr0 + autofldr + '/'

    return outfldr

def get_git_revision_hash():
    #githash = subp.check_output(['git', 'describe', '--always', '--long', 'HEAD']).strip()
    githash = subp.check_output(['git', 'rev-parse', 'HEAD']).strip()
    return githash.decode("ascii") 

def generate_jobscript(template,cmd,jobname,account,qos,wall,partition,omp,email,mail_type):
    '''Definition of the job script based on a template file and 
    fields in < > that can be substituted.'''

    # If omp has been set, generate a jobscript string with appropriate settings
    if omp > 0:
        # Use openmp settings 

        # Read in omp section template
        omp_script = open(template+"_omp",'r').read()

        # Now make substitutions to match argument choices 
        omp_script = omp_script.replace('<OMP>',"{}".format(omp))
    
    else: 
        # No openmp 
        omp_script = "" 

    # Email settings
    if not email == "":
        email_script = "#SBATCH --mail-user=<EMAIL>".replace('<EMAIL>',email)
        for mt in mail_type:
            email_script = email_script + "\n" + "#SBATCH --mail-type=<MT>".replace('<MT>',mt)

    else:
        email_script = ""

    
    # Read in jobscript template 
    job_script = open(template,'r').read()

    # Substitute in email section as needed to make full job script
    job_script = job_script.replace('<EMAILSECTION>',email_script)
    
    # Substitute in omp section as needed to make full job script
    job_script = job_script.replace('<OMPSECTION>',omp_script)
    
    # Make additional substitutions to match argument choices 
    job_script = job_script.replace('<PARTITION>',partition)
    job_script = job_script.replace('<QOS>',qos)
    job_script = job_script.replace('<WALL>',"{}".format(wall))
    job_script = job_script.replace('<OMP>',"{}".format(omp))
    job_script = job_script.replace('<JOBNAME>',jobname)
    job_script = job_script.replace('<ACCOUNT>',account)
    job_script = job_script.replace('<CMD>',cmd)
    
    return job_script


if __name__ == "__main__": 

    # Call main function...
    main()