JobLog.py

import os
import re
import commands
import traceback
from time import localtime
from glob import glob
from shutil import copy2, rmtree

import Mover as mover
from PilotErrors import PilotErrors
from pUtil import tolog, readpar, isLogfileCopied, isAnalysisJob, removeFiles, getFileGuid, PFCxml, createLockFile, \
    getMetadata, returnLogMsg, removeLEDuplicates, getPilotlogFilename, remove, getExeErrors, updateJobState, \
    makeJobReport, chdir, addSkippedToPFC, updateMetadata, getJobReport, filterJobReport, timeStamp, \
    getPilotstderrFilename, safe_call, updateXMLWithSURLs, putMetadata, getCmtconfig, getExperiment, getSiteInformation, \
    getGUID, timedCommand, updateXMLWithEndpoints
from FileHandling import addToOSTransferDictionary, getOSTransferDictionaryFilename, getOSTransferDictionary, \
    getWorkDirSizeFilename, getDirSize, storeWorkDirSize
from JobState import JobState
from FileState import FileState
from FileStateClient import updateFileState, dumpFileStates
from JobRecovery import JobRecovery
from Configuration import Configuration

class JobLog:
    """
    Methods for handling the job log (e.g. postJobLog, updatePandaServer)
    """

    # private data members
    __error = PilotErrors()   # PilotErrors object

    def __init__(self):
        """ Default initialization """

        self.__env = Configuration()

    def getLogFileGuid(self, tarFileGuid, logFile, jobId, workdir):
        """ return the proper log file guid """

        # if for some reason the log file guid is not known (e.g. in a problematic lost job)
        # the guid should not be generated by PFCxml below, but be extracted from metadata-<jobId>.xml
        _filename = os.path.join(workdir, "metadata-%s.xml" % (jobId))

        fileGuid = getFileGuid(_filename, logFile)
        if tarFileGuid != fileGuid:
            if fileGuid == "":
                tolog("!!WARNING!!1500!! Log file guid could not be found in %s" % (_filename))
            else:
                tolog("!!WARNING!!1500!! Encountered a disprepancy between job.tarFileGuid (value: %s) and %s (value: %s)" %\
                      (tarFileGuid, _filename, fileGuid))
                tarFileGuid = fileGuid
        else:
            tolog("Log guid same as in metadata file")

        if tarFileGuid == "":
            tolog("!!WARNING!!1500!! Encountered an empty log file guid")
        else:
            tolog("Using log file guid: %s" % (tarFileGuid))

        return tarFileGuid

    def copyLogFile(self, dest, workdir, logFile, newDirNM):
        """ copy the log file to a specific directory """

        status = False

        if dest == "None":
            tolog("Log file will not be copied to neither SE nor any other directory")
        else:
            tolog("Log file will not be copied to SE, but to directory: %s" % (dest))
            try:
                copy2("%s/%s" % (workdir, logFile), dest)
            except Exception, e:
                tolog("!!WARNING!!1500!! Exception caught: Could not copy log file %s/%s from %s: %s" %\
                      (workdir, logFile, dest, str(e)))
                status = False
            else:
                status = True
                tolog("Successfully copied log file to destination")
                try:
                    os.remove(logFile)
                except Exception, e:
                    tolog("!!WARNING!!1500!! Exception caught: Could not remove %s: %s (ignore)" % (logFile, str(e)))
                    pass # ignore, return status True anyway
                if os.path.exists(newDirNM):
                    self.removeTree(newDirNM)

        return status

    def removeTree(self, _dir):
        """ Remove a non-empty directory """

        if safe_call(rmtree, _dir):
            tolog("Removed directory: %s" % (_dir))

    def transferLogFile(self, job, site, experiment, dest=None, jr=False):
        """ Transfer the log file to storage """

        status = True

        # transfer log file to special log SE (CERN via xrdcp)
        # get the experiment object
        thisExperiment = getExperiment(experiment)
        if thisExperiment.doSpecialLogFileTransfer(eventService=job.eventService, putLogToOS=job.putLogToOS):
            tolog("Preparing for log file transfer to special SE")

            # get the site information object
            si = getSiteInformation(experiment)

            # first backup some schedconfig fields that need to be modified for the secondary transfer
            copytool_org = readpar('copytool')

            # temporarily modify the schedconfig fields with values for the secondary SE
            tolog("Temporarily modifying queuedata for log file transfer to special SE")
            #ec = si.replaceQueuedataField("copytool", "objectstore")

            # do log transfer
            tolog("Attempting log file transfer to special SE")
            ret, job = self.transferActualLogFile(job, site, experiment, dest=dest, jr=jr, specialTransfer=True, copytool="objectstore")
            if not ret:
                tolog("!!WARNING!!1600!! Could not transfer log file to special SE")
                #status = False
            else:
                # Update the OS transfer dictionary
                # Get the OS name identifier and bucket endpoint
                os_bucket_id = job.logBucketID
                os_ddmendpoint = si.getObjectstoreDDMEndpointFromBucketID(os_bucket_id)

                # Add the transferred file to the OS transfer file
                addToOSTransferDictionary(job.logFile, self.__env['pilot_initdir'], os_bucket_id, os_ddmendpoint)

            # finally restore the modified schedconfig fields
            tolog("Restoring queuedata fields")
            #ec = si.replaceQueuedataField("copytool", copytool_org)

        else:
            tolog("Special log file transfer not required")

        # register/copy log file
        tolog("Attempting log file transfer to primary SE")
        ret, job = self.transferActualLogFile(job, site, experiment, dest=dest, jr=jr)
        if not ret:
            tolog("!!%s!!1600!! Could not transfer log file to primary SE" % (self.__env['errorLabel']))
            status = False

        return status, job

    def transferActualLogFile(self, job, site, experiment, dest=None, jr=False, specialTransfer=False, copytool=None):
        """
        Save log tarball in DDM and register it to catalog, or copy it to 'dest'.
        the job recovery will use the current site info known by the current pilot
        """

        status = True
        pilotErrorDiag = ""
        N_filesNormalStageOut = 0
        N_filesAltStageOut = 0

        if not self.__env['jobrec']:
            self.__env['errorLabel'] = "FAILED"

        # only check for performed log transfer for normal stage-out (not for any special transfers)
        if isLogfileCopied(site.workdir, job.jobId) and not specialTransfer:
            tolog("Log file already transferred")
            return status, job

        # only copy log file to dest dir if specified
        if dest:
            status = self.copyLogFile(dest, site.workdir, job.logFile, job.newDirNM)

            # update the current file state
            if status:
                updateFileState(job.logFile, site.workdir, job.jobId, mode="file_state", state="transferred")
            else:
                updateFileState(job.logFile, site.workdir, job.jobId, mode="file_state", state="not_transferred")
            dumpFileStates(site.workdir, job.jobId)
            return status, job

        # see if it's an analysis job or not
        analyJob = isAnalysisJob(job.trf.split(",")[0])

        # remove any lingering input files from the work dir
        if len(job.inFiles) > 0:
            ec = removeFiles(job.workdir, job.inFiles)

        # get the log file guid (if not set already)
        job.tarFileGuid = self.getLogFileGuid(job.tarFileGuid, job.logFile, job.jobId, site.workdir)

        # the cmtconfig is needed by at least the xrdcp site mover
        cmtconfig = getCmtconfig(job.cmtconfig)

        # create the xml needed for the registration if it doesn't exist already (for a secondary log transfer)
        WDTxml = "%s.xml" % (job.newDirNM)
        if not os.path.exists(WDTxml):
            guids_status = PFCxml(job.experiment, WDTxml, fntag="pfn", alog=job.logFile, alogguid=job.tarFileGuid, jr=jr)
        else:
            tolog("Log XML already exists: %s" % (WDTxml))

        dblock = job.logDblock
        if dblock and dblock != 'NULL' and dblock != ' ':
            dsname = dblock
        else:
            dsname = "%s-%s-%s" % (localtime()[0:3]) # pass it a random name
        rmflag = 1
        ec = 0
        _state = ""
        _msg = ""
        latereg = False

        # determine the file path for special log transfers (can be overwritten in mover_put_data() in case of failure in transfer to primary OS)
        if specialTransfer:
            logPath, os_bucket_id = self.getLogPath(job.jobId, job.logFile, job.experiment)
            if logPath == "":
                tolog("!!WARNING!!4444!! Can not continue with special transfer since logPath is not set")
                return False, job
            tolog("Special log transfer: %s" % (logPath))
        else:
            logPath = ""
            os_bucket_id = -1
        try:
            rc, pilotErrorDiag, rf, rs, N_filesNormalStageOut, N_filesAltStageOut, os_bucket_id = mover.mover_put_data("xmlcatalog_file:%s" % (WDTxml),
                                                                  dsname,
                                                                  site.sitename,
                                                                  site.computingElement,
                                                                  analysisJob = analyJob,
                                                                  testLevel = self.__env['testLevel'],
                                                                  proxycheck = self.__env['proxycheckFlag'],
                                                                  pinitdir = self.__env['pilot_initdir'],
                                                                  datasetDict = None,
                                                                  outputDir = self.__env['outputDir'],
                                                                  stageoutTries = self.__env['stageoutretry'],
                                                                  cmtconfig = cmtconfig,
                                                                  recoveryWorkDir = site.workdir,
                                                                  logPath = logPath,
                                                                  os_bucket_id = os_bucket_id,
                                                                  copytool=copytool,
                                                                  job = job,
                                                                  log_transfer = True # new sitemovers required integration parameter
                                                                  )
        except Exception, e:
            rmflag = 0 # don't remove the tarball
            status = False
            import traceback
            if 'format_exc' in traceback.__all__:
                trace = traceback.format_exc()
                pilotErrorDiag = "Exception caught when saving the log tarball: %s, %s" % (str(e), trace)
            else:
                tolog("traceback.format_exc() not available in this python version")
                pilotErrorDiag = "Exception caught when saving the log tarball: %s" % (str(e))
            tolog("!!%s!!1500!! %s" % (self.__env['errorLabel'], pilotErrorDiag))
        else:
            tolog("mover_put_data finished with EC = %s" % str(rc))

            # update transfer numbers in case alt stage-out has been used
            if N_filesAltStageOut > 0:
                job.filesNormalStageOut += N_filesNormalStageOut # only reported to jobMetrics in case of alt stage-out
                job.filesAltStageOut += N_filesAltStageOut
                tolog("Updated stage-out numbers:")
                tolog("..filesNormalStageOut = %d" % (job.filesNormalStageOut))
                tolog(".....filesAltStageOut = %d" % (job.filesAltStageOut))

            if rc != 0:
                # remove any trailing "\r" or "\n" (there can be two of them)
                if rs != None:
                    rs = rs.rstrip()
                    tolog("Error string: %s" % (rs))

                # ignore failed OS log transfers (this might change if we only store logs in OS:s)
                if os_bucket_id != -1 and specialTransfer:
                    tolog("Ignoring failed special log transfer to OS (resetting log bucket id)")
                    os_bucket_id = -1
                    rc = 0

                rmflag = 0 # don't remove the tarball
                job.result[0] = "holding"

                # is the job recoverable?
                if self.__error.isRecoverableErrorCode(rc):
                    _state = "holding"
                    _msg = "WARNING"
                else:
                    _state = "failed"
                    _msg = self.__env['errorLabel']

                # look for special error in the error string
                if rs == "Error: string Limit exceeded 250":
                    tolog("!!%s!!3000!! Put error: file name string limit exceeded 250" % (_msg))
                    ec = self.__error.ERR_LRCREGSTRSIZE
                else:
                    ec = rc

            else:
                # create a weak lock file for the log transfer (but not for any special transfer, ie the log transfer to the special/secondary log area)
                if not specialTransfer:
                    createLockFile(self.__env['jobrec'], site.workdir, lockfile="LOGFILECOPIED_%s" % job.jobId)

                # to which OS bucket id was the file transferred to?
                if os_bucket_id != -1:
                    # get the site information object
                    #si = getSiteInformation(experiment)
                    job.logBucketID = os_bucket_id #si.getBucketID(os_id, "logs")
                    tolog("Stored log bucket ID: %s" % (job.logBucketID))

            # set the error code for the log transfer only if there was no previous error (e.g. from the get-operation)
            if job.result[2] == 0:
                job.result[2] = ec
                job.pilotErrorDiag = pilotErrorDiag
            else:
                # there was a previous error
                if ec != 0:
                    # is the new log transfer error of the same type as the earlier error?
                    if ec == job.result[2]:
                        tolog("!!WARNING!!1105!! Previous error same as new error: %d" % (ec))
                    else:
                        tolog("!!WARNING!!1105!! Previous error (%d) will not be overwritten by the new error (%d)" % (job.result[2], ec))
                    # ignore holding state for log transfer if previous earlier error was a get error
                    if job.result[0] == "holding" and not self.__error.isRecoverableErrorCode(job.result[2]):
                        tolog("!!WARNING!!1105!! Resetting HOLDING to FAILED since the previous error is not recoverable")
                        job.result[0] = "failed"

        # in case the log file could not be registered, store the relevant info in the job state file
        if latereg:
            job.log_latereg = "True"
            job.log_field = rf
        else:
            job.log_latereg = "False"
            job.log_field = None

        # tarball is saved to DDM successfully, so remove everything except the log file which might
        # still be needed (for creating metadata for failed jobs)
        if rmflag == 1:
            if os.path.isdir(job.newDirNM):
                self.removeTree(job.newDirNM)
            try:
                os.remove(WDTxml)
            except Exception, e:
                tolog("!!WARNING!!1500!! Could not remove %s: %s" % (WDTxml, str(e)))
                #status = False
            else:
                tolog("%s removed" % (WDTxml))
        elif rmflag == 0: # something bad happened during put, save the tarball on worker node for further debugging
            if job.result[0] == 'holding':
                tolog("Will leave log file %s for later recovery" % (job.logFile))
                status = False
                if os.path.isdir(job.newDirNM):
                    self.removeTree(job.newDirNM)
        elif os.path.isdir(job.workdir) and (not job.logFile or job.logFile == ''):
            try:
                rmtree(job.workdir)
            except Exception, e:
                tolog("!!WARNING!!1500!! Could not remove %s: %s" % (job.workdir, str(e)))
                pass

        # do not overwrite any existing pilotErrorDiag (from a get operation e.g.)
        if job.pilotErrorDiag != "" and job.pilotErrorDiag != None:
            if pilotErrorDiag != "" and pilotErrorDiag != None:
                # add pilotErrorDiag to the end of the existing string but do not add the log put error identifier to save space
                job.pilotErrorDiag += "|Log put error: " + pilotErrorDiag
        else:
            if pilotErrorDiag != "" and pilotErrorDiag != None:
                job.pilotErrorDiag = "Log put error: " + pilotErrorDiag

        return status, job

    def buildLogExtracts(self, job, workdir, analyJob):
        """ Build the bulk of the log extracts """

        error = PilotErrors()
        tolog("Building log extracts..")
        logMsg = ''

        # look for the pandatracerlog.txt file, produced if the user payload attempted any outgoing connections
        tracerlog = os.path.join(job.workdir, "pandatracerlog.txt")
        if analyJob:
            if os.path.exists(tracerlog):
                # only add if file is not empty
                if os.path.getsize(tracerlog) > 0:
                    msg = "!!WARNING!!1010!! PandaID=%s had outbound connections" % (job.jobId)
                    tolog(msg)
                    logMsg += msg
                    try:
                        f = open(tracerlog, "r")
                    except Exception, e:
                        tolog("!!WARNING!!1010!! Failed to open log file: %s, %s" % (tracerlog, e))
                    else:
                        logMsg += f.read()
                        f.close()
                else:
                    tolog("Panda tracer log has zero size (no outbound connections detected)")
            else:
                tolog("Panda tracer log does not exist: %s (ignoring)" % (tracerlog))

        # are there any special log messages from the subprocess/payload?
        for thisf in job.logMsgFiles:
            logMsg += returnLogMsg(logf=thisf) + "\n"

            # grep for !!FAILED/WARNING!!NR!! messages in pilotlog.txt
            ret = commands.getoutput('grep -e "\!\![A-Z]\+\!\![0-9]\+\!\!" %s | tail -20' % (getPilotlogFilename()))
            if ret != "":
                logMsg += "- %s -\n" % os.path.basename(getPilotlogFilename())
                logMsg += ret + "\n"

        # is this a multi-trf job?
        nJobs = job.jobPars.count("\n") + 1

        # loop over all payload stdout files
        for _i in range(nJobs):
            _stdout = job.stdout
            if nJobs > 1:
                _stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1))
            fname = os.path.join(workdir, _stdout)
            if os.path.isfile(fname):
                # use the job reports for production jobs
                if job.payload == "athena" and not analyJob:
                    # leave a filtered extracts from the first subjobs only
                    if _i < nJobs - 1:
                        # only get the error summary for
                        jobReport = filterJobReport(getJobReport(fname))
                    else:
                        # get the full job report for the last trf
                        jobReport = getJobReport(fname)
                else:
                    jobReport = ""

                if jobReport != "":
                    logMsg += jobReport
                else:
                    # old style log extracts
                    logMsg += '\n\n- Errors from %s (no jobReport) -\n' % (_stdout)
                    logMsg += commands.getoutput('grep -i error %s | tail -20' % (fname))
                    tmp = commands.getoutput('grep -i \"Running %s failed\" %s | tail -20' % (job.payload, fname))
                    if len(tmp) > 0:
                        logMsg += '\n\n- %s errors from %s -\n' % (job.payload, _stdout)
                        logMsg += tmp
                    if job.payload == "athena":
                        evts = commands.getoutput('grep AthenaEventLoopMgr %s | grep end' % (fname))
                        evtslist = evts.split("\n")
                        if len(evtslist) > 1:
                            logMsg += '\n\n- First event -\n'
                            logMsg += evtslist[0]
                            logMsg += '\n\n- Last event -\n'
                            logMsg += evtslist[-1]

                # if payload stdout file is too big (ec 1106), remove the file at this point
                if job.result[2] == error.ERR_STDOUTTOOBIG:
                    try:
                        os.remove(fname)
                    except Exception, e:
                        tolog("!!WARNING!!1999!! Failed to remove file %s: %s" % (fname, str(e)))
                    else:
                        tolog("Too large payload stdout file has been removed")
            else:
                logMsg += "\n(%s/%s does not exist)" % (workdir, _stdout)

        # remove duplicated warning/error messages
        logMsg = removeLEDuplicates(logMsg)

        return logMsg

    def getXMLAndWorkdir(self, jr, siteWorkdir, jobWorkdir, newDirNM, jobId):
        """ Get the metadata and the relevant workdir """

        if jr:
            # for lost jobs that can be recovered, the workdir has already been renamed to newDirNM
            workdir = newDirNM
            tolog("Post job task (job recovery mode) using dir: %s" % (workdir))

            # get the metadata
            strXML = getMetadata(siteWorkdir, jobId)
        else:
            workdir = jobWorkdir
            tolog("Post job task (normal mode) using dir: %s" % (workdir))

            # get the preliminary metadata (file size and checksum not yet set for log file)
            strXML = getMetadata(workdir, jobId)

        return strXML, workdir

    def isAnalyJob(self, sitename):
        """ Determine if the job is a user analysis job using the site name """

        if "ANALY" in sitename:
            analyJob = True
        else:
            analyJob = False
        return analyJob

    def removeCoreDumps(self, siteWorkdir, workdir):
        """ Remove any remaining core dumps so they do not end up in the log tarball """

        foundCoreDump = False
        coreDumps1 = glob("%s/core.*" % (siteWorkdir))
        coreDumps2 = glob("%s/core.*" % (workdir))
        coreDumps3 = glob("%s/core" % (siteWorkdir))
        coreDumps4 = glob("%s/core" % (workdir))
        coreDumps = coreDumps1 + coreDumps2 + coreDumps3 + coreDumps4
        if coreDumps:
            for coreDump in coreDumps:
                tolog("Trying to remove core dump: %s" % str(coreDump))
                if not remove([coreDump]):
                    tolog("!!WARNING!!1600!! Failed to remove core dump")
                else:
                    tolog("Core dump removed")
            foundCoreDump = True

        return foundCoreDump

    def removeSoftLink(self, jobPars, stdout, siteWorkdir):
        """ Remove the soft link to the payload stdout """

        # is this a multi-trf job?
        nJobs = jobPars.count("\n") + 1

        for _i in range(nJobs):
            _stdout = stdout
            if nJobs > 1:
                _stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1))
            lnfilename = os.path.join(siteWorkdir, _stdout)
            if os.path.exists(lnfilename):
                try:
                    os.remove(lnfilename)
                except Exception, e:
                    tolog("Failed to remove soft link %s: %s" % (lnfilename, str(e)))
                else:
                    tolog("Removed soft link: %s" % (lnfilename))

    def removeUnwantedFiles(self, workdir, inFiles, outFiles):
        """ Remove unwanted files from work dir prior to tarball creation """

        tolog("Removing unwanted files prior to job log creation")

        # remove any lingering input files from the work dir
        if len(inFiles) > 0:
            ec = removeFiles(workdir, inFiles)

        # remove any lingering output files from the work dir
        if len(outFiles) > 0:
            ec = removeFiles(workdir, outFiles)

        # remove any lingering athena workDir before creating the tarball
        if os.path.exists(os.path.join(workdir, 'workDir')):
            tolog("Removing user workDir prior to tarball creation")
            try:
                rmtree(os.path.join(workdir, 'workDir'))
            except Exception, e:
                tolog("Failed to remove workDir: %s" % str(e))

    def addWantedFiles(self, jobWorkdir, siteWorkdir, jobId, outputFilesXML):
        """ Add wanted files to work dir prior to tarball creation """

        # add skipped input file info, if any
        _skippedfname = os.path.join(jobWorkdir, "skipped.xml")
        _updatedfname = os.path.join(jobWorkdir, "metadata-%s.xml" % (jobId))

        if os.path.exists(_skippedfname):
            ec = addSkippedToPFC(_updatedfname, _skippedfname)
            # copy to site dir so it can be reached in updatePandaServer after log creation if necessary
            try:
                copy2(_skippedfname, siteWorkdir)
            except Exception, e:
                tolog("!!WARNING!!1600!! Exception caught: Could not copy skipped metadata file to site work dir: %s" % str(e))
            else:
                tolog("Successfully copied skipped metadata file to site work dir")
        else:
            tolog("No skipped input files (non DBRelease)")

        # Special NG/CERNVM metadata file
        fname = os.path.join(jobWorkdir, outputFilesXML)
        if os.path.exists(fname):
            # copy to site dir so it can be reached after the log has been created below
            try:
                copy2(fname, siteWorkdir)
            except Exception, e:
                tolog("!!WARNING!!1600!! Exception caught: Could not copy NG/CERNVM metadata file to site work dir: %s" % str(e))
            else:
                tolog("Successfully copied NG/CERNVM metadata file to site work dir: %s" % (siteWorkdir))

    def createMetadataForOutput(self, workdir, filename, jobId, newDirNM, outputFilesXML):
        """ Create the final metadata with file size and checksum of the log tarball """

        # add metadata about log file to metadata.xml
        from SiteMover import SiteMover
        from SiteMoverFarm import getSiteMover
        sitemover = getSiteMover(readpar('copytool'), "")
        _date = "None"
        strXML = ""

        tolog("Preparing to create metadata for output files")

        # get the file info for the log file and, if needed, for the CERNVM outputFilesXML file
        ec, pilotErrorDiag, _fsize, _checksum = \
            SiteMover.getLocalFileInfo(os.path.join(workdir, filename), csumtype=sitemover.getChecksumCommand(), date=_date)
        if ec != 0:
            tolog("!!WARNING!!2995!! Failed while trying to get the log file info: %d" % (ec))
            tolog("fsize=%s" % (_fsize))
            tolog("checksum=%s" % (_checksum))

        JS = JobState()
        _filename = JS.getFilename(workdir, jobId)
        if os.path.exists(_filename):
            ec, pilotErrorDiag, _fsizeAdditional, _checksumAdditional = \
                SiteMover.getLocalFileInfo(_filename, csumtype=sitemover.getChecksumCommand(), date=_date)
            if ec != 0:
                tolog("!!WARNING!!2995!! Failed while trying to get the additional file (%s) info: %d" % (os.path.basename(_filename), ec))
                _fsizeAdditional = None
                _checksumAdditional = None
        else:
            _fsizeAdditional = None
            _checksumAdditional = None

        fname = "%s/metadata-%s.xml" % (workdir, jobId)
        if os.path.exists(fname):
            tolog("Found metadata in site dir: %s" % (workdir))
        else:
            # backup solution in case metadata has not already been copied into the site work dir
            tolog("Metadata not found in site work dir, looking for it in job work dir instead..")
            _fname = "%s/metadata-%s.xml" % (newDirNM, jobId)
            if os.path.exists(_fname):
                tolog("Found metadata in job work dir: %s" % (newDirNM))
                try:
                    copy2(_fname, workdir)
                except Exception, e:
                    tolog("!!WARNING!!2999!! Failed to copy metadata file from job work dir to site work dir: %s" % str(e))
                else:
                    tolog("Successfully copied metadata from job work dir to site work dir")
            else:
                tolog("!!WARNING!! Metadata not found in job work dir either: %s" % (fname))

        # try to read the metadata from the site work dir
        if os.path.exists(fname):
            ec, _strXML = updateMetadata(fname, _fsize, _checksum)
            if ec == 0:
                tolog("Added (%s, %s) to metadata file: %s" % (_fsize, _checksum, fname))
                if len(_strXML) != 0:
                    # replace preliminary XML
                    strXML = _strXML
                    # strXML now contains all the xml for all output files and log
                else:
                    tolog("!!WARNING!!1601!! updateMetadata() did not return any xml")
            else:
                tolog("!!WARNING!!1600!! Failed to add metadata: %d" % (ec))
        else:
            tolog("!!WARNING!!2999!! Failed to find metadata file, expect job to eventually fail with ddm: Adder._updateOutputs() could not get GUID/LFN/MD5/FSIZE")

        # add the metadata about log file to special NG/CERNVM file
        fname = os.path.join(workdir, outputFilesXML)
        if os.path.exists(fname):
            # add checksum and file size of log file to the metadata file (OutputFiles.xml) and then transfer it
            ec, _strXML = updateMetadata(fname, _fsize, _checksum, format='NG', fsizeAdditional=_fsizeAdditional, checksumAdditional=_checksumAdditional)
            if ec == 0:
                tolog("Added (%s, %s) to metadata file: %s" % (_fsize, _checksum, fname))
                if _fsizeAdditional and _checksumAdditional:
                    tolog("Added (%s, %s) to metadata file: %s" % (_fsizeAdditional, _checksumAdditional, fname))
                # OutputFiles.xml now contains all the xml for all output files and log (and additional file info for CERNVM)
                # copy it to the init dir (only necessary for NG not for CERNVM)
                # (actually it can be transferred with the mv site mover just like it is done for CERNVM, skip for now)
                if os.environ.has_key('Nordugrid_pilot'):
                    try:
                        copy2(fname, self.__env['pilot_initdir'])
                    except Exception, e:
                        tolog("!!WARNING!!1600!! Exception caught: Could not copy NG metadata file to init dir: %s" % str(e))
                    else:
                        tolog("Successfully copied NG metadata file to pilot init dir: %s" % (self.__env['pilot_initdir']))
            else:
                tolog("updateMetadata returned: %d" % (ec))

        return strXML

    def addTimingInfo(self, logMsg, timeGetJob, timeStageIn, timeExe, timeStageOut, timeCleanUp):
        """ Add timing info to log message """

        t = '\n\n- Walltime -\n'
        # t1 = timeGetJob   # set in pilot.py
        # t2 = timeStageIn  # set in runJob.py
        # t3 = timeExe      # set in runJob.py
        # t4 = timeStageOut # set in runJob.py (and in pilot.py moveLostOutputFiles() for recovered jobs)
        # t5 = timeCleanUp  # set in this function
        t += 'JobRetrival=%s, StageIn=%s, Execution=%s, StageOut=%s, CleanUp=%s\n' % (timeGetJob, timeStageIn, timeExe, timeStageOut, timeCleanUp)

        # keep the walltime info but truncate the log message if necessary
        l = len(t)
        if len(logMsg) >= 2048 - l:
            logMsg = logMsg[:2048-l] + t
        else:
            logMsg += t

        return logMsg

    def transferLogExtracts(self, logMsg):
        """ Write and transfer log extracts to pilot init dir for Nordugrid """

        fname = "log_extracts.txt"
        try:
            f = open(fname, 'w')
            f.write(logMsg)
            f.close()
        except Exception, e:
            tolog("Failed to write log extracts to file: %s" % str(e))
        else:
            try:
                copy2(fname, self.__env['pilot_initdir'])
            except Exception, e:
                tolog("!!WARNING!!1600!! Exception caught: Could not copy log extracts file to init dir for NG: %s" % str(e))
            else:
                tolog("Successfully copied log extracts file to pilot init dir for NG: %s" % (self.__env['pilot_initdir']))

    def transferAdditionalCERNVMFiles(self, job, site, experiment):
        """ Transfer additional files for CERNVM """

        fname = os.path.join(site.workdir, job.outputFilesXML)
        if os.path.exists(fname):
            ret, job = self.transferAdditionalFile(job, site, experiment, fname)
            if not ret:
                tolog("!!WARNING!!2994!! Additional CERNVM transfer failed: %s" % (job.pilotErrorDiag))
            else:
                JS = JobState()
                ret, job = self.transferAdditionalFile(job, site, experiment, JS.getFilename(site.workdir, job.jobId))
                if not ret:
                    tolog("!!WARNING!!2994!! Additional CERNVM transfer failed: %s" % (job.pilotErrorDiag))
                else:
                    tolog("Transferred additional CERNVM files")

    def postJobTask(self, job, site, experiment, workerNode, jr=False, ra=0, stdout_tail=None, stdout_path=None):
        """
        Update Panda server with output info (xml) and make/save the tarball of the job workdir,
        only for finished or failed jobs.
        jr = job recovery
        ra = recovery attempt
        """

        tc_0 = os.times()
        transferAdditional = False

        # get the metadata and the relevant workdir
        strXML, workdir = self.getXMLAndWorkdir(jr, site.workdir, job.workdir, job.newDirNM, job.jobId)

        # set any holding job to failed for sites that do not use job recovery (e.g. sites with LSF, that immediately
        # removes any work directory after the LSF job finishes which of course makes job recovery impossible)
        if not self.__env['jobrec']:
            if job.result[0] == 'holding':
                job.result[0] = 'failed'
                tolog("This site does not support job recovery: HOLDING state reset to FAILED")

        # is it a user analysis job?
        analyJob = self.isAnalyJob(site.sitename)

        # build log extracts
        logMsg = self.buildLogExtracts(job, workdir, analyJob)

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        # remove known redundant files and directories
        thisExperiment.removeRedundantFiles(workdir)

        # remove the soft link to the payload stdout
        self.removeSoftLink(job.jobPars, job.stdout, site.workdir)

        # make the job workdir tarball
        chdir(site.workdir) # into pilot workdir, one level above job workdir

        # remove the core dump file first, since it's considered as useless
        foundCoreDump = self.removeCoreDumps(site.workdir, workdir)

        # tar the workdir using the Panda jobId index and move it to the log dir
        if os.path.isdir(workdir) and job.logFile and job.logFile != '':
            # dump all directories for a failed job to the log
            if job.result[0] == "failed":
                cmd = 'ls -altrR %s' % workdir
                tolog("%s: %s" % (cmd + '\n', commands.getoutput(cmd)))

            # use the jobInfo.xml to get the trf errors (unless they were read and set already)
            # (running a testEvgen job will not produce any exeError messages)
            if job.exeErrorCode == 0 and job.exeErrorDiag == "":
                try:
                    job.exeErrorCode, job.exeErrorDiag = getExeErrors(workdir, "jobInfo.xml")
                except Exception, e:
                    tolog("!!WARNING!!1600!! Could not get the exeErrors: %s" % str(e))
                    job.exeErrorCode, job.exeErrorDiag = 0, ""
            else:
                tolog("Skipping old style trf error XML file (jobInfo.xml) since TRF errors are already set")

            if not jr:
                # Make the job summary report
                makeJobReport(job, logMsg, foundCoreDump, self.__env['version'], self.__env['jobIds'])

            # overwrite any pilotErrorDiag at this point with exeErrorDiag if set
            # (for the job page error info)
            if job.exeErrorDiag != "" and job.exeErrorDiag != "OK":
                # this is probably useless since pilotErrorDiag might be overwritten again later
                tolog("Overwriting pilotErrorDiag (\'%s\') with exeErrorDiag (\'%s\')" % (job.pilotErrorDiag, job.exeErrorDiag))
                job.pilotErrorDiag = job.exeErrorDiag

                # reset the trf errors since the monitor refuses to display them at the moment
                #job.exeErrorDiag = ""
                #job.exeErrorCode = 0

            # remove unwanted files from work dir prior to tarball creation
            self.removeUnwantedFiles(job.workdir, job.inFiles, job.outFiles)

            # add wanted files to work dir prior to tarball creation
            self.addWantedFiles(job.workdir, site.workdir, job.jobId, job.outputFilesXML)

            if not job.newDirNM:
                job.newDirNM = "tarball_PandaJob_%s_%s" % (job.jobId, site.sitename)

#            # restore the hidden proxy if necessary
#            try:
#                restoreProxy()
#            except Exception, e:
#                tolog("Pilot failed to restore the proxy: %s" % str(e))

            tolog("Preparing to create log file")

            # protect the work dir until the log has been registered
            createLockFile(self.__env['jobrec'], site.workdir)
            # create log file and register it
            if not self.createLogFile(job):
                tolog("!!WARNING!!1600!! Could not create log file")
            else:
                # update the current file state
                updateFileState(job.logFile, site.workdir, job.jobId, mode="file_state", state="created")
                dumpFileStates(site.workdir, job.jobId)

                # create the final metadata.xml
                if not jr and job.result[0] != "failed":
                    strXML = self.createMetadataForOutput(site.workdir, job.logFile, job.jobId, job.newDirNM, job.outputFilesXML)

                # create metadata later (in updatePandaServer) for the log at least, if it doesn't exist already
                if (strXML == "" or strXML == None) and job.result[0] == 'failed':
                    tolog("metadata will be created for the log only in updatePandaServer")

                # update the job state file
                JR = JobRecovery()
                if job.jobState != "stageout":
                    job.jobState = "stageout"
                    _retjs = JR.updateJobStateTest(job, site, workerNode, mode="test")

                # register/copy log file
                try:
                    ret, job = self.transferLogFile(job, site, experiment, dest=self.__env['logFileDir'], jr=jr)
                except:
                    tolog("Failed to transfer log file: %s" % traceback.format_exc())
                    ret = False
                if not ret:
                    tolog("!!%s!!1600!! Could not transfer log file" % (self.__env['errorLabel']))
                    job.result[0] = "holding"
                else:
                    # the log file has been created and transferred, so it's now safe to remove the lock file
                    # as long as output files have been moved to local SE. It will also be removed for
                    # non-recoverable errors (such as 1150 = looping job, etc)
                    error = PilotErrors()
                    if not error.isPutErrorCode(job.result[2]):
                        self.removeLockFile(site.workdir)
                    else:
                        tolog("!!WARNING!!1600!! Job failed with EC %d - lock file will not be removed (job might be recovered by a later pilot)" % job.result[2])

                # transfer additional files for CERNVM (below, after the final server update which update the job state file with the metadata XML)
                # note: only needed in CoPilot mode
                fname = os.path.join(site.workdir, job.outputFilesXML)
                if os.path.exists(fname) and ("CERNVM" in site.sitename and self.__env['useCoPilot']):
                    transferAdditional = True

                # update the job state file
                job.jobState = job.result[0]
                _retjs = JR.updateJobStateTest(job, site, workerNode, mode="test")

        tc_1 = os.times()
        job.timeCleanUp = int(round(tc_1[4]-tc_0[4]))

        # add timing info to log message
        logMsg = self.addTimingInfo(logMsg, job.timeGetJob, job.timeStageIn, job.timeExe, job.timeStageOut, job.timeCleanUp)

        # write and transfer log extracts to pilot init dir for Nordugrid
        if os.environ.has_key('Nordugrid_pilot') and job.result[0] == 'failed':
            self.transferLogExtracts(logMsg)

        # update the SURLs info
        if strXML and strXML != "":
            tolog("Updating metadata XML with SURLs prior to PanDA server update")
            strXML = updateXMLWithSURLs(experiment, strXML, site.workdir, job.jobId, self.__env['jobrec']) # do not use format 'NG' here (even for NG)

            # was the log file transferred to an OS? check in the OS transfer dictionary
            if job.logBucketID != -1:
                # get the corresponding ddm endpoint
                si = getSiteInformation(experiment)
                os_ddmendpoint = si.getObjectstoreDDMEndpointFromBucketID(job.logBucketID)
                strXML = updateXMLWithEndpoints(strXML, [job.logFile], [os_ddmendpoint])

            tolog("Updated XML:\n%s" % (strXML))

            # replace the metadata-<jobId>.xml file
            if putMetadata(site.workdir, job.jobId, strXML):
                tolog("Successfully updated metadata file")

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        # get experiment specific metadata
        try:
            expSpecificMetadata = thisExperiment.getExpSpecificMetadata(job, workdir)
        except Exception, e:
            tolog("!!WARNING!!1211!! Caught exception in getAdditionalMetadata: %s" % (e))
            expSpecificMetadata = ""

        # update panda server
        ret, retNode = self.updatePandaServer(job, site, workerNode, self.__env['psport'],
                                              xmlstr = strXML, log = logMsg, ra = ra, jr = jr,
                                              schedulerID = self.__env['jobSchedulerId'],
                                              pilotID = self.__env['pilotId'],
                                              updateServer = self.__env['updateServerFlag'],
                                              stdout_tail = stdout_tail,
                                              stdout_path = stdout_path,
#                                              stdout_tail = self.__env['stdout_tail'],
#                                              stdout_path = self.__env['stdout_path'],
                                              additionalMetadata = expSpecificMetadata)
        if ret == 0:
            tolog("Successfully updated panda server at %s" % timeStamp())

            if not (os.environ.has_key('Nordugrid_pilot') or site.sitename == 'CERNVM'):
                # remove the job state file for finished and failed jobs (recovery will never be necessary for them)
                error = PilotErrors()
                recoverable = error.isRecoverableErrorCode(job.result[2])
                if job.result[0] == "finished" or (job.result[0] == "failed" and not recoverable) or \
                       job.result[1] != 0 or job.finalstate == "failed":
                    JS = JobState()
                    if JS.remove(site, job):
                        tolog("Removed job state file")

                if retNode:
                    # store the metadata xml
                    retNode['xml'] = strXML
                    tolog("Stored XML in retNode structure")
                    _retjs = updateJobState(job, site, retNode)
                    if _retjs:
                        tolog("Backed up XML in job state file")
                else:
                    tolog("updatePandaServer did not return a node structure. XML is assumed to have been sent to the server.")

        else:
            # if there is a server update problem at this point the job will eventually loose its heartbeat
            tolog("!!WARNING!!1600!! updatePandaServer returned a %d" % (ret))

            # protect the work dir until the next pilot picks up the job state file
            # and properly updates the job status

            # create a weak lock file to prevent cleanup from deleting the work directory
            createLockFile(self.__env['jobrec'], site.workdir)

            if retNode:
                # store the metadata xml
                retNode['xml'] = strXML
                tolog("Stored XML in retNode structure")

                # update the job state file with the new state information
                job.result[0] = "lostheartbeat"
                _retjs = updateJobState(job, site, retNode)
            else:
                tolog("updatePandaServer did not return a node structure. XML is assumed to have been sent to the server.")

        # transfer additional files for CERNVM
        if transferAdditional:
            self.transferAdditionalCERNVMFiles(job, site, experiment)

        # add the log extracts to the batch log
        if logMsg != "":
            tolog("Begin log extracts.......................................................................................")
            tolog(logMsg)
            tolog(".........................................................................................end log extracts")
        else:
            tolog("No available log extracts")

    def updatePandaServer(self, job, site, workerNode, port, xmlstr = None, spaceReport = False,
                          log = None, ra = 0, jr = False, schedulerID = None, pilotID = None,
                          updateServer = True, stdout_tail = "", stdout_path = "", additionalMetadata = None):
        """ Update the PanDA server """

        # create and instantiate the client object
        from PandaServerClient import PandaServerClient
        client = PandaServerClient(pilot_version = self.__env['version'],
                                   pilot_version_tag = self.__env['pilot_version_tag'],
                                   pilot_initdir = self.__env['pilot_initdir'],
                                   jobSchedulerId = self.__env['jobSchedulerId'],
                                   pilotId = self.__env['pilotId'],
                                   updateServer = self.__env['updateServerFlag'],
                                   jobrec = self.__env['jobrec'],
                                   pshttpurl = self.__env['pshttpurl'])

        # update the panda server
        return client.updatePandaServer(job, site, workerNode, port,
                                        xmlstr = xmlstr, spaceReport = spaceReport, log = log, ra = ra, jr = jr,
                                        useCoPilot = self.__env['useCoPilot'],
                                        stdout_tail = stdout_tail, stdout_path = stdout_path, additionalMetadata = additionalMetadata)

    def transferAdditionalFile(self, job, site, experiment, fileName):
        """
        Transfer additional CERNVM files for CERNVM to the intermediate storage location
        where it will be read by special tool responsible for final SE transfers
        """

        status = True
        error = PilotErrors()
        pilotErrorDiag = ""
        N_filesNormalStageOut = 0
        N_filesAltStageOut = 0

        tolog("Preparing to transfer additional file: %s" % (fileName))
        updateFileState(os.path.basename(fileName), site.workdir, job.jobId, mode="file_state", state="not_transferred")
        updateFileState(os.path.basename(fileName), site.workdir, job.jobId, mode="reg_state", state="not_registered")

        # get file info
        from SiteMover import SiteMover
        from SiteMoverFarm import getSiteMover
        sitemover = getSiteMover(readpar('copytool'), "")
        _date = "None"
        ec, pilotErrorDiag, _fsize, _checksum = \
            SiteMover.getLocalFileInfo(fileName, csumtype=sitemover.getChecksumCommand(), date=_date)
        if ec != 0:
            job.pilotErrorDiag = "Failed to get the XML file info: %d, %s" % (ec, pilotErrorDiag)
            return False, job
        else:
            tolog("File %s has size %s and checksum %s" % (os.path.basename(fileName), _fsize, _checksum))

        # see if it's an analysis job or not
        analyJob = isAnalysisJob(job.trf.split(",")[0])

        # assign a random guid
        additionalFileGuid = getGUID()

        # the cmtconfig is needed by at least the xrdcp site mover
        cmtconfig = getCmtconfig(job.cmtconfig)

        # for backwards compatibility
        try:
            experiment = job.experiment
        except:
            experiment = "unknown"

        # create the xml needed for the registration
        filename_xml = "CERNVM.xml"
        try:
            guids_status = PFCxml(experiment, filename_xml, fntag="pfn", additionalOutputFile=fileName, additionalOutputFileGuid=additionalFileGuid)
        except Exception, e:
            # update the current file state
            # updateFileState(filename_xml, site.workdir, job.jobId, mode="file_state", state="not_transferred")
            tolog("!!WARNING!!1500!! Could not generate xml for CERNVM: %s" % str(e))
            status = False
        else:
            dsname = "%s-%s-%s" % (localtime()[0:3]) # pass it a random name
            tolog("dsname = %s" % (dsname))
            ec = 0
            _state = ""
            _msg = ""
            try:
                ec, pilotErrorDiag, rf, rs, N_filesNormalStageOut, N_filesAltStageOut, os_bucket_id = mover.mover_put_data("xmlcatalog_file:%s" % (filename_xml),
                                                                  dsname, site.sitename, site.computingElement, analysisJob = analyJob,
                                                                  testLevel = self.__env['testLevel'],
                                                                  proxycheck = self.__env['proxycheckFlag'],
                                                                  pinitdir = self.__env['pilot_initdir'],
                                                                  datasetDict = None,
                                                                  outputDir = self.__env['outputDir'],
                                                                  stageoutTries = self.__env['stageoutTries'],
                                                                  cmtconfig = cmtconfig,
                                                                  job=job) # quick workaround
            except Exception, e:
                status = False
                import traceback
                if 'format_exc' in traceback.__all__:
                    trace = traceback.format_exc()
                    pilotErrorDiag = "Exception caught when saving the log tarball: %s, %s" % (str(e), trace)
                else:
                    tolog("traceback.format_exc() not available in this python version")
                    pilotErrorDiag = "Exception caught when saving the log tarball: %s" % (str(e))
                tolog("!!%s!!1500!! %s" % (self.__env['errorLabel'], pilotErrorDiag))
            else:
                tolog("mover_put_data finished with EC = %s" % str(ec))
                if ec != 0:
                    job.result[0] = "holding"

                    # remove any trailing "\r" or "\n" (there can be two of them)
                    if rs != None:
                        rs = rs.rstrip()
                        tolog("Error string: %s" % (rs))

                    # is the job recoverable?
                    if error.isRecoverableErrorCode(ec):
                        _state = "holding"
                        _msg = "WARNING"
                    else:
                        _state = "failed"
                        _msg = self.__env['errorLabel']
                else:
                    updateFileState(os.path.basename(fileName), site.workdir, job.jobId, mode="file_state", state="transferred")
                    if site.sitename != "CERNVM":
                        updateFileState(os.path.basename(fileName), site.workdir, job.jobId, mode="reg_state", state="registered")

        # do not overwrite any existing pilotErrorDiag (from a get operation e.g.)
        if job.pilotErrorDiag != "" and job.pilotErrorDiag != None:
            if pilotErrorDiag != "" and pilotErrorDiag != None:
                # add pilotErrorDiag to the end of the existing string but do not add the xml put error identifier to save space
                job.pilotErrorDiag += "|XML put error: " + pilotErrorDiag
        else:
            if pilotErrorDiag != "" and pilotErrorDiag != None:
                job.pilotErrorDiag = "XML put error: " + pilotErrorDiag

        return status, job

    def createLogFile(self, job):
        """ Create the log file; rename the workdir, tar and zip it """

        status = False

        # copy pilotlog.txt to workdir before tar
        try:
            copy2(getPilotlogFilename(), job.workdir)
        except Exception,e:
            tolog("!!WARNING!!1400!! Could not copy pilot log to workdir: %s" % str(e))

        # copy stderr to workdir before tar
        try:
            copy2(getPilotstderrFilename(), job.workdir)
        except Exception,e:
            tolog("!!WARNING!!1400!! Could not copy stderr to workdir: %s" % str(e))

        # has the workdir size dictionary been created? (probably not if the job is less than ten minutes old)
        workdirsize_filepath = os.path.join(job.workdir, getWorkDirSizeFilename(job.jobId))
        if os.path.exists(workdirsize_filepath):
            tolog("Work directory size dictionary already created: %s" % (workdirsize_filepath))
        else:
            tolog("Work directory size dictionary not created (will create it now)")
            size = getDirSize(job.workdir)

            # Store the measured disk space (the max value will later be sent with the job metrics)
            status = storeWorkDirSize(size, self.__env['pilot_initdir'], job)

        # input and output files should already be removed from the workdir in child process
        tarballNM = "%s.tar" % (job.newDirNM)
        try:
            cmd = "mv %s %s" % (job.workdir, job.newDirNM)
            tolog("Executing command: %s" % (cmd))
            os.system(cmd)
        except OSError:
            tolog("!!WARNING!!1400!! Could not move job workdir %s to %s" % (job.workdir, job.newDirNM))
        else:
            timeout = 55*60
            # add an echo $? to mask any tar error code - for now - otherwise it can cause the time-out of the tar
            # to return an error code when we don't want it to, e.g. in evgen jobs that have broken soft links
            # the pilot should remove the broken links before though. later, the pilot should fail if the log file
            # is too big
            cmd = "pwd;tar cvf %s %s --dereference; echo $?" % (tarballNM, job.newDirNM)
            exitcode, output = timedCommand(cmd, timeout=timeout)
            if exitcode != 0:
                tolog("!!WARNING!!4343!! Log file creation failed: %d, %s" % (exitcode, output))
            else:
                tolog("Tarball created: %s" % (tarballNM))
                cmd = "gzip -f %s" % (tarballNM)
                exitcode, output = timedCommand(cmd, timeout=timeout)
                if exitcode != 0:
                    tolog("!!WARNING!!4343!! Log file zip failed: %d, %s" % (exitcode, output))
                else:
                    try:
                        os.rename("%s.gz" % (tarballNM), job.logFile)
                    except OSError:
                        tolog("!!WARNING!!1400!! Could not rename gzipped tarball %s" % job.logFile)
                    else:
                        tolog("Tarball renamed to %s" % (job.logFile))
                        status = True

        return status

    def removeLockFile(self, workdir, lockfile="LOCKFILE"):
        """ Removal of temporary lock file after successful log registration """

        # try to remove the lock file
        # do not bother if the site doesn't allow for job recovery
        f = "%s/%s" % (workdir, lockfile)
        if self.__env['jobrec']:
            try:
                os.remove(f)
            except Exception, e:
                tolog("!!WARNING!!1000!! Failed to remove work dir lock file: %s" % str(e))
            else:
                if lockfile == "LOCKFILE":
                    tolog("Lock file removed (job work dir will be removed)")
                else:
                    tolog("Lock file removed: %s" % (f))

    def constructPathFromJobid(self, jobId):
        """ Split a jobId into sub directories """

        # Used by getLogPath() to generate subdirectories from a job id for the full path for PanDA job logs (centrally stored)
        # E.g. 1838566890 -> 18/38/56/68/90

        # Handle odd length strings
        if len(jobId)%2 != 0:
            # save the last character and add it separately below
            last_char = jobId[-1]
        else:
            last_char = ""

        # Create a list of sub directories, e.g. ['18', '38', '56', '68', '90']
        sub_dirs = re.findall('(\d\d)', jobId)

        if sub_dirs != "":
            # Add the last char to the list, if any
            if last_char != "":
                sub_dirs.append(last_char)

            # '18/38/56/68/90'
            path = '/'.join(sub_dirs)
        else:
            tolog("!!WARNING!!4444!! Sub directories could not be created for jobId=%s" % (jobId))

        return path

    def getLogPath(self, jobId, logFile, experiment, primary=True):
        """ Get the standard path for PanDA job logs """

        # This path determines where the log will be transferred to.
        # The host and base path is read from schedconfig, and can be a ,-separated list.
        # If the "primary"-boolean is True, the first location will be selected. False means the second location, if any
        # In case there is only one host defined in the schedconfig.logPath, primary=False is meaningless (abort).
        # In case of objectstores, also the os_bucket_id will be returned (otherwise set to -1)

        # Standard path
        # logPaths = readpar('copytoollogPath')
        # logPaths = "root://eos.cern.ch/atlas/logs,dav://bnldav.cern.ch/atlas/logs"
        # logPaths = "root://eosatlas.cern.ch/atlas/logs"

        os_bucket_id = -1

        # Get the site information object
        si = getSiteInformation(experiment)

        default_ddmendpoint = si.getObjectstoreDDMEndpoint(os_bucket_name='logs')
        logPaths = si.getObjectstorePath(ddmendpoint=default_ddmendpoint, label='w')
        os_bucket_id = si.getObjectstoreBucketID(default_ddmendpoint)

        # Handle multiple paths (primary and secondary log paths)
        if "," in logPaths:
            _logPaths = logPaths.split(",")
            if primary:
                _logPath = _logPaths[0]
            else:
                _logPath = _logPaths[1]
        else:
            if primary:
                _logPath = logPaths
            else:
                tolog("No secondary log path is defined")
                _logPath = ""

        # Create the full path
        if _logPath != "":
            # Use the job id to generate sub directories
            # path = self.constructPathFromJobid(jobId)

            # Put it all together
            logPath = os.path.join(_logPath, logFile)
#            logPath = os.path.join(_logPath, os.path.join(jobId, logFile))
        else:
            logPath = ""

        return logPath, os_bucket_id