forked from PanDAWMS/pilot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
JobLog.py
1264 lines (1070 loc) · 59.9 KB
/
JobLog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import re
import commands
import traceback
from time import localtime
from glob import glob
from shutil import copy2, rmtree
import Mover as mover
from PilotErrors import PilotErrors
from pUtil import tolog, readpar, isLogfileCopied, isAnalysisJob, removeFiles, getFileGuid, PFCxml, createLockFile, \
getMetadata, returnLogMsg, removeLEDuplicates, getPilotlogFilename, remove, getExeErrors, updateJobState, \
makeJobReport, chdir, addSkippedToPFC, updateMetadata, getJobReport, filterJobReport, timeStamp, \
getPilotstderrFilename, safe_call, updateXMLWithSURLs, putMetadata, getCmtconfig, getExperiment, getSiteInformation, \
getGUID, timedCommand, updateXMLWithEndpoints
from FileHandling import addToOSTransferDictionary, getOSTransferDictionaryFilename, getOSTransferDictionary, \
getWorkDirSizeFilename, getDirSize, storeWorkDirSize
from JobState import JobState
from FileState import FileState
from FileStateClient import updateFileState, dumpFileStates
from JobRecovery import JobRecovery
from Configuration import Configuration
class JobLog:
"""
Methods for handling the job log (e.g. postJobLog, updatePandaServer)
"""
# private data members
__error = PilotErrors() # PilotErrors object
def __init__(self):
""" Default initialization """
self.__env = Configuration()
def getLogFileGuid(self, tarFileGuid, logFile, jobId, workdir):
""" return the proper log file guid """
# if for some reason the log file guid is not known (e.g. in a problematic lost job)
# the guid should not be generated by PFCxml below, but be extracted from metadata-<jobId>.xml
_filename = os.path.join(workdir, "metadata-%s.xml" % (jobId))
fileGuid = getFileGuid(_filename, logFile)
if tarFileGuid != fileGuid:
if fileGuid == "":
tolog("!!WARNING!!1500!! Log file guid could not be found in %s" % (_filename))
else:
tolog("!!WARNING!!1500!! Encountered a disprepancy between job.tarFileGuid (value: %s) and %s (value: %s)" %\
(tarFileGuid, _filename, fileGuid))
tarFileGuid = fileGuid
else:
tolog("Log guid same as in metadata file")
if tarFileGuid == "":
tolog("!!WARNING!!1500!! Encountered an empty log file guid")
else:
tolog("Using log file guid: %s" % (tarFileGuid))
return tarFileGuid
def copyLogFile(self, dest, workdir, logFile, newDirNM):
""" copy the log file to a specific directory """
status = False
if dest == "None":
tolog("Log file will not be copied to neither SE nor any other directory")
else:
tolog("Log file will not be copied to SE, but to directory: %s" % (dest))
try:
copy2("%s/%s" % (workdir, logFile), dest)
except Exception, e:
tolog("!!WARNING!!1500!! Exception caught: Could not copy log file %s/%s from %s: %s" %\
(workdir, logFile, dest, str(e)))
status = False
else:
status = True
tolog("Successfully copied log file to destination")
try:
os.remove(logFile)
except Exception, e:
tolog("!!WARNING!!1500!! Exception caught: Could not remove %s: %s (ignore)" % (logFile, str(e)))
pass # ignore, return status True anyway
if os.path.exists(newDirNM):
self.removeTree(newDirNM)
return status
def removeTree(self, _dir):
""" Remove a non-empty directory """
if safe_call(rmtree, _dir):
tolog("Removed directory: %s" % (_dir))
def transferLogFile(self, job, site, experiment, dest=None, jr=False):
""" Transfer the log file to storage """
status = True
# transfer log file to special log SE (CERN via xrdcp)
# get the experiment object
thisExperiment = getExperiment(experiment)
if thisExperiment.doSpecialLogFileTransfer(eventService=job.eventService, putLogToOS=job.putLogToOS):
tolog("Preparing for log file transfer to special SE")
# get the site information object
si = getSiteInformation(experiment)
# first backup some schedconfig fields that need to be modified for the secondary transfer
copytool_org = readpar('copytool')
# temporarily modify the schedconfig fields with values for the secondary SE
tolog("Temporarily modifying queuedata for log file transfer to special SE")
#ec = si.replaceQueuedataField("copytool", "objectstore")
# do log transfer
tolog("Attempting log file transfer to special SE")
ret, job = self.transferActualLogFile(job, site, experiment, dest=dest, jr=jr, specialTransfer=True, copytool="objectstore")
if not ret:
tolog("!!WARNING!!1600!! Could not transfer log file to special SE")
#status = False
else:
# Update the OS transfer dictionary
# Get the OS name identifier and bucket endpoint
os_bucket_id = job.logBucketID
os_ddmendpoint = si.getObjectstoreDDMEndpointFromBucketID(os_bucket_id)
# Add the transferred file to the OS transfer file
addToOSTransferDictionary(job.logFile, self.__env['pilot_initdir'], os_bucket_id, os_ddmendpoint)
# finally restore the modified schedconfig fields
tolog("Restoring queuedata fields")
#ec = si.replaceQueuedataField("copytool", copytool_org)
else:
tolog("Special log file transfer not required")
# register/copy log file
tolog("Attempting log file transfer to primary SE")
ret, job = self.transferActualLogFile(job, site, experiment, dest=dest, jr=jr)
if not ret:
tolog("!!%s!!1600!! Could not transfer log file to primary SE" % (self.__env['errorLabel']))
status = False
return status, job
def transferActualLogFile(self, job, site, experiment, dest=None, jr=False, specialTransfer=False, copytool=None):
"""
Save log tarball in DDM and register it to catalog, or copy it to 'dest'.
the job recovery will use the current site info known by the current pilot
"""
status = True
pilotErrorDiag = ""
N_filesNormalStageOut = 0
N_filesAltStageOut = 0
if not self.__env['jobrec']:
self.__env['errorLabel'] = "FAILED"
# only check for performed log transfer for normal stage-out (not for any special transfers)
if isLogfileCopied(site.workdir, job.jobId) and not specialTransfer:
tolog("Log file already transferred")
return status, job
# only copy log file to dest dir if specified
if dest:
status = self.copyLogFile(dest, site.workdir, job.logFile, job.newDirNM)
# update the current file state
if status:
updateFileState(job.logFile, site.workdir, job.jobId, mode="file_state", state="transferred")
else:
updateFileState(job.logFile, site.workdir, job.jobId, mode="file_state", state="not_transferred")
dumpFileStates(site.workdir, job.jobId)
return status, job
# see if it's an analysis job or not
analyJob = isAnalysisJob(job.trf.split(",")[0])
# remove any lingering input files from the work dir
if len(job.inFiles) > 0:
ec = removeFiles(job.workdir, job.inFiles)
# get the log file guid (if not set already)
job.tarFileGuid = self.getLogFileGuid(job.tarFileGuid, job.logFile, job.jobId, site.workdir)
# the cmtconfig is needed by at least the xrdcp site mover
cmtconfig = getCmtconfig(job.cmtconfig)
# create the xml needed for the registration if it doesn't exist already (for a secondary log transfer)
WDTxml = "%s.xml" % (job.newDirNM)
if not os.path.exists(WDTxml):
guids_status = PFCxml(job.experiment, WDTxml, fntag="pfn", alog=job.logFile, alogguid=job.tarFileGuid, jr=jr)
else:
tolog("Log XML already exists: %s" % (WDTxml))
dblock = job.logDblock
if dblock and dblock != 'NULL' and dblock != ' ':
dsname = dblock
else:
dsname = "%s-%s-%s" % (localtime()[0:3]) # pass it a random name
rmflag = 1
ec = 0
_state = ""
_msg = ""
latereg = False
# determine the file path for special log transfers (can be overwritten in mover_put_data() in case of failure in transfer to primary OS)
if specialTransfer:
logPath, os_bucket_id = self.getLogPath(job.jobId, job.logFile, job.experiment)
if logPath == "":
tolog("!!WARNING!!4444!! Can not continue with special transfer since logPath is not set")
return False, job
tolog("Special log transfer: %s" % (logPath))
else:
logPath = ""
os_bucket_id = -1
try:
rc, pilotErrorDiag, rf, rs, N_filesNormalStageOut, N_filesAltStageOut, os_bucket_id = mover.mover_put_data("xmlcatalog_file:%s" % (WDTxml),
dsname,
site.sitename,
site.computingElement,
analysisJob = analyJob,
testLevel = self.__env['testLevel'],
proxycheck = self.__env['proxycheckFlag'],
pinitdir = self.__env['pilot_initdir'],
datasetDict = None,
outputDir = self.__env['outputDir'],
stageoutTries = self.__env['stageoutretry'],
cmtconfig = cmtconfig,
recoveryWorkDir = site.workdir,
logPath = logPath,
os_bucket_id = os_bucket_id,
copytool=copytool,
job = job,
log_transfer = True # new sitemovers required integration parameter
)
except Exception, e:
rmflag = 0 # don't remove the tarball
status = False
import traceback
if 'format_exc' in traceback.__all__:
trace = traceback.format_exc()
pilotErrorDiag = "Exception caught when saving the log tarball: %s, %s" % (str(e), trace)
else:
tolog("traceback.format_exc() not available in this python version")
pilotErrorDiag = "Exception caught when saving the log tarball: %s" % (str(e))
tolog("!!%s!!1500!! %s" % (self.__env['errorLabel'], pilotErrorDiag))
else:
tolog("mover_put_data finished with EC = %s" % str(rc))
# update transfer numbers in case alt stage-out has been used
if N_filesAltStageOut > 0:
job.filesNormalStageOut += N_filesNormalStageOut # only reported to jobMetrics in case of alt stage-out
job.filesAltStageOut += N_filesAltStageOut
tolog("Updated stage-out numbers:")
tolog("..filesNormalStageOut = %d" % (job.filesNormalStageOut))
tolog(".....filesAltStageOut = %d" % (job.filesAltStageOut))
if rc != 0:
# remove any trailing "\r" or "\n" (there can be two of them)
if rs != None:
rs = rs.rstrip()
tolog("Error string: %s" % (rs))
# ignore failed OS log transfers (this might change if we only store logs in OS:s)
if os_bucket_id != -1 and specialTransfer:
tolog("Ignoring failed special log transfer to OS (resetting log bucket id)")
os_bucket_id = -1
rc = 0
rmflag = 0 # don't remove the tarball
job.result[0] = "holding"
# is the job recoverable?
if self.__error.isRecoverableErrorCode(rc):
_state = "holding"
_msg = "WARNING"
else:
_state = "failed"
_msg = self.__env['errorLabel']
# look for special error in the error string
if rs == "Error: string Limit exceeded 250":
tolog("!!%s!!3000!! Put error: file name string limit exceeded 250" % (_msg))
ec = self.__error.ERR_LRCREGSTRSIZE
else:
ec = rc
else:
# create a weak lock file for the log transfer (but not for any special transfer, ie the log transfer to the special/secondary log area)
if not specialTransfer:
createLockFile(self.__env['jobrec'], site.workdir, lockfile="LOGFILECOPIED_%s" % job.jobId)
# to which OS bucket id was the file transferred to?
if os_bucket_id != -1:
# get the site information object
#si = getSiteInformation(experiment)
job.logBucketID = os_bucket_id #si.getBucketID(os_id, "logs")
tolog("Stored log bucket ID: %s" % (job.logBucketID))
# set the error code for the log transfer only if there was no previous error (e.g. from the get-operation)
if job.result[2] == 0:
job.result[2] = ec
job.pilotErrorDiag = pilotErrorDiag
else:
# there was a previous error
if ec != 0:
# is the new log transfer error of the same type as the earlier error?
if ec == job.result[2]:
tolog("!!WARNING!!1105!! Previous error same as new error: %d" % (ec))
else:
tolog("!!WARNING!!1105!! Previous error (%d) will not be overwritten by the new error (%d)" % (job.result[2], ec))
# ignore holding state for log transfer if previous earlier error was a get error
if job.result[0] == "holding" and not self.__error.isRecoverableErrorCode(job.result[2]):
tolog("!!WARNING!!1105!! Resetting HOLDING to FAILED since the previous error is not recoverable")
job.result[0] = "failed"
# in case the log file could not be registered, store the relevant info in the job state file
if latereg:
job.log_latereg = "True"
job.log_field = rf
else:
job.log_latereg = "False"
job.log_field = None
# tarball is saved to DDM successfully, so remove everything except the log file which might
# still be needed (for creating metadata for failed jobs)
if rmflag == 1:
if os.path.isdir(job.newDirNM):
self.removeTree(job.newDirNM)
try:
os.remove(WDTxml)
except Exception, e:
tolog("!!WARNING!!1500!! Could not remove %s: %s" % (WDTxml, str(e)))
#status = False
else:
tolog("%s removed" % (WDTxml))
elif rmflag == 0: # something bad happened during put, save the tarball on worker node for further debugging
if job.result[0] == 'holding':
tolog("Will leave log file %s for later recovery" % (job.logFile))
status = False
if os.path.isdir(job.newDirNM):
self.removeTree(job.newDirNM)
elif os.path.isdir(job.workdir) and (not job.logFile or job.logFile == ''):
try:
rmtree(job.workdir)
except Exception, e:
tolog("!!WARNING!!1500!! Could not remove %s: %s" % (job.workdir, str(e)))
pass
# do not overwrite any existing pilotErrorDiag (from a get operation e.g.)
if job.pilotErrorDiag != "" and job.pilotErrorDiag != None:
if pilotErrorDiag != "" and pilotErrorDiag != None:
# add pilotErrorDiag to the end of the existing string but do not add the log put error identifier to save space
job.pilotErrorDiag += "|Log put error: " + pilotErrorDiag
else:
if pilotErrorDiag != "" and pilotErrorDiag != None:
job.pilotErrorDiag = "Log put error: " + pilotErrorDiag
return status, job
def buildLogExtracts(self, job, workdir, analyJob):
""" Build the bulk of the log extracts """
error = PilotErrors()
tolog("Building log extracts..")
logMsg = ''
# look for the pandatracerlog.txt file, produced if the user payload attempted any outgoing connections
tracerlog = os.path.join(job.workdir, "pandatracerlog.txt")
if analyJob:
if os.path.exists(tracerlog):
# only add if file is not empty
if os.path.getsize(tracerlog) > 0:
msg = "!!WARNING!!1010!! PandaID=%s had outbound connections" % (job.jobId)
tolog(msg)
logMsg += msg
try:
f = open(tracerlog, "r")
except Exception, e:
tolog("!!WARNING!!1010!! Failed to open log file: %s, %s" % (tracerlog, e))
else:
logMsg += f.read()
f.close()
else:
tolog("Panda tracer log has zero size (no outbound connections detected)")
else:
tolog("Panda tracer log does not exist: %s (ignoring)" % (tracerlog))
# are there any special log messages from the subprocess/payload?
for thisf in job.logMsgFiles:
logMsg += returnLogMsg(logf=thisf) + "\n"
# grep for !!FAILED/WARNING!!NR!! messages in pilotlog.txt
ret = commands.getoutput('grep -e "\!\![A-Z]\+\!\![0-9]\+\!\!" %s | tail -20' % (getPilotlogFilename()))
if ret != "":
logMsg += "- %s -\n" % os.path.basename(getPilotlogFilename())
logMsg += ret + "\n"
# is this a multi-trf job?
nJobs = job.jobPars.count("\n") + 1
# loop over all payload stdout files
for _i in range(nJobs):
_stdout = job.stdout
if nJobs > 1:
_stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1))
fname = os.path.join(workdir, _stdout)
if os.path.isfile(fname):
# use the job reports for production jobs
if job.payload == "athena" and not analyJob:
# leave a filtered extracts from the first subjobs only
if _i < nJobs - 1:
# only get the error summary for
jobReport = filterJobReport(getJobReport(fname))
else:
# get the full job report for the last trf
jobReport = getJobReport(fname)
else:
jobReport = ""
if jobReport != "":
logMsg += jobReport
else:
# old style log extracts
logMsg += '\n\n- Errors from %s (no jobReport) -\n' % (_stdout)
logMsg += commands.getoutput('grep -i error %s | tail -20' % (fname))
tmp = commands.getoutput('grep -i \"Running %s failed\" %s | tail -20' % (job.payload, fname))
if len(tmp) > 0:
logMsg += '\n\n- %s errors from %s -\n' % (job.payload, _stdout)
logMsg += tmp
if job.payload == "athena":
evts = commands.getoutput('grep AthenaEventLoopMgr %s | grep end' % (fname))
evtslist = evts.split("\n")
if len(evtslist) > 1:
logMsg += '\n\n- First event -\n'
logMsg += evtslist[0]
logMsg += '\n\n- Last event -\n'
logMsg += evtslist[-1]
# if payload stdout file is too big (ec 1106), remove the file at this point
if job.result[2] == error.ERR_STDOUTTOOBIG:
try:
os.remove(fname)
except Exception, e:
tolog("!!WARNING!!1999!! Failed to remove file %s: %s" % (fname, str(e)))
else:
tolog("Too large payload stdout file has been removed")
else:
logMsg += "\n(%s/%s does not exist)" % (workdir, _stdout)
# remove duplicated warning/error messages
logMsg = removeLEDuplicates(logMsg)
return logMsg
def getXMLAndWorkdir(self, jr, siteWorkdir, jobWorkdir, newDirNM, jobId):
""" Get the metadata and the relevant workdir """
if jr:
# for lost jobs that can be recovered, the workdir has already been renamed to newDirNM
workdir = newDirNM
tolog("Post job task (job recovery mode) using dir: %s" % (workdir))
# get the metadata
strXML = getMetadata(siteWorkdir, jobId)
else:
workdir = jobWorkdir
tolog("Post job task (normal mode) using dir: %s" % (workdir))
# get the preliminary metadata (file size and checksum not yet set for log file)
strXML = getMetadata(workdir, jobId)
return strXML, workdir
def isAnalyJob(self, sitename):
""" Determine if the job is a user analysis job using the site name """
if "ANALY" in sitename:
analyJob = True
else:
analyJob = False
return analyJob
def removeCoreDumps(self, siteWorkdir, workdir):
""" Remove any remaining core dumps so they do not end up in the log tarball """
foundCoreDump = False
coreDumps1 = glob("%s/core.*" % (siteWorkdir))
coreDumps2 = glob("%s/core.*" % (workdir))
coreDumps3 = glob("%s/core" % (siteWorkdir))
coreDumps4 = glob("%s/core" % (workdir))
coreDumps = coreDumps1 + coreDumps2 + coreDumps3 + coreDumps4
if coreDumps:
for coreDump in coreDumps:
tolog("Trying to remove core dump: %s" % str(coreDump))
if not remove([coreDump]):
tolog("!!WARNING!!1600!! Failed to remove core dump")
else:
tolog("Core dump removed")
foundCoreDump = True
return foundCoreDump
def removeSoftLink(self, jobPars, stdout, siteWorkdir):
""" Remove the soft link to the payload stdout """
# is this a multi-trf job?
nJobs = jobPars.count("\n") + 1
for _i in range(nJobs):
_stdout = stdout
if nJobs > 1:
_stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1))
lnfilename = os.path.join(siteWorkdir, _stdout)
if os.path.exists(lnfilename):
try:
os.remove(lnfilename)
except Exception, e:
tolog("Failed to remove soft link %s: %s" % (lnfilename, str(e)))
else:
tolog("Removed soft link: %s" % (lnfilename))
def removeUnwantedFiles(self, workdir, inFiles, outFiles):
""" Remove unwanted files from work dir prior to tarball creation """
tolog("Removing unwanted files prior to job log creation")
# remove any lingering input files from the work dir
if len(inFiles) > 0:
ec = removeFiles(workdir, inFiles)
# remove any lingering output files from the work dir
if len(outFiles) > 0:
ec = removeFiles(workdir, outFiles)
# remove any lingering athena workDir before creating the tarball
if os.path.exists(os.path.join(workdir, 'workDir')):
tolog("Removing user workDir prior to tarball creation")
try:
rmtree(os.path.join(workdir, 'workDir'))
except Exception, e:
tolog("Failed to remove workDir: %s" % str(e))
def addWantedFiles(self, jobWorkdir, siteWorkdir, jobId, outputFilesXML):
""" Add wanted files to work dir prior to tarball creation """
# add skipped input file info, if any
_skippedfname = os.path.join(jobWorkdir, "skipped.xml")
_updatedfname = os.path.join(jobWorkdir, "metadata-%s.xml" % (jobId))
if os.path.exists(_skippedfname):
ec = addSkippedToPFC(_updatedfname, _skippedfname)
# copy to site dir so it can be reached in updatePandaServer after log creation if necessary
try:
copy2(_skippedfname, siteWorkdir)
except Exception, e:
tolog("!!WARNING!!1600!! Exception caught: Could not copy skipped metadata file to site work dir: %s" % str(e))
else:
tolog("Successfully copied skipped metadata file to site work dir")
else:
tolog("No skipped input files (non DBRelease)")
# Special NG/CERNVM metadata file
fname = os.path.join(jobWorkdir, outputFilesXML)
if os.path.exists(fname):
# copy to site dir so it can be reached after the log has been created below
try:
copy2(fname, siteWorkdir)
except Exception, e:
tolog("!!WARNING!!1600!! Exception caught: Could not copy NG/CERNVM metadata file to site work dir: %s" % str(e))
else:
tolog("Successfully copied NG/CERNVM metadata file to site work dir: %s" % (siteWorkdir))
def createMetadataForOutput(self, workdir, filename, jobId, newDirNM, outputFilesXML):
""" Create the final metadata with file size and checksum of the log tarball """
# add metadata about log file to metadata.xml
from SiteMover import SiteMover
from SiteMoverFarm import getSiteMover
sitemover = getSiteMover(readpar('copytool'), "")
_date = "None"
strXML = ""
tolog("Preparing to create metadata for output files")
# get the file info for the log file and, if needed, for the CERNVM outputFilesXML file
ec, pilotErrorDiag, _fsize, _checksum = \
SiteMover.getLocalFileInfo(os.path.join(workdir, filename), csumtype=sitemover.getChecksumCommand(), date=_date)
if ec != 0:
tolog("!!WARNING!!2995!! Failed while trying to get the log file info: %d" % (ec))
tolog("fsize=%s" % (_fsize))
tolog("checksum=%s" % (_checksum))
JS = JobState()
_filename = JS.getFilename(workdir, jobId)
if os.path.exists(_filename):
ec, pilotErrorDiag, _fsizeAdditional, _checksumAdditional = \
SiteMover.getLocalFileInfo(_filename, csumtype=sitemover.getChecksumCommand(), date=_date)
if ec != 0:
tolog("!!WARNING!!2995!! Failed while trying to get the additional file (%s) info: %d" % (os.path.basename(_filename), ec))
_fsizeAdditional = None
_checksumAdditional = None
else:
_fsizeAdditional = None
_checksumAdditional = None
fname = "%s/metadata-%s.xml" % (workdir, jobId)
if os.path.exists(fname):
tolog("Found metadata in site dir: %s" % (workdir))
else:
# backup solution in case metadata has not already been copied into the site work dir
tolog("Metadata not found in site work dir, looking for it in job work dir instead..")
_fname = "%s/metadata-%s.xml" % (newDirNM, jobId)
if os.path.exists(_fname):
tolog("Found metadata in job work dir: %s" % (newDirNM))
try:
copy2(_fname, workdir)
except Exception, e:
tolog("!!WARNING!!2999!! Failed to copy metadata file from job work dir to site work dir: %s" % str(e))
else:
tolog("Successfully copied metadata from job work dir to site work dir")
else:
tolog("!!WARNING!! Metadata not found in job work dir either: %s" % (fname))
# try to read the metadata from the site work dir
if os.path.exists(fname):
ec, _strXML = updateMetadata(fname, _fsize, _checksum)
if ec == 0:
tolog("Added (%s, %s) to metadata file: %s" % (_fsize, _checksum, fname))
if len(_strXML) != 0:
# replace preliminary XML
strXML = _strXML
# strXML now contains all the xml for all output files and log
else:
tolog("!!WARNING!!1601!! updateMetadata() did not return any xml")
else:
tolog("!!WARNING!!1600!! Failed to add metadata: %d" % (ec))
else:
tolog("!!WARNING!!2999!! Failed to find metadata file, expect job to eventually fail with ddm: Adder._updateOutputs() could not get GUID/LFN/MD5/FSIZE")
# add the metadata about log file to special NG/CERNVM file
fname = os.path.join(workdir, outputFilesXML)
if os.path.exists(fname):
# add checksum and file size of log file to the metadata file (OutputFiles.xml) and then transfer it
ec, _strXML = updateMetadata(fname, _fsize, _checksum, format='NG', fsizeAdditional=_fsizeAdditional, checksumAdditional=_checksumAdditional)
if ec == 0:
tolog("Added (%s, %s) to metadata file: %s" % (_fsize, _checksum, fname))
if _fsizeAdditional and _checksumAdditional:
tolog("Added (%s, %s) to metadata file: %s" % (_fsizeAdditional, _checksumAdditional, fname))
# OutputFiles.xml now contains all the xml for all output files and log (and additional file info for CERNVM)
# copy it to the init dir (only necessary for NG not for CERNVM)
# (actually it can be transferred with the mv site mover just like it is done for CERNVM, skip for now)
if os.environ.has_key('Nordugrid_pilot'):
try:
copy2(fname, self.__env['pilot_initdir'])
except Exception, e:
tolog("!!WARNING!!1600!! Exception caught: Could not copy NG metadata file to init dir: %s" % str(e))
else:
tolog("Successfully copied NG metadata file to pilot init dir: %s" % (self.__env['pilot_initdir']))
else:
tolog("updateMetadata returned: %d" % (ec))
return strXML
def addTimingInfo(self, logMsg, timeGetJob, timeStageIn, timeExe, timeStageOut, timeCleanUp):
""" Add timing info to log message """
t = '\n\n- Walltime -\n'
# t1 = timeGetJob # set in pilot.py
# t2 = timeStageIn # set in runJob.py
# t3 = timeExe # set in runJob.py
# t4 = timeStageOut # set in runJob.py (and in pilot.py moveLostOutputFiles() for recovered jobs)
# t5 = timeCleanUp # set in this function
t += 'JobRetrival=%s, StageIn=%s, Execution=%s, StageOut=%s, CleanUp=%s\n' % (timeGetJob, timeStageIn, timeExe, timeStageOut, timeCleanUp)
# keep the walltime info but truncate the log message if necessary
l = len(t)
if len(logMsg) >= 2048 - l:
logMsg = logMsg[:2048-l] + t
else:
logMsg += t
return logMsg
def transferLogExtracts(self, logMsg):
""" Write and transfer log extracts to pilot init dir for Nordugrid """
fname = "log_extracts.txt"
try:
f = open(fname, 'w')
f.write(logMsg)
f.close()
except Exception, e:
tolog("Failed to write log extracts to file: %s" % str(e))
else:
try:
copy2(fname, self.__env['pilot_initdir'])
except Exception, e:
tolog("!!WARNING!!1600!! Exception caught: Could not copy log extracts file to init dir for NG: %s" % str(e))
else:
tolog("Successfully copied log extracts file to pilot init dir for NG: %s" % (self.__env['pilot_initdir']))
def transferAdditionalCERNVMFiles(self, job, site, experiment):
""" Transfer additional files for CERNVM """
fname = os.path.join(site.workdir, job.outputFilesXML)
if os.path.exists(fname):
ret, job = self.transferAdditionalFile(job, site, experiment, fname)
if not ret:
tolog("!!WARNING!!2994!! Additional CERNVM transfer failed: %s" % (job.pilotErrorDiag))
else:
JS = JobState()
ret, job = self.transferAdditionalFile(job, site, experiment, JS.getFilename(site.workdir, job.jobId))
if not ret:
tolog("!!WARNING!!2994!! Additional CERNVM transfer failed: %s" % (job.pilotErrorDiag))
else:
tolog("Transferred additional CERNVM files")
def postJobTask(self, job, site, experiment, workerNode, jr=False, ra=0, stdout_tail=None, stdout_path=None):
"""
Update Panda server with output info (xml) and make/save the tarball of the job workdir,
only for finished or failed jobs.
jr = job recovery
ra = recovery attempt
"""
tc_0 = os.times()
transferAdditional = False
# get the metadata and the relevant workdir
strXML, workdir = self.getXMLAndWorkdir(jr, site.workdir, job.workdir, job.newDirNM, job.jobId)
# set any holding job to failed for sites that do not use job recovery (e.g. sites with LSF, that immediately
# removes any work directory after the LSF job finishes which of course makes job recovery impossible)
if not self.__env['jobrec']:
if job.result[0] == 'holding':
job.result[0] = 'failed'
tolog("This site does not support job recovery: HOLDING state reset to FAILED")
# is it a user analysis job?
analyJob = self.isAnalyJob(site.sitename)
# build log extracts
logMsg = self.buildLogExtracts(job, workdir, analyJob)
# get the experiment object
thisExperiment = getExperiment(experiment)
# remove known redundant files and directories
thisExperiment.removeRedundantFiles(workdir)
# remove the soft link to the payload stdout
self.removeSoftLink(job.jobPars, job.stdout, site.workdir)
# make the job workdir tarball
chdir(site.workdir) # into pilot workdir, one level above job workdir
# remove the core dump file first, since it's considered as useless
foundCoreDump = self.removeCoreDumps(site.workdir, workdir)
# tar the workdir using the Panda jobId index and move it to the log dir
if os.path.isdir(workdir) and job.logFile and job.logFile != '':
# dump all directories for a failed job to the log
if job.result[0] == "failed":
cmd = 'ls -altrR %s' % workdir
tolog("%s: %s" % (cmd + '\n', commands.getoutput(cmd)))
# use the jobInfo.xml to get the trf errors (unless they were read and set already)
# (running a testEvgen job will not produce any exeError messages)
if job.exeErrorCode == 0 and job.exeErrorDiag == "":
try:
job.exeErrorCode, job.exeErrorDiag = getExeErrors(workdir, "jobInfo.xml")
except Exception, e:
tolog("!!WARNING!!1600!! Could not get the exeErrors: %s" % str(e))
job.exeErrorCode, job.exeErrorDiag = 0, ""
else:
tolog("Skipping old style trf error XML file (jobInfo.xml) since TRF errors are already set")
if not jr:
# Make the job summary report
makeJobReport(job, logMsg, foundCoreDump, self.__env['version'], self.__env['jobIds'])
# overwrite any pilotErrorDiag at this point with exeErrorDiag if set
# (for the job page error info)
if job.exeErrorDiag != "" and job.exeErrorDiag != "OK":
# this is probably useless since pilotErrorDiag might be overwritten again later
tolog("Overwriting pilotErrorDiag (\'%s\') with exeErrorDiag (\'%s\')" % (job.pilotErrorDiag, job.exeErrorDiag))
job.pilotErrorDiag = job.exeErrorDiag
# reset the trf errors since the monitor refuses to display them at the moment
#job.exeErrorDiag = ""
#job.exeErrorCode = 0
# remove unwanted files from work dir prior to tarball creation
self.removeUnwantedFiles(job.workdir, job.inFiles, job.outFiles)
# add wanted files to work dir prior to tarball creation
self.addWantedFiles(job.workdir, site.workdir, job.jobId, job.outputFilesXML)
if not job.newDirNM:
job.newDirNM = "tarball_PandaJob_%s_%s" % (job.jobId, site.sitename)
# # restore the hidden proxy if necessary
# try:
# restoreProxy()
# except Exception, e:
# tolog("Pilot failed to restore the proxy: %s" % str(e))
tolog("Preparing to create log file")
# protect the work dir until the log has been registered
createLockFile(self.__env['jobrec'], site.workdir)
# create log file and register it
if not self.createLogFile(job):
tolog("!!WARNING!!1600!! Could not create log file")
else:
# update the current file state
updateFileState(job.logFile, site.workdir, job.jobId, mode="file_state", state="created")
dumpFileStates(site.workdir, job.jobId)
# create the final metadata.xml
if not jr and job.result[0] != "failed":
strXML = self.createMetadataForOutput(site.workdir, job.logFile, job.jobId, job.newDirNM, job.outputFilesXML)
# create metadata later (in updatePandaServer) for the log at least, if it doesn't exist already
if (strXML == "" or strXML == None) and job.result[0] == 'failed':
tolog("metadata will be created for the log only in updatePandaServer")
# update the job state file
JR = JobRecovery()
if job.jobState != "stageout":
job.jobState = "stageout"
_retjs = JR.updateJobStateTest(job, site, workerNode, mode="test")
# register/copy log file
try:
ret, job = self.transferLogFile(job, site, experiment, dest=self.__env['logFileDir'], jr=jr)
except:
tolog("Failed to transfer log file: %s" % traceback.format_exc())
ret = False
if not ret:
tolog("!!%s!!1600!! Could not transfer log file" % (self.__env['errorLabel']))
job.result[0] = "holding"
else:
# the log file has been created and transferred, so it's now safe to remove the lock file
# as long as output files have been moved to local SE. It will also be removed for
# non-recoverable errors (such as 1150 = looping job, etc)
error = PilotErrors()
if not error.isPutErrorCode(job.result[2]):
self.removeLockFile(site.workdir)
else:
tolog("!!WARNING!!1600!! Job failed with EC %d - lock file will not be removed (job might be recovered by a later pilot)" % job.result[2])
# transfer additional files for CERNVM (below, after the final server update which update the job state file with the metadata XML)
# note: only needed in CoPilot mode
fname = os.path.join(site.workdir, job.outputFilesXML)
if os.path.exists(fname) and ("CERNVM" in site.sitename and self.__env['useCoPilot']):
transferAdditional = True
# update the job state file
job.jobState = job.result[0]
_retjs = JR.updateJobStateTest(job, site, workerNode, mode="test")
tc_1 = os.times()
job.timeCleanUp = int(round(tc_1[4]-tc_0[4]))
# add timing info to log message
logMsg = self.addTimingInfo(logMsg, job.timeGetJob, job.timeStageIn, job.timeExe, job.timeStageOut, job.timeCleanUp)
# write and transfer log extracts to pilot init dir for Nordugrid
if os.environ.has_key('Nordugrid_pilot') and job.result[0] == 'failed':
self.transferLogExtracts(logMsg)
# update the SURLs info
if strXML and strXML != "":
tolog("Updating metadata XML with SURLs prior to PanDA server update")
strXML = updateXMLWithSURLs(experiment, strXML, site.workdir, job.jobId, self.__env['jobrec']) # do not use format 'NG' here (even for NG)
# was the log file transferred to an OS? check in the OS transfer dictionary
if job.logBucketID != -1:
# get the corresponding ddm endpoint
si = getSiteInformation(experiment)
os_ddmendpoint = si.getObjectstoreDDMEndpointFromBucketID(job.logBucketID)
strXML = updateXMLWithEndpoints(strXML, [job.logFile], [os_ddmendpoint])
tolog("Updated XML:\n%s" % (strXML))
# replace the metadata-<jobId>.xml file
if putMetadata(site.workdir, job.jobId, strXML):
tolog("Successfully updated metadata file")
# get the experiment object
thisExperiment = getExperiment(experiment)
# get experiment specific metadata
try:
expSpecificMetadata = thisExperiment.getExpSpecificMetadata(job, workdir)
except Exception, e:
tolog("!!WARNING!!1211!! Caught exception in getAdditionalMetadata: %s" % (e))
expSpecificMetadata = ""
# update panda server
ret, retNode = self.updatePandaServer(job, site, workerNode, self.__env['psport'],
xmlstr = strXML, log = logMsg, ra = ra, jr = jr,
schedulerID = self.__env['jobSchedulerId'],
pilotID = self.__env['pilotId'],
updateServer = self.__env['updateServerFlag'],
stdout_tail = stdout_tail,
stdout_path = stdout_path,
# stdout_tail = self.__env['stdout_tail'],
# stdout_path = self.__env['stdout_path'],
additionalMetadata = expSpecificMetadata)
if ret == 0:
tolog("Successfully updated panda server at %s" % timeStamp())
if not (os.environ.has_key('Nordugrid_pilot') or site.sitename == 'CERNVM'):
# remove the job state file for finished and failed jobs (recovery will never be necessary for them)
error = PilotErrors()
recoverable = error.isRecoverableErrorCode(job.result[2])
if job.result[0] == "finished" or (job.result[0] == "failed" and not recoverable) or \
job.result[1] != 0 or job.finalstate == "failed":
JS = JobState()
if JS.remove(site, job):
tolog("Removed job state file")
if retNode:
# store the metadata xml
retNode['xml'] = strXML
tolog("Stored XML in retNode structure")
_retjs = updateJobState(job, site, retNode)
if _retjs:
tolog("Backed up XML in job state file")
else:
tolog("updatePandaServer did not return a node structure. XML is assumed to have been sent to the server.")
else:
# if there is a server update problem at this point the job will eventually loose its heartbeat
tolog("!!WARNING!!1600!! updatePandaServer returned a %d" % (ret))
# protect the work dir until the next pilot picks up the job state file
# and properly updates the job status
# create a weak lock file to prevent cleanup from deleting the work directory
createLockFile(self.__env['jobrec'], site.workdir)
if retNode:
# store the metadata xml
retNode['xml'] = strXML
tolog("Stored XML in retNode structure")
# update the job state file with the new state information
job.result[0] = "lostheartbeat"
_retjs = updateJobState(job, site, retNode)
else:
tolog("updatePandaServer did not return a node structure. XML is assumed to have been sent to the server.")
# transfer additional files for CERNVM
if transferAdditional:
self.transferAdditionalCERNVMFiles(job, site, experiment)
# add the log extracts to the batch log
if logMsg != "":
tolog("Begin log extracts.......................................................................................")
tolog(logMsg)
tolog(".........................................................................................end log extracts")
else:
tolog("No available log extracts")
def updatePandaServer(self, job, site, workerNode, port, xmlstr = None, spaceReport = False,
log = None, ra = 0, jr = False, schedulerID = None, pilotID = None,
updateServer = True, stdout_tail = "", stdout_path = "", additionalMetadata = None):
""" Update the PanDA server """
# create and instantiate the client object
from PandaServerClient import PandaServerClient
client = PandaServerClient(pilot_version = self.__env['version'],
pilot_version_tag = self.__env['pilot_version_tag'],
pilot_initdir = self.__env['pilot_initdir'],
jobSchedulerId = self.__env['jobSchedulerId'],
pilotId = self.__env['pilotId'],
updateServer = self.__env['updateServerFlag'],
jobrec = self.__env['jobrec'],
pshttpurl = self.__env['pshttpurl'])
# update the panda server
return client.updatePandaServer(job, site, workerNode, port,
xmlstr = xmlstr, spaceReport = spaceReport, log = log, ra = ra, jr = jr,
useCoPilot = self.__env['useCoPilot'],
stdout_tail = stdout_tail, stdout_path = stdout_path, additionalMetadata = additionalMetadata)
def transferAdditionalFile(self, job, site, experiment, fileName):
"""
Transfer additional CERNVM files for CERNVM to the intermediate storage location
where it will be read by special tool responsible for final SE transfers
"""