Skip to content

Commit

Permalink
stale file handle logged and stored in box file and boxinfo index
Browse files Browse the repository at this point in the history
  • Loading branch information
smorovic committed Mar 5, 2015
1 parent 7f0d9df commit d381173
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 20 deletions.
5 changes: 4 additions & 1 deletion python/elasticbu.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,10 @@ def elasticize_box(self,infile):
document['instance']=self.conf.instance
#only here
document['host']=basename

try:
document['detectedStaleHandle']=bool(document['detectedStaleHandle'])
except:
pass
self.index_documents('boxinfo',[document])
except Exception as ex:
self.logger.warning('box info not injected: '+str(ex))
Expand Down
36 changes: 18 additions & 18 deletions python/hltd.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,27 +539,26 @@ def run(self):
if conf.role == 'fu':
try:
#check for NFS stale file handle
#this feature is disabled until investigation
# #which kind of error is thrown with unresponsive Force10 network
#trystat = bu_disk_list_ramdisk[0]
#mpstat = os.stat(trystat)
#trystat = bu_disk_list_output[0]
#mpstat = os.stat(trystat)
for disk in bu_disk_list_ramdisk:
mpstat = os.stat(disk)
for disk in bu_disk_list_output:
mpstat = os.stat(disk)
#no issue if we reached this point
fu_stale_counter = 0
except IOError as ex:
#TODO:which kind of error is thrown with unresponsive Force10 network
if ex.errno == 116:
#rigger ramdisk remount if detected more than 5 times in a row
logger.fatal('stale file handle: '+trystat)
if fu_stale_counter>=5:
fu_stale_counter=0
logger.exception(ex)
logger.fatal('initiating remount on stale file handle')
try:os.unlink(os.path.join(conf.watch_directory,'suspend0'))
except:pass
with open(os.path.join(conf.watch_directory,'suspend0'),'w') as fi:
pass
time.sleep(1)
continue
if fu_stale_counter==0 or fu_stale_counter%20==0:
logger.fatal('detected stale file handle: '+str(disk))
#if fu_stale_counter>=5:
# fu_stale_counter=0
# logger.fatal('initiating remount on stale file handle')
# try:os.unlink(os.path.join(conf.watch_directory,'suspend0'))
# except:pass
# with open(os.path.join(conf.watch_directory,'suspend0'),'w') as fi:
# pass
# time.sleep(1)
# continue
fu_stale_counter+=1

dirstat = os.statvfs(conf.watch_directory)
Expand Down Expand Up @@ -588,6 +587,7 @@ def run(self):
numQueuedLumis,maxCMSSWLumi=self.getLumiQueueStat()
fp.write('activeRunNumQueuedLS='+numQueuedLumis+'\n')
fp.write('activeRunCMSSWMaxLS='+maxCMSSWLumi+'\n')
fp.write('detectedStaleHandle='+str(fu_stale_counter>0))
fp.write('entriesComplete=True')
boxinfo_update_attempts=0
except IOError as ex:
Expand Down
3 changes: 2 additions & 1 deletion python/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@
'totalOutput' :{'type':'integer'},
'activeRuns' :{'type':'string'},
'activeRunsErrors':{'type':'string',"index":"not_analyzed"},
'activeRunNumQueuedLS':{'type':'integer'}
'activeRunNumQueuedLS':{'type':'integer'},
'detectedStaleHandle':{'type':'boolean'}
},
'_timestamp' : {
'enabled' : True,
Expand Down

0 comments on commit d381173

Please sign in to comment.