-
Notifications
You must be signed in to change notification settings - Fork 23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Azure server #689
base: main
Are you sure you want to change the base?
Azure server #689
Changes from all commits
dfa290d
b65c34d
83cd331
6437460
64893cd
7948473
6b3b610
808ba16
18f2b4a
2bf315c
00e4b74
fe06f9e
197e248
0baf191
b00bca6
48a612f
5ac4317
e346f8d
65186c8
76de095
c69d700
84ec8cb
46541b1
75942f5
a569939
45681e1
cb5ff4c
90a5c89
dfdcf13
e9954a5
123279e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,6 +36,7 @@ scratch/ | |
|
||
# csv database | ||
*.csv | ||
!basis_sets.csv | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you explain There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So that whenever someone |
||
|
||
# iPython files | ||
*.ipynb_* | ||
|
@@ -52,6 +53,10 @@ timer.dat | |
# .vscode | ||
.vscode | ||
|
||
# files created via testing | ||
nul | ||
run.out | ||
|
||
# .trunk folder | ||
.trunk | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -772,10 +772,13 @@ def set_cpu_and_mem(self): | |
f'exceeds {100 * job_max_server_node_memory_allocation}% of the the maximum node memory on ' | ||
f'{self.server}. Setting it to {job_max_server_node_memory_allocation * max_mem:.2f} GB.') | ||
self.job_memory_gb = job_max_server_node_memory_allocation * max_mem | ||
total_submit_script_memory = self.job_memory_gb * 1024 * 1.05 # MB | ||
total_submit_script_memory = self.job_memory_gb * 1024 * 1.05 if (self.job_memory_gb * 1024 * 1.05) <= (max_mem * 1024) else max_mem * 1024 # MB | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you squash? I commented about this on the original commit |
||
self.job_status[1]['keywords'].append('max_total_job_memory') # Useful info when troubleshooting. | ||
else: | ||
total_submit_script_memory = self.job_memory_gb * 1024 * 1.1 # MB | ||
if max_mem is None: | ||
total_submit_script_memory = self.job_memory_gb * 1024 * 1.1 | ||
else: | ||
total_submit_script_memory = self.job_memory_gb * 1024 * 1.1 if (self.job_memory_gb * 1024 * 1.1) <= (max_mem * 1024) else max_mem * 1024 # MB | ||
# Determine amount of memory in submit script based on cluster job scheduling system. | ||
cluster_software = servers[self.server].get('cluster_soft').lower() if self.server is not None else None | ||
if cluster_software in ['oge', 'sge', 'htcondor']: | ||
|
@@ -785,8 +788,8 @@ def set_cpu_and_mem(self): | |
# In PBS, "#PBS -l select=1:ncpus=8:mem=12000000" specifies the memory for all cores to be 12 MB. | ||
self.submit_script_memory = math.ceil(total_submit_script_memory) * 1E6 # in Bytes | ||
elif cluster_software in ['slurm']: | ||
# In Slurm, "#SBATCH --mem-per-cpu=2000" specifies the memory **per cpu/thread** to be 2000 MB. | ||
self.submit_script_memory = math.ceil(total_submit_script_memory / self.cpu_cores) # in MB | ||
# In Slurm, "#SBATCH --mem=2000" specifies the memory to be 2000 MB. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please make the comment more explicit by specifying something like If you change this behavior, do we need to make any changes to our submit scripts? |
||
self.submit_script_memory = math.ceil(total_submit_script_memory) # in MB | ||
self.set_input_file_memory() | ||
|
||
def as_dict(self) -> dict: | ||
|
@@ -942,18 +945,25 @@ def _get_additional_job_info(self): | |
if cluster_soft in ['oge', 'sge', 'slurm', 'pbs', 'htcondor']: | ||
local_file_path_1 = os.path.join(self.local_path, 'out.txt') | ||
local_file_path_2 = os.path.join(self.local_path, 'err.txt') | ||
local_file_path_3 = os.path.join(self.local_path, 'job.log') | ||
if self.server != 'local' and self.remote_path is not None and not self.testing: | ||
local_file_path_3 = None | ||
for files in self.files_to_upload: | ||
if 'job.sh' in files.values(): | ||
local_file_path_3 = os.path.join(self.local_path, 'job.log') | ||
if self.server != 'local' and self.remote_path is not None: | ||
remote_file_path_1 = os.path.join(self.remote_path, 'out.txt') | ||
remote_file_path_2 = os.path.join(self.remote_path, 'err.txt') | ||
remote_file_path_3 = os.path.join(self.remote_path, 'job.log') | ||
remote_file_path_3 = None | ||
for files in self.files_to_upload: | ||
if 'job.sh' in files.values(): | ||
remote_file_path_3 = os.path.join(self.remote_path, 'job.log') | ||
with SSHClient(self.server) as ssh: | ||
for local_file_path, remote_file_path in zip([local_file_path_1, | ||
local_file_path_2, | ||
local_file_path_3], | ||
[remote_file_path_1, | ||
remote_file_path_2, | ||
remote_file_path_3]): | ||
|
||
local_files_to_zip = [local_file_path_1, local_file_path_2] | ||
remote_files_to_zip = [remote_file_path_1, remote_file_path_2] | ||
if local_file_path_3 and remote_file_path_3: | ||
local_files_to_zip.append(local_file_path_3) | ||
remote_files_to_zip.append(remote_file_path_3) | ||
for local_file_path, remote_file_path in zip(local_files_to_zip, remote_files_to_zip): | ||
try: | ||
ssh.download_file(remote_file_path=remote_file_path, | ||
local_file_path=local_file_path) | ||
|
@@ -963,10 +973,21 @@ def _get_additional_job_info(self): | |
f'flags with stdout and stderr of out.txt and err.txt, respectively ' | ||
f'(e.g., "#SBATCH -o out.txt"). Error message:') | ||
logger.warning(e) | ||
for local_file_path in [local_file_path_1, local_file_path_2, local_file_path_3]: | ||
for local_file_path in [path for path in [local_file_path_1, local_file_path_2, local_file_path_3] if path]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could be simpler to keep the above version and add to the condition below |
||
if os.path.isfile(local_file_path): | ||
with open(local_file_path, 'r') as f: | ||
lines = f.readlines() | ||
with open(local_file_path, 'rb') as f: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you mind outsourcing this |
||
# Read the file | ||
first_bytes = f.read() | ||
# Check if the bytes contain a null byte | ||
has_null_byte = b'\x00' in first_bytes | ||
# Use the appropriate mode based on whether the file is binary or not | ||
mode = 'rb' if has_null_byte else 'r' | ||
# Read the file contents using the determined mode | ||
lines = first_bytes.decode('utf-8') | ||
if mode == 'r': | ||
with open(local_file_path, 'r') as f: | ||
lines = f.readlines() | ||
|
||
content += ''.join([line for line in lines]) | ||
content += '\n' | ||
else: | ||
|
@@ -1346,6 +1367,14 @@ def troubleshoot_server(self): | |
if run_job: | ||
# resubmit job | ||
self.execute() | ||
|
||
def remove_remote_files(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where is this being called? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here
|
||
""" | ||
Remove the remote files. | ||
""" | ||
if (self.server != 'local' and self.server is not None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need for the parenthesis |
||
with SSHClient(self.server) as ssh: | ||
ssh.remove_dir(self.remote_path) | ||
|
||
def troubleshoot_queue(self) -> bool: | ||
"""Troubleshoot queue errors. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
import os | ||
from typing import TYPE_CHECKING, List, Optional, Tuple, Union | ||
import socket | ||
import re | ||
Check notice Code scanning / CodeQL Unused import Note
Import of 're' is not used.
|
||
|
||
from mako.template import Template | ||
|
||
|
@@ -156,6 +157,7 @@ | |
self.execution_type = execution_type or 'queue' | ||
self.command = 'molpro' | ||
self.url = 'https://www.molpro.net/' | ||
self.core_change = None | ||
|
||
if species is None: | ||
raise ValueError('Cannot execute Molpro without an ARCSpecies object.') | ||
|
@@ -326,7 +328,7 @@ | |
Set the input_file_memory attribute. | ||
""" | ||
# Molpro's memory is per cpu core and in MW (mega word; 1000 MW = 7.45 GB on a 64-bit machine) | ||
# The conversion from mW to GB was done using this (https://deviceanalytics.com/words-to-bytes-converter/) | ||
# The conversion from mW to GB was done using this (c) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ? |
||
# specifying a 64-bit architecture. | ||
# | ||
# See also: | ||
|
@@ -335,8 +337,37 @@ | |
# 800,000,000 bytes (800 mb). | ||
# Formula - (100,000,000 [Words]/( 800,000,000 [Bytes] / (job mem in gb * 1000,000,000 [Bytes])))/ 1000,000 [Words -> MegaWords] | ||
# The division by 1E6 is for converting into MWords | ||
# Due to Zeus's configuration, there is only 1 nproc so the memory should not be divided by cpu_cores. | ||
# Due to Zeus's configuration, there is only 1 nproc so the memory should not be divided by cpu_cores. | ||
self.input_file_memory = math.ceil(self.job_memory_gb / (7.45e-3 * self.cpu_cores)) if 'zeus' not in socket.gethostname() else math.ceil(self.job_memory_gb / (7.45e-3)) | ||
# We need to check if ess_trsh_methods=['cpu'] and ess_trsh_methods=['molpro_memory:] exists | ||
# If it does, we need to reduce the cpu_cores | ||
if self.ess_trsh_methods is not None: | ||
if 'cpu' in self.ess_trsh_methods and any('molpro_memory:' in method for method in self.ess_trsh_methods): | ||
current_cpu_cores = self.cpu_cores | ||
max_memory = self.job_memory_gb | ||
memory_values = [] | ||
for item in self.ess_trsh_methods: | ||
if 'molpro_memory:' in item: | ||
memory_value = item.split('molpro_memory:')[1] | ||
memory_values.append(float(memory_value)) | ||
|
||
if memory_values: | ||
min_memory_value = min(memory_values) | ||
required_cores = math.floor(max_memory / (min_memory_value * 7.45e-3)) | ||
if self.core_change is None: | ||
self.core_change = required_cores | ||
elif self.core_change == required_cores: | ||
# We have already done this | ||
# Reduce the cores by 1 | ||
required_cores -= 1 | ||
if required_cores < current_cpu_cores: | ||
self.cpu_cores = required_cores | ||
logger.info(f'Changing the number of cpu_cores from {current_cpu_cores} to {self.cpu_cores}') | ||
self.input_file_memory = math.ceil(self.job_memory_gb / (7.45e-3 * self.cpu_cores)) if 'zeus' not in socket.gethostname() else math.ceil(self.job_memory_gb / (7.45e-3)) | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please keep just one line break between methods |
||
|
||
|
||
|
||
def execute_incore(self): | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can this commit be removed once it did its trick once?