generated from sehoffmann/python-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from sehoffmann/feature/torch-distributed
Rewrite to torch distributed
- Loading branch information
Showing
41 changed files
with
1,816 additions
and
1,547 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,55 +1,10 @@ | ||
# Python Project Template | ||
# dmlcloud | ||
[![](https://img.shields.io/pypi/v/dmlcloud)](https://pypi.org/project/dmlcloud/) | ||
[![](https://img.shields.io/github/actions/workflow/status/sehoffmann/dmlcloud/run_tests.yml?logo=github)](https://github.com/sehoffmann/dmlcloud/actions/workflows/run_tests.yml) | ||
[![](https://img.shields.io/github/actions/workflow/status/sehoffmann/dmlcloud/run_linting.yml?label=lint&logo=github)](https://github.com/sehoffmann/dmlcloud/actions/workflows/run_linting.yml) | ||
|
||
This is a quickstart project template for Python that already comes attached with the following features: | ||
Flexibel, easy-to-use, opinionated | ||
|
||
* Packaging and metadata support | ||
* Formatting and linting via *pre-commit*, *black*, *usort*, and *flake8* | ||
* Testing via *pytest* | ||
* CI via github-actions | ||
**dmlcloud** is a library for distributed training of deep learning models with torch. Its main aim is to do all these tiny little tedious things that everybody just copy pastes over and over again, while still giving you full control over the training loop and maximum flexibility. | ||
|
||
|
||
## Configuration | ||
|
||
To tailor this template to your needs, the following steps must be taken: | ||
|
||
1. Rename the *myproject* package folder to your project name | ||
2. Change metadata and project name in *setup.cfg*. | ||
3. Do not forget to change the version attribute to point to your new package name as well. | ||
4. Add dependencies to *requirements.txt* | ||
5. Adjust the *LICENSE* file to your liking. | ||
6. Adjust this *README.md* file to your liking. | ||
|
||
### Formatting and linting | ||
|
||
Install *pre-commit* and *pytest* via | ||
``` | ||
pip install -r ci_requirements.txt | ||
``` | ||
|
||
To format and lint the entire codebase run: | ||
``` | ||
pre-commit run --all-files | ||
``` | ||
|
||
To perform this step automatically during each commit (and fail on errors) run: | ||
``` | ||
pre-commit install | ||
``` | ||
|
||
### Testing | ||
To run the tests execute: | ||
``` | ||
pytest | ||
``` | ||
in the top-level directory. | ||
Tests can also be executed individually by running them as regular python script. This requires you to add a small main function to them, c.f. *test/test_myproject.py*. | ||
|
||
### Github Actions | ||
This project defines the following workflows: | ||
1. *run_linting.yml* will run `pre-commit run --all-files` on every push to develop and pull request | ||
2. *run_tests.yml* will run `pytest` on Windows, Ubuntu, and MacOS on every push to develop and pull_request | ||
3. *release_public.yml* and *release_test.yml* can be triggered manually to build a wheel distribution and publish it to PyPI or TestPyPI respectively | ||
|
||
For the publising to work, you need to add the PyPI API token as Github secrets: | ||
* *PYPI_TOKEN* for the official PyPI index | ||
* *TEST_PYPI_TOKEN* for the TestPyPI index | ||
Unlike other similar frameworks, such as *lightning*, dmcloud tries to add as little additional complexity and abstraction as possible. Instead, it is tailored towards a careful selected set of libraries and workflows and sticks with them. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,3 @@ | ||
from .config import ArgparseVar, BaseConfig, ConfigVar, DefaultConfig, SubConfig | ||
from .training import BaseTrainer, ClassificationTrainer | ||
__version__ = "0.3.0" | ||
|
||
__version__ = "0.1.1" | ||
|
||
__all__ = [ | ||
'ArgparseVar', | ||
'BaseConfig', | ||
'BaseTrainer', | ||
'ClassificationTrainer', | ||
'ConfigVar', | ||
'DefaultConfig', | ||
'SubConfig', | ||
] | ||
__all__ = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
import datetime | ||
import logging | ||
import secrets | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
from omegaconf import OmegaConf | ||
|
||
from dmlcloud.util.slurm import slurm_job_id | ||
|
||
|
||
def sanitize_filename(filename: str) -> str: | ||
return filename.replace('/', '_') | ||
|
||
|
||
def generate_id() -> str: | ||
s = secrets.token_urlsafe(5) | ||
return s.replace('-', 'a').replace('_', 'b') | ||
|
||
|
||
def generate_checkpoint_path( | ||
root: Path | str, name: Optional[str] = None, creation_time: Optional[datetime.datetime] = None | ||
) -> Path: | ||
root = Path(root) | ||
|
||
if name is None: | ||
name = 'run' | ||
|
||
if creation_time is None: | ||
creation_time = datetime.datetime.now() | ||
|
||
dt = datetime.datetime.now().strftime('%Y.%m.%d-%H:%M') | ||
name = sanitize_filename(name) | ||
return root / f'{name}-{dt}-{generate_id()}' | ||
|
||
|
||
def find_slurm_checkpoint(root: Path | str) -> Optional[Path]: | ||
root = Path(root) | ||
|
||
job_id = slurm_job_id() | ||
if job_id is None: | ||
return None | ||
|
||
for child in root.iterdir(): | ||
if CheckpointDir(child).is_valid and CheckpointDir(child).slurm_job_id == job_id: | ||
return child | ||
|
||
return None | ||
|
||
|
||
class CheckpointDir: | ||
def __init__(self, path: Path): | ||
self.path = path.resolve() | ||
self.logger = logging.getLogger('dmlcloud') | ||
|
||
@property | ||
def config_file(self) -> Path: | ||
return self.path / 'config.yaml' | ||
|
||
@property | ||
def indicator_file(self) -> Path: | ||
return self.path / '.dmlcloud' | ||
|
||
@property | ||
def log_file(self) -> Path: | ||
return self.path / 'log.txt' | ||
|
||
@property | ||
def slurm_file(self) -> Path: | ||
return self.path / '.slurm-jobid' | ||
|
||
@property | ||
def exists(self) -> bool: | ||
return self.path.exists() | ||
|
||
@property | ||
def is_valid(self) -> bool: | ||
if not self.exists or not self.path.is_dir(): | ||
return False | ||
|
||
if not self.indicator_file.exists(): | ||
return False | ||
|
||
return True | ||
|
||
@property | ||
def slurm_job_id(self) -> Optional[str]: | ||
if not self.slurm_file.exists(): | ||
return None | ||
|
||
with open(self.slurm_file) as f: | ||
return f.read() | ||
|
||
def create(self): | ||
if self.exists: | ||
raise ValueError(f'Checkpoint directory already exists: {self.path}') | ||
|
||
self.path.mkdir(parents=True, exist_ok=True) | ||
self.indicator_file.touch() | ||
self.log_file.touch() | ||
if slurm_job_id() is not None: | ||
with open(self.slurm_file, 'w') as f: | ||
f.write(slurm_job_id()) | ||
|
||
def save_config(self, config: OmegaConf): | ||
if not self.exists: | ||
raise ValueError(f'Checkpoint directory does not exist: {self.path}') | ||
|
||
with open(self.config_file, 'w') as f: | ||
OmegaConf.save(config, f) | ||
|
||
def load_config(self) -> OmegaConf: | ||
if not self.is_valid: | ||
raise ValueError(f'Checkpoint directory is not valid: {self.path}') | ||
|
||
with open(self.config_file) as f: | ||
return OmegaConf.load(f) | ||
|
||
def __str__(self) -> str: | ||
return str(self.path) | ||
|
||
def __repr__(self) -> str: | ||
return f'CheckpointDir({self.path})' |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.