Optimize memory when loading checkpoint #699

	name: Run FA2-related unit tests

	on:
	workflow_dispatch:
	push:
	branches: [ main ]
	# Only run tests if we modify the following files
	paths:
	- "src/*/.py"
	- "examples/*/.py"
	- "tests/*/.py"

	pull_request:
	branches: [ '**' ]
	paths:
	- "src/*/.py"
	- "examples/*/.py"
	- "tests/*/.py"

	jobs:
	tests:
	runs-on:
	group: aws-g5-4xlarge-plus
	container:
	image: runpod/pytorch:2.1.1-py3.10-cuda12.1.1-devel-ubuntu22.04
	ports:
	- 80
	options: --gpus all --shm-size "8G"
	steps:
	- uses: actions/checkout@v3

	- name: Python environment
	run: \|
	which python
	python --version

	- name: Check Pytorch version
	run: \|
	nvidia-smi
	python -c "import torch; print('torch:', torch.__version__, torch)"
	python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

	- name: Install nanotron
	run: \|
	python -m pip install --upgrade pip
	pip install packaging
	pip install wheel
	pip install "flash-attn>=2.5.0" --no-build-isolation
	pip install -e .
	pip install -e .[dev]
	pip install -e .[test]

	- name: Show installed libraries and their versions
	run: pip freeze \| tee installed.txt

	- name: Run tests
	# NOTE: -m fa2 will only run the unit tests that have the mark
	# "fa2" (these are FA2-related tests)
	run: pytest -m fa2 --color=yes --durations=0 --ignore tests/fp8 --ignore tests/nanoset --verbose tests/

Provide feedback