From a99272a972ec72c0dc7a695870270fdd5f5813fd Mon Sep 17 00:00:00 2001 From: Johannes Linder Date: Tue, 1 Oct 2024 11:36:55 -0700 Subject: [PATCH] Updated download/install helper scripts. --- data/training/download_dependencies.sh | 97 ++++++++++++++++++++++++++ model/train.sh | 2 +- 2 files changed, 98 insertions(+), 1 deletion(-) create mode 100755 data/training/download_dependencies.sh diff --git a/data/training/download_dependencies.sh b/data/training/download_dependencies.sh new file mode 100755 index 0000000..57424a1 --- /dev/null +++ b/data/training/download_dependencies.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# create additional folder in borzoi data folders +mkdir -p "$BORZOI_HG38/assembly/ucsc" +mkdir -p "$BORZOI_HG38/assembly/gnomad" +mkdir -p "$BORZOI_HG38/mappability" +mkdir -p "$BORZOI_HG38/blacklist" +mkdir -p "$BORZOI_HG38/align" + +mkdir -p "$BORZOI_MM10/assembly/ucsc" +mkdir -p "$BORZOI_MM10/mappability" +mkdir -p "$BORZOI_MM10/blacklist" + + +# download and uncompress auxiliary files required for Makefile (hg38) +if [ -f "$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed" ]; then + echo "hg38_gaps.bed already exists." +else + wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38_gaps.bed.gz | gunzip -c > "$BORZOI_HG38/assembly/ucsc/hg38_gaps.bed" +fi + +if [ -f "$BORZOI_HG38/mappability/umap_k36_t10_l32.bed" ]; then + echo "umap_k36_t10_l32.bed (hg38) already exists." +else + wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_k36_t10_l32_hg38.bed.gz | gunzip -c > "$BORZOI_HG38/mappability/umap_k36_t10_l32.bed" +fi + +if [ -f "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" ]; then + echo "blacklist_hg38_all.bed already exists." +else + wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/blacklist_hg38_all.bed.gz | gunzip -c > "$BORZOI_HG38/blacklist/blacklist_hg38_all.bed" +fi + +if [ -f "$BORZOI_HG38/align/hg38.mm10.syn.net.gz" ]; then + echo "Splice site annotation already exist." +else + wget https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38.mm10.syn.net.gz -O "$BORZOI_HG38/align/hg38.mm10.syn.net.gz" +fi + + +# download and uncompress auxiliary files required for Makefile (mm10) +if [ -f "$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed" ]; then + echo "mm10_gaps.bed already exists." +else + wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/mm10_gaps.bed.gz | gunzip -c > "$BORZOI_MM10/assembly/ucsc/mm10_gaps.bed" +fi + +if [ -f "$BORZOI_MM10/mappability/umap_k36_t10_l32.bed" ]; then + echo "umap_k36_t10_l32.bed (mm10) already exists." +else + wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_k36_t10_l32_mm10.bed.gz | gunzip -c > "$BORZOI_MM10/mappability/umap_k36_t10_l32.bed" +fi + +if [ -f "$BORZOI_MM10/blacklist/blacklist_mm10_all.bed" ]; then + echo "blacklist_mm10_all.bed already exists." +else + wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/blacklist_mm10_all.bed.gz | gunzip -c > "$BORZOI_MM10/blacklist/blacklist_mm10_all.bed" +fi + + +# download and uncompress pre-compiled umap bed files +if [ -f "$BORZOI_DIR/examples/umap_human.bed" ]; then + echo "umap_human.bed already exists." +else + wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_human.bed.gz | gunzip -c > "$BORZOI_DIR/examples/umap_human.bed" +fi + +if [ -f "$BORZOI_DIR/examples/umap_mouse.bed" ]; then + echo "umap_mouse.bed already exists." +else + wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/umap_mouse.bed.gz | gunzip -c > "$BORZOI_DIR/examples/umap_mouse.bed" +fi + + +# download and index hg38 ml genome +if [ -f "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa" ]; then + echo "hg38.ml.fa already exists." +else + wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38.ml.fa.gz | gunzip -c > "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa" + idx_genome.py "$BORZOI_HG38/assembly/ucsc/hg38.ml.fa" +fi + +# download and index hg38 ml genome (gnomad major alleles) +if [ -f "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa" ]; then + echo "hg38.ml.fa (gnomad) already exists." +else + wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/hg38_gnomad.ml.fa.gz | gunzip -c > "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa" + idx_genome.py "$BORZOI_HG38/assembly/gnomad/hg38.ml.fa" +fi + +# download and index mm10 ml genome +if [ -f "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa" ]; then + echo "mm10.ml.fa already exists." +else + wget -O - https://storage.googleapis.com/seqnn-share/helper/dependencies/mm10.ml.fa.gz | gunzip -c > "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa" + idx_genome.py "$BORZOI_MM10/assembly/ucsc/mm10.ml.fa" +fi diff --git a/model/train.sh b/model/train.sh index 13943a2..b5d6337 100644 --- a/model/train.sh +++ b/model/train.sh @@ -1,3 +1,3 @@ #!/bin/sh -westminster_train_folds.py -e borzoi_py310 --f_list 3 -c 4 --identical_crosses -q rtx4090 --rc -o saved_models params.json data/hg38 data/mm10 +westminster_train_folds.py -e borzoi_py310 --f_list 3 -c 4 --identical_crosses -q rtx4090 -o saved_models params.json data/hg38 data/mm10