diff --git a/atlas-schemas b/atlas-schemas index 33a9b2e..8054511 160000 --- a/atlas-schemas +++ b/atlas-schemas @@ -1 +1 @@ -Subproject commit 33a9b2e1d0ff05c22da81aa35cfb0dda7fe06b2c +Subproject commit 80545112bf373ccacfe46a53787aa07a4ef0b971 diff --git a/bin/load_exp_design.sh b/bin/load_exp_design.sh new file mode 100755 index 0000000..48bf5cf --- /dev/null +++ b/bin/load_exp_design.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +set -e + +scriptDir=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) +source $scriptDir/db_scxa_common.sh + +dbConnection=${dbConnection:-$1} +condensed_sdrf_file=${CONDENSED_SDRF_FILE:-$2} +sdrf_file=${SDRF_FILE:-$3} + +# Check that necessary environment variables are defined. +[ -z ${dbConnection+x} ] && echo "Env var dbConnection for the database connection needs to be defined. This includes the database name." && exit 1 +[ -z ${CONDENSED_SDRF_FILE+x} ] && echo "Env var CONDENSED_SDRF_FILE for the experiment design data needs to be defined." && exit 1 +[ -z ${SDRF_FILE+x} ] && echo "Env var SDRF_FILE for column sequence of experiment design needs to be defined." && exit 1 + +# Reason for creating this array is to search factor value column +# In some sdrf files this column is mentioned as "Factor Value" and in some as "FactorValue" +FactorArray=( FactorValue "Factor Value" ) + +# for experiment design column table we need to have unique experiment accession, column name and sample type +# as they are the primary key for the table and we don't want to insert duplicate rows +cut -f 1,4,5 $condensed_sdrf_file | sort | uniq | while read exp_acc sample_type col_name; +do + if [ $sample_type == 'characteristic' ] + then + search_column="Characteristics[${col_name}]" + column_order=$(awk -v val="$search_column" -F '\t' '{for (i=1; i<=NF; i++) if ($i==val) {print i} }' $sdrf_file) + else + for element in "${FactorArray[@]}"; do + search_column="$element[${col_name}]" + column_order=$(awk -v val="$search_column" -F '\t' '{for (i=1; i<=NF; i++) if ($i==val) {print i} }' $sdrf_file) + if [[ -n "${column_order}" ]]; then + break + fi + done + fi + echo "INSERT INTO exp_design_column (experiment_accession, column_name, sample_type, column_order) VALUES ('$exp_acc', '$col_name', '$sample_type', '$column_order');" | psql -v ON_ERROR_STOP=1 $dbConnection +done + +while IFS=$'\t' read exp_acc sample sample_type col_name annot_value annot_url +do + echo "INSERT INTO exp_design (sample, annot_value, annot_ont_uri, exp_design_column_id) VALUES ('$sample', '$annot_value', '$annot_url', (SELECT id FROM exp_design_column WHERE experiment_accession='$exp_acc' AND column_name='$col_name' AND sample_type='$sample_type'));" | psql -v ON_ERROR_STOP=1 $dbConnection +done < $condensed_sdrf_file + +echo "Experiment design data done loading for $condensed_sdrf_file" \ No newline at end of file diff --git a/tests/random-data-set.bats b/tests/random-data-set.bats index 17da3db..f573918 100644 --- a/tests/random-data-set.bats +++ b/tests/random-data-set.bats @@ -377,6 +377,15 @@ [ "$status" -eq 0 ] } +@test "Exp_Design: Load exp_design data" { + export CONDENSED_SDRF_FILE=/tmp/fixtures/experiment_files/magetab/E-MTAB-2983/E-MTAB-2983.condensed-sdrf.tsv + export SDRF_FILE=/tmp/fixtures/experiment_files/magetab/E-MTAB-2983/E-MTAB-2983.sdrf.txt + run load_exp_design.sh + + echo "output = ${output}" + [ "$status" -eq 0 ] +} + @test "Collections: Create X" { export COLL_ID=MYCOLLX export COLL_NAME="My collection X"