Skip to content

Commit

Permalink
Add loading script to load experiment design data into database (#63)
Browse files Browse the repository at this point in the history
* Add initial data loading script

* Add script to load exp_design data

* Add test for exp_design

* remove unnecessary symbol

* update bash path

* update file permissions

* Revert old change

* update submodule

* update submodule

* Handle factor value column descripencies

* Add loop break

* resolve duplicate constraint issue

* add debug log

Co-authored-by: Pedro Madrigal <[email protected]>

---------

Co-authored-by: Pedro Madrigal <[email protected]>
  • Loading branch information
haideriqbal and pmb59 authored Jan 27, 2023
1 parent 3d64dc2 commit 0ad240f
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 1 deletion.
46 changes: 46 additions & 0 deletions bin/load_exp_design.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bash

set -e

scriptDir=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
source $scriptDir/db_scxa_common.sh

dbConnection=${dbConnection:-$1}
condensed_sdrf_file=${CONDENSED_SDRF_FILE:-$2}
sdrf_file=${SDRF_FILE:-$3}

# Check that necessary environment variables are defined.
[ -z ${dbConnection+x} ] && echo "Env var dbConnection for the database connection needs to be defined. This includes the database name." && exit 1
[ -z ${CONDENSED_SDRF_FILE+x} ] && echo "Env var CONDENSED_SDRF_FILE for the experiment design data needs to be defined." && exit 1
[ -z ${SDRF_FILE+x} ] && echo "Env var SDRF_FILE for column sequence of experiment design needs to be defined." && exit 1

# Reason for creating this array is to search factor value column
# In some sdrf files this column is mentioned as "Factor Value" and in some as "FactorValue"
FactorArray=( FactorValue "Factor Value" )

# for experiment design column table we need to have unique experiment accession, column name and sample type
# as they are the primary key for the table and we don't want to insert duplicate rows
cut -f 1,4,5 $condensed_sdrf_file | sort | uniq | while read exp_acc sample_type col_name;
do
if [ $sample_type == 'characteristic' ]
then
search_column="Characteristics[${col_name}]"
column_order=$(awk -v val="$search_column" -F '\t' '{for (i=1; i<=NF; i++) if ($i==val) {print i} }' $sdrf_file)
else
for element in "${FactorArray[@]}"; do
search_column="$element[${col_name}]"
column_order=$(awk -v val="$search_column" -F '\t' '{for (i=1; i<=NF; i++) if ($i==val) {print i} }' $sdrf_file)
if [[ -n "${column_order}" ]]; then
break
fi
done
fi
echo "INSERT INTO exp_design_column (experiment_accession, column_name, sample_type, column_order) VALUES ('$exp_acc', '$col_name', '$sample_type', '$column_order');" | psql -v ON_ERROR_STOP=1 $dbConnection
done

while IFS=$'\t' read exp_acc sample sample_type col_name annot_value annot_url
do
echo "INSERT INTO exp_design (sample, annot_value, annot_ont_uri, exp_design_column_id) VALUES ('$sample', '$annot_value', '$annot_url', (SELECT id FROM exp_design_column WHERE experiment_accession='$exp_acc' AND column_name='$col_name' AND sample_type='$sample_type'));" | psql -v ON_ERROR_STOP=1 $dbConnection
done < $condensed_sdrf_file

echo "Experiment design data done loading for $condensed_sdrf_file"
9 changes: 9 additions & 0 deletions tests/random-data-set.bats
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,15 @@
[ "$status" -eq 0 ]
}

@test "Exp_Design: Load exp_design data" {
export CONDENSED_SDRF_FILE=/tmp/fixtures/experiment_files/magetab/E-MTAB-2983/E-MTAB-2983.condensed-sdrf.tsv
export SDRF_FILE=/tmp/fixtures/experiment_files/magetab/E-MTAB-2983/E-MTAB-2983.sdrf.txt
run load_exp_design.sh

echo "output = ${output}"
[ "$status" -eq 0 ]
}

@test "Collections: Create X" {
export COLL_ID=MYCOLLX
export COLL_NAME="My collection X"
Expand Down

0 comments on commit 0ad240f

Please sign in to comment.