Skip to content

Commit

Permalink
Bugfix – Inferred cell type field not in SDRF breaks experiment design (
Browse files Browse the repository at this point in the history
#69)

* Move checkDatabaseConnection to common_routines.sh

* Remove unnecessary files

* Add fixtures with fields absent in the SDRF file

* Refactor load_exp_design.sh and AWK script with support for fields absent in the SDRF file

* Restore files needed for testing, but set them in sensible locations

* Use scratch directory for experiment design files

* Set new experiment accession

* Fix typos

* Update version of the image used to run tests

* Update version of the image used to run tests (!)

* Changed scripts to adjust to changes in 1a0c76e

* Use ${SCRATCH_DIR} if set to write the SQL file

* Prepend ${SCRIPT_DIR} to AWK file

* Remove duplicate line in fixture

* Remove duplicate line in fixture

* Remove unneeded configuration values for solr / zk

* Remove unused configuration values for solr / zk

* Add human experiment to tests

* Fix typo in species when asserting experiment load correctness

---------

Co-authored-by: Karoly Erdos <[email protected]>
  • Loading branch information
alfonsomunozpomer and ke4 authored Oct 10, 2023
1 parent 6326500 commit 908307e
Show file tree
Hide file tree
Showing 33 changed files with 315 additions and 98 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM quay.io/ebigxa/atlas-db-scxa-base:0.1.0
FROM quay.io/ebigxa/atlas-db-scxa-base:0.15.0.0
# debian

ADD bin/* /usr/local/bin/
Expand Down
1 change: 0 additions & 1 deletion bin/add_exps_to_collection.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
set -e

scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source $scriptDir/db_scxa_common.sh

dbConnection=${dbConnection:-$1}
COLL_ID=${COLL_ID:-$2}
Expand Down
13 changes: 13 additions & 0 deletions bin/common_routines.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,16 @@ get_host_from_hostport() {
get_port_from_hostport() {
echo $(echo $1 | awk -F':' '{ print $2 }')
}

checkDatabaseConnection() {
pg_user=$(echo $1 | sed s+postgresql://++ | awk -F':' '{ print $1}')
pg_host_port=$(echo $1 | awk -F':' '{ print $3}' \
| awk -F'@' '{ print $2}' | awk -F'/' '{ print $1 }')
pg_host=$(echo $pg_host_port | awk -F':' '{print $1}')
pg_port=$(echo $pg_host_port | awk -F':' '{print $2}')
if [ ! -z "$pg_port" ]; then
pg_isready -U $pg_user -h $pg_host -p $pg_port || (echo "No db connection." && exit 1)
else
pg_isready -U $pg_user -h $pg_host || (echo "No db connection" && exit 1)
fi
}
2 changes: 1 addition & 1 deletion bin/create_collection.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -e

scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source $scriptDir/db_scxa_common.sh
source $scriptDir/common_routines.sh

dbConnection=${dbConnection:-$1}
COLL_ID=${COLL_ID:-$2}
Expand Down
14 changes: 0 additions & 14 deletions bin/db_scxa_common.sh

This file was deleted.

1 change: 0 additions & 1 deletion bin/delete_collection.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
set -e

scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source $scriptDir/db_scxa_common.sh

dbConnection=${dbConnection:-$1}
COLL_ID=${COLL_ID:-$2}
Expand Down
2 changes: 1 addition & 1 deletion bin/delete_exp_from_collection.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -e

scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source $scriptDir/db_scxa_common.sh
source $scriptDir/common_routines.sh

dbConnection=${dbConnection:-$1}
COLL_ID=${COLL_ID:-$2}
Expand Down
1 change: 0 additions & 1 deletion bin/get_experiment_info.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
set -e

scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source $scriptDir/db_scxa_common.sh

postgres_scripts_dir=$scriptDir/../postgres_routines

Expand Down
2 changes: 1 addition & 1 deletion bin/load_db_scxa_analytics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
set -e

scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source $scriptDir/db_scxa_common.sh
source $scriptDir/common_routines.sh

postgres_scripts_dir=$scriptDir/../postgres_routines

Expand Down
2 changes: 1 addition & 1 deletion bin/load_db_scxa_analytics_pg9.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
set -e

scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source $scriptDir/db_scxa_common.sh
source $scriptDir/common_routines.sh

dbConnection=${dbConnection:-$1}
EXP_ID=${EXP_ID:-$2}
Expand Down
2 changes: 1 addition & 1 deletion bin/load_db_scxa_cell_clusters.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
set -e

scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source $scriptDir/db_scxa_common.sh
source $scriptDir/common_routines.sh

dbConnection=${dbConnection:-$1}
EXP_ID=${EXP_ID:-$2}
Expand Down
3 changes: 1 addition & 2 deletions bin/load_db_scxa_dimred.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
set -e

scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source $scriptDir/db_scxa_common.sh
source $scriptDir/common_routines.sh

dbConnection=${dbConnection:-$1}
EXP_ID=${EXP_ID:-$2}
DIMRED_TYPE=${DIMRED_TYPE:-$3}
DIMRED_FILE_PATH=${DIMRED_FILE_PATH:-$4}
DIMRED_PARAM_JSON=${DIMRED_PARAM_JSON:-$5}
#SCRATCH_DIR=${SCRATCH_DIR:-"$(dirname ${DIMRED_FILE_PATH})"}
SCRATCH_DIR=${SCRATCH_DIR:-"$DIMRED_FILE_PATH"}

# Check that necessary environment variables are defined.
Expand Down
2 changes: 1 addition & 1 deletion bin/load_db_scxa_marker_genes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
set -e

scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source $scriptDir/db_scxa_common.sh
source $scriptDir/common_routines.sh

dbConnection=${dbConnection:-$1}
EXP_ID=${EXP_ID:-$2}
Expand Down
13 changes: 13 additions & 0 deletions bin/load_exp_design.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Return the index of the first field that matches the given pattern, or 0 if it’s not found
{
for (i = 1; i <= NF; ++i) {
field = $i;
if (field ~ pattern) {
print i;
exit;
}
}

print 0;
exit;
}
67 changes: 47 additions & 20 deletions bin/load_exp_design.sh
Original file line number Diff line number Diff line change
@@ -1,32 +1,59 @@
#!/usr/bin/env bash

set -e
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source "${SCRIPT_DIR}/common_routines.sh"

scriptDir=$(cd "$( dirname "${BASH_SOURCE[0]:-$0}" )" && pwd )
source $scriptDir/db_scxa_common.sh

# Alfonso is bothered about dbConnection, it shouldn’t be camelCased because:
# 1. It’s a constant, it should be DB_CONNECTION
# 2. We use snake_case for Bash variables
dbConnection=${dbConnection:-$1}
condensed_sdrf_file=${CONDENSED_SDRF_FILE:-$2}
sdrf_file=${SDRF_FILE:-$3}
CONDENSED_SDRF_FILE=${CONDENSED_SDRF_FILE:-$2}
SDRF_FILE=${SDRF_FILE:-$3}

# Check that necessary environment variables are defined
require_env_var "dbConnection"
require_env_var "CONDENSED_SDRF_FILE"
require_env_var "SDRF_FILE"
checkDatabaseConnection "${dbConnection}"

EXPERIMENT_ACCESSION=$(head -1 "${CONDENSED_SDRF_FILE}" | cut -f 1)
DESTINATION_FILE=${SCRATCH_DIR:-${SCRIPT_DIR}}/${EXPERIMENT_ACCESSION}-exp-design.sql
# Remove DESTINATION_FILE if it exists
rm -f ${DESTINATION_FILE}

# Check that necessary environment variables are defined.
[ -z ${dbConnection+x} ] && echo "Env var dbConnection for the database connection needs to be defined. This includes the database name." && exit 1
[ -z ${CONDENSED_SDRF_FILE+x} ] && echo "Env var CONDENSED_SDRF_FILE for the experiment design data needs to be defined." && exit 1
[ -z ${SDRF_FILE+x} ] && echo "Env var SDRF_FILE for column sequence of experiment design needs to be defined." && exit 1
# Create the file and enclose all INSERT statements in a transaction
echo "BEGIN;" >> ${DESTINATION_FILE}

# for experiment design column table, we need to have a unique experiment accession, column name, and sample type
# as they are the primary key for the table, and we don't want to insert duplicate rows
cut -f 1,4,5 "$condensed_sdrf_file" | sort | uniq | while read exp_acc sample_type col_name; do
# In the experiment design column table we use the experiment accession, column name and sample type as the primary key
cut -f 1,4,5 "${CONDENSED_SDRF_FILE}" | sort | uniq | while read experiment_accession sample_type column_name; do
if [ "$sample_type" == 'characteristic' ]; then
column_order=$(awk -v val="$search_column" -v pattern="^Characteristics ?\\[${col_name}]$" -F '\t' '{for (i=1; i<=NF; i++) if ($i ~ pattern) {print i} }' "$sdrf_file")
sdrf_column_index=$(awk -F '\t' -v pattern="^Characteristics ?\\\[${column_name}\\\]$" -f ${SCRIPT_DIR}/load_exp_design.awk ${SDRF_FILE})
else
column_order=$(awk -v val="$search_column" -v pattern="^Factor ?Value ?\\[${col_name}]$" -F '\t' '{for (i=1; i<=NF; i++) if ($i ~ pattern) {print i} }' "$sdrf_file")
sdrf_column_index=$(awk -F '\t' -v pattern="^Factor ?Value ?\\\[${column_name}\\\]$" -f ${SCRIPT_DIR}/load_exp_design.awk ${SDRF_FILE})
fi
echo "INSERT INTO exp_design_column (experiment_accession, column_name, sample_type, column_order) VALUES ('$exp_acc', '$col_name', '$sample_type', '$column_order');" | psql -v ON_ERROR_STOP=1 "$dbConnection"
sql_statement="INSERT INTO exp_design_column (experiment_accession, sample_type, column_name, column_order) VALUES ('${experiment_accession}', '${sample_type}', '${column_name}', '${sdrf_column_index}');"
echo "${sql_statement}" >> ${DESTINATION_FILE}
done

while IFS=$'\t' read -r exp_acc sample sample_type col_name annot_value annot_url; do
echo "INSERT INTO exp_design (sample, annot_value, annot_ont_uri, exp_design_column_id) VALUES ('$sample', '$annot_value', '$annot_url', (SELECT id FROM exp_design_column WHERE experiment_accession='$exp_acc' AND column_name='$col_name' AND sample_type='$sample_type'));" | psql -v ON_ERROR_STOP=1 "$dbConnection"
done < "$condensed_sdrf_file"
# Add the columns from the condensed SDRF file.
# Fields in the condensed SDRF that aren’t in the SDRF are assigned a column_order value of 0 by the AWK script.
# We need to assign them a value that is greater than the maximum column_order value for the experiment.
# The column_order value is used to order the columns in the UI and is not used for the primary key, so it’s ok to have
# duplicates; we can order the fields with the same column_order by name if necessary.
sql_statement="UPDATE exp_design_column SET column_order=(SELECT MAX(column_order) FROM exp_design_column WHERE experiment_accession='${EXPERIMENT_ACCESSION}')+1 WHERE column_order=0 AND experiment_accession='${EXPERIMENT_ACCESSION}';"
echo "${sql_statement}" >> ${DESTINATION_FILE}

# Insert the experiment design data.
while IFS=$'\t' read -r experiment_accession sample sample_type column_name annotation_value annotation_url; do
sql_statement="INSERT INTO exp_design (sample, annot_value, annot_ont_uri, exp_design_column_id) VALUES ('${sample}', '${annotation_value}', '${annotation_url}', (SELECT id FROM exp_design_column WHERE experiment_accession='${experiment_accession}' AND column_name='${column_name}' AND sample_type='${sample_type}'));"
echo "${sql_statement}" >> ${DESTINATION_FILE}
done < "$CONDENSED_SDRF_FILE"

# Finish the transaction
echo "COMMIT;" >> ${DESTINATION_FILE}

PSQL_CMD="psql -qv ON_ERROR_STOP=1 ${dbConnection} -f ${DESTINATION_FILE}"
echo ${PSQL_CMD}
eval ${PSQL_CMD}

echo "Experiment design data done loading for $condensed_sdrf_file"
echo "$CONDENSED_SDRF_FILE: finished loading experiment design"
22 changes: 6 additions & 16 deletions bin/load_experiment_web_cli.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/usr/bin/env bash
#
# This script:
# - Checks if the experiment is loaded and stops it is already loaded.
# - Adds the appropiate line to the experiments table if it doesn't exist.
# - Generates the experiment design file from condensed SDRF and SDRF files in $EXPERIMENT_FILES/expdesign
# - Checks if the experiment is loaded and stops if it is already loaded.
# - Adds the appropriate line to the experiments table if it doesn't exist.
# - Generates the experiment design file from condensed SDRF and SDRF files in $EXPERIMENT_DESIGN_FILES
#
# Most of the variables required for this are usually defined in the environment file for each setup (test, prod, etc).
# The experiment designs file might need to be synced to an appropiate location at the web application instance disk
# The experiment designs file might need to be synced to an appropriate location at the web application instance disk
# depending on how the setup disk layout.

jar_dir=$CONDA_PREFIX/share/atlas-cli
Expand All @@ -16,29 +16,19 @@ source $scriptDir/common_routines.sh

echo "CONDA_PREFIX: $CONDA_PREFIX"

require_env_var "SOLR_HOST"
require_env_var "ZK_HOST"
require_env_var "ZK_PORT"
require_env_var "BIOENTITIES"
require_env_var "EXPERIMENT_FILES"
require_env_var "EXPERIMENT_DESIGN_FILES"
require_env_var "jdbc_url"
require_env_var "jdbc_username"
require_env_var "jdbc_password"

# Either ACCESSIONS or PRIVATE_ACCESSIONS need to be provided
#require_env_var "ACCESSIONS"

SOLR_PORT=$(get_port_from_hostport $SOLR_HOST)
SOLR_HOST=$(get_host_from_hostport $SOLR_HOST)

require_env_var "SOLR_PORT"

java_opts="-Dsolr.host=$SOLR_HOST"
java_opts="$java_opts -Dsolr.port=$SOLR_PORT"
java_opts="$java_opts -Dzk.host=$ZK_HOST"
java_opts="$java_opts -Dzk.port=$ZK_PORT"
java_opts="$java_opts -Ddata.files.location=$BIOENTITIES"
java_opts="$java_opts -Dexperiment.files.location=$EXPERIMENT_FILES"
java_opts="$java_opts -Dexperiment.design.location=$EXPERIMENT_DESIGN_FILES"
java_opts="$java_opts -Djdbc.url=$jdbc_url"
java_opts="$java_opts -Djdbc.username=$jdbc_username"
java_opts="$java_opts -Djdbc.password=$jdbc_password"
Expand Down
1 change: 0 additions & 1 deletion bin/modify_collection.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
set -e

scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
source $scriptDir/db_scxa_common.sh

dbConnection=${dbConnection:-$1}
COLL_ID=${COLL_ID:-$2}
Expand Down
14 changes: 2 additions & 12 deletions bin/update_experiment_web_cli.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,17 @@ source $scriptDir/common_routines.sh

echo "CONDA_PREFIX: $CONDA_PREFIX"

require_env_var "SOLR_HOST"
require_env_var "ZK_HOST"
require_env_var "ZK_PORT"
require_env_var "BIOENTITIES"
require_env_var "EXPERIMENT_FILES"
require_env_var "EXPERIMENT_DESIGN_FILES"
require_env_var "jdbc_url"
require_env_var "jdbc_username"
require_env_var "jdbc_password"
require_env_var "ACCESSIONS"

SOLR_PORT=$(get_port_from_hostport $SOLR_HOST)
SOLR_HOST=$(get_host_from_hostport $SOLR_HOST)

require_env_var "SOLR_PORT"

java_opts="-Dsolr.host=$SOLR_HOST"
java_opts="$java_opts -Dsolr.port=$SOLR_PORT"
java_opts="$java_opts -Dzk.host=$ZK_HOST"
java_opts="$java_opts -Dzk.port=$ZK_PORT"
java_opts="$java_opts -Ddata.files.location=$BIOENTITIES"
java_opts="$java_opts -Dexperiment.files.location=$EXPERIMENT_FILES"
java_opts="$java_opts -Dexperiment.design.location=$EXPERIMENT_DESIGN_FILES"
java_opts="$java_opts -Djdbc.url=$jdbc_url"
java_opts="$java_opts -Djdbc.username=$jdbc_username"
java_opts="$java_opts -Djdbc.password=$jdbc_password"
Expand Down
15 changes: 5 additions & 10 deletions run_tests_with_containers.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
#!/usr/bin/env bash

export SOLR_HOST=my_solr:8983
export ZK_HOST=gxa-zk-1
export ZK_PORT=2181
export POSTGRES_HOST=postgres
export POSTGRES_DB=scxa-test
export POSTGRES_USER=scxa
Expand All @@ -21,17 +18,17 @@ docker stop postgres && docker rm postgres
docker network rm mynet
docker network create mynet

echo "Start ZK"
docker run --rm --net mynet --name $ZK_HOST -d -p $ZK_PORT:$ZK_PORT -e ZOO_MY_ID=1 -e ZOO_SERVERS='server.1=0.0.0.0:2888:3888' -t zookeeper:3.5.8
echo "Start ZooKeeper"
docker run --rm --net mynet -d -p 2181:2181 -e ZOO_MY_ID=1 -e ZOO_SERVERS='server.1=0.0.0.0:2888:3888' -t zookeeper:3.8
echo "Start Solr"
docker run --rm --net mynet --name my_solr -d -p 8983:8983 -e ZK_HOST=$ZK_HOST:$ZK_PORT -t solr:7.7.1-alpine -DzkRun -Denable.runtime.lib=true -m 2g
docker run --rm --net mynet -d -p 8983:8983 -t solr:8-slim -DzkRun -Denable.runtime.lib=true -m 2g

echo "Start postgresql"
echo "Start PostgreSQL"
docker run --rm --name postgres --net mynet \
-e POSTGRES_PASSWORD=$POSTGRES_PASSWORD \
-e POSTGRES_USER=$POSTGRES_USER \
-e POSTGRES_DB=$POSTGRES_DB \
-p $POSTGRES_PORT:$POSTGRES_PORT -d postgres:10-alpine3.15
-p $POSTGRES_PORT:$POSTGRES_PORT -d postgres:11-alpine3.17

sleep 20

Expand All @@ -49,8 +46,6 @@ docker run --net mynet -i $docker_arch_line \
-v $( pwd )/tests:/usr/local/tests:rw \
-v $( pwd )/atlas-schemas:/atlas-schemas:rw \
-v $( pwd )/bin:/usr/local/bin:rw \
-v $( pwd )/fixtures:/fixtures:rw \
-e SOLR_HOST=$SOLR_HOST -e ZK_HOST=$ZK_HOST -e ZK_PORT=$ZK_PORT \
-e POSTGRES_USER=$POSTGRES_USER \
-e POSTGRES_PASSWORD=$POSTGRES_PASSWORD \
-e jdbc_username=$POSTGRES_USER \
Expand Down
Loading

0 comments on commit 908307e

Please sign in to comment.