Merge branch 'develop' into bugfix/conform-to-v19-migration

ebi-gene-expression-group · Oct 13, 2023 · b8a45e4 · b8a45e4
2 parents bc40ee7 + 908307e
commit b8a45e4
Show file tree

Hide file tree

Showing 35 changed files with 528 additions and 211 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM quay.io/ebigxa/atlas-db-scxa-base:0.1.0
+FROM quay.io/ebigxa/atlas-db-scxa-base:0.15.0.0
 # debian
 
 ADD bin/* /usr/local/bin/

diff --git a/bin/add_exps_to_collection.sh b/bin/add_exps_to_collection.sh
@@ -2,8 +2,7 @@
 
 set -e
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-source $scriptDir/db_scxa_common.sh
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
 
 dbConnection=${dbConnection:-$1}
 COLL_ID=${COLL_ID:-$2}

diff --git a/bin/common_routines.sh b/bin/common_routines.sh
@@ -12,3 +12,16 @@ get_host_from_hostport() {
 get_port_from_hostport() {
   echo $(echo $1 | awk -F':' '{ print $2 }')
 }
+
+checkDatabaseConnection() {
+  pg_user=$(echo $1 | sed s+postgresql://++ | awk -F':' '{ print $1}')
+  pg_host_port=$(echo $1 | awk -F':' '{ print $3}' \
+           | awk -F'@' '{ print $2}' | awk -F'/' '{ print $1 }')
+  pg_host=$(echo $pg_host_port  | awk -F':' '{print $1}')
+  pg_port=$(echo $pg_host_port  | awk -F':' '{print $2}')
+  if [ ! -z "$pg_port" ]; then
+    pg_isready -U $pg_user -h $pg_host -p $pg_port || (echo "No db connection." && exit 1)
+  else
+    pg_isready -U $pg_user -h $pg_host || (echo "No db connection" && exit 1)
+  fi
+}
diff --git a/bin/create_collection.sh b/bin/create_collection.sh
@@ -2,8 +2,8 @@
 
 set -e
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-source $scriptDir/db_scxa_common.sh
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
+source $scriptDir/common_routines.sh
 
 dbConnection=${dbConnection:-$1}
 COLL_ID=${COLL_ID:-$2}

diff --git a/bin/db_scxa_common.sh b/bin/db_scxa_common.sh
diff --git a/bin/delete_collection.sh b/bin/delete_collection.sh
@@ -2,8 +2,7 @@
 
 set -e
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-source $scriptDir/db_scxa_common.sh
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
 
 dbConnection=${dbConnection:-$1}
 COLL_ID=${COLL_ID:-$2}

diff --git a/bin/delete_exp_from_collection.sh b/bin/delete_exp_from_collection.sh
@@ -2,8 +2,8 @@
 
 set -e
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-source $scriptDir/db_scxa_common.sh
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
+source $scriptDir/common_routines.sh
 
 dbConnection=${dbConnection:-$1}
 COLL_ID=${COLL_ID:-$2}

diff --git a/bin/get_experiment_info.sh b/bin/get_experiment_info.sh
@@ -2,8 +2,7 @@
 
 set -e
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-source $scriptDir/db_scxa_common.sh
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
 
 postgres_scripts_dir=$scriptDir/../postgres_routines
 

diff --git a/bin/load_db_scxa_analytics.sh b/bin/load_db_scxa_analytics.sh
@@ -12,8 +12,8 @@
 # - Postprocess table and attach it to the main scxa-analytics table.
 set -e
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-source $scriptDir/db_scxa_common.sh
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
+source $scriptDir/common_routines.sh
 
 postgres_scripts_dir=$scriptDir/../postgres_routines
 

diff --git a/bin/load_db_scxa_analytics_pg9.sh b/bin/load_db_scxa_analytics_pg9.sh
@@ -10,8 +10,8 @@
 #   PG10, which loads each experiment into a different partition.
 set -e
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-source $scriptDir/db_scxa_common.sh
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
+source $scriptDir/common_routines.sh
 
 dbConnection=${dbConnection:-$1}
 EXP_ID=${EXP_ID:-$2}

diff --git a/bin/load_db_scxa_cell_clusters.sh b/bin/load_db_scxa_cell_clusters.sh
@@ -5,8 +5,8 @@
 # scxa_cell_group_membership table of AtlasProd.
 set -e
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-source $scriptDir/db_scxa_common.sh
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
+source $scriptDir/common_routines.sh
 
 dbConnection=${dbConnection:-$1}
 EXP_ID=${EXP_ID:-$2}

diff --git a/bin/load_db_scxa_dimred.sh b/bin/load_db_scxa_dimred.sh
@@ -5,15 +5,15 @@
 # parameterisations, and loads it into the scxa_coords table of AtlasProd.
 set -e
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-source $scriptDir/db_scxa_common.sh
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
+source $scriptDir/common_routines.sh
 
 dbConnection=${dbConnection:-$1}
 EXP_ID=${EXP_ID:-$2}
 DIMRED_TYPE=${DIMRED_TYPE:-$3}
 DIMRED_FILE_PATH=${DIMRED_FILE_PATH:-$4}
 DIMRED_PARAM_JSON=${DIMRED_PARAM_JSON:-$5}
-SCRATCH_DIR=${SCRATCH_DIR:-"$(dirname ${DIMRED_FILE_PATH})"}
+SCRATCH_DIR=${SCRATCH_DIR:-"$DIMRED_FILE_PATH"}
 
 # Check that necessary environment variables are defined.
 [ -n ${dbConnection+x} ] || (echo "Env var dbConnection for the database connection needs to be defined. This includes the database name." && exit 1)

diff --git a/bin/load_db_scxa_marker_genes.sh b/bin/load_db_scxa_marker_genes.sh
@@ -6,8 +6,8 @@
 # scxa_cell_groups_marker_genes table of AtlasProd.
 set -e
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-source $scriptDir/db_scxa_common.sh
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
+source $scriptDir/common_routines.sh
 
 dbConnection=${dbConnection:-$1}
 EXP_ID=${EXP_ID:-$2}

diff --git a/bin/load_exp_design.awk b/bin/load_exp_design.awk
@@ -0,0 +1,13 @@
+# Return the index of the first field that matches the given pattern, or 0 if it’s not found
+{
+  for (i = 1; i <= NF; ++i) {
+    field = $i;
+    if (field ~ pattern) {
+      print i;
+      exit;
+    }
+  }
+
+  print 0;
+  exit;
+}
diff --git a/bin/load_exp_design.sh b/bin/load_exp_design.sh
@@ -1,46 +1,59 @@
 #!/usr/bin/env bash
-
 set -e
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
+source "${SCRIPT_DIR}/common_routines.sh"
 
-scriptDir=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
-source $scriptDir/db_scxa_common.sh
-
+# Alfonso is bothered about dbConnection, it shouldn’t be camelCased because:
+# 1. It’s a constant, it should be DB_CONNECTION
+# 2. We use snake_case for Bash variables
 dbConnection=${dbConnection:-$1}
-condensed_sdrf_file=${CONDENSED_SDRF_FILE:-$2}
-sdrf_file=${SDRF_FILE:-$3}
-
-# Check that necessary environment variables are defined.
-[ -z ${dbConnection+x} ] && echo "Env var dbConnection for the database connection needs to be defined. This includes the database name." && exit 1
-[ -z ${CONDENSED_SDRF_FILE+x} ] && echo "Env var CONDENSED_SDRF_FILE for the experiment design data needs to be defined." && exit 1
-[ -z ${SDRF_FILE+x} ] && echo "Env var SDRF_FILE for column sequence of experiment design needs to be defined." && exit 1
-
-# Reason for creating this array is to search factor value column
-# In some sdrf files this column is mentioned as "Factor Value" and in some as "FactorValue"
-FactorArray=( FactorValue "Factor Value" )
-
-# for experiment design column table we need to have unique experiment accession, column name and sample type
-# as they are the primary key for the table and we don't want to insert duplicate rows
-cut -f 1,4,5 $condensed_sdrf_file | sort | uniq | while read exp_acc sample_type col_name;
-do
-  if [ $sample_type == 'characteristic' ]
-        then
-            search_column="Characteristics[${col_name}]"
-            column_order=$(awk -v val="$search_column" -F '\t' '{for (i=1; i<=NF; i++) if ($i==val) {print i} }' $sdrf_file)
-        else
-            for element in "${FactorArray[@]}"; do
-                search_column="$element[${col_name}]"
-                column_order=$(awk -v val="$search_column" -F '\t' '{for (i=1; i<=NF; i++) if ($i==val) {print i} }' $sdrf_file)
-                if [[ -n "${column_order}" ]]; then
-                    break
-                fi
-            done
-        fi
-        echo "INSERT INTO exp_design_column (experiment_accession, column_name, sample_type, column_order) VALUES ('$exp_acc', '$col_name', '$sample_type', '$column_order');" | psql -v ON_ERROR_STOP=1 $dbConnection
+CONDENSED_SDRF_FILE=${CONDENSED_SDRF_FILE:-$2}
+SDRF_FILE=${SDRF_FILE:-$3}
+
+# Check that necessary environment variables are defined
+require_env_var "dbConnection"
+require_env_var "CONDENSED_SDRF_FILE"
+require_env_var "SDRF_FILE"
+checkDatabaseConnection "${dbConnection}"
+
+EXPERIMENT_ACCESSION=$(head -1 "${CONDENSED_SDRF_FILE}" | cut -f 1)
+DESTINATION_FILE=${SCRATCH_DIR:-${SCRIPT_DIR}}/${EXPERIMENT_ACCESSION}-exp-design.sql
+# Remove DESTINATION_FILE if it exists
+rm -f ${DESTINATION_FILE}
+
+# Create the file and enclose all INSERT statements in a transaction
+echo "BEGIN;" >> ${DESTINATION_FILE}
+
+# In the experiment design column table we use the experiment accession, column name and sample type as the primary key
+cut -f 1,4,5 "${CONDENSED_SDRF_FILE}" | sort | uniq | while read experiment_accession sample_type column_name; do
+  if [ "$sample_type" == 'characteristic' ]; then
+    sdrf_column_index=$(awk -F '\t' -v pattern="^Characteristics ?\\\[${column_name}\\\]$" -f ${SCRIPT_DIR}/load_exp_design.awk ${SDRF_FILE})
+  else
+    sdrf_column_index=$(awk -F '\t' -v pattern="^Factor ?Value ?\\\[${column_name}\\\]$" -f ${SCRIPT_DIR}/load_exp_design.awk ${SDRF_FILE})
+  fi
+  sql_statement="INSERT INTO exp_design_column (experiment_accession, sample_type, column_name, column_order) VALUES ('${experiment_accession}', '${sample_type}', '${column_name}', '${sdrf_column_index}');"
+  echo "${sql_statement}" >> ${DESTINATION_FILE}
 done
 
-while IFS=$'\t' read exp_acc sample sample_type col_name annot_value annot_url
-do
-  echo "INSERT INTO exp_design (sample, annot_value, annot_ont_uri, exp_design_column_id) VALUES ('$sample', '$annot_value', '$annot_url', (SELECT id FROM exp_design_column WHERE experiment_accession='$exp_acc' AND column_name='$col_name' AND sample_type='$sample_type'));" | psql -v ON_ERROR_STOP=1 $dbConnection
-done < $condensed_sdrf_file
+# Add the columns from the condensed SDRF file.
+# Fields in the condensed SDRF that aren’t in the SDRF are assigned a column_order value of 0 by the AWK script.
+# We need to assign them a value that is greater than the maximum column_order value for the experiment.
+# The column_order value is used to order the columns in the UI and is not used for the primary key, so it’s ok to have
+# duplicates; we can order the fields with the same column_order by name if necessary.
+sql_statement="UPDATE exp_design_column SET column_order=(SELECT MAX(column_order) FROM exp_design_column WHERE experiment_accession='${EXPERIMENT_ACCESSION}')+1 WHERE column_order=0 AND experiment_accession='${EXPERIMENT_ACCESSION}';"
+echo "${sql_statement}" >> ${DESTINATION_FILE}
+
+# Insert the experiment design data.
+while IFS=$'\t' read -r experiment_accession sample sample_type column_name annotation_value annotation_url; do
+  sql_statement="INSERT INTO exp_design (sample, annot_value, annot_ont_uri, exp_design_column_id) VALUES ('${sample}', '${annotation_value}', '${annotation_url}', (SELECT id FROM exp_design_column WHERE experiment_accession='${experiment_accession}' AND column_name='${column_name}' AND sample_type='${sample_type}'));"
+  echo "${sql_statement}" >> ${DESTINATION_FILE}
+done < "$CONDENSED_SDRF_FILE"
+
+# Finish the transaction
+echo "COMMIT;" >> ${DESTINATION_FILE}
+
+PSQL_CMD="psql -qv ON_ERROR_STOP=1 ${dbConnection} -f ${DESTINATION_FILE}"
+echo ${PSQL_CMD}
+eval ${PSQL_CMD}
 
-echo "Experiment design data done loading for $condensed_sdrf_file"
+echo "$CONDENSED_SDRF_FILE: finished loading experiment design"
diff --git a/bin/load_experiment_web_cli.sh b/bin/load_experiment_web_cli.sh
@@ -1,44 +1,34 @@
 #!/usr/bin/env bash
 # 
 # This script:
-# - Checks if the experiment is loaded and stops it is already loaded.
-# - Adds the appropiate line to the experiments table if it doesn't exist.
-# - Generates the experiment design file from condensed SDRF and SDRF files in $EXPERIMENT_FILES/expdesign
+# - Checks if the experiment is loaded and stops if it is already loaded.
+# - Adds the appropriate line to the experiments table if it doesn't exist.
+# - Generates the experiment design file from condensed SDRF and SDRF files in $EXPERIMENT_DESIGN_FILES
 #
 # Most of the variables required for this are usually defined in the environment file for each setup (test, prod, etc).
-# The experiment designs file might need to be synced to an appropiate location at the web application instance disk
+# The experiment designs file might need to be synced to an appropriate location at the web application instance disk
 # depending on how the setup disk layout.
 
 jar_dir=$CONDA_PREFIX/share/atlas-cli
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
 source $scriptDir/common_routines.sh
 
 echo "CONDA_PREFIX: $CONDA_PREFIX"
 
-require_env_var "SOLR_HOST"
-require_env_var "ZK_HOST"
-require_env_var "ZK_PORT"
 require_env_var "BIOENTITIES"
 require_env_var "EXPERIMENT_FILES"
+require_env_var "EXPERIMENT_DESIGN_FILES"
 require_env_var "jdbc_url"
 require_env_var "jdbc_username"
 require_env_var "jdbc_password"
 
 # Either ACCESSIONS or PRIVATE_ACCESSIONS need to be provided
 #require_env_var "ACCESSIONS"
 
-SOLR_PORT=$(get_port_from_hostport $SOLR_HOST)
-SOLR_HOST=$(get_host_from_hostport $SOLR_HOST)
-
-require_env_var "SOLR_PORT"
-
-java_opts="-Dsolr.host=$SOLR_HOST"
-java_opts="$java_opts -Dsolr.port=$SOLR_PORT"
-java_opts="$java_opts -Dzk.host=$ZK_HOST"
-java_opts="$java_opts -Dzk.port=$ZK_PORT"
 java_opts="$java_opts -Ddata.files.location=$BIOENTITIES"
 java_opts="$java_opts -Dexperiment.files.location=$EXPERIMENT_FILES"
+java_opts="$java_opts -Dexperiment.design.location=$EXPERIMENT_DESIGN_FILES"
 java_opts="$java_opts -Djdbc.url=$jdbc_url"
 java_opts="$java_opts -Djdbc.username=$jdbc_username"
 java_opts="$java_opts -Djdbc.password=$jdbc_password"

diff --git a/bin/modify_collection.sh b/bin/modify_collection.sh
@@ -2,8 +2,7 @@
 
 set -e
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-source $scriptDir/db_scxa_common.sh
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
 
 dbConnection=${dbConnection:-$1}
 COLL_ID=${COLL_ID:-$2}

diff --git a/bin/update_experiment_web_cli.sh b/bin/update_experiment_web_cli.sh
@@ -11,32 +11,22 @@
 
 jar_dir=$CONDA_PREFIX/share/atlas-cli
 
-scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+scriptDir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )
 source $scriptDir/common_routines.sh
 
 echo "CONDA_PREFIX: $CONDA_PREFIX"
 
-require_env_var "SOLR_HOST"
-require_env_var "ZK_HOST"
-require_env_var "ZK_PORT"
 require_env_var "BIOENTITIES"
 require_env_var "EXPERIMENT_FILES"
+require_env_var "EXPERIMENT_DESIGN_FILES"
 require_env_var "jdbc_url"
 require_env_var "jdbc_username"
 require_env_var "jdbc_password"
 require_env_var "ACCESSIONS"
 
-SOLR_PORT=$(get_port_from_hostport $SOLR_HOST)
-SOLR_HOST=$(get_host_from_hostport $SOLR_HOST)
-
-require_env_var "SOLR_PORT"
-
-java_opts="-Dsolr.host=$SOLR_HOST"
-java_opts="$java_opts -Dsolr.port=$SOLR_PORT"
-java_opts="$java_opts -Dzk.host=$ZK_HOST"
-java_opts="$java_opts -Dzk.port=$ZK_PORT"
 java_opts="$java_opts -Ddata.files.location=$BIOENTITIES"
 java_opts="$java_opts -Dexperiment.files.location=$EXPERIMENT_FILES"
+java_opts="$java_opts -Dexperiment.design.location=$EXPERIMENT_DESIGN_FILES"
 java_opts="$java_opts -Djdbc.url=$jdbc_url"
 java_opts="$java_opts -Djdbc.username=$jdbc_username"
 java_opts="$java_opts -Djdbc.password=$jdbc_password"

diff --git a/fixtures/generate-fixtures.sh b/fixtures/generate-fixtures.sh
@@ -22,7 +22,7 @@
 
 # https://stackoverflow.com/questions/59895/how-can-i-get-the-source-directory-of-a-bash-script-from-within-the-script-itsel
 # https://stackoverflow.com/a/246128
-SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )"
 
 export POSTGRES_HOST=${POSTGRES_HOST:-localhost}
 export POSTGRES_PORT=${POSTGRES_PORT:-5432}

diff --git a/fixtures/generate-tsv-fixture.sh b/fixtures/generate-tsv-fixture.sh
@@ -10,7 +10,7 @@
 # We choose 100 genes that are expressed in those cells via analytics table
 
 set -e
-SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]:-$0}" )" &> /dev/null && pwd )"
 
 source ${SCRIPT_DIR}/utils.sh