Biomart mappings testable by species and less of Atlas Prod env var (#6)

* Initial changes, wip still * Makes bioentities properties configurable for ensemblUpdate * Allows reuse of ontology files for faster testing (not by default) * Allows setting interpro version * More ATLAS_PROD removal * Removes unused imports * Avoids ATLAS_PROD variable usage in PropertiesAdequate * Avoids hardcoded paths for Directories, makes experiments paths configurable. * Adds example testing directory structure * Some documentation * Avoids ATLAS_PROD env var * More documentation * Makes annotation sources configurable * Main method for mapping validation only * Adds env dockerfile * Missing default y for apt-get install * Jenkins k8s integration seems to be mounting stuff on /usr/local/bin (ouch!) * Exits with error code on bad validation test. * Revert "Jenkins k8s integration seems to be mounting stuff on /usr/local/bin (ouch!)" This reverts commit 7214272. * Change connection protocol of E! sites to HTTPS * Updates documentation a bit. * Exports PATH_BIOENTITTY.... * Add TODO to handle correctly missing dataset error. * Adds explanation on how to use ANNOTATION_SOURCES
ebi-gene-expression-group · Oct 19, 2018 · 8f0da81 · 8f0da81
1 parent 1fbe68f
commit 8f0da81
Show file tree

Hide file tree

Showing 15 changed files with 302 additions and 68 deletions.
diff --git a/Dockerfile_base b/Dockerfile_base
@@ -0,0 +1,3 @@
+FROM lolhens/ammonite:latest
+
+RUN apt-get update -y && apt-get install -y git
diff --git a/README.md b/README.md
@@ -10,6 +10,9 @@ Version 1.0.0 was used for the August/September 2018 Atlas (bulk and single cell
 src - only java and [Ammonite](http://www.lihaoyi.com/Ammonite/)
 ensemblUpdate.sh - various bash utilities,mysql, environment variable $ATLAS_PROD (see util/create_test_env.sh to work with this script)
 
+We are in the process of detaching this from our direct filesystem dependencies. As such, the use of $ATLAS_PROD is being replaced
+everywhere to point more specificly to the exact needs of each script.
+
 ### Entry points
 
 `sh/ensembl/ensemblUpdate.sh`
@@ -18,6 +21,11 @@ the entry point to the annotations update process
 `sh/atlas_species.sh`
 Regenerate the species file based on annotation sources config
 
+`amm -s src/pipeline/retrieve/Retrieve.sc`
+Runs only the BioMart mapping verification for defined organisms (depends on the organisms file inside either
+`annsrc` or the overriding `$ANNOTATION_SOURCES` path). These tests are automated in our internal Jenkins setup (http://193.62.52.166:30752/jenkins)
+under the `Ensembl Update` tab.
+
 ### Structure
 
 #### ./annsrcs

diff --git a/sh/ensembl/ensemblUpdate.sh b/sh/ensembl/ensemblUpdate.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # I used to source this script from the same (prod or test) Atlas environment as this script
 # scriptDir=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
@@ -16,29 +16,33 @@ NEW_ENSEMBL_REL=$1
 NEW_ENSEMBLGENOMES_REL=$2
 NEW_WBPS_REL=$3
 
+export PATH_BIOENTITY_PROPERTIES=${PATH_BIOENTITY_PROPERTIES:-$ATLAS_PROD/bioentity_properties}
+
 function symlinkAndArchive() {
+    # This method is used to update the symbolic link from the current bioentity property placeholder
+    # to the new release directory.
     mkdir -p $2
     if [[ -e $1 ]] ; then
         rm $1
     fi
     ln -s $2 $1
 }
 echo "Shifting the symlinks to new versions of Ensembl, Ensembl Genomes and WBPS"
-symlinkAndArchive $ATLAS_PROD/bioentity_properties/ensembl $ATLAS_PROD/bioentity_properties/archive/ensembl_${NEW_ENSEMBL_REL}_${NEW_ENSEMBLGENOMES_REL}
-symlinkAndArchive $ATLAS_PROD/bioentity_properties/reactome $ATLAS_PROD/bioentity_properties/archive/reactome_ens${NEW_ENSEMBL_REL}_${NEW_ENSEMBLGENOMES_REL}
-symlinkAndArchive $ATLAS_PROD/bioentity_properties/go $ATLAS_PROD/bioentity_properties/archive/go_ens${NEW_ENSEMBL_REL}_${NEW_ENSEMBLGENOMES_REL}
-symlinkAndArchive $ATLAS_PROD/bioentity_properties/wbps $ATLAS_PROD/bioentity_properties/archive/wbps_${NEW_WBPS_REL}
-symlinkAndArchive $ATLAS_PROD/bioentity_properties/array_designs/current $ATLAS_PROD/bioentity_properties/archive/array_designs_${NEW_ENSEMBL_REL}_${NEW_ENSEMBLGENOMES_REL}_${NEW_WBPS_REL}
-symlinkAndArchive $ATLAS_PROD/bioentity_properties/annotations/ensembl $ATLAS_PROD/bioentity_properties/archive/annotations_ensembl_${NEW_ENSEMBL_REL}_${NEW_ENSEMBLGENOMES_REL}
-symlinkAndArchive $ATLAS_PROD/bioentity_properties/annotations/wbps $ATLAS_PROD/bioentity_properties/archive/annotations_wbps_${NEW_WBPS_REL}
+symlinkAndArchive $PATH_BIOENTITY_PROPERTIES/ensembl $PATH_BIOENTITY_PROPERTIES/archive/ensembl_${NEW_ENSEMBL_REL}_${NEW_ENSEMBLGENOMES_REL}
+symlinkAndArchive $PATH_BIOENTITY_PROPERTIES/reactome $PATH_BIOENTITY_PROPERTIES/archive/reactome_ens${NEW_ENSEMBL_REL}_${NEW_ENSEMBLGENOMES_REL}
+symlinkAndArchive $PATH_BIOENTITY_PROPERTIES/go $PATH_BIOENTITY_PROPERTIES/archive/go_ens${NEW_ENSEMBL_REL}_${NEW_ENSEMBLGENOMES_REL}
+symlinkAndArchive $PATH_BIOENTITY_PROPERTIES/wbps $PATH_BIOENTITY_PROPERTIES/archive/wbps_${NEW_WBPS_REL}
+symlinkAndArchive $PATH_BIOENTITY_PROPERTIES/array_designs/current $PATH_BIOENTITY_PROPERTIES/archive/array_designs_${NEW_ENSEMBL_REL}_${NEW_ENSEMBLGENOMES_REL}_${NEW_WBPS_REL}
+symlinkAndArchive $PATH_BIOENTITY_PROPERTIES/annotations/ensembl $PATH_BIOENTITY_PROPERTIES/archive/annotations_ensembl_${NEW_ENSEMBL_REL}_${NEW_ENSEMBLGENOMES_REL}
+symlinkAndArchive $PATH_BIOENTITY_PROPERTIES/annotations/wbps $PATH_BIOENTITY_PROPERTIES/archive/annotations_wbps_${NEW_WBPS_REL}
 
 echo "Fetching the latest GO mappings..."
 # This needs to be done because we need to replace any alternative GO ids in Ensembl mapping files with their canonical equivalents
-$PROJECT_ROOT/sh/go/fetchGoIDToTermMappings.sh ${ATLAS_PROD}/bioentity_properties/go
+$PROJECT_ROOT/sh/go/fetchGoIDToTermMappings.sh $PATH_BIOENTITY_PROPERTIES/go
 
 echo "Fetching the latest Interpro mappings..."
 # I've only put it here for symmetry - we currently do not transform on Interpro file output
-$PROJECT_ROOT/sh/interpro/fetchInterproIDToTypeTermMappings.sh ${ATLAS_PROD}/bioentity_properties/interpro
+$PROJECT_ROOT/sh/interpro/fetchInterproIDToTypeTermMappings.sh $PATH_BIOENTITY_PROPERTIES/interpro
 
 pushd $PROJECT_ROOT
 echo "Obtain the mapping files from biomarts based on annotation sources"
@@ -50,86 +54,86 @@ echo "Fetching the synonyms from biomart databases..."
 $PROJECT_ROOT/sh/ensembl/fetchGeneSynonyms.sh
 
 echo "Merge all individual Ensembl property files into matrices"
-for species in $(find -L ${ATLAS_PROD}/bioentity_properties/ensembl -name '*tsv' -type f | xargs -n 1 basename | awk -F"." '{print $1}' | sort -u ); do
+for species in $(find -L $PATH_BIOENTITY_PROPERTIES/ensembl -name '*tsv' -type f | xargs -n 1 basename | awk -F"." '{print $1}' | sort -u ); do
     for bioentity in ensgene enstranscript ensprotein; do
-        mergedFile=${ATLAS_PROD}/bioentity_properties/annotations/ensembl/$species.$bioentity.tsv
+        mergedFile=$PATH_BIOENTITY_PROPERTIES/annotations/ensembl/$species.$bioentity.tsv
         [[ -s $mergedFile ]] \
         || $PROJECT_ROOT/sh/ensembl/mergePropertiesIntoMatrix.pl \
-            -indir ${ATLAS_PROD}/bioentity_properties/ensembl \
+            -indir $PATH_BIOENTITY_PROPERTIES/ensembl \
             -species $species -bioentity $bioentity \
         > $mergedFile
     done
 done
 
 # Do the same for WBPS.
 echo "Merge all individual WBPS property files into matrices"
-for species in $(find -L ${ATLAS_PROD}/bioentity_properties/wbps -name '*tsv' -type f | xargs -n 1 basename | awk -F"." '{print $1}' | sort -u ); do
+for species in $(find -L $PATH_BIOENTITY_PROPERTIES/wbps -name '*tsv' -type f | xargs -n 1 basename | awk -F"." '{print $1}' | sort -u ); do
     for bioentity in wbpsgene wbpsprotein wbpstranscript; do
-        mergedFile=${ATLAS_PROD}/bioentity_properties/annotations/wbps/$species.$bioentity.tsv
+        mergedFile=$PATH_BIOENTITY_PROPERTIES/annotations/wbps/$species.$bioentity.tsv
         [[ -s $mergedFile ]] \
         || $PROJECT_ROOT/sh/ensembl/mergePropertiesIntoMatrix.pl \
-            -indir ${ATLAS_PROD}/bioentity_properties/wbps \
+            -indir $PATH_BIOENTITY_PROPERTIES/wbps \
             -species $species -bioentity $bioentity \
         > $mergedFile
     done
 done
 
 # Create files that will be loaded into the database.
-echo "Generate ${ATLAS_PROD}/bioentity_properties/bioentityOrganism.dat file"
-$PROJECT_ROOT/sh/prepare_bioentityorganisms_forloading.sh ${ATLAS_PROD}/bioentity_properties
+echo "Generate $PATH_BIOENTITY_PROPERTIES/bioentityOrganism.dat file"
+$PROJECT_ROOT/sh/prepare_bioentityorganisms_forloading.sh $PATH_BIOENTITY_PROPERTIES
 
 # Apply sanity test
-size=`wc -l ${ATLAS_PROD}/bioentity_properties/bioentityOrganism.dat | awk '{print $1}'`
+size=`wc -l $PATH_BIOENTITY_PROPERTIES/bioentityOrganism.dat | awk '{print $1}'`
 if [ "$size" -lt 200 ]; then
     echo "ERROR: Something went wrong with populating bioentityOrganism.dat file - should have more than 200 rows"
     exit 1
 fi
 
 
-echo "Generate ${ATLAS_PROD}/bioentity_properties/bioentityName.dat file"
+echo "Generate $PATH_BIOENTITY_PROPERTIES/bioentityName.dat file"
 echo "... Generate miRBase component"
-rm -rf ${ATLAS_PROD}/bioentity_properties/mirbase/miRNAName.dat
+rm -rf $PATH_BIOENTITY_PROPERTIES/mirbase/miRNAName.dat
 $PROJECT_ROOT/sh/mirbase/prepare_mirbasenames_forloading.sh
 
 echo "... Generate Ensembl component"
-find -L $ATLAS_PROD/bioentity_properties/ensembl -name '*ensgene.symbol.tsv' \
-| xargs $PROJECT_ROOT/sh/ensembl/prepare_names_for_loading.sh $ATLAS_PROD/bioentity_properties/bioentityOrganism.dat \
-> ${ATLAS_PROD}/bioentity_properties/ensembl/geneName.dat
+find -L $PATH_BIOENTITY_PROPERTIES/ensembl -name '*ensgene.symbol.tsv' \
+| xargs $PROJECT_ROOT/sh/ensembl/prepare_names_for_loading.sh $PATH_BIOENTITY_PROPERTIES/bioentityOrganism.dat \
+> $PATH_BIOENTITY_PROPERTIES/ensembl/geneName.dat
 
 echo "... Generate WBPS component"
-find -L $ATLAS_PROD/bioentity_properties/wbps -name '*wbpsgene.symbol.tsv' \
-| xargs $PROJECT_ROOT/sh/ensembl/prepare_names_for_loading.sh $ATLAS_PROD/bioentity_properties/bioentityOrganism.dat \
-> ${ATLAS_PROD}/bioentity_properties/wbps/wbpsgeneName.dat
+find -L $PATH_BIOENTITY_PROPERTIES/wbps -name '*wbpsgene.symbol.tsv' \
+| xargs $PROJECT_ROOT/sh/ensembl/prepare_names_for_loading.sh $PATH_BIOENTITY_PROPERTIES/bioentityOrganism.dat \
+> $PATH_BIOENTITY_PROPERTIES/wbps/wbpsgeneName.dat
 
 echo "Merge miRNAName.dat, geneName.dat and wbpsgeneName.dat into bioentityName.dat"
-cp ${ATLAS_PROD}/bioentity_properties/mirbase/miRNAName.dat ${ATLAS_PROD}/bioentity_properties/bioentityName.dat
-cat ${ATLAS_PROD}/bioentity_properties/ensembl/geneName.dat >> ${ATLAS_PROD}/bioentity_properties/bioentityName.dat
-cat ${ATLAS_PROD}/bioentity_properties/wbps/wbpsgeneName.dat >> ${ATLAS_PROD}/bioentity_properties/bioentityName.dat
+cp $PATH_BIOENTITY_PROPERTIES/mirbase/miRNAName.dat $PATH_BIOENTITY_PROPERTIES/bioentityName.dat
+cat $PATH_BIOENTITY_PROPERTIES/ensembl/geneName.dat >> $PATH_BIOENTITY_PROPERTIES/bioentityName.dat
+cat $PATH_BIOENTITY_PROPERTIES/wbps/wbpsgeneName.dat >> $PATH_BIOENTITY_PROPERTIES/bioentityName.dat
 # Apply sanity test
-size=`wc -l ${ATLAS_PROD}/bioentity_properties/bioentityName.dat | awk '{print $1}'`
+size=`wc -l $PATH_BIOENTITY_PROPERTIES/bioentityName.dat | awk '{print $1}'`
 if [ "$size" -lt 1000000 ]; then
     echo "ERROR: Something went wrong with populating bioentityName.dat file - should have more than 1mln rows, only had $size"
     exit 1
 fi
 
 nonuniqueArrayDesignFiles=$(
-    find -L $ATLAS_PROD/bioentity_properties/array_designs -name '*A-*.tsv' \
+    find -L $PATH_BIOENTITY_PROPERTIES/array_designs -name '*A-*.tsv' \
     | xargs -n 1 basename \
     | sort \
     | uniq -d )
 
 if [[ -n "$nonuniqueArrayDesignFiles" ]] ; then
-    echo "ERROR: Check $ATLAS_PROD/bioentity_properties/array_designs/backfill - no need to backfill for: " $nonuniqueArrayDesignFiles
+    echo "ERROR: Check $PATH_BIOENTITY_PROPERTIES/array_designs/backfill - no need to backfill for: " $nonuniqueArrayDesignFiles
     exit 1
 fi
 
-echo "Generate ${ATLAS_PROD}/bioentity_properties/designelementMapping.dat file"
-find -L $ATLAS_PROD/bioentity_properties/array_designs -name '*A-*.tsv' \
+echo "Generate $PATH_BIOENTITY_PROPERTIES/designelementMapping.dat file"
+find -L $PATH_BIOENTITY_PROPERTIES/array_designs -name '*A-*.tsv' \
     | xargs $PROJECT_ROOT/sh/prepare_array_designs_for_loading.sh  \
-    > ${ATLAS_PROD}/bioentity_properties/designelementMapping.dat
+    > $PATH_BIOENTITY_PROPERTIES/designelementMapping.dat
 
 # Apply sanity test
-size=`wc -l ${ATLAS_PROD}/bioentity_properties/designelementMapping.dat | awk '{print $1}'`
+size=`wc -l $PATH_BIOENTITY_PROPERTIES/designelementMapping.dat | awk '{print $1}'`
 if [ "$size" -lt 2000000 ]; then
     echo "ERROR: Something went wrong with populating designelementMapping.dat file - should have more than 2mln rows"
     exit 1
@@ -138,7 +142,7 @@ fi
 echo "Fetching the latest Reactome mappings..."
 # This needs to be done because some of Reactome's pathways are mapped to UniProt accessions only, hence so as to map them to
 # gene ids - we need to use the mapping files we've just retrieved from Ensembl
-$PROJECT_ROOT/sh/reactome/fetchAllReactomeMappings.sh $ATLAS_PROD/bioentity_properties/reactome/
+$PROJECT_ROOT/sh/reactome/fetchAllReactomeMappings.sh $PATH_BIOENTITY_PROPERTIES/reactome/
 
 echo "Downloading gtfs..."
 $PROJECT_ROOT/sh/gtf/download_gtfs.sh "$NEW_ENSEMBL_REL" "$NEW_ENSEMBLGENOMES_REL" "$NEW_WBPS_REL"
diff --git a/sh/go/fetchGoIDToTermMappings.sh b/sh/go/fetchGoIDToTermMappings.sh
@@ -1,9 +1,15 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # This script retrieves the latest mapping between GO ids and terms
 # Author: [email protected], [email protected]
 set -euo pipefail
+
 PROJECT_ROOT=`dirname $0`/../..
 export JAVA_OPTS=-Xmx3000M
+
+# To avoid using the depths functionality, set env var $GET_GO_DEPTHS to something different to "yes".
+GET_GO_DEPTHS=${GET_GO_DEPTHS:-"yes"}
+USE_EXITING_ONTOLOGY_FILES=${USE_EXISTING_ONTOLOGY_FILES:-"no"}
+
 IFS="
 "
 outputDir=$1
@@ -13,8 +19,10 @@ if [[ -z "$outputDir" ]]; then
 fi
 
 echo "Fetching GO and PO owl files"
-curl -s "http://geneontology.org/ontology/go.owl" > $outputDir/go.owl
-curl -s "http://palea.cgrb.oregonstate.edu/viewsvn/Poc/tags/live/plant_ontology.owl?view=co" > $outputDir/po.owl
+if [ $USE_EXISTING_ONTOLOGY_FILES == "no" ]; then
+   curl -s "http://geneontology.org/ontology/go.owl" > $outputDir/go.owl
+   curl -s "http://palea.cgrb.oregonstate.edu/viewsvn/Poc/tags/live/plant_ontology.owl?view=co" > $outputDir/po.owl
+fi
 
 echo "Extracting GO id -> term"
 amm -s $PROJECT_ROOT/src/go/PropertiesFromOwlFile.sc terms $outputDir/go.owl \
@@ -62,7 +70,13 @@ from
 group by go_id, ancestor_id;" | sqlplus goselect/selectgo@goapro | grep '^GO:'  | sort -t$'\t' -rk2,2  | awk -F"\t" '{ print $1"\t"$2+1 }' | sort -buk1,1
 }
 
-get_ontology_id2Depth_mappings > $outputDir/goIDToDepth.tsv
+if [ $GET_GO_DEPTHS == "yes" ]; then
+  get_ontology_id2Depth_mappings > $outputDir/goIDToDepth.tsv
+else
+  # write file with just zero depths
+  echo "As requested, ignoring GO depths and writing dummy file with all depths 0 based on $outputDir/goIDToTerm.tsv ."
+  awk -F'\t' '{ print $1"\t0" }' $outputDir/goIDToTerm.tsv > $outputDir/goIDToDepth.tsv
+fi
 
 
 # Append Plant Ontology terms at the end of the Gene Ontology file (Ensembl provides Plant Ontology (PO) and Gene Ontology (GO) terms - as GO terms)

diff --git a/sh/interpro/fetchInterproIDToTypeTermMappings.sh b/sh/interpro/fetchInterproIDToTypeTermMappings.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # This script retrieves the latest mapping between Interpro ids and their types (family/domain) and terms
 # Author: [email protected]
 PROJECT_ROOT=`dirname $0`/../..
@@ -8,8 +10,10 @@ if [[ -z "$outputDir" ]]; then
     exit 1
 fi
 
-curl -s ftp://ftp.ebi.ac.uk/pub/databases/interpro/62.0/interpro.xml.gz | zcat > $outputDir/interpro.xml
-curl -s ftp://ftp.ebi.ac.uk/pub/databases/interpro/62.0/interpro.dtd > $outputDir/interpro.dtd
+INTERPRO_VERSION=${INTERPRO_VERSION:-"62.0"}
+
+curl -s ftp://ftp.ebi.ac.uk/pub/databases/interpro/$INTERPRO_VERSION/interpro.xml.gz | zcat > $outputDir/interpro.xml
+curl -s ftp://ftp.ebi.ac.uk/pub/databases/interpro/$INTERPRO_VERSION/interpro.dtd > $outputDir/interpro.dtd
 
 pushd $PROJECT_ROOT
 echo "Parse the file we obtained from Interpro's FTP site"

diff --git a/src/Directories.sc b/src/Directories.sc
@@ -5,18 +5,24 @@ if (!(PROJECT_ROOT/"annsrcs").isDir){
   throw new RuntimeException("Annotations directory not found, possibly ammonite calls this from a wrong place: "+(PROJECT_ROOT/"annsrcs"))
 }
 
-lazy val ATLAS_PROD = Option(System.getenv.get("ATLAS_PROD")).map(Path(_)).filter(_.isDir) match {
+
+lazy val PATH_BIOENTITY_PROPERTIES = Option(System.getenv.get("PATH_BIOENTITY_PROPERTIES")).map(Path(_)).filter(_.isDir) match {
   case Some(path)
-    => path
+  => path
   case None
-    => {
-      throw new RuntimeException("export $ATLAS_PROD as an environment variable")
-      null
-    }
+  => {
+    throw new RuntimeException("export $PATH_BIOENTITY_PROPERTIES as an environment variable")
+    null
+  }
 }
 
+/*
+ TODO within $PATH_BIOENTITY_PROPERTIES we should check that certain directories exists,
+ such as go, interpro, etc.
+ */
+
 lazy val alternativeToCanonicalGoTermMapping = {
-  (read.lines!(ATLAS_PROD / "bioentity_properties" /"go" / "go.alternativeID2CanonicalID.tsv"))
+  (read.lines!(PATH_BIOENTITY_PROPERTIES /"go" / "go.alternativeID2CanonicalID.tsv"))
   .flatMap{ case line =>
       line.split("\t").toList match {
         case List(mapping, mapped)
@@ -27,18 +33,62 @@ lazy val alternativeToCanonicalGoTermMapping = {
   }.toMap
 }
 
+/**
+  * Default paths for annotation sources, within the project.
+  * If you wish to override them, set the env var ANNOTATION_SOURCES
+  * to a list of paths separated by colon (:). This is useful to
+  * run the analysis for a single organism. For instance, if you copy
+  * cp annsrcs/ensembl/homo_sapiens my_directory/ensembl/homo_sapiens
+  * cp annsrcs/wpbs/c_elegans my_directory/wpbs_v2/c_elegans
+  * and then setup:
+  * export ANNOTATION_SOURCES=my_directory/ensembl:my_directory/wpbs_v2
+  * and then run the code, it will only run for those organisms.
+  * You could of course add multiple organisms to a single directory and give it without :
+  * to ANNOTATION_SOURCES.
+  */
 val annsrcsPath = PROJECT_ROOT/"annsrcs"/"ensembl"
 val wbpsAnnsrcsPath = PROJECT_ROOT/"annsrcs"/"wbps"
 
+lazy val ANNOTATION_SOURCES: Seq[Path] = Option(System.getenv.get("ANNOTATION_SOURCES"))
+  .map(_.split(":").map(Path(_)).filter(exists).filter(_.isDir))
+match {
+  case Some(paths) => ((paths.map(ls! _).flatten) ++ (List())).toList
+  case None => ((ls! wbpsAnnsrcsPath) ++ (ls! annsrcsPath))
+}
+
 def annotationSources: Seq[Path] =
-  ((ls! wbpsAnnsrcsPath) ++ (ls! annsrcsPath))
+  ANNOTATION_SOURCES
   .filter{ case path =>
     path.isFile && path.segments.last.matches("[a-z]+_[a-z]+")
   }
 
-lazy val ANALYSIS_EXPERIMENTS = List(
-  ls (Directories.ATLAS_PROD / "analysis" / "baseline" / "proteomics" / "experiments"),
-  ls (Directories.ATLAS_PROD / "analysis" / "baseline" / "rna-seq" / "experiments"),
-  ls (Directories.ATLAS_PROD / "analysis" / "differential" / "microarray" / "experiments"),
-  ls (Directories.ATLAS_PROD / "analysis" / "differential" / "rna-seq" / "experiments")
-).flatten.filter(_.name startsWith "E-")
+/**
+ EXPERIMENT_SOURCES should be defined as an environment variable as a list of
+ colon (:) delimited paths where one would expect to find experiment directories
+
+ In particular for production, one would expect here paths to proteomics experiments,
+ rna-seq experiments, microarray experiments, rna-seq experiments, etc.
+
+ TODO add a warning when a given path is not considered valid.
+
+ WARNING: some classes use elements in the path to make certain decisions,
+ for instance array design related classes expect to find "microarray" in certain
+ paths. TODO move way from such path based decisions.
+ */
+val EXPERIMENT_SOURCES: List[Path] = Option(System.getenv.get("EXPERIMENT_SOURCES"))
+  .map(_.split(":").map(Path(_)).filter(exists).filter(_.isDir))
+match {
+  case Some(paths) => paths.toList
+  case None => {
+    throw new RuntimeException("export $EXPERIMENT_SOURCES as an environment variable, where each directory with experiments is separated by a colon :")
+    null
+  }
+}
+
+println("Using the following paths for sources of experiments:")
+EXPERIMENT_SOURCES.map(println(_))
+
+/*
+ For each path to experiments, we list all directories that begin with 'E-'
+ */
+lazy val ANALYSIS_EXPERIMENTS = EXPERIMENT_SOURCES.map(ls(_)).flatten.filter(_.name startsWith "E-")
diff --git a/src/atlas/AtlasSpecies.sc b/src/atlas/AtlasSpecies.sc
@@ -1,14 +1,10 @@
 import $ivy.`org.json4s:json4s-native_2.12:3.5.0`
-import org.json4s._
 import org.json4s.native.JsonMethods._
 import org.json4s.JsonDSL._
 import $file.^.property.AnnotationSource
 import AnnotationSource.AnnotationSource
 import $file.^.Directories
 import $file.^.util.Combinators
-import java.nio.file.{Paths, Files}
-import java.nio.charset.StandardCharsets
-import ammonite.ops._
 
 case class AtlasSpecies(species: String, defaultQueryFactorType: String, kingdom: String, resources: List[(String, List[(String, String)])]) {
   val json =
@@ -49,7 +45,7 @@ object AtlasSpeciesFactory {
           "metazoa" -> List(("Ensembl Genomes", "https://metazoa.ensembl.org/")),
           "fungi" -> List(("Ensembl Genomes", "https://fungi.ensembl.org/")),
           "parasite" -> List(("Wormbase ParaSite", "https://parasite.wormbase.org/")),
-          "plants" -> List(("Gramene", "https://ensembl.gramene.org/"),("Ensembl Genomes", "https://plants.ensembl.org/")),
+          "plants" -> List(("Gramene", "http://ensembl.gramene.org/"),("Ensembl Genomes", "https://plants.ensembl.org/")),
           "protists" -> List(("Ensembl Genomes", "https://protists.ensembl.org/"))
         )
       )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		FROM lolhens/ammonite:latest

		RUN apt-get update -y && apt-get install -y git