Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
* Reinstate changes from ebi-gene-expression-group/atlas-prod#119

* Make ontology lookup a class variable

* Get plants list from Ensembl file

* Add plants file

* plants file made default only

* Fix typo

* Source zooma ontolgies via improved config mechanism

* Syntax fixes

* Belated sanitisation of configs
  • Loading branch information
pinin4fjords authored Aug 3, 2021
1 parent d853eea commit e48fc51
Show file tree
Hide file tree
Showing 5 changed files with 236 additions and 53 deletions.
95 changes: 85 additions & 10 deletions perl_modules/Atlas/ZoomaClient.pm
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,13 @@ use URL::Encode qw( url_encode_utf8 );
use JSON::Parse qw( parse_json );
use Array::Compare;
use Log::Log4perl;

use Atlas::Common qw( make_http_request );
use File::Basename;
use Atlas::Common qw(
make_http_request
get_supporting_file
);
use Atlas::ZoomaClient::MappingResult;

use HTTP::Request;

=head1 ATTRIBUTES
Expand Down Expand Up @@ -108,6 +111,8 @@ has cutoff_proportion => (
=cut

my $logger = Log::Log4perl::get_logger;
my $ontology_lookup = zooma_ontology_lookup();
my $plant_species = get_plants_species();

=head1 METHODS
Expand All @@ -122,20 +127,20 @@ more ontology URIs.

sub map_term {

my ( $self, $propertyType, $propertyValue ) = @_;
my ( $self, $propertyType, $propertyValue, $organism ) = @_;

# Step 1: Get the results from Zooma, an array ref of hash refs
# representing the JSON from the Zooma API.
$logger->debug( "Querying zooma for $propertyType : $propertyValue ..." );
$logger->debug( "Querying zooma for $propertyType : $propertyValue : $organism ..." );

my $queryResults = $self->_query_zooma( $propertyType, $propertyValue );
my $queryResults = $self->_query_zooma( $propertyType, $propertyValue, $organism );

unless( $queryResults ) {

$logger->warn( "No results found for $propertyType : $propertyValue" );

my $mappingResult = Atlas::ZoomaClient::MappingResult->new(
zooma_error => "No results found for $propertyType : $propertyValue"
zooma_error => "No results found for $propertyType : $propertyValue : $organism"
);

return $mappingResult;
Expand All @@ -150,7 +155,7 @@ sub map_term {
# If there are any results, create the mapping result.
if( @{ $queryResults } ) {

$logger->debug( "Getting mapping result for $propertyType : $propertyValue ..." );
$logger->debug( "Getting mapping result for $propertyType : $propertyValue : $organism ..." );

my $mappingResult = Atlas::ZoomaClient::MappingResult->new(
zooma_results => $queryResults,
Expand Down Expand Up @@ -191,22 +196,27 @@ Given a property type and value, return a hash containing the query results.

sub _query_zooma {

my ( $self, $propertyType, $propertyValue ) = @_;
my ( $self, $propertyType, $propertyValue, $organism ) = @_;

my ( $propertyType4url, $propertyValue4url ) = ( $propertyType, $propertyValue );

# First we need to make sure the property type and value are in the right
# format for a URL.
$_ = url_encode_utf8( $_ ) for ( $propertyType4url, $propertyValue4url );

my $ontology = lc ( get_ontology_for_type( $organism, $propertyType ) );
$ontology =~ s/\s+$//;

# Get the data sources string.
my $dataSourcesString = $self->_data_sources_to_string;
my $ontologiesString = $self->_ontologies_to_string;

$logger->info("Mapping property type '", $propertyType, "' for species '", $organism, "' with ontology db - ", $ontology);

# Build the query URL.
my $queryURL = $self->get_zooma_api_base
. "services/annotate?propertyValue=$propertyValue4url&propertyType=$propertyType4url"
. "&filter=required:[$dataSourcesString],preferred:[$dataSourcesString],ontologies:[$ontologiesString]";
. "&filter=required:[$dataSourcesString],preferred:[$dataSourcesString],ontologies:[$ontology]";

my $zoomaJSON = make_http_request( $queryURL, "json" );

Expand Down Expand Up @@ -244,6 +254,71 @@ sub _query_zooma {
}


sub zooma_ontology_lookup {
my ( $filename ) = @_;
my $abs_path = dirname(File::Spec->rel2abs(__FILE__));
my $zoomaOntologyLookupFile = get_supporting_file( 'zooma_ontologies.tsv' );
open (my $in_fh, '<', $zoomaOntologyLookupFile) or die $!;
my %ontology_lookup;
while ( my $line = <$in_fh> ) {
my ( $property_type, $organism, $ontologies ) = split /\t/, $line;
$ontology_lookup{$property_type}{$organism} = $ontologies;
}
return \%ontology_lookup;
}

sub get_plants_species {

# Plants file comes from
# http://ftp.ebi.ac.uk/ensemblgenomes/pub/release-51/plants/species_EnsemblPlants.txt,
# but the Ensembl FTP is a bit unreliable, so we bundle it here.

my $plants_file=get_supporting_file( 'species_EnsemblPlants.txt' );
open(PLANTS, $plants_file) or die("Could not open file $plants_file.");
my @plants_species_list;
foreach my $line (<PLANTS>) {
next if $line =~ m/#name/;
my @plants_species = split(/\t/,$line);
push (@plants_species_list, $plants_species[1]);
}
close(PLANTS);
return \@plants_species_list;
}


sub get_ontology_for_type {
my ( $organism, $property_type ) = @_;

my $ontology;
## for plants specific
if (grep { $_ =~ $organism } @{ $plant_species }) {
print ("plant species - $organism \n");
#set organism to plants
my $organism='plants';
}
## iterate over the lookup table to identify corresponding ontology db for zooma mappings
foreach my $propertyType ( sort keys %{ $ontology_lookup } ) {
if ( $propertyType =~ $property_type ){
foreach my $species ( sort keys %{ $ontology_lookup->{$propertyType} } ) {
if ( $species =~ $organism ){
$ontology = $ontology_lookup->{$propertyType}->{$species};
}
elsif ( $species =~ 'any' ){
$ontology = $ontology_lookup->{$propertyType}->{'any'};
}
elsif ( $species =~ 'other'){
$ontology = $ontology_lookup->{$propertyType}->{'other'};
}
}
return $ontology;
}
else {
$ontology='EFO';
}
}
return $ontology;
}

=item _data_sources_to_string
Converts data sources array ref into a comma-separated string and returns it.
Expand Down
48 changes: 23 additions & 25 deletions supporting_files/ArrayExpressSiteConfig.yml.default
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,33 @@
# This file contains definitions for variables used throughout ArrayExpress and
# Expression Atlas production code.

MX_DSN: DBI:mysql:prod_miamexpress:mysql-prod-miamexpress.ebi.ac.uk:4054
MX_DSN: xxxx
MX_USERNAME: xxxx
MX_PASSWORD: xxxx

AE2_INSTANCE_NAME: AE2PRD
AE2_DSN: DBI:Oracle:host=ora-vm5-022.ebi.ac.uk;sid=AE2PRO;port=1531
AE2_INSTANCE_NAME: xxxx
AE2_DSN: xxxx
AE2_USERNAME: xxxx
AE2_PASSWORD: xxxx

AEDW_DSN: DBI:Oracle:host=ora-vm-064.ebi.ac.uk;sid=ATLASPRO;port=1531
AEDW_DSN: xxxx
AEDW_USERNAME: xxxx
AEDW_PASSWORD: xxxx

AE_PG_DSN: dbi:Pg:dbname=gxpatlaspro;host=pgsql-hxvm-002.ebi.ac.uk;port=5432
AE_PG_DSN: xxx
AE_PG_USERNAME: xxxx
AE_PG_PASSWORD: xxxx

# Submissions Tracking MySQL back-end
AUTOSUBS_DSN: DBI:mysql:ae_autosubs:mysql-ae-autosubs-prod.ebi.ac.uk:4091
AUTOSUBS_DSN: xxxx
AUTOSUBS_USERNAME: xxxx
AUTOSUBS_PASSWORD: xxxx

AUTOSUBMISSIONS_FILEBASE: /nfs/ma/ma-exp/AutoSubmissions/
AUTOSUBMISSIONS_TARGET: /nfs/ma/home/arrayexpress/ae2_production/data/EXPERIMENT
GEO_SUBMISSIONS_TARGET: /nfs/production3/ma/home/atlas3-production/GEO_import
ENA_SUBMISSIONS_TARGET: /nfs/production3/ma/home/atlas3-production/ENA_import
AUTOSUBMISSIONS_ARRAY_TARGET: /nfs/production3/ma/home/atlas3-production/GEO_import/microarray/ARRAY
AUTOSUBMISSIONS_FILEBASE:
AUTOSUBMISSIONS_TARGET:
EO_SUBMISSIONS_TARGET:
ENA_SUBMISSIONS_TARGET:
AUTOSUBMISSIONS_ARRAY_TARGET:

AUTOSUBS_ADMIN: [email protected]
AUTOSUBS_ADMIN_USERNAME: xxxx
Expand All @@ -41,48 +41,46 @@ AUTOSUBS_ADMIN_USERNAME: xxxx

AUTOSUBS_CURATOR_EMAIL: [email protected]

HTTP_PROXY: 'http://www-proxy.ebi.ac.uk:3128'
HTTP_PROXY:

AE_ARRAYDESIGN_LIST: 'http://peach.ebi.ac.uk:8480/api/all-arrays.txt'

MAX_LWP_DOWNLOAD: 10104857600

ENA_FTP_URI: ftp://ftp.sra.ebi.ac.uk/vol1/fastq/

AE2_LOAD_DIR: /nfs/ma/home/arrayexpress/ae2_production/data
AE2_LOAD_DIR:

BIOPORTAL_API_KEY: xxxx
ONTO_TERMS_LIST: /nfs/production3/ma/home/atlas3-production/sw/configs/onto_terms.yml
ONTO_TERMS_LIST:


# file used to process GEO experiments.
#GEO_IMPORT_COMMAND: 'PERL5LIB=/nfs/ma/home/fgpt/sw/lib/perl/CentOS_prod/lib64/perl5/site_perl:/nfs/ma/home/fgpt/sw/lib/perl/CentOS_prod/lib:/nfs/ma/home/fgpt/sw/lib/perl/FGPT_CentOS_prod/lib /usr/bin/perl /nfs/ma/home/fgpt/sw/lib/perl/FGPT_CentOS_prod/new_soft2magetab.pl'

GEO_IMPORT_COMMAND: '/nfs/production3/ma/home/atlas3-production/sw/atlasinstall_prod/atlasprod/geo_import/new_soft2magetab.pl'
GEO_IMPORT_COMMAND: 'new_soft2magetab.pl'

GEO_STAGING_DIR: /nfs/production3/ma/home/atlas3-production/GEO_import/GEOImportDownload/
GEO_STAGING_DIR: GEOImportDownload/

GSE_GDS_MAP: /nfs/production3/ma/home/atlas3-production/GEO_import/geo_import_supporting_files/gse2gds.txt
GEO_PLATFORM_MAP: /nfs/production3/ma/home/atlas3-production/GEO_import/geo_import_supporting_files/platforms.txt
GSE_GDS_MAP: geo_import_supporting_files/gse2gds.txt
GEO_PLATFORM_MAP: geo_import_supporting_files/platforms.txt

# New: location of EFO and ontology mapping script for use in GEO import
EFO_LOCATION: http://www.ebi.ac.uk/efo/efo.owl
EFO_OWL_FILE: /nfs/production3/ma/home/atlas3-production/GEO_import/geo_import_supporting_files/efo.owl
#OE_MAPPING_SCRIPT: /nfs/ma/ma-subs/AE/subs/PERL_SCRIPTS/local/bin/add_ontology_refs_to_magetab.pl
EFO_OWL_FILE: geo_import_supporting_files/efo.owl

ENA_ACC_MAP: /nfs/production3/ma/home/atlas3-production/GEO_import/geo_import_supporting_files/fastqFileReport
ENA_ACC_MAP: geo_import_supporting_files/fastqFileReport

VALIDATION_SCRIPT:


# Location of ADF and Experiments checked in Atlas.pm
ADF_CHECKED_LIST: /nfs/production3/ma/home/atlas3-production/sw/configs/adfs_not_in_atlas.txt
ATLAS_EXPT_CHECKED_LIST: /nfs/production3/ma/home/atlas3-production/sw/configs/expts_checked_for_atlas.txt
ADF_CHECKED_LIST: configs/adfs_not_in_atlas.txt
ATLAS_EXPT_CHECKED_LIST: configs/expts_checked_for_atlas.txt
SKIP_CHECKED_LIST_FILES: true

PRIVATE_ADF_USERNAME: xxxx
PRIVATE_ADF_PASSWORD: xxxxx
PRIVATE_ADF_URI_BASE: http://www.ebi.ac.uk/arrayexpress/files/

ADF_DB_PATTERN_FILE: /nfs/production3/ma/home/atlas3-production/sw/configs/adf_db_patterns.txt
ADF_DB_PATTERN_FILE: configs/adf_db_patterns.txt

31 changes: 13 additions & 18 deletions supporting_files/AtlasSiteConfig.yml.default
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
atlasinstall: prod

# annotare import
arrayexpress_experiment_load_dir: /ebi/microarray/home/arrayexpress/ae2_production/data/EXPERIMENT
arrayexpress_experiment_load_dir:

# geo import
geo_experiment_load_dir: /GEO_import
Expand All @@ -22,7 +22,7 @@ ena_sc_experiment_load_dir: /singlecell/experiment

#File containing factor values to use as references in differential
# experiments and factor types to ignore.
references_ignore_file: sw/atlasinstall_prod/atlasprod/analysis/differential/mapped_reference_assay_group_factor_values.xml
references_ignore_file: mapped_reference_assay_group_factor_values.xml

# Allowed experiment types in XML configuration file.
allowed_xml_experiment_types:
Expand Down Expand Up @@ -58,7 +58,7 @@ mirna_array_design_accessions:
mirbase_mappings_write_directory: arraydesigns/microRNA

# URL base for ArrayExpress files.
array_designs_ftp_site: /net/nasP/ftp_public/databases/microarray/data/array
array_designs_ftp_site:

# File mapping abbreviations used for species in miRBase.
mirbase_species_abbreviations: bioentity_properties/mirbase/idprefix_to_organism.tsv
Expand All @@ -67,28 +67,25 @@ mirbase_species_abbreviations: bioentity_properties/mirbase/idprefix_to_organism
contrast_details_url: http://wwwdev.ebi.ac.uk/gxa/api/contrastdetails.tsv

# File containing mappings of property types to EFO URIs.
property_types_efo_mappings: sw/atlasinstall_prod/atlasprod/db/scripts/property_types_efo_mappings.tsv
property_types_efo_mappings: property_types_efo_mappings.tsv

# Directory where Ensembl annotation source files live.
annotation_source_dir: sw/atlasinstall_prod/atlasprod/bioentity_annotations/ensembl/annsrcs
annotation_source_dir: annsrcs

# Genome references config.
genome_references_config: sw/islinstall_prod/isl_genomes/genome_references.conf
genome_references_config: genome_references.conf

# R installation.
atlas_r_installation: sw/atlasinstall_prod/R_install
atlas_r_installation: R_install

# Script for querying CTTV EFO terms.
cttv_efo_sparql_script: sw/atlasinstall_prod/atlasprod/db/scripts/get_efo_in_cttv_validation_data.sh

# File containing property types and values to exclude from Zooma mapping.
zooma_exclusions_file: <CONDA_PREFIX>/atlasprod/supporting_files/zooma_exclusions.yml
cttv_efo_sparql_script: get_efo_in_cttv_validation_data.sh

# File containing accessions of experiments that have been assessed by a curator.
atlas_curated_accessions_file: sw/configs/atlas_curated_accessions.yml
atlas_curated_accessions_file: atlas_curated_accessions.yml

# File containing accessions of experiments that don't have R experiment summaries.
no_r_data_accessions_file: sw/configs/no_r_object
no_r_data_accessions_file: no_r_object

# URL to get ADF info from ArrayExpress.
arrayexpress_adf_info_url: http://peach.ebi.ac.uk:8480/api/array.txt?acc=
Expand Down Expand Up @@ -188,13 +185,11 @@ atlas_supported_adfs:
A-GEOD-11534: Rattus norvegicus
A-MEXP-1414: Oryza sativa

failed_qc_dir: /nfs/production3/ma/home/atlas3-production/failedQC

min_pval_dir: /nfs/production3/ma/home/atlas3-production/minPValGt0.5
failed_qc_dir: failedQC

failed_curation_dir: /nfs/production3/ma/home/atlas3-production/failedCuration
min_pval_dir: atlas3-production/minPValGt0.5

non_standard_experiments_file: <CONDA_PREFIX>/atlasprod/supporting_files/non_standard_experiments.yml
failed_curation_dir: failedCuration

atlasprd3_dsn: DBI:Oracle:host=ora-vm-xxx.ebi.ac.uk;sid=ATLASPRO;port=1531
atlasprd3_user: xxxx
Expand Down
Loading

0 comments on commit e48fc51

Please sign in to comment.