Skip to content

Commit

Permalink
Remove usage of AE database for curation checks (#16)
Browse files Browse the repository at this point in the history
* Comment out usage of AE database for curation checks

* Remove check_for_previously_loaded_files entirely

* Decrease severity for file check skipping (it's making the output hard to read)

* Better solution for identifying and skipping Illumina matrix files

* Remove ADF name retrieval from AE DB, it's seems to be just for the error message

* Reinstate warning about GEO arrays without features

* Fix how variable is passed to get_array_design_name

* Remove debug-print
  • Loading branch information
anjaf authored Apr 29, 2021
1 parent 1997696 commit 920a56c
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 163 deletions.
142 changes: 11 additions & 131 deletions perl_modules/EBI/FGPT/CheckSet/Curation.pm
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@
EBI::FGPT::CheckSet::Curation
=head1 SYNOPSIS
use EBI::FGPT;
my $check_sets = {
'EBI::FGPT::CheckSet::Curation' => 'curator_checks',
};
my $idf = $ARGV[0];
my $checker = EBI::FGPT::Reader::MAGETAB->new(
'idf' => $idf,
my $checker = EBI::FGPT::Reader::MAGETAB->new(
'idf' => $idf,
'check_sets' => $check_sets,
);
$checker->parse();
Expand Down Expand Up @@ -87,8 +87,6 @@ MD5, compression checks and previously loaded file checks are only run on non-ma
For files with a Comment[MD5] their actual MD5 sum must match that specified (ERROR)
File MD5 is checked against those stored in AE to check for previously loaded (WARN)
The following checks are only run on files which are assoicated with an array design:
Affymetrix CHP file must be listed as Derived data (ERROR)
Expand All @@ -107,7 +105,7 @@ If file contains exactly 65535 rows it may have been truncated by old version of
All design elements in the file must be described on the array design (ERROR)
We skip MD5, compression checks and previously loaded file checks for raw seq files
We skip MD5, compression checks for raw seq files
=cut

Expand Down Expand Up @@ -227,7 +225,6 @@ augment 'run_sdrf_checks' => sub {
# Only run on non-matrix files
$self->run_data_md5_check();
$self->check_compressed_file_integrity();
$self->check_for_previously_loaded_files();

$self->check_for_derived_data();
};
Expand Down Expand Up @@ -311,7 +308,7 @@ sub check_compressed_file_integrity {
$is_seq = grep { $_ =~ /sequencing/i } @tech_types;

if ( $is_seq and ( $type eq "raw" ) ) {
$self->warn( "Skipping compression check for",
$self->info( "Skipping compression check for",
$name . " (assuming it is sequencing data)" );
next;
}
Expand Down Expand Up @@ -427,7 +424,7 @@ sub run_data_md5_check {
grep { $_ and $_->get_name eq "MD5" } $file->get_comments;

if ( $is_seq and ( $type eq "raw" ) ) {
$self->warn( "Skipping MD5 checks for ",
$self->info( "Skipping MD5 checks for ",
$name . " (assuming it is sequencing data)" );

# Should throw error if no MD5s provided by submitter
Expand Down Expand Up @@ -483,116 +480,6 @@ sub run_data_md5_check {
}
}

sub check_for_previously_loaded_files {
my ($self) = @_;

if ( $self->get_skip_data_checks ) {
$self->warn("Skipping check for previously loaded files");
return;
}

# MD5 of loaded files is stored in AE data table
foreach my $file ( $self->get_magetab->get_dataFiles ) {
my ( $path, $name ) = $self->_get_file_path($file);

# Ignore raw seq files, to do this we determine if assay
# attached to file has technology type "sequencing"

my $type = $file->get_dataType->get_value;
my @inputEdges = $file->get_inputEdges;
my $assay;
my @tech_types;

# Store technology type associated with raw file
if ( $type eq "raw" ) {
foreach my $inputEdge (@inputEdges) {

# If file input is an assay get technology type
if ( $inputEdge->get_inputNode->isa("Bio::MAGETAB::Assay") ) {
$assay = $inputEdge->get_inputNode;
push @tech_types, $assay->get_technologyType->get_value;
}

# If file input is a scan
elsif (
$inputEdge->get_inputNode->isa(
"Bio::MAGETAB::DataAcquisition")
)
{

my $scan = $inputEdge->get_inputNode;
my @scanEdges = $scan->get_inputEdges;

foreach my $scanEdge (@scanEdges) {
if (
$scanEdge->get_inputNode->isa(
"Bio::MAGETAB::Assay")
)
{
$assay = $scanEdge->get_inputNode;
push @tech_types,
$assay->get_technologyType->get_value;
}
}
}

else {
$self->warn(
"Cannot determine technology type for " . $name );
}

}
}

my $is_seq;
$is_seq = grep { $_ =~ /sequencing/i } @tech_types;

if ( $is_seq and ( $type eq "raw" ) ) {
$self->warn(
"Skipping previously loaded file check for",
$name . " (assuming it is sequencing data)"
);
next;
}

open( my $fh, "<", $path )
or (
$self->error(
"Could not open $path to check for previously loaded files"),
next
);

my $md5 = Digest::MD5->new();
my $chunk;
my $chunksize =
65536; # 64k for reasonable efficiency (untested though).
while ( my $bytes = read( $fh, $chunk, $chunksize ) ) {
$md5->add($chunk);
}

my $actual_md5 = $md5->hexdigest();

# Connect to AE to check MD5
my $ae_db = EBI::FGPT::Resource::Database::ArrayExpress->new();
my $md5_info = $ae_db->check_md5_in_database($actual_md5);
my @md5_info = @{$md5_info};

if (@md5_info) {
foreach my $ae_md5_info (@md5_info) {
my @ae_md5_info = @{$ae_md5_info};
my $accs = $ae_md5_info[0];
$accs =~ s/\\.+//g;
my $ae_file_name = $ae_md5_info[1];
my $ae_md5 = $ae_md5_info[2];
$self->warn(
"$name has been previously loaded for experiment: $accs with name $ae_file_name"
);
}
}

} # End checking of files

}

sub _get_file_path {

Expand Down Expand Up @@ -678,11 +565,9 @@ sub run_data_checks {

# Skip checking Illumina BeadChip files

my $ae_db = EBI::FGPT::Resource::Database::ArrayExpress->new();
my $array_design_name =
$ae_db->get_array_design_name_by_acc( $file->{array} )
if ($ae_db);
if ( ($array_design_name) && ( $array_design_name =~ /Illumina/ ) ) {
my $array_design_name = $self->get_ae_rest->get_array_design_name( $file->{array} );

if ( ($array_design_name ) && ( $array_design_name =~ /Illumina/ ) ) {
$self->warn(
"Recognised Illumina array using ADF name \'$array_design_name\', skipping checking file "
. $file->{name} );
Expand Down Expand Up @@ -1233,13 +1118,8 @@ sub check_features_match_array {
}

elsif ( !$identifiers && $acc =~ /^A-GEOD-/ ) {

# getting ADF name from DB and not from parsed ADF, in case ADF parsing failed and parser is undef

my $ae_db = EBI::FGPT::Resource::Database::ArrayExpress->new();
my $adf_name = $ae_db->get_array_design_name_by_acc($acc);
$self->warn(
"No $heading found for GEO array $acc ($adf_name), ",
"No $heading found for GEO array $acc, ",
"skipping identifier checks for file ",
$file->get_name
);
Expand Down
75 changes: 43 additions & 32 deletions perl_modules/EBI/FGPT/Resource/ArrayExpressREST.pm
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
#
# EBI/FGPT/Resource/ArrayExpressREST
#
#
# Anna Farne 2012 ArrayExpress team, EBI
#
# $Id: ArrayExpressREST.pm 21742 2012-11-19 12:55:13Z amytang $
Expand All @@ -25,19 +25,19 @@ use EBI::FGPT::Config qw($CONFIG);
has 'array_list' => (is => 'rw', isa => 'HashRef', builder => '_load_array_list' , lazy => 1);

sub _load_array_list{

my ($self) = @_;

# Set the array list to empty hash so the builder is
# not called again if array list fails to load
$self->set_array_list({});

# FIXME: do we have external version of this uri for public arrays only?
my $uri = $CONFIG->get_AE_ARRAYDESIGN_LIST
or croak("AE_ARRAYDESIGN_LIST URI not set in Config file - cannot load array design list");

my $ua = $self->get_user_agent;

my $response = $ua->get($uri);
if ($response->is_success) {
my @lines = split /\n/, $response->content;
Expand All @@ -51,40 +51,51 @@ sub _load_array_list{
}
else {
croak("Could not get array design list from $uri - ".$response->status_line);
}
}
}

sub get_affy_design_id{
sub get_array_design_name{

my ($self, $acc) = @_;

unless(scalar %{ $self->get_array_list }){
croak("Array list not loaded");
croak("Array list not loaded");
}

my $name = $self->get_array_list->{$acc};


if ($name){
return $name;
}
else{
croak("Array accession $acc not found in ArrayExpress");
}
}

sub get_affy_design_id{

my ($self, $acc) = @_;

my $name = $self->get_array_design_name($acc);

if ($name){
my $design_id;

if ($name =~ m/\[ ([^\]]+) \]/xms){
$design_id = $1;
}

return $design_id;
}
else{
croak("Array accession $acc not found in ArrayExpress");
}
}


sub get_adf{

my $uri_base = $CONFIG->get_PRIVATE_ADF_URI_BASE;
my $uri_base = $CONFIG->get_PRIVATE_ADF_URI_BASE;
# e.g. "http://www.ebi.ac.uk/arrayexpress/files/", to be appended by "A-AFFY-1/A-AFFY-1.adf.txt" later

my ($self,$acc) = @_;

my $cookie_jar = HTTP::Cookies->new();


Expand All @@ -95,44 +106,44 @@ sub get_adf{
# Creating the user agent directl using LWP::UserAgent has solved the problem, hence this change.

#my $ua = $self->get_user_agent();
my $ua = LWP::UserAgent->new();
my $ua = LWP::UserAgent->new();

$ua->cookie_jar($cookie_jar); #empty jar, no cookies yet. User agent also not associated with any proxy.

# We are logging in with username and password to retrieve ADF. This is not
# really required for public ADFs but at this point we don't really
# know whether the ADF is public or private, so it's better to treat
# all ADFs are private.

# Fire the first HTTP request to get the login token cookie:

my $username = $CONFIG->get_PRIVATE_ADF_USERNAME;
my $password = $CONFIG->get_PRIVATE_ADF_PASSWORD;

my $verify_site = 'http://www.ebi.ac.uk/arrayexpress/verify-login.txt?u='.$username.'&p='.$password;
my $verify_response = $ua->get($verify_site);

my $uri = $uri_base."$acc/$acc.adf.txt";

# Assign the two required cookies to the user agent object

$cookie_jar->set_cookie(0,'AeLoginToken', $verify_response->content, '/','www.ebi.ac.uk');
$cookie_jar->set_cookie(0,'AeLoggedUser', 'curator','/','www.ebi.ac.uk');

# print "Set Cookie Jar?\n", $ua->cookie_jar->as_string, "\n"; # DEBUG

# Fire the second HTTP request from the same user agent to get the ADF

my $response = $ua->get($uri);

my $adf;
if ($response->is_success) {
$adf = $response->content;
}
else {
croak("Could not get ADF from $uri - ".$response->status_line);
}

return $adf;
}
1;

0 comments on commit 920a56c

Please sign in to comment.