diff --git a/perl_modules/EBI/FGPT/CheckSet/Curation.pm b/perl_modules/EBI/FGPT/CheckSet/Curation.pm index 18ba245..4dd6b5c 100644 --- a/perl_modules/EBI/FGPT/CheckSet/Curation.pm +++ b/perl_modules/EBI/FGPT/CheckSet/Curation.pm @@ -14,16 +14,16 @@ EBI::FGPT::CheckSet::Curation =head1 SYNOPSIS - + use EBI::FGPT; - + my $check_sets = { 'EBI::FGPT::CheckSet::Curation' => 'curator_checks', }; my $idf = $ARGV[0]; - my $checker = EBI::FGPT::Reader::MAGETAB->new( - 'idf' => $idf, + my $checker = EBI::FGPT::Reader::MAGETAB->new( + 'idf' => $idf, 'check_sets' => $check_sets, ); $checker->parse(); @@ -87,8 +87,6 @@ MD5, compression checks and previously loaded file checks are only run on non-ma For files with a Comment[MD5] their actual MD5 sum must match that specified (ERROR) -File MD5 is checked against those stored in AE to check for previously loaded (WARN) - The following checks are only run on files which are assoicated with an array design: Affymetrix CHP file must be listed as Derived data (ERROR) @@ -107,7 +105,7 @@ If file contains exactly 65535 rows it may have been truncated by old version of All design elements in the file must be described on the array design (ERROR) -We skip MD5, compression checks and previously loaded file checks for raw seq files +We skip MD5, compression checks for raw seq files =cut @@ -227,7 +225,6 @@ augment 'run_sdrf_checks' => sub { # Only run on non-matrix files $self->run_data_md5_check(); $self->check_compressed_file_integrity(); - $self->check_for_previously_loaded_files(); $self->check_for_derived_data(); }; @@ -311,7 +308,7 @@ sub check_compressed_file_integrity { $is_seq = grep { $_ =~ /sequencing/i } @tech_types; if ( $is_seq and ( $type eq "raw" ) ) { - $self->warn( "Skipping compression check for", + $self->info( "Skipping compression check for", $name . " (assuming it is sequencing data)" ); next; } @@ -427,7 +424,7 @@ sub run_data_md5_check { grep { $_ and $_->get_name eq "MD5" } $file->get_comments; if ( $is_seq and ( $type eq "raw" ) ) { - $self->warn( "Skipping MD5 checks for ", + $self->info( "Skipping MD5 checks for ", $name . " (assuming it is sequencing data)" ); # Should throw error if no MD5s provided by submitter @@ -483,116 +480,6 @@ sub run_data_md5_check { } } -sub check_for_previously_loaded_files { - my ($self) = @_; - - if ( $self->get_skip_data_checks ) { - $self->warn("Skipping check for previously loaded files"); - return; - } - - # MD5 of loaded files is stored in AE data table - foreach my $file ( $self->get_magetab->get_dataFiles ) { - my ( $path, $name ) = $self->_get_file_path($file); - - # Ignore raw seq files, to do this we determine if assay - # attached to file has technology type "sequencing" - - my $type = $file->get_dataType->get_value; - my @inputEdges = $file->get_inputEdges; - my $assay; - my @tech_types; - - # Store technology type associated with raw file - if ( $type eq "raw" ) { - foreach my $inputEdge (@inputEdges) { - - # If file input is an assay get technology type - if ( $inputEdge->get_inputNode->isa("Bio::MAGETAB::Assay") ) { - $assay = $inputEdge->get_inputNode; - push @tech_types, $assay->get_technologyType->get_value; - } - - # If file input is a scan - elsif ( - $inputEdge->get_inputNode->isa( - "Bio::MAGETAB::DataAcquisition") - ) - { - - my $scan = $inputEdge->get_inputNode; - my @scanEdges = $scan->get_inputEdges; - - foreach my $scanEdge (@scanEdges) { - if ( - $scanEdge->get_inputNode->isa( - "Bio::MAGETAB::Assay") - ) - { - $assay = $scanEdge->get_inputNode; - push @tech_types, - $assay->get_technologyType->get_value; - } - } - } - - else { - $self->warn( - "Cannot determine technology type for " . $name ); - } - - } - } - - my $is_seq; - $is_seq = grep { $_ =~ /sequencing/i } @tech_types; - - if ( $is_seq and ( $type eq "raw" ) ) { - $self->warn( - "Skipping previously loaded file check for", - $name . " (assuming it is sequencing data)" - ); - next; - } - - open( my $fh, "<", $path ) - or ( - $self->error( - "Could not open $path to check for previously loaded files"), - next - ); - - my $md5 = Digest::MD5->new(); - my $chunk; - my $chunksize = - 65536; # 64k for reasonable efficiency (untested though). - while ( my $bytes = read( $fh, $chunk, $chunksize ) ) { - $md5->add($chunk); - } - - my $actual_md5 = $md5->hexdigest(); - - # Connect to AE to check MD5 - my $ae_db = EBI::FGPT::Resource::Database::ArrayExpress->new(); - my $md5_info = $ae_db->check_md5_in_database($actual_md5); - my @md5_info = @{$md5_info}; - - if (@md5_info) { - foreach my $ae_md5_info (@md5_info) { - my @ae_md5_info = @{$ae_md5_info}; - my $accs = $ae_md5_info[0]; - $accs =~ s/\\.+//g; - my $ae_file_name = $ae_md5_info[1]; - my $ae_md5 = $ae_md5_info[2]; - $self->warn( -"$name has been previously loaded for experiment: $accs with name $ae_file_name" - ); - } - } - - } # End checking of files - -} sub _get_file_path { @@ -678,11 +565,9 @@ sub run_data_checks { # Skip checking Illumina BeadChip files - my $ae_db = EBI::FGPT::Resource::Database::ArrayExpress->new(); - my $array_design_name = - $ae_db->get_array_design_name_by_acc( $file->{array} ) - if ($ae_db); - if ( ($array_design_name) && ( $array_design_name =~ /Illumina/ ) ) { + my $array_design_name = $self->get_ae_rest->get_array_design_name( $file->{array} ); + + if ( ($array_design_name ) && ( $array_design_name =~ /Illumina/ ) ) { $self->warn( "Recognised Illumina array using ADF name \'$array_design_name\', skipping checking file " . $file->{name} ); @@ -1233,13 +1118,8 @@ sub check_features_match_array { } elsif ( !$identifiers && $acc =~ /^A-GEOD-/ ) { - -# getting ADF name from DB and not from parsed ADF, in case ADF parsing failed and parser is undef - - my $ae_db = EBI::FGPT::Resource::Database::ArrayExpress->new(); - my $adf_name = $ae_db->get_array_design_name_by_acc($acc); $self->warn( - "No $heading found for GEO array $acc ($adf_name), ", + "No $heading found for GEO array $acc, ", "skipping identifier checks for file ", $file->get_name ); diff --git a/perl_modules/EBI/FGPT/Resource/ArrayExpressREST.pm b/perl_modules/EBI/FGPT/Resource/ArrayExpressREST.pm index 0880b4f..833d36e 100644 --- a/perl_modules/EBI/FGPT/Resource/ArrayExpressREST.pm +++ b/perl_modules/EBI/FGPT/Resource/ArrayExpressREST.pm @@ -1,7 +1,7 @@ #!/usr/bin/env perl # # EBI/FGPT/Resource/ArrayExpressREST -# +# # Anna Farne 2012 ArrayExpress team, EBI # # $Id: ArrayExpressREST.pm 21742 2012-11-19 12:55:13Z amytang $ @@ -25,19 +25,19 @@ use EBI::FGPT::Config qw($CONFIG); has 'array_list' => (is => 'rw', isa => 'HashRef', builder => '_load_array_list' , lazy => 1); sub _load_array_list{ - + my ($self) = @_; - + # Set the array list to empty hash so the builder is # not called again if array list fails to load $self->set_array_list({}); - + # FIXME: do we have external version of this uri for public arrays only? my $uri = $CONFIG->get_AE_ARRAYDESIGN_LIST or croak("AE_ARRAYDESIGN_LIST URI not set in Config file - cannot load array design list"); my $ua = $self->get_user_agent; - + my $response = $ua->get($uri); if ($response->is_success) { my @lines = split /\n/, $response->content; @@ -51,40 +51,51 @@ sub _load_array_list{ } else { croak("Could not get array design list from $uri - ".$response->status_line); - } + } } -sub get_affy_design_id{ - +sub get_array_design_name{ + my ($self, $acc) = @_; - + unless(scalar %{ $self->get_array_list }){ - croak("Array list not loaded"); + croak("Array list not loaded"); } - + my $name = $self->get_array_list->{$acc}; - + + if ($name){ + return $name; + } + else{ + croak("Array accession $acc not found in ArrayExpress"); + } +} + +sub get_affy_design_id{ + + my ($self, $acc) = @_; + + my $name = $self->get_array_design_name($acc); + if ($name){ my $design_id; - + if ($name =~ m/\[ ([^\]]+) \]/xms){ $design_id = $1; } - return $design_id; } - else{ - croak("Array accession $acc not found in ArrayExpress"); - } } + sub get_adf{ - my $uri_base = $CONFIG->get_PRIVATE_ADF_URI_BASE; + my $uri_base = $CONFIG->get_PRIVATE_ADF_URI_BASE; # e.g. "http://www.ebi.ac.uk/arrayexpress/files/", to be appended by "A-AFFY-1/A-AFFY-1.adf.txt" later - + my ($self,$acc) = @_; - + my $cookie_jar = HTTP::Cookies->new(); @@ -95,36 +106,36 @@ sub get_adf{ # Creating the user agent directl using LWP::UserAgent has solved the problem, hence this change. #my $ua = $self->get_user_agent(); - my $ua = LWP::UserAgent->new(); + my $ua = LWP::UserAgent->new(); $ua->cookie_jar($cookie_jar); #empty jar, no cookies yet. User agent also not associated with any proxy. - + # We are logging in with username and password to retrieve ADF. This is not # really required for public ADFs but at this point we don't really # know whether the ADF is public or private, so it's better to treat # all ADFs are private. - + # Fire the first HTTP request to get the login token cookie: - + my $username = $CONFIG->get_PRIVATE_ADF_USERNAME; my $password = $CONFIG->get_PRIVATE_ADF_PASSWORD; - + my $verify_site = 'http://www.ebi.ac.uk/arrayexpress/verify-login.txt?u='.$username.'&p='.$password; my $verify_response = $ua->get($verify_site); - + my $uri = $uri_base."$acc/$acc.adf.txt"; - + # Assign the two required cookies to the user agent object $cookie_jar->set_cookie(0,'AeLoginToken', $verify_response->content, '/','www.ebi.ac.uk'); $cookie_jar->set_cookie(0,'AeLoggedUser', 'curator','/','www.ebi.ac.uk'); - + # print "Set Cookie Jar?\n", $ua->cookie_jar->as_string, "\n"; # DEBUG - + # Fire the second HTTP request from the same user agent to get the ADF - + my $response = $ua->get($uri); - + my $adf; if ($response->is_success) { $adf = $response->content; @@ -132,7 +143,7 @@ sub get_adf{ else { croak("Could not get ADF from $uri - ".$response->status_line); } - + return $adf; } 1;