Skip to content

Commit

Permalink
Merge pull request #13 from chadlaing/genbank_name_change
Browse files Browse the repository at this point in the history
Genbank name change
  • Loading branch information
chadlaing authored Jun 20, 2017
2 parents bf8db82 + 6dd9afe commit 6021f28
Show file tree
Hide file tree
Showing 10 changed files with 164 additions and 113 deletions.
1 change: 1 addition & 0 deletions Build.PL
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ my $build=Module::Build->new(
'Test2::Suite'=>0,
'Getopt::Long'=>0,
'Digest::MD5'=>0,
'Digest::SHA'=>0,
'File::Which'=>0,
'File::Path'=>0,
'File::Copy'=>0,
Expand Down
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,6 @@ If you find _**Panseq**_ useful please cite:
> Laing C, Buchanan C, Taboada EN, Zhang Y, Kropinski A, Villegas A, Thomas JE, Gannon VP.
> BMC Bioinformatics. 2010 Sep 15;11:461.
> Identification of Salmonella enterica species- and subgroup-specific genomic regions using Panseq 2.0.
> Laing C, Villegas A, Taboada EN, Kropinski A, Thomas JE, Gannon VP.
> Infect Genet Evol. 2011 Dec;11(8):2151-61.

## USAGE

Expand Down Expand Up @@ -98,6 +94,7 @@ Advanced Options
nucG 90
nucL 20
cdhit 1
sha1 1

### Settings and their [DEFAULTS]

Expand Down Expand Up @@ -156,6 +153,12 @@ Advanced Options

* `cdhit` [0] determines whether or not `cd-hit-est` is run on the pan-genome before identifying the distribution of the pan-genome (and SNPs among core regions) among the input sequences. Percent identity cutoff for `cd-hit-est` is taken from `percentIdentityCutoff`.

* `sha1` [0] sets the header and ID for all analyses as the SHA1 hash of the
sequence. In `novel` mode this will give the novel regions with the fasta
header as the hash. In `pan` mode this will also set the fasta headers as
the SHA1 hash of the sequence, and the ID column in the outputs will use
the SHA1 hash.

## Format of multi-fasta files ##

_**Panseq**_ currently only accepts fasta or multi-fasta formatted files. More than one genome may be in a single file, but for all genomes consisting of more than one contig, a distinct identifier must be present in the fasta header of each contig belonging to the same genome. For example, you have just assembled a new genome and are eager to analyze it. Your file consists of a number of contigs, similar to:
Expand Down
6 changes: 6 additions & 0 deletions lib/Modules/Fasta/SequenceName.pm
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,12 @@ sub _getName{
elsif($originalName =~ m/lcl\|([\w-]*)\|/){
$newName = $1;
}
elsif($originalName =~ m/([A-Za-z]{2}\d{6})\.\d/){
$newName = $1;
}
elsif($originalName =~ m/([A-Za-z]{4}\d{2})\d{6}\.\d/){
$newName = $1;
}
elsif($originalName =~ m/(ref\|\w\w_\w\w\w\w\w\w|gb\|\w\w\w\w\w\w\w\w|emb\|\w\w\w\w\w\w\w\w|dbj\|\w\w\w\w\w\w\w\w)/){
$newName = $1;
}
Expand Down
32 changes: 4 additions & 28 deletions lib/Modules/NovelRegion/NovelIterator.pm
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ sub run{

my $numberOfGenomes = scalar(@{$self->settings->orderedGenomeNames});
$self->logger->info("We have " . $numberOfGenomes . " genomes this run");
foreach my $ogn (@{$self->settings->orderedGenomeNames}){
$self->logger->debug($ogn);
}

if($numberOfGenomes ==0){
$self->logger->fatal("Input error. Require at least 1 genome");
exit(1);
Expand Down Expand Up @@ -384,34 +388,6 @@ sub _combineNovelRegionsAndReferenceFile{
return $outputFile;
}


=head2 _setRefFileToIgnore
Need to ensure the reference file is not used for generating new novel regions
under mode "unique". By setting the lcl|_ignore| flag, when used as a query, the novel
region finder will ignore it. When used as a reference, will still show the query matches.
=cut

sub _setRefFileToIgnore{
my $self = shift;
my $refFileOriginal = shift;

my $fileName = $refFileOriginal . '_ignore';
my $refInFH = IO::File->new('<' . $refFileOriginal) or die "$!";
my $refOutFH = IO::File->new('>' . $fileName) or die "$!";
while(my $line = $refInFH->getline){
if($line =~ m/^>/){
$line =~ s/\W/_/g;
$line = '>lcl|_ignore|' . $line . "\n";
}
$refOutFH->print($line);
}
$refInFH->close();
$refOutFH->close();
return $fileName;
}

=head2 _printNovelRegionsFromQueue
Takes in the coords file, the queryFile, and an outputFile.
Expand Down
17 changes: 15 additions & 2 deletions lib/Modules/NovelRegion/NovelRegionFinder.pm
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env perl

use Digest::SHA qw(sha1_hex);

=pod
=head1 NAME
Expand Down Expand Up @@ -157,7 +159,7 @@ sub coordsFile{
=head3 queryFile
The file containing all combined query sequences.
File is in mult-fasta foramt.
File is in multi-fasta foramt.
=cut

Expand Down Expand Up @@ -305,8 +307,19 @@ sub printNovelRegions{
my($relId, $relStart, $relEnd) = $self->_getNewIdStartEnd($id,$start,$end);
my $relLength = ($relEnd - $relStart +1);

my $sequence = $retriever->extractRegion($id,$start,$end);
my $header = '>';
if($self->settings->runMode eq 'novel' && $self->settings->sha1){
$header .= Digest::SHA::sha1_hex($sequence);
$self->logger->debug("$header");
}
else{
$header .= $relId . '_(' . $relStart . '..' . $relEnd .
')'. "\n"
}

$self->logger->debug("original: start:$start, end:$end, length: $length\nnew : start:$relStart, end:$relEnd, length:$relLength" );
$outFH->print('>' . $relId . '_(' . $relStart . '..' . $relEnd . ')'. "\n" . $retriever->extractRegion($id,$start,$end) . "\n");
$outFH->print($header . "\n" . $sequence . "\n");
}# end while
}# end foreach
$outFH->close();
Expand Down
50 changes: 25 additions & 25 deletions lib/Modules/PanGenome/PanGenome.pm
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env perl

use Digest::SHA qw(sha1_hex);

=pod
=head1 NAME
Expand Down Expand Up @@ -415,10 +417,16 @@ sub _processBlastXML {
$self->logger->debug("no queryFile, using $qsn as qsn, queryName: $queryName");
}

my $id = $counter;
my $sequence = $result->{$qsn}->[0]->[11];
if($self->settings->sha1){
$id = Digest::SHA::sha1_hex($sequence);
}

my %locusInformation = (
id=>$counter,
id=>$id,
name=>$result->{$qsn}->[0]->[1],
sequence=>$result->{$qsn}->[0]->[11],
sequence=>$sequence,
pan=>$coreOrAccessory
);
$self->logger->debug("locusInformation: " . Dumper(%locusInformation));
Expand Down Expand Up @@ -554,29 +562,21 @@ sub _printPanGenomeFastaFiles{
my $self = shift;
my $locusInformation = shift;

#fragment files
if($locusInformation->{pan} eq 'core'){
$self->_printFH->{coreFH}->print('>lcl|'
.$locusInformation->{id}
. '|'
. $locusInformation->{name}
. "\n"
. $locusInformation->{sequence}
. "\n");
}
elsif($locusInformation->{pan} eq 'accessory'){
$self->_printFH->{accessoryFH}->print('>lcl|'
. $locusInformation->{id}
. '|'
. $locusInformation->{name}
. "\n"
. $locusInformation->{sequence}
. "\n");
}
else{
$self->logger->fatal("Unknown pan-genome fragment type! " . $locusInformation->{pan});
exit(1);
}
my $header = '>';

if($self->settings->sha1){
$header .= $locusInformation->{id} . "\n";
}
else{
$header .= 'lcl|'
.$locusInformation->{id}
.'|'
.$locusInformation->{name}
."\n";
}

$self->_printFH->{$locusInformation->{pan} . 'FH'}->print(
$header . $locusInformation->{sequence} . "\n");
}


Expand Down
21 changes: 16 additions & 5 deletions lib/Modules/Setup/Panseq.pm
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,18 @@ sub _externalProgramTest{
my $self = shift;

if ($self->settings->cdhit){
unless(-e $self->settings->cdhitDirectory . 'cd-hit-est'){
unless(-e $self->settings->cdhitDirectory . 'cd-hit-est' || -e $self->settings->cdhitDirectory . 'cdhit-est'){
$self->logger->fatal($self->_missingExternalMessage($self->settings->cdhitDirectory, 'cdhitDirectory'));
exit(1);
}
else{
if(-e $self->settings->cdhitDirectory . 'cd-hit-est'){
$self->settings->_cdhitExecutable($self->settings->cdhitDirectory . 'cd-hit-est');
}
elsif(-e $self->settings->cdhitDirectory . 'cdhit-est'){
$self->settings->_cdhitExecutable($self->settings->cdhitDirectory . 'cdhit-est');
}
}
}

unless(-e $self->settings->blastDirectory . 'blastn'){
Expand Down Expand Up @@ -286,8 +294,8 @@ sub _runCdhit{

my $decimalPercentIdentityCutoff = $self->settings->percentIdentityCutoff / 100;
my $outputFile = $self->settings->baseDirectory . 'cdhit.fasta';
my $cdhitLine =
'cd-hit-est -M 0 -T ' .
my $cdhitLine = $self->settings->_cdhitExecutable .
' -M 0 -T ' .
$self->settings->numberOfCores .
' -c ' .
$decimalPercentIdentityCutoff .
Expand Down Expand Up @@ -371,7 +379,9 @@ sub _cleanUp{
($file =~ m/lastNovelRegionsFile/) ||
($file =~ m/uniqueNovelRegions/) ||
($file =~ m/\.temp/) ||
($file =~ m/_NR$/)
($file =~ m/_NR$/) ||
($file =~ m/Temp_out/) ||
($file =~ m/Temp_in/)
){
unlink $file;
}
Expand Down Expand Up @@ -402,14 +412,15 @@ sub _performPanGenomeAnalyses{
my $segmenter;

if($self->settings->fragmentationSize > 0){

$segmenter = Modules::Fasta::SegmentMaker->new(
'inputFile'=>$panGenomeFile,
'outputFile'=>$self->settings->baseDirectory . 'pangenome_fragments.fasta',
'segmentSize'=>$self->settings->fragmentationSize
);
$segmenter->segmentTheSequence;
$panGenomeFile = $segmenter->outputFile;
}
}

#run cdhit if so desired
my $postCdhit;
Expand Down
26 changes: 21 additions & 5 deletions lib/Modules/Setup/Settings.pm
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,12 @@ sub cdhitDirectory{
$self->{'_cdhitDirectory'} = shift // return $self->{'_cdhitDirectory'}
}


sub _cdhitExecutable{
my $self = shift;
$self->{'__cdhitExecutable'} = shift // return $self->{'__cdhitExecutable'}
}

sub cdhit{
my $self = shift;
$self->{'_cdhit'} = shift // return $self->{'_cdhit'}
Expand Down Expand Up @@ -264,6 +270,11 @@ sub overwrite{
$self->{'_overwrite'} = shift // return $self->{'_overwrite'};
}

sub sha1{
my $self=shift;
$self->{'_sha1'} = shift // return $self->{'_sha1'};
}

#methods
sub _getSettingsFromConfigurationFile {
my ($self) = shift;
Expand All @@ -277,11 +288,12 @@ sub _getSettingsFromConfigurationFile {

$line =~ s/\R//g;
my @la = split(/\s+/,$line);

my $setting = $la[0] // $self->_noValueSet('name');
my $value = $la[1] // $self->_noValueSet($la[0]);

$self->_setSettingValue($setting,$value);
if(defined $la[0] && defined $la[1]){
$self->_setSettingValue($la[0],$la[1]);
}
else{
$self->logger->debug("$line cannot be parsed");
}
}
$inFile->close();
}
Expand Down Expand Up @@ -465,6 +477,10 @@ sub _setDefaults{
unless(defined $self->percentIdentityCutoff){
$self->percentIdentityCutoff(85);
}

unless(defined $self->sha1){
$self->sha1(0);
}
}


Expand Down
22 changes: 11 additions & 11 deletions t/SequenceName.t
Original file line number Diff line number Diff line change
Expand Up @@ -41,34 +41,34 @@ my %testValues=(
},
'ref'=>{
'string'=>'ref|NC_123456|gi|148566106|gb|CP000711.1| Enterobacteria phage CUS-3, complete genome',
'correct'=>'ref|NC_123456'
'correct'=>'CP000711'
},
'gb'=>{
'string'=>'gi|148566106|gb|CP000711.1| Enterobacteria phage CUS-3, complete genome',
'correct'=>'gb|CP000711'
'correct'=>'CP000711'
},
'emb'=>{
'string'=>'gi|148566106|emb|CP000711.1| Enterobacteria phage CUS-3, complete genome',
'correct'=>'emb|CP000711'
'correct'=>'CP000711'
},
'dbj'=>{
'string'=>'gi|148566106|dbj|CP000711.1| Enterobacteria phage CUS-3, complete genome',
'correct'=>'dbj|CP000711'
'correct'=>'CP000711'
},
'Segment'=>{
'string'=>'gi_148566106_dbj_CP000711.1| Enterobacteria phage CUS-3, complete genome|Segment=125',
'correct'=>'gi_148566106_dbj_CP000711.1| Enterobacteria phage CUS-3, complete genome'
'string'=>'gi_148566106_dbj_000711.1| Enterobacteria phage CUS-3, complete genome|Segment=125',
'correct'=>'gi_148566106_dbj_000711.1| Enterobacteria phage CUS-3, complete genome'
},
'Length'=>{
'string'=>'gi_148566106_dbj_CP000711.1| Enterobacteria phage CUS-3, complete genome|Length=1245555',
'correct'=>'gi_148566106_dbj_CP000711.1| Enterobacteria phage CUS-3, complete genome'
'string'=>'gi_148566106_dbj_000711.1| Enterobacteria phage CUS-3, complete genome|Length=1245555',
'correct'=>'gi_148566106_dbj_000711.1| Enterobacteria phage CUS-3, complete genome'
},
'original'=>{
'string'=>'gi_148566106_dbj_CP000711_1_ Enterobacteria phage CUS-3, complete genome_Length=1245555',
'correct'=>'gi_148566106_dbj_CP000711_1_ Enterobacteria phage CUS-3, complete genome_Length=1245555'
'string'=>'gi_148566106_dbj_000711_1_ Enterobacteria phage CUS-3, complete genome_Length=1245555',
'correct'=>'gi_148566106_dbj_000711_1_ Enterobacteria phage CUS-3, complete genome_Length=1245555'
},
'gi'=>{
'string'=>'gi|148566106|gret|CP000711.1| Enterobacteria phage CUS-3, complete genome',
'string'=>'gi|148566106|gret|000711.1| Enterobacteria phage CUS-3, complete genome',
'correct'=>'gi|148566106'
}
);
Expand Down
Loading

0 comments on commit 6021f28

Please sign in to comment.