-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFastq_ReadExtract_byReadName.pl
115 lines (90 loc) · 2.87 KB
/
Fastq_ReadExtract_byReadName.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/perl
# New name: Fastq_Read_Extract_byReadName.pl
# Based upon: fastqConverterHash2.pl
# Author: Tovah Markowitz
# Date: 10/14/14
# Description: extract specific queries from a fastq file
# Input: name of fastq file, query file, output file name
# Output: query in fastQ format
use strict;
use Data::Dumper;
use Getopt::Long;
use Pod::Usage;
my $help=0;
my ($fastqFile, $queryFile, $outputFile);
# set commandline options
GetOptions ("input=s" => \$fastqFile,
"query=s" => \$queryFile,
"output=s" => \$outputFile,,
"help" => \$help,) or pod2usage(2);
pod2usage(1) if $help;
#############################################
my $sequences=LoadFastq($fastqFile);
# open query file and read in queries, save only first column
# if only one column, everything becomes a query
open (QUERY, "<$queryFile") or die ("Can not find your query file $queryFile: No such file or directory\n");
my $seqCounter = -1;
my @queries;
my $query;
# read in queries
while (<QUERY>){
chomp;
$seqCounter++;
$query= $_;
$query =~ /^\S+/;
#print Dumper ($&);
$queries[$seqCounter] = $&;
}
close QUERY;
# open output file, search and write concurrently
open (OUT, ">$outputFile") or die ("Can not open $outputFile for writing.\n");
# look through each query
# ensure that query is in the fastq file
# write output
for (my $q=0; $q < scalar(@queries); $q++) {
my $fastaName = $queries[$q];
if ($sequences -> {$fastaName}) {
print OUT
"@" . "$fastaName\n$sequences->{$fastaName}->{'seq'}" . "+" . "$fastaName\n$sequences->{$fastaName}->{'quals'}";
}
}
close OUT;
#############################################
sub LoadFastq {
# read FASTQ file into a hash of hashes
my $fastqFile=shift;
# ensure fastq file exists else die
open (IN, "<$fastqFile") or die ("Can not find your input file $fastqFile: No such file or directory\n");
# set up necessary arrays
my $names;
my %sequences;
my $list;
# for as long as the file has more data:
while (<IN>){
chomp;
# place every line in one of the two hashes,
# using the @ symbol to identify the start of each sequence
if ($_ =~ /@/) {
$names = $_;
$names=~ s/@//;
$sequences{$names} -> {'seq'} = <IN>;
$list = <IN>;
$sequences{$names} -> {'quals'} = <IN>;
}
}
close IN;
return \%sequences;
}
############
__END__
=head1 SYNOPSIS
Fastq_Read_Extract_byReadName.pl [Options]
Purpose: To find a list of reads based upon their read names.
The reads will be listed in the order of the queries within the
query file. Query names should not start with "@", ">", or "+".
Options:
-help/-h brief help message
-input/-i input fastq file
-query/-q query file [ each read name should be on a separate line ]
-output/-o output fastq file name
=cut