-
Notifications
You must be signed in to change notification settings - Fork 0
/
cross_contamination.pl
126 lines (111 loc) · 4.44 KB
/
cross_contamination.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/perl -w
use strict;
my $top = 10;
my (%matrix_75, %matrix_57, %rg, %rg_p7, %rg_p5, %all_p7, %all_p5, %all_rg, %cross, %cross2, %sum, %top, %expected_rg);
&help unless $ARGV[0];
open READ, $ARGV[0] or die "no input file was provided\n";
while (my $line = <READ>) {
chomp $line;
next unless $line =~ /^\d/;
my ($number, $p7seq, $p7, $p5seq, $p5, $rg) = split /\t/, $line;
next unless $p7 =~ /[^\-]/; #Removes sequences with unknown p7 index
next unless $p5 =~ /[^\-]/; #Removes sequences with unknown p5 index
next if $p7 =~ /phix[acgt]/i; #Removes sequences matching multi-base PhiX index (MPI-specific)
next if $p5 =~ /phix[acgt]/i; #Removes sequences matching multi-base PhiX index (MPI-specific)
$matrix_75{$p7}{$p5} = $number;
$matrix_57{$p5}{$p7} = $number;
if ($rg eq "unknown" || $rg eq "unexpected" || $rg eq "" || $rg eq "-") {
$all_p5{$p5}++;
$all_p7{$p7}++;
} else {
$expected_rg{$rg} = $number;
$rg{$p7}{$p5} = $rg;
$all_p5{$p5}++;
$all_p7{$p7}++;
$rg_p5{$p5}++;
$rg_p7{$p7}++;
$all_rg{$rg}{"p7"} = $p7;
$all_rg{$rg}{"p5"} = $p5;
}
}
foreach my $rg (keys %all_rg) {
my $rg_p7 = $all_rg{$rg}{"p7"};
my $rg_p5 = $all_rg{$rg}{"p5"};
my $rg_count = $matrix_75{$rg_p7}{$rg_p5};
foreach my $other_p5 (keys %{$matrix_75{$rg_p7}}) {
my ($corner1, $corner2, $min_corner, $cont_name);
next if $other_p5 eq $rg_p5;
foreach my $other_p7 (keys %{$matrix_57{$other_p5}}) {
next if $other_p7 eq $rg_p7;
next unless exists $matrix_75{$other_p7}{$rg_p5};
$corner1 = $matrix_75{$rg_p7}{$other_p5};
$corner2 = $matrix_75{$other_p7}{$rg_p5};
$min_corner = $corner1;
$min_corner = $corner2 if $corner2 < $corner1;
my $cont = (($min_corner/$rg_count)**2 * $rg_count);
my $cont_name = $rg{$other_p7}{$other_p5} || "$other_p7/$other_p5";
$cross{"$cont_name into $rg"} = sprintf ("%0.3f", $cont);
$cross2{"$cont_name into $rg"} = $rg_count;
$sum{$rg} += $cont if $cont >= 0.5;
if (exists $top{$rg}{"num"}) {
my $current = $top{$rg}{"num"};
if ($cont > $current) {
$top{$rg}{"num"} = $cont if $cont > $current;
$top{$rg}{"source"} = $cont_name;
}
}
}
}
}
my $counter = 0;
foreach my $element (sort {$cross{$b} <=> $cross{$a}} keys %cross) {
$counter++;
my $value = sprintf ("%.2f", $cross{$element});
my $total = $cross2{$element};
my $percent = sprintf ("%.5f", $value / $total * 100);
print "#$element\t$value reads (of a total of ", $total, "), or $percent%\n";
if ($counter >= $top) {
last if $value < 0.5;
}
}
print "\n#\#RG\tcross_cont_readsum\tcross_cont_percent\n";
foreach my $rg (sort keys %expected_rg) {
print "$rg\t";
if (exists $sum{$rg}) {
my $total = $expected_rg{$rg};
my $sum = sprintf ("%0.1f", $sum{$rg});
print "$sum\t";
my $percent = sprintf ("%0.4f", $sum / $total * 100);
print "$percent\n";
} else {
print "0\t0\n";
}
}
print "#\n";
sub help {
print "
This script requires a demultiplexing report to estimate possible cross-contamination between samples based on index swapping during library amplification or sequencing.
The demultiplexing report has to be provided in the following format:
#number p7seq p7index p5seq p5index readgroup
833843 CGATAGTT 298 TATGACGA 338 L4
693049 GTAGAATC 127 AGCATCGA 167 L1
583948 GGCCTGAC 9 CAACCTTA 49 L3
...
2324 AGGTCGTC - CAACCTTA 49 <-- p7 index sequence unrecognized ('-'), readgroup unassigned
1839 ACTTAAGC 19 GAGGAGTC 372 <-- unexpected indexcombination, readgroup unassigned
Fields:
number number of sequenced clusters, sorted by occurence of p7 and p5 index sequences
p7seq observed p7 index sequence
p7index Name/Number of index ('-' or empty if index is unrecognized)
p5seq observed p5 index sequence
p5index Name/Number of index ('-' or empty if index is unrecognized)
readgroup Library/sample name ('-','unknown','unexpected' or empty if index combination does not match an expected readgroup)
[usage]
./cross-contamination.pl demultiplexing-report.txt
[output]
STDOUT
1. List of all cross-contamination events with an estimated contribution of >0.5 reads (but at least the top 10 events, even if estimated to below 0.5 reads)
2. For each of the expected readgroups: sum of contaminating reads (only >0.5 per event), contribution of cross contamination in percent
";
exit;
}