-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalignToBed.pl
executable file
·186 lines (149 loc) · 4.38 KB
/
alignToBed.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env perl
##---------------------------------------------------------------------------##
## File:
## @(#) alignToBed.pl
## Author:
## Robert M. Hubley [email protected]
## Description:
## A simple script to convert RepeatMasker *.out files to
## BED format.
##
#******************************************************************************
#* This software is provided ``AS IS'' and any express or implied *
#* warranties, including, but not limited to, the implied warranties of *
#* merchantability and fitness for a particular purpose, are disclaimed. *
#* In no event shall the authors or the Institute for Systems Biology *
#* liable for any direct, indirect, incidental, special, exemplary, or *
#* consequential damages (including, but not limited to, procurement of *
#* substitute goods or services; loss of use, data, or profits; or *
#* business interruption) however caused and on any theory of liability, *
#* whether in contract, strict liability, or tort (including negligence *
#* or otherwise) arising in any way out of the use of this software, even *
#* if advised of the possibility of such damage. *
#* *
#******************************************************************************
#
# ChangeLog
#
# $Log$
#
###############################################################################
#
# To Do:
#
=head1 NAME
alignToBed.pl - Convert RepeatMasker *.align to BED format
=head1 SYNOPSIS
alignToBed.pl [-version] [-noSimple] [-fullAlign]
*.align
=head1 DESCRIPTION
A simple script to convert RepeatMasker's default alignment
format ( *.align ) to BED format. For more details on BED
see: http://genome.ucsc.edu/FAQ/FAQformat.html#format1
The *.align fields are converted as:
=begin text
BED Field RepeatMasker Field
========= ==================
chrom Query Sequence Identifier
chromStart Query Start - 1
chromEnd Query End
name The entire *.out line
=end text
The options are:
=over 4
=item -version
Displays the version of the program
=item -noSimple
Filter out repeats with "Simple_repeat" or "Low_complexity"
repeat classes.
=item -fullAlign
Encode the full alignment data in the BED 'name' field. Line
terminations are encoded using the '$' character.
=back
=head1 SEE ALSO
=head1 COPYRIGHT
Copyright 2014 Robert Hubley, Institute for Systems Biology
=head1 LICENSE
This code may be used in accordance with the Open Source License v. 3.0
http://opensource.org/licenses/OSL-3.0
=head1 AUTHOR
Robert Hubley <[email protected]>
=cut
#
# Module Dependence
#
use strict;
use Getopt::Long;
use Data::Dumper;
my $Version = "1.0";
#
# Magic numbers/constants here
# ie. my $PI = 3.14159;
#
my $DEBUG = 0;
#
# Option processing
# e.g.
# -t: Single letter binary option
# -t=s: String parameters
# -t=i: Number paramters
#
my @getopt_args = (
'-version', # print out the version and exit
'-noSimple',
'-fullAlign'
);
my %options = ();
Getopt::Long::config("noignorecase", "bundling_override");
unless (GetOptions(\%options, @getopt_args)) {
usage();
}
sub usage {
print "$0 - $Version\n\n";
exec "pod2text $0";
exit;
}
if ($options{'version'}) {
print "$Version\n";
exit;
}
#
# ARGV Processing
#
#if ( !@ARGV ) {
# usage();
#}
my $align_str = "";
my $summ_str = "";
my $bed_id = "";
my $bed_start = 0;
my $bed_end = 0;
while (<>)
{
if ( /^\s*(\d+\s+\d+\.\d+\s+\d+\.\d+\s+\d+\.\d+.*)/ )
{
$align_str = "";
next if ( $options{'noSimple'} &&
/Simple_repeat|Low_complexity/ );
my @fields = split(/\s+/, $1);
if ( $fields[8] ne "C" && $fields[8] ne "+" )
{
splice( @fields, 8, 0, "+");
}
$summ_str = join(" ",@fields);
# Zero Based Half Open Coords
$bed_id = $fields[4];
$bed_start = ($fields[5] - 1 );
$bed_end = $fields[6];
}
$align_str .= $_;
# Gap_init rate = 0.01 (3 / 223), avg. gap size = 1.67 (5 / 3)
if ( /^Gap_init/ ) {
if ( $options{'fullAlign'} ) {
$align_str =~ s/\n/\$/g;
print "$bed_id\t$bed_start\t$bed_end\t$align_str\n";
}else {
print "$bed_id\t$bed_start\t$bed_end\t$summ_str\n";
}
}
}