-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
143 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,14 @@ | ||
--normalize_gene_expression.pl | ||
The gene expression/quantification data files given out by the `rsem_calculate_expression` is normalized using the script `quartile_norm.pl` downloaded from the https://webshare.bioinf.unc.edu/public/mRNAseq_TCGA | ||
--quartile_norm.pl | ||
The gene expression/quantification data files given out by the `rsem_calculate_expression` is normalized using the script `quartile_norm.pl` downloaded from the https://webshare.bioinf.unc.edu/public/mRNAseq_TCGA | ||
|
||
--filter_genes.py | ||
Usage: python filter_genes.py <resources/tcga_boxplot/genes_of_interest.txt> <{sample}.genes.results.normalized.header.txt> > {sample}.genes.results.normalized.genes_filtered.txt | ||
Purpose: Changes the ENSEMBL geneID in the file {sample}.genes.results.normalized.header.txt to HUGO symbol. | ||
The resources/tcga_boxplot/genes_of_interest.txt has the 'HUGO_symbol->ENSEMBL_ID' for our genes of interest | ||
The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script | ||
|
||
--genename_to_HUGO.py | ||
Purpose: Changes the ENSEMBL geneID to HUGO symbol. | ||
Usage scenario: This script is used in RNASeq snakemake pipeline. The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script | ||
Usage example: genename_to_HUGO.py <genes_of_interest.txt> <OB225.genes.results.normalized.header.txt> > OB225.genes.results.normalized.genes_filtered.txt | ||
--------------------- ----------------------------------------- ------------------------------------------------- | ||
genes_of_interest.txt | OB225.genes.results.normalized.header.txt | OB225.genes.results.normalized.genes_filtered.txt | ||
--------------------- ----------------------------------------- ------------------------------------------------- | ||
AKT1,ENSG00000142208 gene_id OB225 gene_id OB225 | ||
ENSG00000142208.11 1067.7043 AKT1 1067.7043 | ||
--parse_rna_tcga_pat_data.r | ||
Command line script for generating individual expression context figures from NEXUS | ||
Command line script for generating individual expression context figures from NEXUS | ||
|
||
--variant_expression_context.TCGA.list_of_genes.R | ||
Command line script for generating individual expression context figures from NEXUS | ||
--variant_expression_context.TCGA.list_of_genes.R | ||
Command line script for generating individual expression context figures from NEXUS |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/usr/bin/env python | ||
import sys | ||
|
||
""" | ||
Usage: python filter_genes.py <resources/tcga_boxplot/genes_of_interest.txt> <{sample}.genes.results.normalized.header.txt> > {sample}.genes.results.normalized.genes_filtered.txt | ||
Purpose: Changes the ENSEMBL geneID in the file {sample}.genes.results.normalized.header.txt to HUGO symbol. | ||
The resources/tcga_boxplot/genes_of_interest.txt has the 'HUGO_symbol->ENSEMBL_ID' for our genes of interest | ||
The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script | ||
Author: Tinu Thomas | ||
Date: July, 2019 | ||
""" | ||
genes={} | ||
|
||
# Reading genes of interest file and storing genes in the dictionary 'genes' | ||
with open(sys.argv[1],'r') as fgene: | ||
for line in fgene: | ||
line=line.strip().split(',') | ||
genes[line[1]]=line[0] # stored HUGO symbol | ||
|
||
# Reading the normalized data file and extracting information for genes of interest | ||
with open(sys.argv[2],'r') as fnorm: | ||
for data in fnorm: | ||
data=data.strip().split('\t') | ||
if data[0] == 'gene_id': | ||
print('\t'.join(data)) | ||
else: | ||
if data[0].split('.')[0] in genes: | ||
print('\t'.join([ genes[data[0].split('.')[0]] ] + data[1:] )) # changes the ENSEMBL geneID in the normalized file to HUGO symbol |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
#!/usr/bin/env perl | ||
|
||
use strict; | ||
use Getopt::Long; | ||
|
||
my $out = '-'; | ||
my $q = 75; | ||
my @col; | ||
my @also; | ||
my $names = 1; | ||
my $target = 1000; | ||
my $skip = 0; | ||
my $min=1; | ||
GetOptions("quant=i"=>\$q, "target=i"=>\$target, "col=i@"=>\@col, "out=s"=>\$out, "also=i@"=>\@also, "skip=i"=>\$skip, "min=i"=>\$min); | ||
|
||
my $in = shift @ARGV; | ||
|
||
die usage() unless $in && @col; | ||
|
||
open(OUT, ($out eq '-') ? '<&STDOUT' : ">$out") || die "Can't open $out\n"; | ||
open(IN, ($in eq '-') ? '<&STDIN' : $in) || die "Can't open $in\n"; | ||
|
||
@also = (1) if !@also && !grep {$_ eq '1'} @col; | ||
|
||
map {$_--} @col; | ||
map {$_--} @also; | ||
|
||
my @d; | ||
my $cnt = 0; | ||
my $head =''; | ||
while(<IN>) { | ||
if ($skip) { | ||
--$skip; | ||
$head .= $_; | ||
next; | ||
} | ||
chomp; | ||
my @f = split /\t/; | ||
if ($col[0] eq '-2') { | ||
@col = (1..$#f); | ||
} | ||
for (@col) { | ||
push @{$d[$_]}, $f[$_]; | ||
} | ||
for (@also) { | ||
push @{$d[$_]}, $f[$_]; | ||
} | ||
++$cnt; | ||
} | ||
for (@col) { | ||
my @t = grep {$_>=$min} @{$d[$_]}; | ||
@t = sort {$a <=> $b} @t; | ||
my $t=quantile(\@t, $q/100); | ||
for (@{$d[$_]}) { | ||
$_= sprintf "%.4f", $target*$_/$t; | ||
} | ||
} | ||
|
||
my @out = (sort {$a <=> $b} (@col, @also)); | ||
|
||
print OUT $head; | ||
|
||
for (my $i=0;$i<$cnt;++$i) { | ||
for my $j (@out) { | ||
print OUT "\t" unless $j == $out[0]; | ||
print OUT $d[$j][$i]; | ||
} | ||
print OUT "\n"; | ||
} | ||
|
||
|
||
sub usage { | ||
<<EOF; | ||
Usage: $0 -c COL [opts] FILE | ||
Returns an upper quartile normalization of data in column(s) COL | ||
of file FILE. | ||
Col is 1-based, zeroes are ignores when calculating upper quartile | ||
Options: | ||
-c|col COL normalize this column of data (can specify more than once, or -1 for all but first col) | ||
-q|quant INT quantile to use (75) | ||
-t|target INT target to use (1000) | ||
-a|also COL output these columns also | ||
-o|out FILE output to this file instead of stdout | ||
-m|min INT minimum value (1) | ||
-s|skip INT skip header rows | ||
EOF | ||
} | ||
|
||
sub quantile { | ||
my ($a,$p) = @_; | ||
my $l = scalar(@{$a}); | ||
my $t = ($l-1)*$p; | ||
my $v=$a->[int($t)]; | ||
if ($t > int($t)) { | ||
return $v + $p * ($a->[int($t)+1] - $v); | ||
} else { | ||
return $v; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters