Skip to content

Commit

Permalink
added tcga boxplot
Browse files Browse the repository at this point in the history
  • Loading branch information
akahles committed Nov 20, 2020
1 parent f4e5cce commit d081c3a
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 15 deletions.
25 changes: 11 additions & 14 deletions scripts/tcga_boxplot/README
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
--normalize_gene_expression.pl
The gene expression/quantification data files given out by the `rsem_calculate_expression` is normalized using the script `quartile_norm.pl` downloaded from the https://webshare.bioinf.unc.edu/public/mRNAseq_TCGA
--quartile_norm.pl
The gene expression/quantification data files given out by the `rsem_calculate_expression` is normalized using the script `quartile_norm.pl` downloaded from the https://webshare.bioinf.unc.edu/public/mRNAseq_TCGA

--filter_genes.py
Usage: python filter_genes.py <resources/tcga_boxplot/genes_of_interest.txt> <{sample}.genes.results.normalized.header.txt> > {sample}.genes.results.normalized.genes_filtered.txt
Purpose: Changes the ENSEMBL geneID in the file {sample}.genes.results.normalized.header.txt to HUGO symbol.
The resources/tcga_boxplot/genes_of_interest.txt has the 'HUGO_symbol->ENSEMBL_ID' for our genes of interest
The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script

--genename_to_HUGO.py
Purpose: Changes the ENSEMBL geneID to HUGO symbol.
Usage scenario: This script is used in RNASeq snakemake pipeline. The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script
Usage example: genename_to_HUGO.py <genes_of_interest.txt> <OB225.genes.results.normalized.header.txt> > OB225.genes.results.normalized.genes_filtered.txt
--------------------- ----------------------------------------- -------------------------------------------------
genes_of_interest.txt | OB225.genes.results.normalized.header.txt | OB225.genes.results.normalized.genes_filtered.txt
--------------------- ----------------------------------------- -------------------------------------------------
AKT1,ENSG00000142208 gene_id OB225 gene_id OB225
ENSG00000142208.11 1067.7043 AKT1 1067.7043
--parse_rna_tcga_pat_data.r
Command line script for generating individual expression context figures from NEXUS
Command line script for generating individual expression context figures from NEXUS

--variant_expression_context.TCGA.list_of_genes.R
Command line script for generating individual expression context figures from NEXUS
--variant_expression_context.TCGA.list_of_genes.R
Command line script for generating individual expression context figures from NEXUS
28 changes: 28 additions & 0 deletions scripts/tcga_boxplot/filter_genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env python
import sys

"""
Usage: python filter_genes.py <resources/tcga_boxplot/genes_of_interest.txt> <{sample}.genes.results.normalized.header.txt> > {sample}.genes.results.normalized.genes_filtered.txt
Purpose: Changes the ENSEMBL geneID in the file {sample}.genes.results.normalized.header.txt to HUGO symbol.
The resources/tcga_boxplot/genes_of_interest.txt has the 'HUGO_symbol->ENSEMBL_ID' for our genes of interest
The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script
Author: Tinu Thomas
Date: July, 2019
"""
genes={}

# Reading genes of interest file and storing genes in the dictionary 'genes'
with open(sys.argv[1],'r') as fgene:
for line in fgene:
line=line.strip().split(',')
genes[line[1]]=line[0] # stored HUGO symbol

# Reading the normalized data file and extracting information for genes of interest
with open(sys.argv[2],'r') as fnorm:
for data in fnorm:
data=data.strip().split('\t')
if data[0] == 'gene_id':
print('\t'.join(data))
else:
if data[0].split('.')[0] in genes:
print('\t'.join([ genes[data[0].split('.')[0]] ] + data[1:] )) # changes the ENSEMBL geneID in the normalized file to HUGO symbol
102 changes: 102 additions & 0 deletions scripts/tcga_boxplot/quartile_norm.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env perl

use strict;
use Getopt::Long;

my $out = '-';
my $q = 75;
my @col;
my @also;
my $names = 1;
my $target = 1000;
my $skip = 0;
my $min=1;
GetOptions("quant=i"=>\$q, "target=i"=>\$target, "col=i@"=>\@col, "out=s"=>\$out, "also=i@"=>\@also, "skip=i"=>\$skip, "min=i"=>\$min);

my $in = shift @ARGV;

die usage() unless $in && @col;

open(OUT, ($out eq '-') ? '<&STDOUT' : ">$out") || die "Can't open $out\n";
open(IN, ($in eq '-') ? '<&STDIN' : $in) || die "Can't open $in\n";

@also = (1) if !@also && !grep {$_ eq '1'} @col;

map {$_--} @col;
map {$_--} @also;

my @d;
my $cnt = 0;
my $head ='';
while(<IN>) {
if ($skip) {
--$skip;
$head .= $_;
next;
}
chomp;
my @f = split /\t/;
if ($col[0] eq '-2') {
@col = (1..$#f);
}
for (@col) {
push @{$d[$_]}, $f[$_];
}
for (@also) {
push @{$d[$_]}, $f[$_];
}
++$cnt;
}
for (@col) {
my @t = grep {$_>=$min} @{$d[$_]};
@t = sort {$a <=> $b} @t;
my $t=quantile(\@t, $q/100);
for (@{$d[$_]}) {
$_= sprintf "%.4f", $target*$_/$t;
}
}

my @out = (sort {$a <=> $b} (@col, @also));

print OUT $head;

for (my $i=0;$i<$cnt;++$i) {
for my $j (@out) {
print OUT "\t" unless $j == $out[0];
print OUT $d[$j][$i];
}
print OUT "\n";
}


sub usage {
<<EOF;
Usage: $0 -c COL [opts] FILE
Returns an upper quartile normalization of data in column(s) COL
of file FILE.
Col is 1-based, zeroes are ignores when calculating upper quartile
Options:
-c|col COL normalize this column of data (can specify more than once, or -1 for all but first col)
-q|quant INT quantile to use (75)
-t|target INT target to use (1000)
-a|also COL output these columns also
-o|out FILE output to this file instead of stdout
-m|min INT minimum value (1)
-s|skip INT skip header rows
EOF
}

sub quantile {
my ($a,$p) = @_;
my $l = scalar(@{$a});
my $t = ($l-1)*$p;
my $v=$a->[int($t)];
if ($t > int($t)) {
return $v + $p * ($a->[int($t)+1] - $v);
} else {
return $v;
}
}
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@
'scripts/hypoxia/tupro_pipeline_hypoxia.R',
'scripts/hypoxia/hypoxia_plots.r',
'scripts/tcga_boxplot/variant_expression_context.TCGA.list_of_genes.R',
'scripts/tcga_boxplot/parse_rna_tcga_pat_data.r'
'scripts/tcga_boxplot/parse_rna_tcga_pat_data.r',
'scripts/tcga_boxplot/quartile_norm.pl'
],
install_requires=requirements,
license="MIT license",
Expand Down

0 comments on commit d081c3a

Please sign in to comment.