added tcga boxplot

ratschlab · Nov 20, 2020 · d081c3a · d081c3a
1 parent f4e5cce
commit d081c3a
Show file tree

Hide file tree

Showing 4 changed files with 143 additions and 15 deletions.
diff --git a/scripts/tcga_boxplot/README b/scripts/tcga_boxplot/README
@@ -1,17 +1,14 @@
---normalize_gene_expression.pl
-    The gene expression/quantification data files given out by the `rsem_calculate_expression` is normalized using the script `quartile_norm.pl` downloaded from the https://webshare.bioinf.unc.edu/public/mRNAseq_TCGA 
+--quartile_norm.pl
+The gene expression/quantification data files given out by the `rsem_calculate_expression` is normalized using the script `quartile_norm.pl` downloaded from the https://webshare.bioinf.unc.edu/public/mRNAseq_TCGA 
+
+--filter_genes.py
+Usage:   python filter_genes.py <resources/tcga_boxplot/genes_of_interest.txt> <{sample}.genes.results.normalized.header.txt>  > {sample}.genes.results.normalized.genes_filtered.txt 
+Purpose: Changes the ENSEMBL geneID in the file {sample}.genes.results.normalized.header.txt to HUGO symbol.
+         The resources/tcga_boxplot/genes_of_interest.txt has the 'HUGO_symbol->ENSEMBL_ID' for our genes of interest
+         The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script
 
---genename_to_HUGO.py
-    Purpose: Changes the ENSEMBL geneID to HUGO symbol.
-    Usage scenario: This script is used in RNASeq snakemake pipeline. The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script
-    Usage example:   genename_to_HUGO.py <genes_of_interest.txt> <OB225.genes.results.normalized.header.txt>  > OB225.genes.results.normalized.genes_filtered.txt 
-    ---------------------       -----------------------------------------       -------------------------------------------------
-    genes_of_interest.txt   |   OB225.genes.results.normalized.header.txt   |   OB225.genes.results.normalized.genes_filtered.txt 
-    ---------------------       -----------------------------------------       -------------------------------------------------
-    AKT1,ENSG00000142208        gene_id OB225                                   gene_id OB225
-                                ENSG00000142208.11      1067.7043               AKT1    1067.7043
 --parse_rna_tcga_pat_data.r
-    Command line script for generating individual expression context figures from NEXUS
+Command line script for generating individual expression context figures from NEXUS
 
---variant_expression_context.TCGA.list_of_genes.R   
-    Command line script for generating individual expression context figures from NEXUS
+--variant_expression_context.TCGA.list_of_genes.R
+Command line script for generating individual expression context figures from NEXUS
diff --git a/scripts/tcga_boxplot/filter_genes.py b/scripts/tcga_boxplot/filter_genes.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+import sys
+
+"""
+Usage: python filter_genes.py <resources/tcga_boxplot/genes_of_interest.txt> <{sample}.genes.results.normalized.header.txt>  > {sample}.genes.results.normalized.genes_filtered.txt 
+Purpose: Changes the ENSEMBL geneID in the file {sample}.genes.results.normalized.header.txt to HUGO symbol.
+The resources/tcga_boxplot/genes_of_interest.txt has the 'HUGO_symbol->ENSEMBL_ID' for our genes of interest
+The 'rule filter_genes' in rules/tcga_boxplot.smk uses this script
+Author: Tinu Thomas
+Date: July, 2019
+"""
+genes={}
+
+# Reading genes of interest file and storing genes in the dictionary 'genes'
+with open(sys.argv[1],'r') as fgene:
+    for line in fgene:
+        line=line.strip().split(',')
+        genes[line[1]]=line[0] # stored HUGO symbol 
+
+# Reading the normalized data file and extracting information for genes of interest
+with open(sys.argv[2],'r') as fnorm:
+    for data in fnorm:
+        data=data.strip().split('\t')
+        if data[0] == 'gene_id':
+            print('\t'.join(data))
+        else:
+            if data[0].split('.')[0] in genes:
+                print('\t'.join([ genes[data[0].split('.')[0]] ] + data[1:] )) # changes the ENSEMBL geneID in the normalized file to HUGO symbol
diff --git a/scripts/tcga_boxplot/quartile_norm.pl b/scripts/tcga_boxplot/quartile_norm.pl
@@ -0,0 +1,102 @@
+#!/usr/bin/env perl
+
+use strict;
+use Getopt::Long;
+
+my $out = '-';
+my $q = 75;
+my @col;
+my @also;
+my $names = 1;
+my $target = 1000;
+my $skip = 0;
+my $min=1;
+GetOptions("quant=i"=>\$q, "target=i"=>\$target, "col=i@"=>\@col, "out=s"=>\$out, "also=i@"=>\@also, "skip=i"=>\$skip, "min=i"=>\$min);
+
+my $in = shift @ARGV;
+
+die usage() unless $in && @col;
+
+open(OUT, ($out eq '-') ? '<&STDOUT' : ">$out") || die "Can't open $out\n";
+open(IN, ($in eq '-') ? '<&STDIN' : $in) || die "Can't open $in\n";
+
+@also = (1) if !@also && !grep {$_ eq '1'} @col;
+
+map {$_--} @col;
+map {$_--} @also;
+
+my @d;
+my $cnt = 0;
+my $head ='';
+while(<IN>) {
+        if ($skip) {
+                --$skip;
+                $head .= $_;
+                next;
+        }
+        chomp;
+        my @f = split /\t/;
+        if ($col[0] eq '-2') {
+                @col = (1..$#f);
+        }
+        for (@col) {
+                push @{$d[$_]}, $f[$_];
+        }
+        for (@also) {
+                push @{$d[$_]}, $f[$_];
+        }
+        ++$cnt;
+}
+for (@col) {
+        my @t = grep {$_>=$min} @{$d[$_]};
+        @t = sort {$a <=> $b} @t;
+        my $t=quantile(\@t, $q/100);
+        for (@{$d[$_]}) {
+                $_= sprintf "%.4f", $target*$_/$t;
+        }
+}
+
+my @out = (sort {$a <=> $b} (@col, @also));
+
+print OUT $head;
+
+for (my $i=0;$i<$cnt;++$i) {
+        for my $j (@out) {
+                print OUT "\t" unless $j == $out[0];
+                print OUT $d[$j][$i];
+        }
+        print OUT "\n";
+}
+
+
+sub usage {
+<<EOF;
+Usage: $0 -c COL [opts] FILE
+
+Returns an upper quartile normalization of data in column(s) COL
+of file FILE.
+
+Col is 1-based, zeroes are ignores when calculating upper quartile
+
+Options:
+   -c|col COL    normalize this column of data (can specify more than once, or -1 for all but first col)
+   -q|quant INT  quantile to use (75)
+   -t|target INT target to use (1000)
+   -a|also COL   output these columns also
+   -o|out FILE   output to this file instead of stdout
+   -m|min INT    minimum value (1)
+   -s|skip INT   skip header rows
+EOF
+}
+
+sub quantile {
+        my ($a,$p) = @_;
+        my $l = scalar(@{$a});
+        my $t = ($l-1)*$p;
+        my $v=$a->[int($t)];
+        if ($t > int($t)) {
+                return $v + $p * ($a->[int($t)+1] - $v);
+        } else {
+                return $v;
+        }
+}
diff --git a/setup.py b/setup.py
@@ -48,7 +48,8 @@
         'scripts/hypoxia/tupro_pipeline_hypoxia.R',
         'scripts/hypoxia/hypoxia_plots.r',
         'scripts/tcga_boxplot/variant_expression_context.TCGA.list_of_genes.R',
-        'scripts/tcga_boxplot/parse_rna_tcga_pat_data.r'
+        'scripts/tcga_boxplot/parse_rna_tcga_pat_data.r',
+        'scripts/tcga_boxplot/quartile_norm.pl'
     ],
     install_requires=requirements,
     license="MIT license",