From 27a6d706a24962f0cc8d5ce011aa4b53c3c7f60d Mon Sep 17 00:00:00 2001
From: Ben Fulcher <ben.d.fulcher@gmail.com>
Date: Fri, 23 Oct 2020 17:59:49 +1100
Subject: [PATCH] Streamlining

---
 EnsembleEnrichment/ComputeAllCategoryNulls.m | 68 +++++++++++++-------
 EnsembleEnrichment/EnsembleEnrichment.m      | 11 +++-
 GetFilteredGOData.m                          | 19 ++++--
 3 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/EnsembleEnrichment/ComputeAllCategoryNulls.m b/EnsembleEnrichment/ComputeAllCategoryNulls.m
index 5fa21e5..7a14590 100644
--- a/EnsembleEnrichment/ComputeAllCategoryNulls.m
+++ b/EnsembleEnrichment/ComputeAllCategoryNulls.m
@@ -53,7 +53,7 @@
 numNullSamples = enrichmentParams.numNullSamples;
 
 %-------------------------------------------------------------------------------
-% Get random vectors from real genes to use as null spatial maps:
+% Get randomized phenotypes to use as your null:
 switch enrichmentParams.whatEnsemble
 case 'customSpecified'
     % Specify a custom phenotype and just run the full calculation on this:
@@ -78,44 +78,62 @@
 end
 
 %-------------------------------------------------------------------------------
-% Correlation of genes with a given spatial map (or null ensemble of spatial maps):
+% Prepare for category-wise agglomeration by first computing the correlation of 
+% each gene with a given spatial map (or a null ensemble of spatial maps)
+allAnnotatedGenesEntrez = unique(vertcat(GOTable.annotations{:}));
+numAnnotatedTotal = length(allAnnotatedGenesEntrez);
+allAnnotatedGenesEntrez = intersect(allAnnotatedGenesEntrez,entrezIDs);
+numAnnotatedGenes = length(allAnnotatedGenesEntrez);
+fprintf(1,'Of %u annotated genes, %u match genes we have expression data for\n',...
+                    numAnnotatedTotal,numAnnotatedGenes);
+
+geneScores = nan(numAnnotatedGenes,numNullSamples);
+fprintf(1,'Computing null distributions to %u null phenotypes for all %u genes annotated to GO categories\n',...
+                            numNullSamples,numAnnotatedGenes);
+for g = 1:numAnnotatedGenes
+    % Get this gene's expression vector:
+    matchMe = (entrezIDs==allAnnotatedGenesEntrez(g));
+    if sum(matchMe)~=1
+        fprintf(1,'How the heck did I get %u matches for gene entrez %u?!\n',...
+                            sum(matchMe),allAnnotatedGenesEntrez(g));
+    end
+    expressionVector = geneData(:,matchMe);
+
+    % The correlation to compute for this gene:
+    theCorr_fun = @(x) corr(x,expressionVector,'type',whatCorr,'rows','pairwise');
+    if numNullSamples==1
+        geneScores(g) = theCorr_fun(nullMaps(:,1));
+    else
+        parfor n = 1:numNullSamples
+            geneScores(g,n) = theCorr_fun(nullMaps(:,n));
+        end
+    end
+end
+
+%-------------------------------------------------------------------------------
+% Agglomerate gene scores by GO category:
 categoryScores = cell(numGOCategories,1);
 for i = 1:numGOCategories
     if beVerbose
-        fprintf(1,'\n\n\nCategory %u/%u\n',i,numGOCategories);
-
-        fprintf(1,'Looking in at %s:%s (%u)\n',GOTable.GOIDlabel{i},...
-                            GOTable.GOName{i},GOTable.size(i));
+        fprintf(1,'\n\n\nLooking in at Category %u/%u. %s:%s (%u)\n',...
+            i,numGOCategories,GOTable.GOIDlabel{i},GOTable.GOName{i},GOTable.size(i));
     end
 
     % Match genes for this category:
-    theGenesEntrez = GOTable.annotations{i};
-    matchMe = find(ismember(entrezIDs,theGenesEntrez));
-    geneDataCategory = geneData(:,matchMe);
+    categoryEntrez = GOTable.annotations{i};
+    matchMe = find(ismember(allAnnotatedGenesEntrez,categoryEntrez));
     numGenesCategory = length(matchMe);
 
     if beVerbose
         fprintf(1,'%u/%u genes from this GO category have matching records in the expression data\n',...
-                            length(matchMe),length(theGenesEntrez));
+                    length(matchMe),length(theGenesEntrez));
     end
 
-    % Compute the distribution of gene-category scores for correlation with the null maps:
-    scoresHere = nan(numGenesCategory,numNullSamples);
-    for k = 1:numGenesCategory
-        expressionVector = geneDataCategory(:,k);
-        % The correlation to compute for this gene:
-        theCorr_fun = @(x) corr(x,expressionVector,'type',whatCorr,'rows','pairwise');
-        if numNullSamples==1
-            scoresHere(k) = theCorr_fun(nullMaps(:,1));
-        else
-            parfor j = 1:numNullSamples
-                scoresHere(k,j) = theCorr_fun(nullMaps(:,j));
-            end
-        end
-    end
+    % Retrieve the distribution of gene scores across phenotypes:
+    scoresHere = geneScores(matchMe,:);
 
     %---------------------------------------------------------------------------
-    % Aggregate gene-wise scores into an overall GO category score
+    % Aggregate gene-wise scores into an overall GO category score (for each phenotype)
     switch enrichmentParams.aggregateHow
     case 'mean'
         categoryScores{i} = nanmean(scoresHere,1);
diff --git a/EnsembleEnrichment/EnsembleEnrichment.m b/EnsembleEnrichment/EnsembleEnrichment.m
index bf90783..e1ea73b 100644
--- a/EnsembleEnrichment/EnsembleEnrichment.m
+++ b/EnsembleEnrichment/EnsembleEnrichment.m
@@ -10,7 +10,7 @@
 % - fileNullEnsembleResults: file where the precomputed nulls are stored
 % - phenotypeVector: a vector of the spatial phenotype map to be tested
 %
-% The params are taken from the results of the null ensemble enrichment file
+% The parameters are taken from the null ensemble enrichment file (params)
 %
 %---OUTPUT:
 % - GOTablePhenotype: a table with p-values estimated from the null ensemble
@@ -19,9 +19,12 @@
 % Process inputs and set defaults:
 %-------------------------------------------------------------------------------
 if nargin < 1
-    error('You must specify a file containing the precomputed ensemble nulls');
+    error('You must specify a file containing the gene data');
 end
 if nargin < 2
+    error('You must specify a file containing the precomputed ensemble nulls');
+end
+if nargin < 3
     error('You must provide a phenotype vector');
 end
 %-------------------------------------------------------------------------------
@@ -51,12 +54,14 @@
 %-------------------------------------------------------------------------------
 GOTablePhenotype = EstimatePVals(GOTableNull.categoryScores,...
                         [GOTablePhenotype.categoryScores{:}],'right',GOTablePhenotype);
+
+% Sort by Gaussian-approximation p-values:
 GOTablePhenotype = sortrows(GOTablePhenotype,'pValZ','ascend');
 
 %-------------------------------------------------------------------------------
 % Give a basic output about significance using pValZCorr:
 sigThresh = 0.05;
 numSig = sum(GOTablePhenotype.pValZCorr < sigThresh);
-fprintf(1,'%u significant categories at pZ_corr < %.2f\n',numSig,sigThresh);
+fprintf(1,'%u significant categories at p_Z_corr < %.2f\n',numSig,sigThresh);
 
 end
diff --git a/GetFilteredGOData.m b/GetFilteredGOData.m
index 1ad9b9c..3ce6829 100644
--- a/GetFilteredGOData.m
+++ b/GetFilteredGOData.m
@@ -6,7 +6,6 @@
 % Check Inputs:
 if nargin < 1
     whatSource = 'mouse-direct'; % Direct annotations from GO
-    % whatSource = 'mouse-GEMMA'; % Annotations derived from GEMMA
 end
 if nargin < 2
     whatFilter = 'biological_process';
@@ -20,7 +19,7 @@
 
 %-------------------------------------------------------------------------------
 % Load processed GO annotation data (i.e., direct annotations propagated up the hierarchy):
-% cf. propagateHierarchy to map files generated from ReadDirectAnnotationFile or ReadGEMMAAnnotationFile
+% cf. propagateHierarchy to map files generated from ReadDirectAnnotationFile (or ReadGEMMAAnnotationFile)
 switch whatSource
 case 'mouse-direct'
     fileNameLoad = sprintf('GOAnnotationDirect-mouse-%s-Prop.mat',whatFilter);
@@ -31,6 +30,7 @@
 otherwise
     error('Unknown annotation source: ''%s''',whatSource);
 end
+
 load(fileNameLoad,'GOTerms');
 fprintf(1,'Loaded annotated GO Terms from %s\n',fileNameLoad);
 
@@ -39,10 +39,11 @@
 GOTermsFile = 'GOTerms_BP.mat';
 if strcmp(whatFilter,'biological_process') && exist(GOTermsFile,'file')
     load(GOTermsFile,'GOTable');
-    fprintf(1,'Loaded biological process GO terms from %s\n',GOTermsFile);
+    fprintf(1,'Loaded biological process GO terms from file: %s\n',GOTermsFile);
 else
     % Retrieve from mySQL database:
     GOTable = GetGOTerms(whatFilter);
+    fprintf(1,'Loaded biological process GO terms from SQL database\n');
 end
 
 %-------------------------------------------------------------------------------
@@ -57,7 +58,7 @@
 % Filter by category size:
 numGOCategories = height(GOTerms);
 if isempty(ourEntrez)
-    fprintf(1,'Filtering on actual annotated size of GO category\n');
+    fprintf(1,'Filtering GO categories on their number of gene annotations\n');
     sizeGOCategories = GOTerms.size;
     fprintf(1,'%u GO categories have no annotations :-/\n',...
                     sum(sizeGOCategories==0));
@@ -81,4 +82,14 @@
 fprintf(1,'Filtered to %u GO categories with between %u and %u annotations\n',...
                 height(GOTable),sizeFilter(1),sizeFilter(2));
 
+
+%-------------------------------------------------------------------------------
+% Check that all annotations in GOTable are column vectors:
+isRowVector = find(cellfun(@(x)size(x,2),GOTable.annotations)~=1);
+numRowVectorAnnotation = length(isRowVector);
+for i = 1:numRowVectorAnnotation
+    GOTable.annotations{isRowVector(i)} = GOTable.annotations{isRowVector(i)}';
+end
+fprintf(1,'Transposed %u category annotation vectors from row-vector -> column vector\n',numRowVectorAnnotation);
+
 end