From 27a6d706a24962f0cc8d5ce011aa4b53c3c7f60d Mon Sep 17 00:00:00 2001 From: Ben Fulcher Date: Fri, 23 Oct 2020 17:59:49 +1100 Subject: [PATCH] Streamlining --- EnsembleEnrichment/ComputeAllCategoryNulls.m | 68 +++++++++++++------- EnsembleEnrichment/EnsembleEnrichment.m | 11 +++- GetFilteredGOData.m | 19 ++++-- 3 files changed, 66 insertions(+), 32 deletions(-) diff --git a/EnsembleEnrichment/ComputeAllCategoryNulls.m b/EnsembleEnrichment/ComputeAllCategoryNulls.m index 5fa21e5..7a14590 100644 --- a/EnsembleEnrichment/ComputeAllCategoryNulls.m +++ b/EnsembleEnrichment/ComputeAllCategoryNulls.m @@ -53,7 +53,7 @@ numNullSamples = enrichmentParams.numNullSamples; %------------------------------------------------------------------------------- -% Get random vectors from real genes to use as null spatial maps: +% Get randomized phenotypes to use as your null: switch enrichmentParams.whatEnsemble case 'customSpecified' % Specify a custom phenotype and just run the full calculation on this: @@ -78,44 +78,62 @@ end %------------------------------------------------------------------------------- -% Correlation of genes with a given spatial map (or null ensemble of spatial maps): +% Prepare for category-wise agglomeration by first computing the correlation of +% each gene with a given spatial map (or a null ensemble of spatial maps) +allAnnotatedGenesEntrez = unique(vertcat(GOTable.annotations{:})); +numAnnotatedTotal = length(allAnnotatedGenesEntrez); +allAnnotatedGenesEntrez = intersect(allAnnotatedGenesEntrez,entrezIDs); +numAnnotatedGenes = length(allAnnotatedGenesEntrez); +fprintf(1,'Of %u annotated genes, %u match genes we have expression data for\n',... + numAnnotatedTotal,numAnnotatedGenes); + +geneScores = nan(numAnnotatedGenes,numNullSamples); +fprintf(1,'Computing null distributions to %u null phenotypes for all %u genes annotated to GO categories\n',... + numNullSamples,numAnnotatedGenes); +for g = 1:numAnnotatedGenes + % Get this gene's expression vector: + matchMe = (entrezIDs==allAnnotatedGenesEntrez(g)); + if sum(matchMe)~=1 + fprintf(1,'How the heck did I get %u matches for gene entrez %u?!\n',... + sum(matchMe),allAnnotatedGenesEntrez(g)); + end + expressionVector = geneData(:,matchMe); + + % The correlation to compute for this gene: + theCorr_fun = @(x) corr(x,expressionVector,'type',whatCorr,'rows','pairwise'); + if numNullSamples==1 + geneScores(g) = theCorr_fun(nullMaps(:,1)); + else + parfor n = 1:numNullSamples + geneScores(g,n) = theCorr_fun(nullMaps(:,n)); + end + end +end + +%------------------------------------------------------------------------------- +% Agglomerate gene scores by GO category: categoryScores = cell(numGOCategories,1); for i = 1:numGOCategories if beVerbose - fprintf(1,'\n\n\nCategory %u/%u\n',i,numGOCategories); - - fprintf(1,'Looking in at %s:%s (%u)\n',GOTable.GOIDlabel{i},... - GOTable.GOName{i},GOTable.size(i)); + fprintf(1,'\n\n\nLooking in at Category %u/%u. %s:%s (%u)\n',... + i,numGOCategories,GOTable.GOIDlabel{i},GOTable.GOName{i},GOTable.size(i)); end % Match genes for this category: - theGenesEntrez = GOTable.annotations{i}; - matchMe = find(ismember(entrezIDs,theGenesEntrez)); - geneDataCategory = geneData(:,matchMe); + categoryEntrez = GOTable.annotations{i}; + matchMe = find(ismember(allAnnotatedGenesEntrez,categoryEntrez)); numGenesCategory = length(matchMe); if beVerbose fprintf(1,'%u/%u genes from this GO category have matching records in the expression data\n',... - length(matchMe),length(theGenesEntrez)); + length(matchMe),length(theGenesEntrez)); end - % Compute the distribution of gene-category scores for correlation with the null maps: - scoresHere = nan(numGenesCategory,numNullSamples); - for k = 1:numGenesCategory - expressionVector = geneDataCategory(:,k); - % The correlation to compute for this gene: - theCorr_fun = @(x) corr(x,expressionVector,'type',whatCorr,'rows','pairwise'); - if numNullSamples==1 - scoresHere(k) = theCorr_fun(nullMaps(:,1)); - else - parfor j = 1:numNullSamples - scoresHere(k,j) = theCorr_fun(nullMaps(:,j)); - end - end - end + % Retrieve the distribution of gene scores across phenotypes: + scoresHere = geneScores(matchMe,:); %--------------------------------------------------------------------------- - % Aggregate gene-wise scores into an overall GO category score + % Aggregate gene-wise scores into an overall GO category score (for each phenotype) switch enrichmentParams.aggregateHow case 'mean' categoryScores{i} = nanmean(scoresHere,1); diff --git a/EnsembleEnrichment/EnsembleEnrichment.m b/EnsembleEnrichment/EnsembleEnrichment.m index bf90783..e1ea73b 100644 --- a/EnsembleEnrichment/EnsembleEnrichment.m +++ b/EnsembleEnrichment/EnsembleEnrichment.m @@ -10,7 +10,7 @@ % - fileNullEnsembleResults: file where the precomputed nulls are stored % - phenotypeVector: a vector of the spatial phenotype map to be tested % -% The params are taken from the results of the null ensemble enrichment file +% The parameters are taken from the null ensemble enrichment file (params) % %---OUTPUT: % - GOTablePhenotype: a table with p-values estimated from the null ensemble @@ -19,9 +19,12 @@ % Process inputs and set defaults: %------------------------------------------------------------------------------- if nargin < 1 - error('You must specify a file containing the precomputed ensemble nulls'); + error('You must specify a file containing the gene data'); end if nargin < 2 + error('You must specify a file containing the precomputed ensemble nulls'); +end +if nargin < 3 error('You must provide a phenotype vector'); end %------------------------------------------------------------------------------- @@ -51,12 +54,14 @@ %------------------------------------------------------------------------------- GOTablePhenotype = EstimatePVals(GOTableNull.categoryScores,... [GOTablePhenotype.categoryScores{:}],'right',GOTablePhenotype); + +% Sort by Gaussian-approximation p-values: GOTablePhenotype = sortrows(GOTablePhenotype,'pValZ','ascend'); %------------------------------------------------------------------------------- % Give a basic output about significance using pValZCorr: sigThresh = 0.05; numSig = sum(GOTablePhenotype.pValZCorr < sigThresh); -fprintf(1,'%u significant categories at pZ_corr < %.2f\n',numSig,sigThresh); +fprintf(1,'%u significant categories at p_Z_corr < %.2f\n',numSig,sigThresh); end diff --git a/GetFilteredGOData.m b/GetFilteredGOData.m index 1ad9b9c..3ce6829 100644 --- a/GetFilteredGOData.m +++ b/GetFilteredGOData.m @@ -6,7 +6,6 @@ % Check Inputs: if nargin < 1 whatSource = 'mouse-direct'; % Direct annotations from GO - % whatSource = 'mouse-GEMMA'; % Annotations derived from GEMMA end if nargin < 2 whatFilter = 'biological_process'; @@ -20,7 +19,7 @@ %------------------------------------------------------------------------------- % Load processed GO annotation data (i.e., direct annotations propagated up the hierarchy): -% cf. propagateHierarchy to map files generated from ReadDirectAnnotationFile or ReadGEMMAAnnotationFile +% cf. propagateHierarchy to map files generated from ReadDirectAnnotationFile (or ReadGEMMAAnnotationFile) switch whatSource case 'mouse-direct' fileNameLoad = sprintf('GOAnnotationDirect-mouse-%s-Prop.mat',whatFilter); @@ -31,6 +30,7 @@ otherwise error('Unknown annotation source: ''%s''',whatSource); end + load(fileNameLoad,'GOTerms'); fprintf(1,'Loaded annotated GO Terms from %s\n',fileNameLoad); @@ -39,10 +39,11 @@ GOTermsFile = 'GOTerms_BP.mat'; if strcmp(whatFilter,'biological_process') && exist(GOTermsFile,'file') load(GOTermsFile,'GOTable'); - fprintf(1,'Loaded biological process GO terms from %s\n',GOTermsFile); + fprintf(1,'Loaded biological process GO terms from file: %s\n',GOTermsFile); else % Retrieve from mySQL database: GOTable = GetGOTerms(whatFilter); + fprintf(1,'Loaded biological process GO terms from SQL database\n'); end %------------------------------------------------------------------------------- @@ -57,7 +58,7 @@ % Filter by category size: numGOCategories = height(GOTerms); if isempty(ourEntrez) - fprintf(1,'Filtering on actual annotated size of GO category\n'); + fprintf(1,'Filtering GO categories on their number of gene annotations\n'); sizeGOCategories = GOTerms.size; fprintf(1,'%u GO categories have no annotations :-/\n',... sum(sizeGOCategories==0)); @@ -81,4 +82,14 @@ fprintf(1,'Filtered to %u GO categories with between %u and %u annotations\n',... height(GOTable),sizeFilter(1),sizeFilter(2)); + +%------------------------------------------------------------------------------- +% Check that all annotations in GOTable are column vectors: +isRowVector = find(cellfun(@(x)size(x,2),GOTable.annotations)~=1); +numRowVectorAnnotation = length(isRowVector); +for i = 1:numRowVectorAnnotation + GOTable.annotations{isRowVector(i)} = GOTable.annotations{isRowVector(i)}'; +end +fprintf(1,'Transposed %u category annotation vectors from row-vector -> column vector\n',numRowVectorAnnotation); + end