Skip to content

Commit

Permalink
Streamlining
Browse files Browse the repository at this point in the history
  • Loading branch information
benfulcher committed Oct 23, 2020
1 parent bc33737 commit 27a6d70
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 32 deletions.
68 changes: 43 additions & 25 deletions EnsembleEnrichment/ComputeAllCategoryNulls.m
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
numNullSamples = enrichmentParams.numNullSamples;

%-------------------------------------------------------------------------------
% Get random vectors from real genes to use as null spatial maps:
% Get randomized phenotypes to use as your null:
switch enrichmentParams.whatEnsemble
case 'customSpecified'
% Specify a custom phenotype and just run the full calculation on this:
Expand All @@ -78,44 +78,62 @@
end

%-------------------------------------------------------------------------------
% Correlation of genes with a given spatial map (or null ensemble of spatial maps):
% Prepare for category-wise agglomeration by first computing the correlation of
% each gene with a given spatial map (or a null ensemble of spatial maps)
allAnnotatedGenesEntrez = unique(vertcat(GOTable.annotations{:}));
numAnnotatedTotal = length(allAnnotatedGenesEntrez);
allAnnotatedGenesEntrez = intersect(allAnnotatedGenesEntrez,entrezIDs);
numAnnotatedGenes = length(allAnnotatedGenesEntrez);
fprintf(1,'Of %u annotated genes, %u match genes we have expression data for\n',...
numAnnotatedTotal,numAnnotatedGenes);

geneScores = nan(numAnnotatedGenes,numNullSamples);
fprintf(1,'Computing null distributions to %u null phenotypes for all %u genes annotated to GO categories\n',...
numNullSamples,numAnnotatedGenes);
for g = 1:numAnnotatedGenes
% Get this gene's expression vector:
matchMe = (entrezIDs==allAnnotatedGenesEntrez(g));
if sum(matchMe)~=1
fprintf(1,'How the heck did I get %u matches for gene entrez %u?!\n',...
sum(matchMe),allAnnotatedGenesEntrez(g));
end
expressionVector = geneData(:,matchMe);

% The correlation to compute for this gene:
theCorr_fun = @(x) corr(x,expressionVector,'type',whatCorr,'rows','pairwise');
if numNullSamples==1
geneScores(g) = theCorr_fun(nullMaps(:,1));
else
parfor n = 1:numNullSamples
geneScores(g,n) = theCorr_fun(nullMaps(:,n));
end
end
end

%-------------------------------------------------------------------------------
% Agglomerate gene scores by GO category:
categoryScores = cell(numGOCategories,1);
for i = 1:numGOCategories
if beVerbose
fprintf(1,'\n\n\nCategory %u/%u\n',i,numGOCategories);

fprintf(1,'Looking in at %s:%s (%u)\n',GOTable.GOIDlabel{i},...
GOTable.GOName{i},GOTable.size(i));
fprintf(1,'\n\n\nLooking in at Category %u/%u. %s:%s (%u)\n',...
i,numGOCategories,GOTable.GOIDlabel{i},GOTable.GOName{i},GOTable.size(i));
end

% Match genes for this category:
theGenesEntrez = GOTable.annotations{i};
matchMe = find(ismember(entrezIDs,theGenesEntrez));
geneDataCategory = geneData(:,matchMe);
categoryEntrez = GOTable.annotations{i};
matchMe = find(ismember(allAnnotatedGenesEntrez,categoryEntrez));
numGenesCategory = length(matchMe);

if beVerbose
fprintf(1,'%u/%u genes from this GO category have matching records in the expression data\n',...
length(matchMe),length(theGenesEntrez));
length(matchMe),length(theGenesEntrez));
end

% Compute the distribution of gene-category scores for correlation with the null maps:
scoresHere = nan(numGenesCategory,numNullSamples);
for k = 1:numGenesCategory
expressionVector = geneDataCategory(:,k);
% The correlation to compute for this gene:
theCorr_fun = @(x) corr(x,expressionVector,'type',whatCorr,'rows','pairwise');
if numNullSamples==1
scoresHere(k) = theCorr_fun(nullMaps(:,1));
else
parfor j = 1:numNullSamples
scoresHere(k,j) = theCorr_fun(nullMaps(:,j));
end
end
end
% Retrieve the distribution of gene scores across phenotypes:
scoresHere = geneScores(matchMe,:);

%---------------------------------------------------------------------------
% Aggregate gene-wise scores into an overall GO category score
% Aggregate gene-wise scores into an overall GO category score (for each phenotype)
switch enrichmentParams.aggregateHow
case 'mean'
categoryScores{i} = nanmean(scoresHere,1);
Expand Down
11 changes: 8 additions & 3 deletions EnsembleEnrichment/EnsembleEnrichment.m
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
% - fileNullEnsembleResults: file where the precomputed nulls are stored
% - phenotypeVector: a vector of the spatial phenotype map to be tested
%
% The params are taken from the results of the null ensemble enrichment file
% The parameters are taken from the null ensemble enrichment file (params)
%
%---OUTPUT:
% - GOTablePhenotype: a table with p-values estimated from the null ensemble
Expand All @@ -19,9 +19,12 @@
% Process inputs and set defaults:
%-------------------------------------------------------------------------------
if nargin < 1
error('You must specify a file containing the precomputed ensemble nulls');
error('You must specify a file containing the gene data');
end
if nargin < 2
error('You must specify a file containing the precomputed ensemble nulls');
end
if nargin < 3
error('You must provide a phenotype vector');
end
%-------------------------------------------------------------------------------
Expand Down Expand Up @@ -51,12 +54,14 @@
%-------------------------------------------------------------------------------
GOTablePhenotype = EstimatePVals(GOTableNull.categoryScores,...
[GOTablePhenotype.categoryScores{:}],'right',GOTablePhenotype);

% Sort by Gaussian-approximation p-values:
GOTablePhenotype = sortrows(GOTablePhenotype,'pValZ','ascend');

%-------------------------------------------------------------------------------
% Give a basic output about significance using pValZCorr:
sigThresh = 0.05;
numSig = sum(GOTablePhenotype.pValZCorr < sigThresh);
fprintf(1,'%u significant categories at pZ_corr < %.2f\n',numSig,sigThresh);
fprintf(1,'%u significant categories at p_Z_corr < %.2f\n',numSig,sigThresh);

end
19 changes: 15 additions & 4 deletions GetFilteredGOData.m
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
% Check Inputs:
if nargin < 1
whatSource = 'mouse-direct'; % Direct annotations from GO
% whatSource = 'mouse-GEMMA'; % Annotations derived from GEMMA
end
if nargin < 2
whatFilter = 'biological_process';
Expand All @@ -20,7 +19,7 @@

%-------------------------------------------------------------------------------
% Load processed GO annotation data (i.e., direct annotations propagated up the hierarchy):
% cf. propagateHierarchy to map files generated from ReadDirectAnnotationFile or ReadGEMMAAnnotationFile
% cf. propagateHierarchy to map files generated from ReadDirectAnnotationFile (or ReadGEMMAAnnotationFile)
switch whatSource
case 'mouse-direct'
fileNameLoad = sprintf('GOAnnotationDirect-mouse-%s-Prop.mat',whatFilter);
Expand All @@ -31,6 +30,7 @@
otherwise
error('Unknown annotation source: ''%s''',whatSource);
end

load(fileNameLoad,'GOTerms');
fprintf(1,'Loaded annotated GO Terms from %s\n',fileNameLoad);

Expand All @@ -39,10 +39,11 @@
GOTermsFile = 'GOTerms_BP.mat';
if strcmp(whatFilter,'biological_process') && exist(GOTermsFile,'file')
load(GOTermsFile,'GOTable');
fprintf(1,'Loaded biological process GO terms from %s\n',GOTermsFile);
fprintf(1,'Loaded biological process GO terms from file: %s\n',GOTermsFile);
else
% Retrieve from mySQL database:
GOTable = GetGOTerms(whatFilter);
fprintf(1,'Loaded biological process GO terms from SQL database\n');
end

%-------------------------------------------------------------------------------
Expand All @@ -57,7 +58,7 @@
% Filter by category size:
numGOCategories = height(GOTerms);
if isempty(ourEntrez)
fprintf(1,'Filtering on actual annotated size of GO category\n');
fprintf(1,'Filtering GO categories on their number of gene annotations\n');
sizeGOCategories = GOTerms.size;
fprintf(1,'%u GO categories have no annotations :-/\n',...
sum(sizeGOCategories==0));
Expand All @@ -81,4 +82,14 @@
fprintf(1,'Filtered to %u GO categories with between %u and %u annotations\n',...
height(GOTable),sizeFilter(1),sizeFilter(2));


%-------------------------------------------------------------------------------
% Check that all annotations in GOTable are column vectors:
isRowVector = find(cellfun(@(x)size(x,2),GOTable.annotations)~=1);
numRowVectorAnnotation = length(isRowVector);
for i = 1:numRowVectorAnnotation
GOTable.annotations{isRowVector(i)} = GOTable.annotations{isRowVector(i)}';
end
fprintf(1,'Transposed %u category annotation vectors from row-vector -> column vector\n',numRowVectorAnnotation);

end

0 comments on commit 27a6d70

Please sign in to comment.