-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathGenerateSuperpathway.m
180 lines (161 loc) · 7.85 KB
/
GenerateSuperpathway.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
function GenerateSuperpathway(SymbolFile, InteractionFile, PathwayListFile, HUGOFile, ScreenFile, OutputFile)
%Builds a unified pathway model from combining curated (NCI/Nature PID) and
%PPI screen protein interactions. The unified pathway is translated to
%a matrix format generated by 'BuildModel.m' for use with sparse linear
%solvers of underdetermined equations. The pathways generated by this script
%are used by '' to
%inputs:
%SymbolFile (string) - filename and path to an mSigDB file defining pathways and gene symbols.
% (see Data folder for example file: PID.MSigDB.gmt.txt).
%InteractionFile (string) - filename and path for an interaction file defining the source and target
% genes of protein interactions. Each row is an interaction. Source
% gene is listed in column 4, target gene is listed in column 5.
% (see Data folder for example file: PID.edge-attributes.txt).
%PathwaysListFile (string) - filename and path to a single-column text file defining the pathways to
% include in the aggregate superpathway.
% (see Data folder for example file: PID.Pathways.txt).
%HUGOFile (string) - filename and path to a HUGO gene set file containing official gene symbols
% and past aliases. Example is available here: ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt
%ScreenFile (string) - filename and path to a .mat file defining additional interactions to
% include in the superpathway. These are typically obtained from a
% protein-protein interaction screen, and will be marked as "novel"
% in the superpathway. This file contains two cell arrays of strings:
% 'ScreenSource' defining HUGO symbols for source genes in novel interactions
% and 'ScreenTarget' definint HUGO symbols for target genes in novel interactions.
% (see Data folder for example file: ScreenedInteractions.mat).
%OutputFile (string) - filename and path to store superpathway generated by this function.
% The output contains the following variables:
% Date - date of file generation
% PathwayNames - a 1xN cell array containing pathway names
% PathwaySymbols - a 1xN cell array containing pathway symbols
% Adjacencies - a 1xN cell array of pathway adjacency matrices
% Connectivities - a 1xN cell array of Matrix representations of adjacencies
% Novel - a 1xN cell array of vectors indicating which edges in the matrix
% representation are novel.
% Screened - a 1xN cell array of vectors indicating which edges in the
% matrix representation were screened.
% Undirected - a 1xN cell array of vectors indicating which edges in the
% matrix are undirected (novel edges).
%Licensed to the Apache Software Foundation (ASF) under one
%or more contributor license agreements. See the NOTICE file
%distributed with this work for additional information
%regarding copyright ownership. The ASF licenses this file
%to you under the Apache License, Version 2.0 (the
%"License"); you may not use this file except in compliance
%with the License. You may obtain a copy of the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
%Unless required by applicable law or agreed to in writing,
%software distributed under the License is distributed on an
%"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%KIND, either express or implied. See the License for the
%specific language governing permissions and limitations
%under the License.
%build HUGO structure
HUGO = ParseHUGO(HUGOFile);
%initialize NCI containers
PathwayNames = {};
PathwaySymbols = {};
%build NCI pathway member lists
counter = 1;
fid = fopen(SymbolFile, 'r');
while(~feof(fid))
Line = fgetl(fid);
Line = textscan(Line, '%s', 'Delimiter', '\t', 'EndOfLine', '\n');
Line = Line{1};
PathwayNames{counter} = Line{1};
Line = Line(2:end);
PathwaySymbols{counter} = sort(Line(~cellfun(@isempty, Line)));
counter = counter + 1;
end
fclose(fid);
%build list of all proteins in superpathway
Proteins = {};
if(exist('PathwayListFile'))
Contents = text2cell(PathwayListFile, '\t');
Mapping = StringMatch(PathwayNames, Contents);
for i = 1:length(PathwaySymbols)
if(~isempty(Mapping{i}))
Proteins = union(Proteins, unique(PathwaySymbols{i}));
end
end
else
for i = 1:length(PathwaySymbols)
Proteins = union(Proteins, unique(PathwaySymbols{i}));
end
end
%map these proteins to HUGO ontology - remove proteins without standard
%symbols
Proteins = HUGOLookup(Proteins, HUGO, 'Symbol');
Missing = cellfun(@isempty, Proteins);
Proteins(Missing) = [];
%open NCI protein interaction definitions, get sources/targets
Contents = text2cell(InteractionFile, '\t');
Source = Contents(2:end, 4);
Target = Contents(2:end, 5);
Remove = strcmp(Source, 'NOT_SPECIFIED') | strcmp(Target, 'NOT_SPECIFIED');
Source(Remove) = [];
Target(Remove) = [];
%identify interactions where source, target are both in super pathway
SourceMapping = StringMatch(Proteins, Source);
Hits = unique([SourceMapping{:}]);
TargetMapping = StringMatch(Proteins, Target(Hits));
Keep = Hits([TargetMapping{:}]);
Source = Source(Keep);
Target = Target(Keep);
%build adjacenty matrix
Adjacency = zeros(length(Proteins));
SourceMapping = StringMatch(Source, Proteins);
TargetMapping = StringMatch(Target, Proteins);
Indices = sub2ind([length(Proteins) length(Proteins)],...
[SourceMapping{:}], [TargetMapping{:}]);
Adjacency(Indices) = 1;
%zero diagonal, locate nonzero entries
Adjacency(1:length(Proteins)+1:end) = 0;
%complement pathways with PPI screen results
load(ScreenFile);
SourceMapping = StringMatch(ScreenSource, Proteins);
TargetMapping = StringMatch(ScreenTarget, Proteins);
%identify interactions where source and target lie within pathway
Hits = find(~cellfun(@isempty, SourceMapping) & ...
~cellfun(@isempty, TargetMapping));
SourceMapping = cell2mat(SourceMapping(Hits));
TargetMapping = cell2mat(TargetMapping(Hits));
%augment adjacency matrix, indicate if interaction is novel
for k = 1:length(SourceMapping)
if(Adjacency(SourceMapping(k), TargetMapping(k)) == 0 && ...
Adjacency(TargetMapping(k), SourceMapping(k)) == 0) %novel PPI
Adjacency(SourceMapping(k), TargetMapping(k)) = 3;
Adjacency(TargetMapping(k), SourceMapping(k)) = 3;
end
if(Adjacency(SourceMapping(k), TargetMapping(k)) == 1) %PPI that was screened but also in canonical pathway
Adjacency(SourceMapping(k), TargetMapping(k)) = 2;
end
if(Adjacency(TargetMapping(k), SourceMapping(k)) == 1) %PPI that was screened but also in canonical pathway
Adjacency(TargetMapping(k), SourceMapping(k)) = 2;
end
end
%translate adjacency matrix format to model for linear solver
[Adjacency, Connectivity, Novel, Screened, Source, Target, Undirected] = ...
BuildModel(Adjacency);
Source = Proteins(Source);
Target = Proteins(Target);
%build sub-pathway lists, indicating which interactions are in which pathways
PathwayMapping = cell(length(Source),1);
for i = 1:length(PathwayNames)
%map source, target to pathway symbols
SourceMapping = StringMatch(Source, PathwaySymbols{i});
TargetMapping = StringMatch(Target, PathwaySymbols{i});
Hits = find(~cellfun(@isempty, SourceMapping) & ...
~cellfun(@isempty, TargetMapping));
%record matches to pathways
for j = 1:length(Hits)
PathwayMapping{Hits(j)} = [PathwayMapping{Hits(j)} i];
end
end
%save pathway
Date = date;
save(OutputFile, 'Date', 'ScreenFile', 'PathwayNames', 'PathwaySymbols',...
'PathwayMapping', 'Adjacency', 'Connectivity', 'Novel', 'Screened',...
'Proteins', 'Source', 'Target', 'Undirected');