-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpfp_oabuild.m
167 lines (149 loc) · 5.39 KB
/
pfp_oabuild.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
function [oa] = pfp_oabuild(ont, afile, varargin)
%PFP_OABUILD Ontology annotation build
%
% [oa] = PFP_OABUILD(ont, afile[, afile_2, ...]);
%
% Builds an ontology annotation structure from data file(s).
%
% Note
% ----
% Annotations with invalid terms will be ignored.
%
% Input
% -----
% (required)
% [struct]
% ont: The ontology structure. See pfp_ontbuild.m
%
% [char]
% afile: An annotation file, which has two columns splitted by TAB.
% <object ID> <term ID>
%
% (optional)
% [cell]
% varargin: Additional annotation files.
%
% Output
% ------
% [struct]
% oa: The ontology annotation structure, which has the following fields:
% .object [cell] Object ID array, of type "char". ID here could
% typically be strings that identify objects in a
% database: e.g., UniProt protein accessions.
% .ontology [struct] The ontology structure passed through from the
% inputs.
% .annotation [logical] A (sparse) binary matrix where annotation(i, j)
% = true indicates the association between object
% i and term j.
% .date [char] The date when it's been built.
%
% Dependency
% ----------
% [>] pfp_getterm.m
% [>] pfp_annotprop.m
%
% See Also
% --------
% [>] pfp_ontbuild.m
% check inputs {{{
% ont
validateattributes(ont, {'struct'}, {'nonempty'}, '', 'ont', 1);
% afile
validateattributes(afile, {'char'}, {'nonempty'}, '', 'afile', 2);
if ~exist(afile, 'file')
error('pfp_oabuild:FileErr', 'File [%s] doesn''t exist.', afile);
end
afiles = cell(1, 1 + numel(varargin));
afiles{1} = afile;
% varargin
for i = 1 : numel(varargin)
validateattributes(varargin{i}, {'char'}, {'nonempty'}, '', 'varargin', 3);
if ~exist(varargin{i}, 'file')
error('pfp_oabuild:FileErr', 'File [%s] doesn''t exist.', varargin{i});
end
afiles{i+1} = varargin{i};
end
% }}}
% read ontology annotation data file(s) {{{
plain_oa = loc_oaread(afiles); % See below for local function definination
valid_term = pfp_getterm(ont, plain_oa.term_id);
% invalid terms will have an empty string, '', as its ID (place-holder),
dummy = find(cellfun(@length, {valid_term.id}) == 0);
if numel(dummy) > 0
warning('pfp_oabuild:InvalidID', 'Found [%d] invalid term ID(s).', numel(dummy));
% remove those dummy terms.
valid_term(dummy) = [];
% update 'plain_oa' accordingly
plain_oa.term_id(dummy) = [];
plain_oa.annot(:, dummy) = [];
end
% }}}
% build oa structure {{{
oa.object = plain_oa.obj_id;
oa.ontology = ont;
oa.annotation = logical(sparse(numel(oa.object), numel(ont.term)));
% map valid_term.id to the ontology term list those terms have been mapped
% already above, so all should be found
[~, index] = ismember({valid_term.id}, {ont.term.id});
% alternative term IDs will share the same index, in which case, we have to
% take the union of annotations on those terms. Thus, the union will be saved
% as two exact copy (columns) corresponding to each of the alternative IDs.
uindex = unique(index);
for i = 1 : numel(uindex)
alt_cols = find(index == uindex(i));
if numel(alt_cols) > 1
% take the union and overwrite
col = any(plain_oa.annot(:, alt_cols), 2);
plain_oa.annot(:, alt_cols) = repmat(col, 1, numel(alt_cols));
end
end
oa.annotation(:, index) = plain_oa.annot ~= 0;
% remove objects with no annotations
zero_annot = sum(oa.annotation, 2) == 0;
oa.annotation(zero_annot, :) = [];
oa.object(zero_annot) = [];
oa.annotation = pfp_annotprop(oa.ontology.DAG, oa.annotation);
oa.date = datestr(now, 'mm/dd/yyyy HH:MM');
% }}}
end
% function: loc_oaread {{{
function [plain_oa] = loc_oaread(afiles)
% [plain_oa] = LOC_OAREAD(afiles);
%
% Reads plain ontology annotation file(s) into a plain oa structure
%
% Input
% -----
% [char]
% afiles: plain ontology annotation file, in the following format
% <object ID> <term ID>
%
% Output
% ------
% [struct]
% plain_oa: ontology annotation, which has
% .obj_id [cell] gene product ID, having length n.
% .term_id [cell] GO term ID, having length m.
% .annot [logical] a sparse binary matrix, with size n-by-m.
gp = {};
tm = {};
for i = 1 : numel(afiles)
fid = fopen(afiles{i}, 'r');
data = textscan(fid, '%s%s', 'Delimiter', '\t');
fclose(fid);
gp = [gp; data{1}];
tm = [tm; data{2}];
end
plain_oa.obj_id = unique(gp);
plain_oa.term_id = unique(tm);
[~, indexO] = ismember(gp, plain_oa.obj_id);
[~, indexT] = ismember(tm, plain_oa.term_id);
plain_oa.annot = sparse(indexO, indexT, 1, numel(plain_oa.obj_id), numel(plain_oa.term_id));
plain_oa.annot = logical(plain_oa.annot);
end
% }}}
% -------------
% Yuxiang Jiang ([email protected])
% Department of Computer Science
% Indiana University Bloomington
% Last modified: Wed 21 Sep 2016 01:04:09 PM E