-
Notifications
You must be signed in to change notification settings - Fork 7
/
Kpax3_fasta_tutorial.jl
444 lines (412 loc) · 16.9 KB
/
Kpax3_fasta_tutorial.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#!/usr/bin/julia
###############################################################################
#
# Kpax3 - Bayesian cluster analysis of categorical data
#
# Copyright (c) 2016-2018 Alberto Pessia <[email protected]>
#
# Kpax3 is free software: you can redistribute it and/or modify it under the
# terms of the MIT License. Refer to LICENSE.md file for more info.
#
###############################################################################
###############################################################################
#
# INTRODUCTION
#
###############################################################################
#
# Welcome to the Kpax3 tutorial!
#
# In this tutorial you will learn how to:
# - load a multiple sequence alignment (MSA)
# - setup the parameters
# - run the algorithm
# - save the results
# - plot the results
#
# If you didn't do it already, copy this file and the provided dataset to a
# location of your choice.
#
# The tutorial has been designed in such a way that you should just copy/paste
# the commands into your julia environment. Alternatively, it is also possible
# to run the script directly from the command line by typing:
#
# julia Kpax3_fasta_tutorial.jl
#
# from within the directory where the file is located.
#
###############################################################################
###############################################################################
#
# NOTE FOR WINDOWS USERS
#
###############################################################################
#
# File paths should be written using the slash '/' character instead of the
# usual backslash '\' character. If you want to use the backslash, you have to
# escape it (i.e. use '\\'). Examples of valid file paths:
#
# "C:/Users/me/Documents/file.txt"
# "C:\\Users\\me\\Documents\\file.txt"
#
###############################################################################
###############################################################################
#
# LOAD THE PACKAGE
#
###############################################################################
#
# First of all, we need to load the package "Kpax3". If the package is not
# already installed in your system, please follow the installation instructions
# written in the README.md file.
#
# To load the package, open a julia session and run the command
using Kpax3;
###############################################################################
#
# INPUT/OUTPUT
#
###############################################################################
#
# The data we will use in this tutorial consist of 75 aligned protein genomes
# of ZIKA Virus (Human). Total length of each sequence is 3425 amino acids, but
# only 186 polymorphisms. Once converted to binary format, dataset will consist
# of 75 units and 382 binary variables.
#
# Data was retrieved from NCBI’s Virus Variation Resource database
# https://www.ncbi.nlm.nih.gov/genome/viruses/variation/Zika/
# on 2017-02-10 and aligned with MUSCLE
# http://www.drive5.com/muscle/
#
# Edit the following variable with the path to the dataset
input_file = "path_to_zika_dataset.fasta";
# Edit the following variable with the path to the output file.
# This file will contain intermediate output generated by Kpax3. Note that file
# extension is not necessary.
output_file = "path_to_output_file";
###############################################################################
#
# SETUP PARAMETERS
#
###############################################################################
#
# Kpax3 comes with default parameters values that should work in the majority
# of applications. At this point, you can already start analysing the dataset.
# Uncomment the following line if you do not want to customize your Kpax3 run
# and use default values:
#settings = KSettings(input_file, output_file);
# -----------------------------------------------------------------------------
# If any setting is omitted, its default value will be used.
#
# - Missing values
# First of all, we need to specify which characters are to be considered
# missing data.
#
# If the dataset is contained in a fasta file (genetic data), then the variable
# is called `miss` and the format is a UInt8 vector representing the
# characters:
# Default value:
# miss = UInt8['?', '*', '#', '-', 'b', 'j', 'x', 'z']
#
# If the dataset is contained in a csv file (generic categorical data), then
# the variable is called `misscsv` and the format is a String vector
# representing the values:
# Default value:
# misscsv = [""]
#
# - Ewens-Pitman distribution for the partition
# alpha is a Float64 variable constrained in (0, 1).
# theta is a Float64 variable constrained in (-alpha, infinity)
#
# Default values:
# alpha = 0.5
# theta = -0.25
#
# - Column status prior probabilities
# This is a non-negative Float64 vector of length 3. If it does not sum to 1,
# it is automatically normalized.
# Probabilities contained in this variable represent an initial guess for the
# proportion of sites possessing the following statuses:
# - uninformative (noise)
# - informative but not characteristic for any cluster (weak signal)
# - informative and characteristic for a cluster (strong signal)
#
# Default value:
# gamma = [0.6; 0.35; 0.05]
#
# - Beta distribution hyper-parameter
# This is a Float64 variable and it's a bit tricky to explain and to properly
# setup. You can do it on your own, of course, but it requires some good
# understanding of the model. Read the Pessia and Corander (2017) paper to
# learn its usage. I suggest leaving it to its default value.
#
# Default value:
# r = log(0.001) / log(0.95)
#
# - General settings
# Boolean variable 'verbose' to print diagnostics messages during run.
# Integer variable 'verbosestep' to print diagnostics messages every fixed
# number of steps.
#
# Default values:
# verbose = false
# verbosestep = 500
#
# - MCMC settings
# Integer variable 'T' to choose the length of the Markov Chain.
# Integer variable 'burnin' to choose the length of the burnin period.
# Integer variable 'tstep' to tune the thinning of the chain, i.e. samples will
# be collected every 'tstep' iterations.
# Float64 vector 'op' of length 3 with proportions of MCMC operators, i.e.
# [split-merge; gibbs; biased random walk]. To save time, we suggest using a
# very long chain with only split-merge and biased random walk. For particular
# datasets, gibbs sampler might be needed to reach convergence.
#
# Default values:
# T = 100000
# burnin = 10000
# tstep = 1
# op = [0.5; 0.0; 0.5]
#
# - Genetic Algorithm settings
# Integer variable 'popsize' to choose the population size.
# Integer variable 'maxiter' to choose the total number of iterations.
# Integer variable 'maxgap' to choose how many iterations without new
# solutions we want to wait before stopping the search.
# Float64 variable 'xrate' for crossover rate.
# Float64 variable 'mrate' for mutation rate.
#
# Default values:
# popsize = 20
# maxiter = 20000
# maxgap = 5000
# xrate = 0.9
# mrate = 0.005
# -----------------------------------------------------------------------------
# To save time in this tutorial, we will simulate a short chain. For a real
# data analysis, choose a longer chain in order to reduce the effect of
# autocorrelation between observations.
settings = KSettings(input_file,
output_file,
miss=UInt8['?', '*', '#', '-', 'b', 'j', 'x', 'z'],
misscsv=[""],
alpha=0.5,
theta=-0.25,
gamma=[0.6; 0.35; 0.05],
r=log(0.001) / log(0.95),
verbose=true,
verbosestep=1000,
T=50000,
burnin=10000,
tstep=1,
op=[0.5; 0.0; 0.5],
popsize=20,
maxiter=20000,
maxgap=5000,
xrate=0.9,
mrate=0.005);
###############################################################################
#
# LOAD DATA
#
###############################################################################
#
# Loading data is straightforward.
zika_data = AminoAcidData(settings);
###############################################################################
#
# INITIAL PARTITION
#
###############################################################################
#
# We need to start our search, or simulation, somewhere. Kpax3 offers two ways
# to input an initial partition of the units:
# 1) Provide a csv file with a known initial partition, for example a
# solution from a previous run;
# 2) Heuristically find a good initial partition.
#
# Here, we will opt for the second option.
#
initial_partition = initializepartition(settings);
# The csv file can be either a single column of integers (cluster indices)
# representing the partition, or a two-column file where the first column
# contains the units ids and the second column the cluster indices.
#initial_partition = "path_to_initial_partition.csv";
###############################################################################
#
# MAP ESTIMATE (OPTIMIZATION BY GENETIC ALGORITHM)
#
###############################################################################
#
# We will start by obtaining a point estimate equivalent to the maximum a
# posteriori (MAP). In general, MAP estimates provide good solutions to the
# problem of clustering genetic sequences. It is the recommended approach
# for huge datasets or if we are only interested in clustering the sequences.
#
# Keep in mind that the solution might be a local optimum, requiring multiple
# runs of the optimization algorithm to increase the chance of finding the
# global optimum.
#
# Fortunately for us, the genetic algorithm implemented by Kpax3 is good enough
# to get very close to the global optimum. Even if the solution is a local
# optimum, it will still be a good approximation to the exact solution.
map_solution = kpax3ga(zika_data, initial_partition, settings);
# Check the value of the log-posterior distribution (plus a constant)
# Compare different solutions by this value and pick the one with the maximum
# value.
using Printf;
@printf("Log-posterior value (plus a constant): %.4f\n", map_solution.logpp);
###############################################################################
#
# POSTERIOR DISTRIBUTION (MCMC)
#
###############################################################################
#
# To obtain a measure of uncertainty regarding the parameters of interest, we
# need to sample realizations of such parameters from the posterior
# distribution. We will do it by simulating a Markov Chain.
#
# NOTE
# This step is going to take a while.
#
# Depending on your dataset, you need to customize chain length and operators
# proportions. Obviously, the longer the chain (number of simulations) the
# better, at the cost of longer simulation times and increased size of hard
# drive memory required. If the dataset is too big for MCMC technique, MAP
# estimate (see previous section) is the recommended approach.
kpax3mcmc(zika_data, initial_partition, settings);
# Compute the estimate that minimize the posterior expected loss
mcmc_solution = kpax3estimate(zika_data, settings);
# If the MAP estimate is a local optimum, it might happen that the MCMC
# estimate will have a higher posterior probability
if mcmc_solution.logpp > map_solution.logpp
# let's try again starting from a better solution. If no solution is found,
# map_solution will be equal to mcmc_solution
map_solution = kpax3ga(zika_data, mcmc_solution.R, settings);
nothing;
end
###############################################################################
#
# SAVE RESULTS
#
###############################################################################
#
# We will save both the MAP estimate and the state that minimize the posterior
# expected loss. In general, MAP estimate is a better point estimate.
#
# Edit the path to the output files. No file extension needed.
map_output_file = "path_to_MAP_estimate_output_file";
mcmc_output_file = "path_to_MCMC_estimate_output_file";
writeresults(zika_data, map_solution, map_output_file, what=4, verbose=true);
writeresults(zika_data, mcmc_solution, mcmc_output_file, what=4, verbose=true);
# Output description:
# If what = 1:
# filename_summary.txt = Text file with the log posterior probability.
# This can be used to compare different
# parallel runs.
# filename_partition.csv = Best partition found by the algorithm.
#
# If what = 2 writes all the previous files plus:
# filename_attributes.csv = Complete columns classification matrix.
#
# If what = 3 writes all the previous files plus:
# filename_characteristic.csv = Characteristic sites and their corresponding
# values.
#
# If what = 4 writes all the previous files plus:
# filename_dataset.txt = Original dataset with highlighted clusters
# and their corresponding characteristic
# values.
#
# To immediately check if the estimate is reliable, open
# "filename_characteristic.csv" and count how many characteristic sites have
# been found. If the ratio
# number of characteristic sites / total number of SNPs
# is greater than 0.1 and each cluster has at least a characteristic amino
# acid (the more the better), then the solution can be considered reliable.
#
# Suppose we are not satisfied with the current MAP solution and we think
# that units should be moved around, or a cluster should be split or two
# clusters merged. This could happen when metadata is available, so that we
# can immediately see if something is not right.
#
# After manually editing the partition file, we can load it again and check
# our hypothesis. No need to run the algorithm again. There is a function
# written specifically for this task.
#
# If needed, uncomment the following commands.
#my_partition = "path_to_manually_edited_partition.csv";
#my_solution = optimumstate(zika_data, my_partition, settings);
#@printf("Log-posterior value (plus a constant): %.4f\n", my_solution.logpp);
#
# You can save this solution the same way we did with MAP and MCMC estimates.
###############################################################################
#
# PLOT RESULTS
#
###############################################################################
#
# In the following, we will use the MAP estimate as the reference partition.
#
# Function signatures are shown in case you want to change default values.
# Choose the default plotting backend, such as GR. Install them with
# ]add Plots GR
using Plots;
gr();
# For high quality figures, you can save all these plots to SVG format with the
# following command:
# Plots.savefig(plotfunction(plot_parameters, size=(width, height),
# "path_to_figure.svg");
# where 'width' and 'height' are Float64 to be set according to your needs and
# 'plotfunction' is the corresponding desired plot.
#
# 'Plots.savefig' chooses file type automatically by the extension.
# Plot the complete dataset, highlighting clusters and polymorphic sites
# function kpax3plotd(x::AminoAcidData,
# state::AminoAcidState;
# clusterorder=:auto,
# clusterlabel=:auto)
kpax3plotd(zika_data, map_solution)
# Load posterior probabilities:
(k, pk) = readposteriork(output_file);
(id, P) = readposteriorP(output_file);
(site, aa, freq, C) = readposteriorC(output_file);
# Let's check how the Markov Chain performed by looking at trace and jump
# plots. We will do it separately for R (partition of the units) and C
# (partition of the columns).
#
# Compute the entropy of each sample and the average distance of entropies at
# different lags.
(entropy_R, avgd_R) = traceR(output_file, maxlag=50);
(entropy_C, avgd_C) = traceC(output_file, maxlag=50);
# function kpax3plottrace(entropy::Vector{Float64};
# maxlag=200,
# M=20000,
# main="")
kpax3plottrace(entropy_R)
kpax3plottrace(entropy_C)
# function kpax3plotdensity(entropy::Vector{Float64};
# maxlag=200,
# main="")
kpax3plotdensity(entropy_R)
kpax3plotdensity(entropy_C)
# function kpax3plotjump(avgd::Vector{Float64};
# main="")
kpax3plotjump(avgd_R)
kpax3plotjump(avgd_C)
# Posterior distribution of the number of clusters
# function kpax3plotk(k::Vector{Int},
# pk::Vector{Float64})
kpax3plotk(k, pk)
# Posterior distribution of adjacency matrix
# function kpax3plotp(R::Vector{Int},
# P::Matrix{Float64};
# clusterorder=:auto,
# clusterlabel=:auto)
kpax3plotp(map_solution.R, P)
# Posterior distribution of features types (not a very informative plot in this
# case because of few polymorphic sites compared to the length of the genome)
# function kpax3plotc(site::Vector{Int},
# freq::Vector{Float64},
# C::Matrix{Float64})
kpax3plotc(site, freq, C)