Merge branch 'julia_upgrade_1.0'

albertopessia · Aug 23, 2018 · 96c386e · 96c386e
2 parents 752d884 + 2cfb303
commit 96c386e
Show file tree

Hide file tree

Showing 50 changed files with 1,329 additions and 1,220 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,18 +1,25 @@
 language: julia
+
 os:
   - linux
   - osx
+
 julia:
-  - 0.6
+  - 1.0
+
 addons:
   apt:
     packages:
-    - hdf5-tools
+      - hdf5-tools
+      - qt5-default
+
 notifications:
   email: false
+
 script:
   - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
-  - julia -e 'Pkg.clone(pwd()); Pkg.build("Kpax3");'
-  - julia --check-bounds=yes -e 'Pkg.test("Kpax3", coverage=true);'
+  - julia -e 'import Pkg; Pkg.add(Pkg.PackageSpec(path=pwd())); Pkg.build("Kpax3");'
+  - julia --check-bounds=yes -e 'import Pkg; Pkg.test("Kpax3", coverage=true);'
+
 after_success:
-  - julia -e 'cd(Pkg.dir("Kpax3")); Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder());'
+  - julia -e 'import Pkg; import Kpax3; cd(joinpath(dirname(pathof(Kpax3)), "..")); Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder());'
diff --git a/LICENSE.md b/LICENSE.md
@@ -1,7 +1,7 @@
 The MIT License (MIT)
 =====================
 
-Copyright (c) 2016-2017 Alberto Pessia
+Copyright (c) 2016-2018 Alberto Pessia
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in

diff --git a/NEWS.md b/NEWS.md
@@ -1,12 +1,19 @@
 # Kpax3.jl Release Notes
 
+## Changes from v0.3.0 to v0.4.0
+
+* Upgrade to Julia 1.0
+* Switched plotting library from `Gadfly` to `Plots`
+
 ## Changes from v0.2.0 to v0.3.0
+
 * Added function plotD for plotting the dataset with highlighted amino acids
 * Modified function plotP for reducing its output and memory size
 * Added axis labels to plotP
 * Updated the tutorial with the new plotD function
 * Created a convenient command line script for running Kpax3. You can find it on [GitHub Gist](https://gist.github.com/albertopessia/fd9df11fb2bdb158ad91936c4638d6fd)
 
 ## Changes from v0.1.0 to v0.2.0
-* Updated code for Julia 0.6
+
+* Upgrade to Julia 0.6
 * It is now possible to load generic categorical data from _csv_ files with the `CategoricalData` function
diff --git a/README.md b/README.md
@@ -7,16 +7,16 @@ Kpax3 is a [Julia](http://julialang.org/) package for inferring the group struct
 ## Reference
 To know more about the underlying statistical model, refer to the following publications (the first is the primary citation if you use the package):
 
-Pessia, A. and Corander, J. (2018). Kpax3: Bayesian bi-clustering of large sequence datasets. Bioinformatics. _Submitted_
+Pessia, A. and Corander, J. (2018). Kpax3: Bayesian bi-clustering of large sequence datasets. *Bioinformatics*, **34**(12): 2132–2133. doi: [10.1093/bioinformatics/bty056](https://doi.org/10.1093/bioinformatics/bty056)
 
-Pessia, A., Grad, Y., Cobey, S., Puranen, J. S., and Corander, J. (2015) K-Pax2: Bayesian identification of cluster-defining amino acid positions in large sequence datasets. _Microbial Genomics_ **1**(1). doi: [10.1099/mgen.0.000025](http://doi.org/10.1099/mgen.0.000025)
+Pessia, A., Grad, Y., Cobey, S., Puranen, J. S., and Corander, J. (2015) K-Pax2: Bayesian identification of cluster-defining amino acid positions in large sequence datasets. *Microbial Genomics*, **1**(1). doi: [10.1099/mgen.0.000025](http://doi.org/10.1099/mgen.0.000025)
 
 ## Installation
-Kpax3 can be installed from within Julia by typing
+Kpax3 can be easily installed from within Julia:
 
-```julia
-Pkg.add("Kpax3")
-```
+- Enter the Pkg REPL-mode by pressing  `]` in the Julia REPL
+- Issue the command `add Kpax3`
+- Press the Backspace key to return to the Julia REPL
 
 ## Usage
 The best way to learn how to use Kpax3 is by following the instructions in the [fasta tutorial](tutorial/Kpax3_fasta_tutorial.jl) (for genetic sequences) or in the [csv tutorial](tutorial/Kpax3_csv_tutorial.jl) (for general categorical data).

diff --git a/REQUIRE b/REQUIRE
@@ -1,8 +1,10 @@
-julia 0.6 0.7-
-Clustering 0.5
-Distances 0.2.1
-FileIO 0.3
-Gadfly 0.6.4
-JLD 0.6.10
-Measures 0.0.1
-StatsBase 0.15
+julia 1.0 2.0-
+Clustering 0.10
+Distances 0.7
+FileIO 1.0
+GR 0.32
+JLD2 0.1.0
+Plots 0.19
+SpecialFunctions 0.7
+StatsBase 0.25
+StatPlots 0.8
diff --git a/src/Kpax3.jl b/src/Kpax3.jl
@@ -1,18 +1,18 @@
 # This file is part of Kpax3. License is MIT.
 
-__precompile__(true)
-
 module Kpax3
   #################
   # Load packages #
   #################
   import Clustering
+  import DelimitedFiles
   import Distances
   import FileIO
-  import JLD
-  import Gadfly
-  import Measures
+  import Plots
+  import Printf
+  import SpecialFunctions
   import StatsBase
+  import StatPlots
 
   ####################
   # Export functions #

diff --git a/src/data_processing/convert.jl b/src/data_processing/convert.jl
@@ -199,7 +199,7 @@ function categorical2binary(data::Matrix{String},
   end
 
   bindata = zeros(UInt8, m, n)
-  val = Array{String}(m)
+  val = Array{String}(undef, m)
   key = zeros(Int, m)
 
   j = 0
@@ -208,7 +208,7 @@ function categorical2binary(data::Matrix{String},
 
     for s in obs
       j += 1
-      bindata[j, data[row, :] .== s] = 0x01
+      bindata[j, data[row, :] .== s] .= 0x01
       val[j] = s
       key[j] = row
     end

diff --git a/src/data_processing/io.jl b/src/data_processing/io.jl
@@ -8,9 +8,18 @@ function save(ofile::String,
     mkpath(dirpath)
   end
 
-  JLD.save(FileIO.File(FileIO.@format_str("JLD"), ofile),
-           "data", x.data, "id", x.id, "ref", x.ref, "val", x.val, "key",
-           x.key, compress=true)
+  fpo = FileIO.File(FileIO.@format_str("JLD2"), ofile)
+  obj = Dict(
+    "data" => x.data,
+      "id" => x.id,
+     "ref" => x.ref,
+     "val" => x.val,
+     "key" => x.key
+  )
+
+  FileIO.save(fpo, obj)
+
+  nothing
 end
 
 function loadnt(ifile::String)
@@ -19,7 +28,9 @@ function loadnt(ifile::String)
   f = open(ifile, "r")
   close(f)
 
-  (d, id, ref, val, key) = JLD.load(ifile, "data", "id", "ref", "val", "key")
+  fpo = FileIO.File(FileIO.@format_str("JLD2"), ifile)
+  (d, id, ref, val, key) = FileIO.load(fpo, "data", "id", "ref", "val", "key")
+
   NucleotideData(d, id, ref, val, key)
 end
 
@@ -29,6 +40,8 @@ function loadaa(ifile::String)
   f = open(ifile, "r")
   close(f)
 
-  (d, id, ref, val, key) = JLD.load(ifile, "data", "id", "ref", "val", "key")
+  fpo = FileIO.File(FileIO.@format_str("JLD2"), ifile)
+  (d, id, ref, val, key) = FileIO.load(fpo, "data", "id", "ref", "val", "key")
+
   AminoAcidData(d, id, ref, val, key)
 end
diff --git a/src/data_processing/read.jl b/src/data_processing/read.jl
@@ -1,35 +1,30 @@
 # This file is part of Kpax3. License is MIT.
 
 """
-# Read a FASTA formatted file
-
-## Description
+  readfasta(ifile::String, protein::Bool, miss::Vector{UInt8}, l::Int,
+verbose::Bool, verbosestep::Int)
 
 Read data in FASTA format and convert it to an integer matrix. Sequences are
 required to be aligned. Only polymorphic columns are stored.
 
-## Usage
-
-readfasta(ifile, protein, miss, l, verbose, verbosestep)
-
 ## Arguments
 
-* `ifile` Path to the input data file
-* `protein` `true` if reading protein data or `false` if reading DNA data
-* `miss` Characters (as `UInt8`) to be considered missing. Use
+- `ifile` Path to the input data file
+- `protein` `true` if reading protein data or `false` if reading DNA data
+- `miss` Characters (as `UInt8`) to be considered missing. Use
 `miss = zeros(UInt8, 1)` if all characters are to be considered valid. Default
 characters for `miss` are:
 
-  - DNA data: _?, \*, #, -, b, d, h, k, m, n, r, s, v, w, x, y, j, z_
-  - Protein data: _?, \*, #, -, b, j, x, z_
+    - DNA data: _?, \\*, #, -, b, d, h, k, m, n, r, s, v, w, x, y, j, z_
+    - Protein data: _?, \\*, #, -, b, j, x, z_
 
-* `l` Sequence length. If unknown, it is better to choose a value which is
+- `l` Sequence length. If unknown, it is better to choose a value which is
 surely greater than the real sequence length. If `l` is found to be
 insufficient, the array size is dynamically increased (not recommended from a
 performance point of view). Default value should be sufficient for most
 datasets
-* `verbose` If `true`, print status reports
-* `verbosestep` Print a status report every `verbosestep` read sequences
+- `verbose` If `true`, print status reports
+- `verbosestep` Print a status report every `verbosestep` read sequences
 
 ## Details
 
@@ -105,9 +100,9 @@ converted to 't' when reading DNA data. Conversion tables are the following:
 
 A tuple containing the following variables:
 
-* `data` Multiple Sequence Alignment (MSA) encoded as a UInt8 matrix
-* `id` Units' ids
-* `ref` Reference sequence, i.e. a vector of the same length of the original
+- `data` Multiple Sequence Alignment (MSA) encoded as a UInt8 matrix
+- `id` Units' ids
+- `ref` Reference sequence, i.e. a vector of the same length of the original
 sequences storing the values of homogeneous sites. SNPs are instead represented
 by a value of 46 ('.')
 """
@@ -125,7 +120,7 @@ function readfasta(ifile::String,
   # note: this function has been written with a huge dataset in mind
   f = open(ifile, "r")
 
-  s = strip(readuntil(f, '>'))
+  s = strip(readuntil(f, '>', keep=true))
   if length(s) == 0
     close(f)
     throw(KFASTAError("No sequence was found."))
@@ -171,8 +166,8 @@ function readfasta(ifile::String,
 
         # do we have enough space to store the first sequence?
         if w > length(tmpseqref)
-          tmpseqref = copy!(zeros(UInt8, w + l), tmpseqref)
-          tmpmissseqref = copy!(falses(w + l), tmpmissseqref)
+          tmpseqref = copyto!(zeros(UInt8, w + l), tmpseqref)
+          tmpmissseqref = copyto!(falses(w + l), tmpmissseqref)
         end
 
         for c in s
@@ -206,8 +201,8 @@ function readfasta(ifile::String,
   # at least a sequence has been found
   n = 1
 
-  seqref = copy!(zeros(UInt8, seqlen), 1, tmpseqref, 1, seqlen)
-  missseqref = copy!(falses(seqlen), 1, tmpmissseqref, 1, seqlen)
+  seqref = copyto!(zeros(UInt8, seqlen), 1, tmpseqref, 1, seqlen)
+  missseqref = copyto!(falses(seqlen), 1, tmpmissseqref, 1, seqlen)
 
   if s[1] == '>'
     if length(s) > 1
@@ -334,7 +329,7 @@ function readfasta(ifile::String,
   end
 
   data = zeros(UInt8, m, n)
-  id = Array{String}(n)
+  id = Array{String}(undef, n)
   enc = zeros(UInt8, 127)
 
   h1 = 0
@@ -473,7 +468,7 @@ function readdata(ifile::String,
   pol = falses(p)
 
   # missing values
-  missobs = indexin(obsref, miss) .> 0
+  missobs = indexin(obsref, miss) .!= nothing
 
   # count the observations and check that they are all the same length
   for line in eachline(f)
@@ -522,8 +517,8 @@ function readdata(ifile::String,
             p - 1, " total columns.\nProcessing data...")
   end
 
-  data = Array{String}(m, n)
-  id = Array{String}(n)
+  data = Array{String}(undef, m, n)
+  id = Array{String}(undef, n)
 
   # go back at the beginning of the file and start again
   seekstart(f)

diff --git a/src/estimate/mcmc.jl b/src/estimate/mcmc.jl
@@ -6,7 +6,7 @@ function kpax3Restimate(fileroot::String)
 
   n = length(id)
 
-  D = 1.0 - P
+  D = 1.0 .- P
 
   R = zeros(Int, n)
   estimate = ones(Int, n)
@@ -16,10 +16,10 @@ function kpax3Restimate(fileroot::String)
   niter = 0
   for g in k
     try
-      copy!(estimate, Clustering.kmedoids(D, g).assignments)
+      copyto!(estimate, Clustering.kmedoids(D, g).assignments)
     catch
       StatsBase.sample!(1:g, estimate, replace=true)
-      estimate[StatsBase.sample(1:n, g, replace=false)] = collect(1:g)
+      estimate[StatsBase.sample(1:n, g, replace=false)] .= collect(1:g)
     end
 
     niter = 0
@@ -28,14 +28,14 @@ function kpax3Restimate(fileroot::String)
 
       if lossnew < lossold
         lossold = lossnew
-        copy!(R, estimate)
+        copyto!(R, estimate)
       end
 
       try
-        copy!(estimate, Clustering.kmedoids(D, g).assignments)
+        copyto!(estimate, Clustering.kmedoids(D, g).assignments)
       catch
         StatsBase.sample!(1:g, estimate, replace=true)
-        estimate[StatsBase.sample(1:n, g, replace=false)] = collect(1:g)
+        estimate[StatsBase.sample(1:n, g, replace=false)] .= collect(1:g)
       end
 
       niter += 1