Merge pull request #78 from colebrookson/main

docker image working
viralemergence · Oct 1, 2023 · 8b7fc29 · 8b7fc29
2 parents e15b03f + 7afa539
commit 8b7fc29
Show file tree

Hide file tree

Showing 10 changed files with 27 additions and 66 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -8,67 +8,38 @@ on:
   pull_request:
     branches:
       - main
-  # run once a week
+  # run once a week 
   schedule:
     - cron: "0 0 * * *"
-  workflow_dispatch:
+  workflow_dispatch: 
 jobs:
   GenBank:
     runs-on: ubuntu-latest
+    container: colebrookson/virion:latest   
     steps:
     - uses: actions/checkout@v3
-    - uses: gautamkrishnar/keepalive-workflow@v1
-    - name: Setup Julia
-      uses: julia-actions/setup-julia@v1
+    - uses: julia-actions/setup-julia@v1  
       with:
         version: 1.7
-    - name: Julia dependencies (DF)
-      run: julia -e 'using Pkg; Pkg.activate("."); Pkg.add("CSV"); Pkg.add("DataFrames")'
-    - name: NCBITaxonomy (version from MAIN branch!)
-      run: julia -e 'using Pkg; Pkg.activate("."); Pkg.add(PackageSpec(name="NCBITaxonomy", rev="main"))'
-    - name: Setup R
-      uses: r-lib/actions/setup-r@v2
-    - name: Libraries for tidyverse
-      run: sudo apt-get install -y libharfbuzz-dev libfribidi-dev
-    - name: dependencies!!!!
+    - name: Download GenBank 
       run: |
-        sudo apt-get install libcurl4-openssl-dev libarchive-dev
-        sudo Rscript -e 'install.packages(c("taxize", "tidyverse", "RCurl", "readr", "vroom", "magrittr", "fs", "data.table", "zip", "rglobi", "lubridate", "R.utils"), repos = "http://cran.us.r-project.org")'
-    - name: Download GenBank
-      run: |
-        Rscript -e 'source("Code/02_1a_Download GenBank.R")'
+        Rscript -e 'source("Code/02_1a_Download GenBank.R")' 
     - name: Digest GenBank
       run: |
         Rscript -e 'source("Code/02_1b_Digest GenBank.R")'
     - name: Format GenBank
       run: |
-        Rscript -e 'source("Code/02_1c_Format GenBank.R")'
+        Rscript -e 'source("Code/02_1c_Format GenBank.R")' 
     - name: Save artifacts
       uses: actions/upload-artifact@v2
       with:
         name: GenBankFormatted
         path: Intermediate/Formatted/GenbankFormatted.csv.gz
   Globi:
     runs-on: ubuntu-latest
+    container: colebrookson/virion:latest
     steps:
     - uses: actions/checkout@v3
-    - name: Setup R
-      uses: r-lib/actions/setup-r@v2
-    - name: Libraries for tidyverse
-      run: sudo apt-get install -y libharfbuzz-dev libfribidi-dev
-    - name: dependencies!!!!
-      run: |
-        sudo apt-get install libcurl4-openssl-dev libarchive-dev
-        sudo Rscript -e 'install.packages(c("taxize", "tidyverse", "RCurl", "readr", "vroom", "magrittr", "fs", "data.table", "zip", "devtools", "lubridate"), repos = "http://cran.us.r-project.org")'
-        sudo Rscript -e 'devtools::install_github("ropensci/rglobi")'
-    - name: Setup Julia
-      uses: julia-actions/setup-julia@v1
-      with:
-        version: 1.7
-    - name: Julia dependencies (DF)
-      run: julia -e 'using Pkg; Pkg.activate("."); Pkg.add("CSV"); Pkg.add("DataFrames")'
-    - name: NCBITaxonomy (version from MAIN branch!)
-      run: julia -e 'using Pkg; Pkg.activate("."); Pkg.add(PackageSpec(name="NCBITaxonomy", rev="main"))'
     - name: Download Globi
       run: |
         Rscript -e 'source("Code/02_3a_Download GLOBI.R")'
@@ -85,15 +56,10 @@ jobs:
         path: Intermediate/Formatted/GLOBIFormatted.csv
   Finish:
     runs-on: ubuntu-latest
+    container: colebrookson/virion:latest
     needs: [GenBank, Globi]
     steps:
     - uses: actions/checkout@v3
-    - name: Setup R
-      uses: r-lib/actions/setup-r@v2
-    - name: dependencies!!!
-      run: |
-        sudo apt-get install libcurl4-openssl-dev libarchive-dev libharfbuzz-dev libfribidi-dev
-        sudo Rscript -e 'install.packages(c("taxize", "tidyverse", "RCurl", "readr", "vroom", "magrittr", "fs", "data.table", "R.utils", "zip", "rglobi", "lubridate", "tidyft"), repos = "http://cran.us.r-project.org")'
     - name: Get GenBank
       uses: actions/download-artifact@v2
       with:

diff --git a/Code/001_Julia functions.R b/Code/001_Julia functions.R
@@ -7,8 +7,8 @@
 install.ncbi <- function() {
 
   # install the packages if needed 
-  JuliaCall::julia_install_package_if_needed("NCBITaxonomy")
+  JuliaCall::julia_install_package_if_needed("NCBITaxonomy") 
   JuliaCall::julia_install_package_if_needed("DataFrames")
   JuliaCall::julia_install_package_if_needed("CSV")
   JuliaCall::julia_install_package_if_needed("ProgressMeter")
-}
+}
diff --git a/Code/02_1a_Download GenBank.R b/Code/02_1a_Download GenBank.R
@@ -11,24 +11,17 @@ library(magrittr)
 
 url = paste0("https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNuclMetadata/",
              "AllNuclMetadata.csv.gz")
-d = tryCatch(utils::download.file(url, destfile = here::here(
-  "./Source/AllNuclMetadata.csv.gz")),
-  error = function(e){-999})
 
-if(d == -999) {
-  while (d == -999){
-    Sys.sleep(600)
-    d = tryCatch(utils::download.file(
-      url, destfile = here::here("./Source/AllNuclMetadata.csv.gz")), 
-      error = function(e){-999})
-  }
-}
+location <- here::here("./Source/") 
+system(paste0("wget ", url, " -P ", location))
 
 # reading this in - use data.table
 seq <- data.table::fread(here::here("./Source/AllNuclMetadata.csv.gz"),
                          select = c("#Accession", "Release_Date", "Species", 
                                     "Host", "Collection_Date"))
-seq %<>% dplyr::rename(Accession = "#Accession")
+print("readin")
+seq %<>% dplyr::rename(Accession = "#Accession")  
 
 # write out ==================================================================== 
 vroom::vroom_write(seq, here::here("./Source/sequences.csv"))
+print("written")
diff --git a/Code/02_1b_Digest GenBank.R b/Code/02_1b_Digest GenBank.R
@@ -11,13 +11,15 @@ library(magrittr)
 rentrez::set_entrez_key("ec345b39079e565bdfa744c3ef0d4b03ba08")
 
 # get the functions to do all the dictionary stuff
-if(!exists("vdict")) {source(here::here("./Code/001_TaxizeFunctions.R"))}
-if(!exists("jvdict")) {source(here::here("./Code/001_Julia functions.R"))}
+source(here::here("./Code/001_TaxizeFunctions.R"))
+source(here::here("./Code/001_Julia functions.R"))
 
 if(!file.exists(here::here("./Source/sequences.csv"))){
   zip::unzip(here::here("./Source/GenBank.zip"), exdir = "Source")
 }
 
+install.ncbi()
+
 gb <- data.table::fread(here::here("./Source/sequences.csv")) %>% 
   dplyr::as_tibble()
 

diff --git a/Code/02_2a_Digest PREDICT.R b/Code/02_2a_Digest PREDICT.R
@@ -30,7 +30,7 @@ predict.raw %>%
                              TRUE,
                              FALSE)) %>%
   mutate(Host = str_replace(Host, " \\*",""),
-         Host = str_replace(Host, "cf. ","")) %>% 
+         Host = str_replace(Host, "cf. ","")) %>%  
 
   # Back up the virus names before doing anything else
 

diff --git a/Code/02_3b_Digest GLOBI.R b/Code/02_3b_Digest GLOBI.R
@@ -1,12 +1,12 @@
-
-if(!exists('vdict')) {source('Code/001_TaxizeFunctions.R')}
-if(!exists('jvdict')) {source('Code/001_Julia functions.R')}
+source('Code/001_TaxizeFunctions.R')
+source('Code/001_Julia functions.R')
 rentrez::set_entrez_key("ec345b39079e565bdfa744c3ef0d4b03ba08")
 
 library(tidyverse)
 library(taxize)
 library(magrittr)
 library(vroom)
+install.ncbi()
 
 globi <- read_csv('Source/GLOBI-raw.csv')
 

diff --git a/Code/Code_Dev/host.jl b/Code/Code_Dev/host.jl
@@ -1,5 +1,5 @@
-using NCBITaxonomy
 using DataFrames
+using NCBITaxonomy
 import CSV
 
 include(joinpath(pwd(), "Code/Code_Dev/taxonomizer.jl"))

diff --git a/Code/Code_Dev/pathogen.jl b/Code/Code_Dev/pathogen.jl
@@ -1,5 +1,5 @@
-using NCBITaxonomy
 using DataFrames
+using NCBITaxonomy
 import CSV
 using ProgressMeter
 

diff --git a/Code/Code_Dev/virus.jl b/Code/Code_Dev/virus.jl
@@ -1,5 +1,5 @@
-using NCBITaxonomy
 using DataFrames
+using NCBITaxonomy
 import CSV
 
 include(joinpath(pwd(), "Code/Code_Dev/taxonomizer.jl"))

diff --git a/here b/here