get_indicator3_data.R

###
### This R function takes as input the output of the Kobo form "International Genetic Indicator testing" 
### and reformat its in order to have the data in a dataframe useful for estimating 
### Genetic Diversity Indicator 3 (the number of taxa with genetic monitoring)
### the function also creates new useful variables, 
### like taxon name and if the taxon was assessed only a single time or multiple times
### 

### If you use this script, please check https://github.com/AliciaMstt/GeneticIndicators 
### for citation guidelines


get_indicator3_data<-function(kobo_output=kobo_output){
  
  ### Arguments:
  
  # kobo_output = a data frame object read into R from the `.csv` file 
  # resulting from exporting the Kobotoolbox data from the form 
  # "International Genetic Indicator testing" wit the settings explaiend at
  # https://github.com/AliciaMstt/GeneticIndicators
  
  ### Needed libraries:  
  
  #  library(tidyr)
  #  library(dplyr)
  #  library(utile.tools)
  #  library(stringr)
  
  ###
  ### Function  
  ###
  
  ### Get data
  kobo_output<-kobo_output
  
  ### Separate data 
  
  # create a variable with the full taxon name if this variable doesn't exist already
  # (raw kobo output doesn't include it, but it may exists in a "clean" version of the 
  # output if ran through the quality check pipeline)
  
  if("taxon" %in% colnames(kobo_output)){
    print("the data already contained a taxon column, that was used instead of creating a new one")
    
  }else {
    kobo_output<-kobo_output %>% 
      mutate(taxon=(utile.tools::paste(genus, species, subspecies_variety, na.rm=TRUE))) %>%
      # remove white space at the end of the name
      mutate(taxon=str_trim(taxon, "right"))
  } 
  
  ## Add a variable to the metadata stating if the taxon was assessed multiple times or only a single time
  
  # object with duplicated taxa within a single country
  # duplicated() is run twice, the second time with  fromLast = TRUE so that 
  # the first occurrence is also accounted for, i.e. we can subset all records with the same taxon for a given country
  kobo_output_duplicates <- kobo_output[which(duplicated(kobo_output[c('taxon', 'country_assessment')]) | duplicated(kobo_output[c('taxon', 'country_assessment')], fromLast = TRUE)), ]
  
  # if it is a duplicate then tag it as multi_assessment, if it is not duplicated within the country then single
  kobo_output <- kobo_output %>% 
    mutate(multiassessment= if_else(
      X_uuid %in% kobo_output_duplicates$X_uuid, "multiassessment", "single_assessment"))
  
  
  ##### Process data already including taxon column
  
  indicator3_data <- kobo_output %>% 
    
    # create variable with year in which assessment was done (based on date the form was compleated)
    mutate(year_assesment=substr(end,1,4)) %>%
    
    ## select relevant columns 
    # taxon and assessment info
    dplyr::select(country_assessment, taxonomic_group, taxon, scientific_authority,
                  genus, taxon, year_assesment, name_assessor, email_assessor,
                 
                   # indicator 3 data (section 7 of the form)              
                  gen_studies, temp_gen_monitoring, gen_monitoring_years, source_genetic_studies,
                  
                  # kobo validation status
                  X_validation_status,
                  # uuid, this is a unique id generated by kobo for each unique record. It will be used to differentiate different records even if the same species is evaluated twice
                  X_uuid,
                  # multiassessment
                  multiassessment) %>%
  
  # change all "" (empty) cells to NA
  
  mutate_all(list(~na_if(.,"")))

  
  # End of function
}