get_metadata.R

###
### This R function takes as input the output of the Kobo form "International Genetic Indicator testing" 
### and extracts the metadata for taxa and indicators, in some cases creating new useful variables, 
### like taxon name and if the taxon was assessed only a single time or multiple times
### 

### If you use this script, please check https://github.com/AliciaMstt/GeneticIndicators 
### for citation guidelines


get_metadata<-function(kobo_output=kobo_output){
  
  ### Arguments:
  
  # kobo_output = a data frame object read into R from the `.csv` file 
  # resulting from exporting the Kobotoolbox data from the form 
  # "International Genetic Indicator testing" wit the settings explaiend at
  # https://github.com/AliciaMstt/GeneticIndicators
  
  ### Needed libraries:  
  
  #  library(tidyr)
  #  library(dplyr)
  #  library(utile.tools)
  #  library(stringr)
  
  ###
  ### Function  
  ###
  
  ### Get data
  kobo_output<-kobo_output
  
  ### Separate data 
  
  # create a variable with the full taxon name if this variable doesn't exist already
  # (raw kobo output doesn't include it, but it may exists in a "clean" version of the 
  # output if ran through the quality check pipeline)
  
  if("taxon" %in% colnames(kobo_output)){
    print("the data already contained a taxon column, that was used instead of creating a new one")
    
  }else {
    kobo_output<-kobo_output %>% 
      mutate(taxon=(utile.tools::paste(genus, species, subspecies_variety, na.rm=TRUE))) %>%
      # remove white space at the end of the name
      mutate(taxon=str_trim(taxon, "right"))
  } 
  
  # Process data already including taxon column
  
metadata <- kobo_output %>% 
    
    # create variable with year in which assessment was done (based on date the form was compleated)
    mutate(year_assesment=substr(end,1,4)) %>%
    
    ## select relevant columns 
    # taxon and assessment info
    dplyr::select(country_assessment, taxonomic_group, taxon, scientific_authority,
                  genus, year_assesment, name_assessor, email_assessor, common_name, kobo_tabular,
    # kobo validation status
                  X_validation_status,
    # uuid, this is a unique id generated by kobo for each unique record. It will be used to differentiate different records even if the same species is evaluated twice
                  X_uuid,
    # taxon in other databases details
                  GBIF_taxonID,	NCBI_taxonID,	national_taxonID,	source_national_taxonID,
    
    # indicator 2 metadata (section 3 of the form)
                  other_populations, time_populations,
                  defined_populations, source_definition_populations, map_populations, map_populations_URL,
                  habitat_decline_area, source_populations, 
    # indicator 1 metadata (seciton 4 of the form)
                  popsize_data, ne_pops_exists, nc_pops_exists, ratio_exists, species_related, ratio_species_related, 
                  ratio_year, source_popsize_ratios, species_comments, 
    # risk assessments and natural history metadata
                  realm, IUCN_habitat, other_habitat, national_endemic, transboundary_type, other_explain, 
                  country_proportion, species_range, rarity, occurrence_extent, occurrence_area, 
                  pop_fragmentation_level, species_range_comments, global_IUCN, regional_redlist, 
                  other_assessment_status, other_assessment_name, source_status_distribution, 
                  fecundity, semelparous_offpring, reproductive_strategy, reproductive_strategy_other, 
                  adult_age_data, other_reproductive_strategy, longevity_max, longevity_median, longevity_maturity, 
                  longevity_age, life_history_based_on, life_history_sp_basedon, sources_life_history) %>%

     # change all "" (empty) cells to NA
          
       mutate_all(list(~na_if(.,""))) %>%

    # change "" in kobo_tabular to "kobo" ("" means that question was not answered because the taxon had less populations that the min to trigger tabular)
    mutate(kobo_tabular=ifelse(is.na(kobo_tabular), "kobo", kobo_tabular)) 
  
## Add a variable to the metadata stating if the taxon was assessed multiple times or only a single time
       
    # object with duplicated taxa within a single country
      # duplicated() is run twice, the second time with  fromLast = TRUE so that 
      # the first occurrence is also accounted for, i.e. we can subset all records with the same taxon for a given country
metadata_duplicates <- metadata[which(duplicated(metadata[c('taxon', 'country_assessment')]) | duplicated(metadata[c('taxon', 'country_assessment')], fromLast = TRUE)), ]

    # if it is a duplicate then tag it as multi_assessment, if it is not duplicated within the country then single
metadata <- metadata %>% 
  mutate(multiassessment= if_else(
    X_uuid %in% metadata_duplicates$X_uuid, "multiassessment", "single_assessment"))

  
  # End of function
}