-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_metadata.R
106 lines (79 loc) · 4.83 KB
/
get_metadata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
###
### This R function takes as input the output of the Kobo form "International Genetic Indicator testing"
### and extracts the metadata for taxa and indicators, in some cases creating new useful variables,
### like taxon name and if the taxon was assessed only a single time or multiple times
###
### If you use this script, please check https://github.com/AliciaMstt/GeneticIndicators
### for citation guidelines
get_metadata<-function(kobo_output=kobo_output){
### Arguments:
# kobo_output = a data frame object read into R from the `.csv` file
# resulting from exporting the Kobotoolbox data from the form
# "International Genetic Indicator testing" wit the settings explaiend at
# https://github.com/AliciaMstt/GeneticIndicators
### Needed libraries:
# library(tidyr)
# library(dplyr)
# library(utile.tools)
# library(stringr)
###
### Function
###
### Get data
kobo_output<-kobo_output
### Separate data
# create a variable with the full taxon name if this variable doesn't exist already
# (raw kobo output doesn't include it, but it may exists in a "clean" version of the
# output if ran through the quality check pipeline)
if("taxon" %in% colnames(kobo_output)){
print("the data already contained a taxon column, that was used instead of creating a new one")
}else {
kobo_output<-kobo_output %>%
mutate(taxon=(utile.tools::paste(genus, species, subspecies_variety, na.rm=TRUE))) %>%
# remove white space at the end of the name
mutate(taxon=str_trim(taxon, "right"))
}
# Process data already including taxon column
metadata <- kobo_output %>%
# create variable with year in which assessment was done (based on date the form was compleated)
mutate(year_assesment=substr(end,1,4)) %>%
## select relevant columns
# taxon and assessment info
dplyr::select(country_assessment, taxonomic_group, taxon, scientific_authority,
genus, year_assesment, name_assessor, email_assessor, common_name, kobo_tabular,
# kobo validation status
X_validation_status,
# uuid, this is a unique id generated by kobo for each unique record. It will be used to differentiate different records even if the same species is evaluated twice
X_uuid,
# taxon in other databases details
GBIF_taxonID, NCBI_taxonID, national_taxonID, source_national_taxonID,
# indicator 2 metadata (section 3 of the form)
other_populations, time_populations,
defined_populations, source_definition_populations, map_populations, map_populations_URL,
habitat_decline_area, source_populations,
# indicator 1 metadata (seciton 4 of the form)
popsize_data, ne_pops_exists, nc_pops_exists, ratio_exists, species_related, ratio_species_related,
ratio_year, source_popsize_ratios, species_comments,
# risk assessments and natural history metadata
realm, IUCN_habitat, other_habitat, national_endemic, transboundary_type, other_explain,
country_proportion, species_range, rarity, occurrence_extent, occurrence_area,
pop_fragmentation_level, species_range_comments, global_IUCN, regional_redlist,
other_assessment_status, other_assessment_name, source_status_distribution,
fecundity, semelparous_offpring, reproductive_strategy, reproductive_strategy_other,
adult_age_data, other_reproductive_strategy, longevity_max, longevity_median, longevity_maturity,
longevity_age, life_history_based_on, life_history_sp_basedon, sources_life_history) %>%
# change all "" (empty) cells to NA
mutate_all(list(~na_if(.,""))) %>%
# change "" in kobo_tabular to "kobo" ("" means that question was not answered because the taxon had less populations that the min to trigger tabular)
mutate(kobo_tabular=ifelse(is.na(kobo_tabular), "kobo", kobo_tabular))
## Add a variable to the metadata stating if the taxon was assessed multiple times or only a single time
# object with duplicated taxa within a single country
# duplicated() is run twice, the second time with fromLast = TRUE so that
# the first occurrence is also accounted for, i.e. we can subset all records with the same taxon for a given country
metadata_duplicates <- metadata[which(duplicated(metadata[c('taxon', 'country_assessment')]) | duplicated(metadata[c('taxon', 'country_assessment')], fromLast = TRUE)), ]
# if it is a duplicate then tag it as multi_assessment, if it is not duplicated within the country then single
metadata <- metadata %>%
mutate(multiassessment= if_else(
X_uuid %in% metadata_duplicates$X_uuid, "multiassessment", "single_assessment"))
# End of function
}