-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_pathogen_data.R
128 lines (97 loc) · 4.81 KB
/
build_pathogen_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# =============== Reads host-pathogen data from CLOVER and VIRION and turns into a zoonotic host edgelist ================
library(RCurl); library(vroom); library(dplyr); library(magrittr)
# CLOVER files are stored at: https://github.com/viralemergence/clover/blob/main/clover/clover_1.0_allpathogens/
# VIRION files are stored at: https://github.com/viralemergence/virion/tree/main/Virion
# --------- 1. CLOVER --------------
# read in bacteria, fungi and helminth files
clov_loc = "C:/Users/roryj/Documents/PhD/202011_clover/repos/clover/clover/clover_1.0_allpathogens/"
clover = do.call(
rbind.data.frame,
lapply(
list.files(clov_loc, pattern="csv", full.names = TRUE)[ 1:3 ], read.csv
)
)
# remove viruses as these come from VIRION and filter to key fields
clover = clover %>%
dplyr::filter(PathogenType != "virus") %>%
dplyr::filter(HostNCBIResolved == TRUE & PathogenNCBIResolved == TRUE) %>% # keep tax resolved only
dplyr::select(
Host, HostClass, HostOrder, HostFamily, HostGenus,
Pathogen, PathogenType, PathogenClass, PathogenOrder, PathogenFamily, PathogenGenus,
DetectionMethod
) %>%
unique() %>%
dplyr::filter(!is.na(Host) & !is.na(Pathogen))
# -------- 2. VIRION -------------
# read in VIRION datafile
virion = vroom::vroom("C:/Users/roryj/Documents/PhD/202104_virion/virion/Virion/Virion.csv.gz")
virion = virion %>%
dplyr::rename(
"Pathogen"=Virus, "PathogenClass"=VirusClass,
"PathogenOrder"=VirusOrder, "PathogenFamily"=VirusFamily, "PathogenGenus"=VirusGenus, "PathogenNCBIResolved" = VirusNCBIResolved
) %>%
dplyr::mutate(PathogenType = "virus",
PredictFlag = stringr::str_detect(Pathogen, "predict\\_")) %>%
dplyr::filter(ICTVRatified | PredictFlag) %>% # keep only properly taxonomically resolved viruses
dplyr::filter(HostNCBIResolved == TRUE) %>% # keep tax resolved hosts only
dplyr::select(
Host, HostClass, HostOrder, HostFamily, HostGenus,
Pathogen, PathogenType, PathogenClass, PathogenOrder, PathogenFamily, PathogenGenus,
DetectionMethod
) %>%
unique() %>%
dplyr::filter(!is.na(Host) & !is.na(Pathogen))
# ------- Combine into edgelist (host * pathogen * detection method) ----------
# combine
hp = rbind(clover, virion)
# identify "human pathogen" (detected in humans via PCR or isolation)
zoono1 = hp %>%
dplyr::filter(Host == "homo sapiens" & DetectionMethod %in% c("Isolation/Observation", "PCR/Sequencing"))
# identify
hp$InfectsHumans = hp$Pathogen %in% zoono1$Pathogen
# exclude humans from list
hp = hp %>%
dplyr::filter(Host != "homo sapiens")
# ------- Subset to zoonotic hosts list -----------
# broad detection criterion: serology, PCR or isolation
zh1 = hp %>%
dplyr::filter(DetectionMethod != "Not specified") %>%
dplyr::select(-DetectionMethod) %>%
unique() %>%
dplyr::group_by(Host) %>%
dplyr::summarise(HostClass = head(HostClass, 1),
HostOrder = head(HostOrder, 1),
HostFamily = head(HostFamily, 1),
HostGenus = head(HostGenus, 1),
ZoonoticHost = any(InfectsHumans == TRUE),
PathogenRichness = n_distinct(Pathogen),
ZoonoticPathogenRichness = n_distinct(Pathogen[ InfectsHumans == TRUE ]),
Pathogens = paste(Pathogen, collapse=", "),
ZoonoticPathogens = paste(Pathogen[ InfectsHumans == TRUE ], collapse=", ")) %>%
dplyr::mutate(HostCriteria = "Broad (serology, sequencing or isolation)")
# narrower detection criterion
zh2 = hp %>%
dplyr::filter(DetectionMethod != c("Not specified", "Antibodies")) %>%
dplyr::select(-DetectionMethod) %>%
unique() %>%
dplyr::group_by(Host) %>%
dplyr::summarise(HostClass = head(HostClass, 1),
HostOrder = head(HostOrder, 1),
HostFamily = head(HostFamily, 1),
HostGenus = head(HostGenus, 1),
ZoonoticHost = any(InfectsHumans == TRUE),
PathogenRichness = n_distinct(Pathogen),
ZoonoticPathogenRichness = n_distinct(Pathogen[ InfectsHumans == TRUE ]),
Pathogens = paste(Pathogen, collapse=", "),
ZoonoticPathogens = paste(Pathogen[ InfectsHumans == TRUE ], collapse=", ")) %>%
dplyr::mutate(HostCriteria = "Stricter (sequencing or isolation)")
# combine
hosts = rbind(zh1, zh2)
# save file
write.csv(hosts, "./output/zoonotichosts_edgelist_VIRIONplusCLOVER.csv", row.names=FALSE)
# metadata
# ZoonoticHost = is species identified as zoonotic host under given criterion
# PathogenRichness = overall pathogen richness (num pathogens)
# ZoonoticPathogenRichness = zoonotic pathogen richness (num pathogens)
# Pathogens and ZoonoticPathogens = names of pathogens
# HostCriteria = describes detection criteria, either broad or stricter, filter to one of these before further processing