-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path60-topics.Rmd
240 lines (172 loc) · 8.96 KB
/
60-topics.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# (PART) 研究热点 {-}
# 四大研究主题 {#research-topic}
我们都知道,对于肠道菌群研究而言,饮食、免疫、代谢、疾病等是四大最重要的研究主题。在这一部分,我们将在这四大研究主题之间逐步展开。
```{r}
# 整合多个关键词为一个关键词
keywords_from <- function(..., name = "primary"){
keyword_list <- list(...)
result <- lapply(keyword_list, function(x){
x[[name]]
})
paste0(unlist(result), collapse = "|")
}
# 提取历史引证网络中的论文
extract_from_hist_graph <- function(M=NULL, g=NULL){
name <- V(g)$name
doi <- str_extract_all(name, "10\\.[0-9]+\\/\\S+")
doi <- unlist(doi)
M %>% filter(DI %in% doi)
}
```
```{r}
diet_keywords <- list(primary=c("\\bdiet[:alpha:]+"),
secondary = c("food","nutrition","supplement","fasting","calorie restriction",
"fibre|fiber","carbohydrate","meat","fish","egg","milk","dairy",
"fruit","vegetable","additives","sweetener"))
immunity_keywords <- list(primary=c("immun[:alpha:]+"),
secondary = c("inflamm[:alpha:]+","barrier","defense","T cell","B cell",
"lymphocyte","macrophage","cytokine","dendritic","neutrophil",
"interleukin","antibody"))
metabolism_keywords <- list(primary = c("\\bmetabol[:alpha:]+"),
secondary = c("obesity","diabet[:alpha:]+","insulin","lipid","\\bfat[:alpha:]+",
"adipose","NAFLD","NASH","nonalcoholic steatohepatitis","SCFA"))
# 癌症
cancer_keywords <- list(primary = c("cancer|\\btumo[:alpha:]+|oncology|melanoma|carcino[:alpha:]+"),
secondary = NULL)
# 心血管疾病
cardio_keywords <- list(primary = c("heart|cardi[:alpha:]+|hypertension"),
secondary = c("stroke","cardiac","arterial","athero[:alpha:]+","blood pressure"))
# 炎症性肠病
ibd_keywords <- list(primary = c("IBD|inflammatory bowl disease|ulcerative colitis|crohn.{1,5}disease|\\bUC\\b"),
secondary = NULL)
# 肠易激综合征
ibs_keywords <- list(primary = c("\\bIBS\\b|irritable bowel syndrome"),
secondary = NULL)
# 阿尔兹海默症
alzheimer_keywords <- list(primary = "Alzheimer.{1,5}disease",
secondary = NULL)
# 自闭症
autism_keywords <- list(primary = "autism",
secondary = NULL)
# 肝病
hepatology_keywords <- list(primary = c("NAFLD|non.?alcoholic fatty liver disease"),
secondary = NULL)
# 过敏
allergy_keywords <- list(primary = "\\ballerg.*\\b",
secondary = NULL)
# 肥胖
obesity_keywords <- list(primary = "obesity",
secondary = NULL)
# 糖尿病
diabetes_keywords <- list(primary = "diabetes",
secondary = NULL)
# 哮喘
asthma_keywords <- list(primary = "asthma",
secondary = NULL)
# 腹泻
diarrhea_keywords <- list(primary = "diarrhea",
secondary = NULL)
# 便秘
constipation_keywords <- list(primary = "\\bconstip.*\\b",
secondary = NULL)
## 全部疾病
disease_keywords <- list(primary = c(keywords_from(cancer_keywords,cardio_keywords,ibd_keywords,ibs_keywords,
alzheimer_keywords,autism_keywords,hepatology_keywords,
allergy_keywords,obesity_keywords,diabetes_keywords,asthma_keywords,
diarrhea_keywords,constipation_keywords),
"disease"),
secondary = NULL)
gut_axis_keywords <- list(primary = c("gut.{4,15}axis"),
secondary = NULL)
metabolism_effect_keywords <- list(primary = c("\\bmetabol[:alpha:]+"),
secondary = c("SCFA","short-chain fatty acids","butyrate","TMAO","TMA","secondary bile acids"))
medicine_keywords <- list(primary = c("medicine|medication|drug|[:alpha:]+pharmaco[:alpha:]+|xenobiotic"),
secondary = NULL)
early_life_keywords <- list(primary = c("early.{1,2}life|infan[:alpha:]+|maternal"),
secondary = c("pregnan[:alpha:]+","[:alpha:]+natal","in utero","uterus","placent[:alpha:]+","vertical transmission"))
cohort_keywords <- list(primary = c("cohort"),
secondary = NULL)
dark_matter_keywords <- list(primary = c("dark matter|virome|mycobiome|culturomics"),
secondary = NULL)
translation_keywords_precision <- list(primary = c("precision|personal"))
translation_keywords_biomarker <- list(primary = c("diagnos|prognos|biomarker"))
translation_keywords_engineer <- list(primary = c("engineer"))
translation_keywords_phage_therapy <- list(primary = c("phage.{1,30}therapy"))
translation_keywords_vaccine <- list(primary = c("\\bvaccin"))
translation_keywords_fmt <- list(primary = c("FMT|fecal.{10,30}transplant"))
translation_keywords_probiotics <- list(primary = c("probiotic|prebiotic|dietary fib[er]{2}"))
chinese_medicine_keywords <- list(primary = c("chinese herbal medicine|traditional Chinese medicine"))
topic_keywords <- list(diet_keywords, immunity_keywords, metabolism_keywords, disease_keywords)
topic_en <- c("diet","immunity","metabolism","disease")
# topic_en <- c("diet","immunity","metabolism","cancer","cardiac","axis","medicine","early life","cohort","dark matter",
# "precision","biomarker","engineer","phage therapy","vaccine","FMT","probiotics","Chinese medicine")
topic_cn <- c("饮食","免疫","代谢","疾病")
# topic_cn <- c("饮食","免疫","代谢","癌症","心血管疾病","轴","药物互作","生命早期","人群","暗物质",
# "精准医疗","生物标记物","工程菌","噬菌体疗法","疫苗","粪菌移植","益生菌益生元","中医药")
topic <- topic_cn
names(topic) <- topic_en
nTopic <- length(topic)
```
## 主题对应的核心论文数据
我们首先统计下各个研究主题相关的论文数量。
```{r}
MC_HC_article <- readRDS("data/MC_HC_article.RDS")
topic_articles_core <- lapply(topic_keywords, function(keyword){
MC_HC_article %>%
filter(str_detect(content, regex(paste0(c(keyword$primary,keyword$secondary),collapse = "|"), ignore_case = T))) %>%
mutate(DT=factor(DT))
})
names(topic_articles_core) <- topic_en
topic_nRecords_core <- lapply(topic_articles_core, function(article){
article %>% group_by(PY) %>%
summarise(nRecord = n())
})
```
在接下来的分析中,我们将基于 `r nrow(MC_HC_article)` 篇**关键论文**进行。显而易见,每个研究主题都具有庞大的发文量(图 \@ref(fig:topic-nRecord-in-core-articles))。
```{r eval=FALSE}
# 导出文献列表(仅研究论文)
all <- MC_HC_article %>% filter(DT == "ARTICLE")
# 全部文献
writeLines(all$UT, "all_article.id")
# 分主题
for (i in 1:nTopic){
out <- paste0(topic_en[[i]],".id")
out <- gsub(" ","_",out)
writeLines(topic_articles_core[[i]] %>% filter(DT == "ARTICLE") %>% pull("UT"), out)
}
```
```{r topic-nRecord-in-core-articles, fig.cap="各研究主题关键论文数量对比",fig.width=4}
count <- sapply(topic_articles_core, nrow)
df <- data.frame(topic=topic, count=count)
df$topic <- factor(df$topic, levels = rev(topic))
ggplot(df, aes(topic,count)) +
geom_col(width = 0.8) +
# geom_text(aes(label=topic),position = position_stack(0.5)) +
labs(x="",y="") +
coord_flip()
```
与此同时,也可以看到这四大研究主题都是历史比较悠久的研究领域(图 \@ref(fig:topic-trend-in-core-articles)),自 2000 年以来不断有关键的研究论文发表。
```{r topic-trend-in-core-articles, fig.cap="各研究主题发表的关键论文数量变化情况",fig.asp=1}
topic_trend_core_plot_list <- lapply(1:nTopic, function(i){
df <- topic_nRecords_core[[i]]
ggplot(df, aes(PY,nRecord)) + geom_col(width = 0.8) +
labs(x="",y="",title = topic_cn[[i]])
})
names(topic_trend_core_plot_list) <- topic_en
plot_grid(plotlist = topic_trend_core_plot_list, ncol = 1)
```
四大研究主题间的重叠度也相当可观(图 \@ref(fig:overlap-of-topic-in-core-article))。 其中,免疫和疾病的文献重叠度最高,免疫、代谢和疾病的重叠度次之,而四大研究主题共有的研究论文就有 511 篇,占到全部**关键论文**的近六分之一之多……
```{r overlap-of-topic-in-core-article, fig.cap="各研究主题核心论文间的重叠度"}
# 各研究主题间的重叠度
topic_articles_core_SR <- lapply(topic_articles_core, function(m){
m$SR
})
names(topic_articles_core_SR) <- topic
# library(UpSetR)
# upset(fromList(topic_articles_core_SR),
# sets=rev(names(topic_articles_core_SR)),
# keep.order = FALSE,
# mb.ratio = c(0.7, 0.3),
# order.by = "freq")
ggVennDiagram::ggVennDiagram(topic_articles_core_SR)
```