forked from fivethirtyeight/data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubredditVectorAnalysis.r
151 lines (130 loc) · 6.16 KB
/
subredditVectorAnalysis.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#######################################
#
# Program to analyze distance between
# Reddit subreddits using the cooccurrence
# of commentors across subreddits.
# Also implements "subreddit algebra"
# by adding and subtracting subreddit
# vectors.
# By @martintrevor_ for FiveThirtyEight
#
#######################################
library(reshape2)
library(lsa)
library(ggtern)
##### Part 1: Load in the data
# This CSV file was created by running the SQL code in processData.sql in Google's BigQuery
rawsubredditvecs = read.table("all_starting_2015_01_overlaps_top2200_no200_10com_allrank_mod_122716.csv",header=TRUE,sep=",")
##### Part 2: Format and clean data for analysis
castsubredditvecs = dcast(rawsubredditvecs,t1_subreddit~t2_subreddit,FUN="identity",fill=0)
subredditvecst = as.matrix(castsubredditvecs[,-1])
rownames(subredditvecst) = castsubredditvecs[,1]
subredditvecs = t(subredditvecst)
subredditvecssums = apply(subredditvecs,1,sum)
subredditvecsnorm = sweep(subredditvecs,1,subredditvecssums,"/")
subredditvecssumscontext = apply(subredditvecs,2,sum)
contextprobs = subredditvecssumscontext/sum(subredditvecssumscontext)
subredditvecspmi = log(sweep(subredditvecsnorm,2,contextprobs,"/")) # PMI version
subredditvecsppmi = subredditvecspmi
subredditvecsppmi[subredditvecspmi<0] = 0 # PPMI version
scalar1 <- function(x) {x / sqrt(sum(x^2))} # Function to normalize vectors to unit length
subredditvecsppminorm = t(apply(subredditvecsppmi,1,scalar1))
##### Part 3: Analysis of subreddit similarities
## Looking at which subreddits are closest to each other (and combinations of subreddits)
cursubmat = subredditvecsppminorm
cursubmatt = t(cursubmat)
currownameslc = tolower(rownames(cursubmat))
# Function to calculate subreddit similarities and perform algebra
# Note that curops always has a leading "+"
findrelsubreddit <- function(cursubs,curops,numret=20) {
cursubs = tolower(cursubs)
curvec = 0
for(i in 1:length(cursubs)) {
curvec = ifelse(curops[i]=="+",list(curvec + cursubmat[which(currownameslc==cursubs[i]),]),list(curvec - cursubmat[which(currownameslc==cursubs[i]),]))[[1]]
}
curclosesubs = cosine(x=curvec,y=cursubmatt)
curclosesubso = order(curclosesubs,decreasing=TRUE)
curclosesubsorder = curclosesubs[curclosesubso]
curclosesubsorderc = curclosesubsorder[-which(tolower(names(curclosesubsorder))%in%cursubs)]
return(head(curclosesubsorderc,numret))
}
## Political examples
# /r/The_Donald
cursubs = c("the_donald")
curops = c("+")
findrelsubreddit(cursubs,curops,5)
# /r/The_Donald - /r/politics
cursubs = c("the_donald","politics")
curops = c("+","-")
findrelsubreddit(cursubs,curops,5)
# /r/hillaryclinton - /r/politics
cursubs = c("hillaryclinton","politics")
curops = c("+","-")
findrelsubreddit(cursubs,curops,5)
# /r/The_Donald - /r/SandersforPresident
cursubs = c("the_donald","sandersforpresident")
curops = c("+","-")
findrelsubreddit(cursubs,curops,5)
# /r/SandersforPresident - /r/The_Donald
cursubs = c("sandersforpresident","the_donald")
curops = c("+","-")
findrelsubreddit(cursubs,curops,5)
# /r/fatpeoplehate + /r/CoonTown + /r/politics
cursubs = c("fatpeoplehate","coontown","politics")
curops = c("+","+","+")
findrelsubreddit(cursubs,curops,5)
## Validation examples
# /r/nba + /r/minnesota
cursubs = c("nba","minnesota")
curops = c("+","+")
findrelsubreddit(cursubs,curops,5)
# /r/personalfinance - /r/Frugal
cursubs = c("personalfinance","frugal")
curops = c("+","-")
findrelsubreddit(cursubs,curops,5)
# /r/Fitness + /r/TwoXChromosomes
cursubs = c("fitness","twoxchromosomes")
curops = c("+","+")
findrelsubreddit(cursubs,curops,5)
## Creating the ternary plot
# Similatrity to /r/The_Donald
cursubs = c("the_donald")
curops = c("+")
Dsubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
# Similarity to /r/SandersforPresident
cursubs = c("sandersforpresident")
curops = c("+")
Ssubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
# Similarity to /r/hillaryclinton
cursubs = c("hillaryclinton")
curops = c("+")
Hsubsims = findrelsubreddit(cursubs,curops,nrow(cursubmat))
# List of subreddits we're interested in
ternarysubs = c("theredpill","coontown","fatpeoplehate","politics","worldnews","news","sjwhate","thebluepill","feminism","books","political_revolution","basicincome")
Dternarysubsims = Dsubsims[tolower(names(Dsubsims))%in%ternarysubs]
Sternarysubsims = Ssubsims[tolower(names(Ssubsims))%in%ternarysubs]
Hternarysubsims = Hsubsims[tolower(names(Hsubsims))%in%ternarysubs]
# Normalizing the matrix
allternarysubsims = transform(merge(transform(merge(Sternarysubsims,Dternarysubsims,by="row.names"),row.names=Row.names,Row.names=NULL),Hternarysubsims,by="row.names"),row.names=Row.names,Row.names=NULL)
colnames(allternarysubsims) = c("S","D","H")
allternarysubsimssums = apply(allternarysubsims,1,sum)
allternarysubsimsnorm = sweep(allternarysubsims,1,allternarysubsimssums,"/")
# Creating the plot
pdf("./ternaryplotanno.pdf",height=10,width=10)
ggtern(data=allternarysubsimsnorm,aes(S,D,H)) + geom_point() + geom_text(label=rownames(allternarysubsimsnorm),hjust=0,vjust=0)
dev.off()
pdf("./ternaryplot.pdf",height=10,width=10)
ggtern(data=allternarysubsimsnorm,aes(S,D,H)) + geom_point() + theme_classic()
dev.off()
# Find subreddits that are particularly biased towards any of the three main candidate subreddits
allsubsims = transform(merge(transform(merge(Ssubsims,Dsubsims,by="row.names"),row.names=Row.names,Row.names=NULL),Hsubsims,by="row.names"),row.names=Row.names,Row.names=NULL)
colnames(allsubsims) = c("S","D","H")
chooseunique = c("H") # Set candidate subreddit of interest
curunique = 1/(allsubsims[,(!(colnames(allsubsims)==chooseunique))]/allsubsims[,chooseunique]) # Calculate fold enrichment of target candidate subreddit over other candidate subreddits for all other subreddits
allsubsimsmin = apply(allsubsims,1,min)
curuniquemin = apply(curunique,1,min)
curuniqueminc = curuniquemin[-which(allsubsimsmin==0)]
curuniquemat = data.frame(enrich=curuniqueminc,allsubsims[match(names(curuniqueminc),rownames(allsubsims)),])
curuniquemato = curuniquemat[order(curuniquemat$enrich,decreasing=TRUE),]
curuniquematoc = curuniquemato[which(curuniquemato[,chooseunique]>=0.25),] # Threshold for high enrichment and high raw similarity
head(curuniquematoc,20)