-
Notifications
You must be signed in to change notification settings - Fork 2
/
TE_landscape_draft.Rmd
6093 lines (4425 loc) · 439 KB
/
TE_landscape_draft.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "The epigenomic landscape of transposable elements across normal human development and anatomy"
author: "Erica Pehrsson"
date: "`r Sys.Date()`"
output: word_document
editor_options:
chunk_output_type: console
---
```{r setup, include=TRUE}
knitr::opts_chunk$set(dev = 'pdf',dev.args=list(pdf = list(useDingbats = FALSE)),fig.width=7, fig.height=8, fig.path='TE_landscape_draft_figures/', cache.path="TE_landscape_draft_cache/")
```
Includes code for producing figures and analysis for the manuscript.
# Setup
## Load required libraries
```{r load libraries}
library(plyr)
packageVersion("plyr")
library(reshape2)
packageVersion("reshape2")
library(ggplot2)
packageVersion("ggplot2")
library(gridExtra)
packageVersion("gridExtra")
library(RColorBrewer)
packageVersion("RColorBrewer")
library(scales)
packageVersion("scales")
library(mgcv)
packageVersion("mgcv")
library(extrafont)
packageVersion("extrafont")
library(rcompanion)
packageVersion("rcompanion")
```
## Set universal plot parameters
```{r set plot parameters}
theme_set(theme_bw(base_size=10) %+replace% theme(panel.background=element_rect(fill="white"),panel.border=element_rect(color="black",fill=NA),panel.grid=element_blank()))
```
## Load custom functions
```{r load functions}
# General
# Count the number of NAs per column for a dataframe
## Input: data_frame - any dataframe
## Output: vector with the number of NAs per column
count_na = function(data_frame){
counts = apply(data_frame,2,function(x) sum(is.na(x)))
return(counts)
}
# Plotting
# From https://stackoverflow.com/questions/11883844/inserting-a-table-under-the-legend-in-a-ggplot2-histogram
# Returns the legend for a ggplot object, to be included in a composite figure
## Input: myggplot - ggplot call
## Output: legend - legend component of a ggplot object
get_legend = function(myggplot){
tmp <- ggplot_gtable(ggplot_build(myggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
# From https://stackoverflow.com/questions/8197559/emulate-ggplot2-default-color-palette
# Returns a vector of colors equally spaced along the color wheel
## Input: n - number of colors
## Output: vector of colors
gg_color_hue = function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
# Matrix processing
# Filter the metadata matrix to only samples with data for that epigenetic technique
## Input: metadata - metadata matrix produced by metadata.R
## metric - epigenetic technique ("WGBS","DNase","H3K27ac"); for all other input, the dataframe is not filtered (all samples have chromHMM data)
## Output: metadata matrix filtered to only samples with data for that technique
filter_metadata = function(metadata,metric){
if (metric == "WGBS") {
metadata_matrix = metadata[which(!is.na(metadata$WGBS)),]
} else if (metric == "DNase") {
metadata_matrix = metadata[which(!is.na(metadata$DNase)),]
} else if (metric == "H3K27ac") {
metadata_matrix = metadata[which(!is.na(metadata$H3K27ac)),]
} else{
metadata_matrix = metadata
}
}
# Convert TE classes as assigned by RepeatMasker to composite classes
# "Other" to "SVA", nine ambiguous classes to "Other"
## Input: class_vector - dataframe column listing TE class
## Output: vector listing updated TE class
convert_class = function(class_vector){
class_vector = factor(class_vector,levels=unique(c(levels(class_vector),"SVA","Other")))
class_vector[which(class_vector == "Other")] = "SVA"
class_vector[which(class_vector %in% c("DNA?","LINE?","LTR?","SINE?","Unknown","Unknown?","RC","Unconfident","Unconfident_RC"))] = "Other"
class_vector = factor(class_vector,levels=c("DNA","LINE","LTR","SINE","SVA","Other"))
return(class_vector)
}
# Split combined RefSeq genic feature names (Cohort) into feature and coding status
## Input: feature_matrix - dataframe with Cohort column
## new_features - additional features to include in Feature factor levels
## Output: dataframe with additional columns Feature and Coding
split_coding = function(feature_matrix,new_features=c()){
to_split = gsub("coding_exon","CDS",feature_matrix$Cohort)
feature_matrix$Feature = factor(unlist(lapply(to_split,function(x) unlist(strsplit(as.character(x),"_"))[1])),levels=c(new_features,features))
feature_matrix$Coding = unlist(lapply(to_split,function(x) unlist(strsplit(as.character(x),"_"))[2]))
# Non-genic cohorts are given the coding status "All"
feature_matrix[which(is.na(feature_matrix$Coding)),]$Coding = "All"
feature_matrix$Coding = factor(feature_matrix$Coding,levels=c("All","pc","nc"))
feature_matrix$Cohort = factor(feature_matrix$Cohort,levels=c(new_features,cohorts))
return(feature_matrix)
}
# Potential
# Given a matrix of the number of samples each individual feature is annotated with each epigenetic state,
# Creates a table with the number of TEs in each state (column) for each number of samples (rows)
## Input: TE_matrix - matrix output by potential.py, individual TEs/promoters (rows) versus the number of samples in each state (columns)
## columns - columns containing states
## samples - total number of samples in dataset
## Output: dist_TE - table with columns for number of samples ("Samples", 0-total) and each state
sample_distribution = function(TE_matrix,columns,samples){
# Create column with number of samples (0 to total)
dist_TE = data.frame(Samples = seq(0,samples))
# Process each state
for (i in columns){
dist_TE = merge(dist_TE,as.data.frame(table(as.integer(TE_matrix[,i]))),by.x="Samples",by.y="Var1",all=TRUE)
colnames(dist_TE)[length(colnames(dist_TE))] = colnames(TE_matrix)[i]
}
dist_TE[is.na(dist_TE)] = 0
return(dist_TE)
}
# From the number of TEs in each state in each number of samples, calculates TE epigenetic statistics
## Input: distribution - table output by sample_distribution
## states - number of states under consideration
## samples - total number of samples in dataset
## Output: stats - table by state (row) with the proportion of TEs ever in the state
## and the mean proportion of samples a TE is in the state (columns)
potential_stats = function(distribution,states,samples){
# Dataframe with state columns
if (states > 1){
distribution = distribution[,2:(states+1)]
}
else{
distribution = as.data.frame(distribution[,2])
}
# For each state, calculate 1) proportion of TEs in the state in at least one sample
# 2) mean/sd proportion of samples in which all TEs are annotated with the state
# 3) mean proportion of samples in which TEs in the state in at least one sample are annotated with the state
stats = as.data.frame(t(rbind(apply(distribution,2,function(x) sum(x[2:(samples+1)])/sum(x)),
apply(distribution,2,function(x) sum(x*seq(0,samples))/sum(x))/samples,
apply(distribution,2,function(x) sd(rep(seq(0,samples),x))/sqrt(sum(x))/samples),
apply(as.data.frame(distribution[2:(samples+1),]),2,function(x) sum(x*seq(1,samples))/sum(x))/samples)))
colnames(stats) = c("Proportion_ever","Samples_avg_all","Samples_SE_all","Samples_avg_ever")
return(stats)
}
# TE features
# Given a dataframe with several variables for a feature, calculates the Spearman correlation between each variable pair, for all TEs and by class
## Input: matrix - matrix of features (rows) versus feature variables (columns)
## indpt_var - list of variables to compare to response_vars variables
## response_vars - list of variables to compare to indpt_var variables
## Output: Dataframe with a row for each combination of variables from indpt_var and response_vars with Spearman's rho and p-value,
## for all TEs and by class
correlate_spearman = function(matrix, indpt_var, response_vars){
# All TEs
indv = as.data.frame(t(apply(matrix[,response_vars],2,function(x) unlist(cor.test(matrix[,indpt_var],x,method="spearman"))[c("p.value","estimate.rho")])))
indv$State = rownames(indv)
indv$class_update = rep("All",dim(indv)[1])
# By class
class = merge(melt(ddply(matrix,~class_update,function(y) apply(y[,response_vars],2,function(x) unlist(cor.test(y[,indpt_var],x,method="spearman"))["p.value"])),id.vars="class_update"),
melt(ddply(matrix,~class_update,function(y) apply(y[,response_vars],2,function(x) unlist(cor.test(y[,indpt_var],x,method="spearman"))["estimate.rho"])),id.vars="class_update"),
by=c("class_update","variable"))
colnames(class) = c("class_update","State","p.value","estimate.rho")
class = rbind(indv,class)
return(class)
}
# Enrichment
# Process the output from a prcomp() call to 1) add metadata to the rotated matrix and
# 2) convert the standard deviations of the principal components into variance per component
## Input: pca - prcomp object
## metadata - the metadata matrix produced by metadata.R
## metadata_col - the column of the metadata matrix corresponding to the rownames of the rotated matrix
## Output: prcomp object with two additional components:
## eigenvectors - rotation matrix with additional columns with sample metadata
## eigenvalues - variance explained by each principal component
format_pca = function(pca,metadata,metadata_col="Sample"){
pca$eigenvectors = cbind(as.data.frame(pca$x),metadata[match(rownames(pca$x),metadata[[metadata_col]]),])
pca$eigenvalues = 100*pca$sdev^2/sum(pca$sdev^2)
return(pca)
}
# Loads dataframe of individual TEs in a subfamily annotated with a state in any sample
## Input: subfamily - subfamily of interest
## state - state of interest
## metric - epigenetic technique
## Output: dataframe with individual TE coordinates, the sample, length of overlap with the state (value varies by technique), and the state
get_subfamily_in_state = function(subfamily,state,metric){
# Load dataframe of individual TEs in the state, location of file based on epigenetic technique
if (metric=="chromHMM"){
subfamily_in_state = read.table(paste("chromHMM/subfamily/by_state/",subfamily,"_",state,".txt",sep=""),sep='\t')[,1:10]
colnames(subfamily_in_state) = c(TE_coordinates[c(1:4,6,5,7)],"Sample","Overlap","State")
} else if (metric=="WGBS") {
subfamily_in_state = read.table(paste("WGBS/subfamily/by_state/",subfamily,"_",state,".txt",sep=""),sep='\t')
colnames(subfamily_in_state) = c(TE_coordinates[c(1:4,6,5,7)],"Sample","Overlap","State")
} else if (metric=="DNase" | metric=="H3K27ac"){
subfamily_in_state = read.table(paste(metric,"/subfamily/true_summit/",subfamily,"_",state,".txt",sep=""),sep='\t')
colnames(subfamily_in_state) = c(TE_coordinates[c(1:4,6,5,7)],"Sample","Overlap")
}
print("Loaded matrix")
return(subfamily_in_state)
}
# Loads list of samples where the subfamily is enriched in the state with LOR > threshold
## Input: subfamily - subfamily of interest
## state - state of interest
## enrichment - LOR threshold
## Output: List of samples where the subfamily is enriched in the state at the specified threshold
get_enriched_samples = function(subfamily,state,enrichment=THRESHOLD_LOR){
# From the matrix of enrichments, filtered by member thresholds (>30 members overall, >10 members in state)
samples = as.vector(unique(subfamily_state_sample_filter[which(subfamily_state_sample_filter$subfamily == subfamily
& subfamily_state_sample_filter$State == state
& subfamily_state_sample_filter$Enrichment > enrichment),]$Sample))
return(samples)
}
# For a given subfamily and state, writes out a file with all individual TEs in the state in samples where the subfamily is enriched
## Input: subfamily - subfamily of interest
## State - state of interest
## Output: subfamily_bedfile - dataframe of individual TEs in the state when the subfamily is enriched with the number of samples
## the TE is in the state, across all samples and only enriched samples.
## Written to a file in bed format
get_subfamily_enriched = function(subfamily,State){
# Identify the epigenetic technique associated with the specified state
metric = ifelse(State %in% chromHMM_states,"chromHMM",ifelse(State %in% meth_states,"WGBS",State))
# Load dataframe of individual TEs in the subfamily annotated with the state in any sample
subfamily_in_state = get_subfamily_in_state(subfamily,State,metric)
# Load list of samples where the subfamily is enriched in the state with LOR > 1.5
samples = get_enriched_samples(subfamily,State,THRESHOLD_LOR)
# Filter dataframe of individual TEs to only samples where the subfamily is enriched
subfamily_enriched = subfamily_in_state[which(subfamily_in_state$Sample %in% samples),]
# Total number of samples each TE is annotated with the state
subfamily_ubiq = aggregate(data=subfamily_in_state,Sample~chromosome+start+stop+subfamily+strand,length)
colnames(subfamily_ubiq)[6] = "Total_samples"
# Number of samples each TE is annotated with the state, enriched samples only
subfamily_bedfile = aggregate(data=subfamily_enriched,Sample~chromosome+start+stop+subfamily+strand,length)
# Combine and sort
subfamily_bedfile = merge(subfamily_bedfile,subfamily_ubiq,by=c("chromosome","start","stop","subfamily","strand"))
subfamily_bedfile = subfamily_bedfile[order(subfamily_bedfile$chromosome,subfamily_bedfile$start),]
# Write out individual TEs in bedfile format
write.table(subfamily_bedfile[,c("chromosome","start","stop","subfamily","Sample","strand")],
file=paste("enrichment/",subfamily,"_",State,"_enriched.bed",sep=""),
quote=FALSE,row.names = FALSE,col.names=FALSE,sep='\t')
return(subfamily_bedfile)
}
# Using permutation tests, calculates p-values for the likelihood of observing the number of samples belonging to each sample category/grouping
# among samples filtered by user-defined thresholds by chance, given the number of samples in each category/grouping overall
## Input: matrix - dataframe with Sample column, with 1-2 columns on which samples can be filtered and a State column (restricted to one state only)
## metric - variable to filter dataframe on
## direction - for p-value calculation, whether permuted results are tested for being above or below observed results
## threshold - threshold for filtering variable
## filtering - additional variable to filter dataframe on
## threshold2 - threshold for additional filtering variable
## Output: dataframe with each category/grouping with the number of samples overall and passing thresholds,
## plus a p-value for observing the number of samples passing the thresholds by chance
permute_by_sample = function(matrix,metric,direction,threshold=0,filtering,threshold2=0){
# Filter the metadata matrix output by metadata.R to match the state under consideration
metadata_matrix = filter_metadata(metadata,ifelse(unique(matrix$State) == "H3K27ac","H3K27ac",ifelse(unique(matrix$State) == "DNase","DNase",ifelse(unique(matrix$State) %in% meth_states,"WGBS","chromHMM"))))
# Creates a table with the number of samples in each grouping for each sample category for all samples
metadata_table = ldply(apply(metadata_matrix[,sample_categories],2,as.data.frame(table)))
colnames(metadata_table) = c("Category","Grouping","Samples")
print("Computing background")
# Creates a table with the number of samples per category/grouping for all samples and only those that pass the specified thresholds
threshold_matrix = function(matrix,metric,threshold,filtering,threshold2){
# Add sample categories for each sample
filter_matrix = merge(matrix,metadata_matrix[,c("Sample",sample_categories)],by="Sample")
# Filter matrix to those passing threshold(s)
filter_matrix = filter_matrix[which(filter_matrix[[metric]] > threshold & filter_matrix[[filtering]] > threshold2),]
# Create a table with the number of samples in each sample category/grouping for only those samples passing thresholds
table_matrix = ldply(apply(filter_matrix[,sample_categories],2,as.data.frame(table)))
colnames(table_matrix) = c("Category","Grouping","Samples")
# Expand table to all possible category/groupings for those samples
table_matrix = merge(table_matrix,metadata_table[,1:2],all.y=TRUE,by=c("Category","Grouping"))
table_matrix[is.na(table_matrix)] = 0
return(table_matrix)
}
# Create sample table for true results
real = threshold_matrix(matrix,metric,threshold,filtering,threshold2)
print("Computing real")
# Create sample table for 1000 permutations of sample labels
permuted = rdply(1000,function(x) {permute_matrix = matrix; permute_matrix$Sample = sample(permute_matrix$Sample);threshold_matrix(permute_matrix,metric,threshold,filtering,threshold2)})
colnames(permuted) = c("Replicate","Category","Grouping","Replicate_samples")
print("Computing permuted")
# Combine true and permuted results
permuted = merge(permuted,real,by=c("Category","Grouping"),all.x=TRUE)
print("Merging")
# Calculate the proportion of permutations for which the result is at or above/at or below the true result, for each category/grouping
# Convert into a p-value by dividing by the number of permutations
if(direction == "+"){
over = ddply(permuted,.(Category,Grouping),summarise,Pvalue=sum(Replicate_samples >= Samples)/1000)
} else if (direction == "-") {
over = ddply(permuted,.(Category,Grouping),summarise,Pvalue=sum(Replicate_samples <= Samples)/1000)
}
print("Computing p-values")
# Combine p-values with true results and table of all samples in each category/grouping
over = merge(over,real,by=c("Category","Grouping"),all.x=TRUE)
over = merge(over,metadata_table,by=c("Category","Grouping"),all.x=TRUE)
return(over)
}
# Mouse
# Counts the number of orthologous TEs with tissue-specific epigenetic activity by tissue,
# including only TEs in the state in <5 human tissues and in both samples of the specified tissue (All),
# and the number of tissues they are in the state in mouse:
# Specific, both samples of the specified tissue, <5 samples overall; On, >8 samples; Off, <2 samples
## Input: x - tissue of interest
## matrix - Dataframe of the number of samples each orthologous TE pair is in the state in each species, overall and by tissue
## Output: named vector with the number of orthologous TE pairs in each category
tissue_matrix = function(x,matrix){
c(All = dim(matrix[which(matrix$Human_samples < 5 & matrix[[paste(x,".x",sep="")]] == 2),])[1],
Specific = dim(matrix[which(matrix$Human_samples < 5 & matrix[[paste(x,".x",sep="")]] == 2 & matrix$Mouse_samples < 5 & matrix[[paste(x,".y",sep="")]] == 2),])[1],
On = dim(matrix[which(matrix$Human_samples < 5 & matrix[[paste(x,".x",sep="")]] == 2 & matrix$Mouse_samples > 8),])[1],
Off = dim(matrix[which(matrix$Human_samples < 5 & matrix[[paste(x,".x",sep="")]] == 2 & matrix$Mouse_samples < 2),])[1])
}
```
## Define color and label vectors
```{bash state lists, eval=FALSE}
# List of 15 chromHMM states
/bar/epehrsson/TE_landscape/sample_lists/chromHMM_states.txt
# List of 4 methylation states
/bar/epehrsson/TE_landscape/sample_lists/methylation_states.txt
```
```{r define vectors}
# Chromosomes
standard_chromosomes = c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY","chrM")
# TE coordinates
TE_coordinates = c("chromosome","start","stop","subfamily","family","class","strand")
hg19_coordinates = c("human_chr_hg19","human_start_hg19","human_stop_hg19","human_subfamily","human_class","human_family","human_strand_hg19")
mm10_coordinates = c("mouse_chr_mm10","mouse_start_mm10","mouse_stop_mm10","mouse_subfamily","mouse_class","mouse_family","mouse_strand_mm10")
# TE class colors
class_colors = setNames(c("#4A72E8","#FF6600","#006600","#cc0000","lightseagreen","#5C5C5C"),c("DNA","LINE","LTR","SINE","SVA","Other"))
# chromHMM states
## 15-state model
chromHMM_states = c("1_TssA","2_TssAFlnk","3_TxFlnk","4_Tx","5_TxWk","6_EnhG","7_Enh","8_ZNF/Rpts","9_Het","10_TssBiv","11_BivFlnk","12_EnhBiv","13_ReprPC","14_ReprPCWk","15_Quies")
chromHMM_colors = setNames(c(rgb(255,0,0,maxColorValue=255),rgb(255,69,0,maxColorValue=255),rgb(50,205,50,maxColorValue=255),rgb(0,128,0,maxColorValue=255),rgb(0,100,0,maxColorValue=255),rgb(194,225,5,maxColorValue=255),rgb(255,255,0,maxColorValue=255),rgb(102,205,170,maxColorValue=255),rgb(138,145,208,maxColorValue=255),rgb(205,92,92,maxColorValue=255),rgb(233,150,122,maxColorValue=255),rgb(189,183,107,maxColorValue=255),rgb(128,128,128,maxColorValue=255),rgb(192,192,192,maxColorValue=255),"grey90"),chromHMM_states)
## 18-state model
chromHMM_states_18 = setNames(c(rgb(255,0,0,maxColorValue=255),rgb(255,69,0,maxColorValue=255),rgb(255,69,0,maxColorValue=255),
rgb(255,69,0,maxColorValue=255),rgb(0,128,0,maxColorValue=255),rgb(0,100,0,maxColorValue=255),
rgb(194,225,5,maxColorValue=255),rgb(194,225,5,maxColorValue=255),rgb(255,195,77,maxColorValue=255),
rgb(255,195,77,maxColorValue=255),rgb(255,255,0,maxColorValue=255),rgb(102,205,170,maxColorValue=255),
rgb(138,145,208,maxColorValue=255),rgb(205,92,92,maxColorValue=255),rgb(189,183,107,maxColorValue=255),
rgb(128,128,128,maxColorValue=255),rgb(192,192,192,maxColorValue=255),rgb(255,255,255,maxColorValue=255)),
c("1_TssA","2_TssFlnk","3_TssFlnkU","4_TssFlnkD","5_Tx","6_TxWk","7_EnhG1","8_EnhG2","9_EnhA1","10_EnhA2","11_EnhWk",
"12_ZNF/Rpts","13_Het","14_TssBiv","15_EnhBiv","16_ReprPC","17_ReprPCWk","18_Quies"))
## Mouse 15-state chromHMM model
mouse_chromHMM_states = c("TssA","TssAFlnk1","TssAFlnk2","Tx1","Tx2","Enh","EnhLo1","EnhLo2","HetCons","HetFac","TssBiv","EnhPois1","EnhPois2","QuiesG","Quies")
mouse_chromHMM_colors = setNames(c("red","orangered2","orangered4","green4","darkgreen","yellow","gold1","goldenrod2","cornflowerblue","steelblue2","indianred3","burlywood1","lightgoldenrod2","grey","lightgrey"),mouse_chromHMM_states)
# WGBS/methylation states
meth_states = c("Hypomethylated","Intermediate","Hypermethylated","Missing")
meth_labels = setNames(c("Hypomethylated","Intermediate meth","Hypermethylated","Missing meth data"),meth_states)
meth_colors = setNames(c("#F8766D","#00BFC4","#7CAE00","#C77CFF"),meth_states)
# All epigenetic states
states = c(chromHMM_states,meth_states,"DNase","H3K27ac","Expressed_samples")
all_state_colors = c(chromHMM_colors,meth_colors,"aquamarine","tan1","black","black","black")
names(all_state_colors)[20:24] = c("DNase","H3K27ac","Expressed_samples","Bases","CpGs")
all_state_labels = setNames(c(chromHMM_states,meth_labels,"DHS","H3K27ac","RPKM > 1"),states)
all_pc_states = c(states[1:21],unlist(lapply(states[1:21],function(x) paste(x,"PC",sep="_"))))
# Technique labels
metric_labels = c("chromHMM","WGBS","DNase","H3K27ac","RPKM > 1")
names(metric_labels) = c("chromHMM","WGBS","DNase","H3K27ac","Expressed_samples")
## Alternative technique labels
mark_labels = setNames(c("chromHMM","WGBS","DHS","H3K27ac","RPKM > 1"),c("chromHMM","WGBS","DNase","H3K27ac","RNA"))
# Sample classification
sample_categories = c("Group","Anatomy","Type","Age","Cancer","Germline")
category_labels = setNames(c("Group","Anatomy","Type","Age","Cancer","Germ layer"),sample_categories)
group_colors = setNames(c("#924965","#4178AE","#E41A1C","#69608A","#B65C73","#FF9D0C","#678C69","#55A354","#E67326","#FFD924","#AF5B39","#D56F80","#999999","#C5912B","#C58DAA","#F182BC","#C2655D","#DAB92E","#000000"),c("ESC","ES-deriv","IMR90","iPSC","Mesench","Epithelial","HSC & B-cell","Blood & T-cell","Myosat","Neurosph","Adipose","Heart","Other","Brain","Digestive","Sm. Muscle","Muscle","Thymus","ENCODE2012"))
anatomy_colors = setNames(c("#B2DF8A","#33A02C","#FB9A99","#FFED6F","#80B1D3","#666666","#924965","#4178AE","#FDBF6F","#CAB2D6","#6A3D9A","#BEBADA","#FCCDE5","#BC80BD","#E7298A","#FF7F00","#69608A","#B3DE69","#CCEBC5","#000000","#FB8072","#FDB462","#8DD3C7","#D9D9D9","#E31A1C","#A6CEE3","#D95F02","#E6AB02","#66A61E","#B15928"),c("ADRENAL","BLOOD","BONE","BRAIN","BREAST","CERVIX","ESC","ESC_DERIVED","FAT","GI_COLON","GI_DUODENUM","GI_ESOPHAGUS","GI_INTESTINE","GI_RECTUM","GI_STOMACH","HEART","IPSC","KIDNEY","LIVER","LUNG","MUSCLE","MUSCLE_LEG","OVARY","PANCREAS","PLACENTA","SKIN","SPLEEN","STROMAL_CONNECTIVE","THYMUS","VASCULAR"))
type_colors = setNames(brewer.pal(5,"Greens"),c("PrimaryCulture","PrimaryCell","PrimaryTissue","CellLine","ESCDerived"))
age_colors = setNames(brewer.pal(4,"YlOrRd"),c("Cell_line","Unknown","Non_fetal","Fetal"))
cancer_colors = setNames(c("lightgrey","red"),c("No","Yes"))
germline_colors = setNames(brewer.pal(6,"Dark2"),c("Pluripotent","Mesoderm","Ectoderm","Endoderm","Mixed","Other"))
group_order = c("ESC","iPSC","ES-deriv","Blood & T-cell","HSC & B-cell","Thymus","Brain","Neurosph","Digestive","Epithelial","Adipose","Mesench","Myosat","Muscle","Sm. Muscle","Heart","Other","IMR90","ENCODE2012")
age_labels = setNames(c("Cell line","Fetal","Non-fetal","Unknown"),c("Cell_line","Fetal","Non_fetal","Unknown"))
# Genic features
features = c("cpgIsland","promoters","5UTR","CDS","3UTR","exons","introns","intergenic")
genic_labels = setNames(c("CpG island","Promoter","5'UTR","CDS","3'UTR","Exon","Intron","Intergenic","CDS","Vista enhancers"),c(features,"coding_exon","Vista_enhancers"))
cohorts = c("cpgIsland","promoters","promoters_pc","promoters_nc","5UTR","5UTR_pc","5UTR_nc","coding_exon","coding_exon_pc","3UTR","3UTR_pc","3UTR_nc",
"exons","exons_pc","exons_nc","introns","introns_pc","introns_nc","intergenic","blacklist","Vista_enhancers")
# Feature overlap labels
overlap_labels = c("% genome","% TEs (either strand)","% TEs (same strand)","% feature in TEs")
names(overlap_labels) = c("Genome_percent_genome","TEs_percent_TE","TEs_strand_percent_TE","Genome_percent_TE")
# Coding labels
coding_labels = c("All","Protein-coding","Non-coding")
names(coding_labels) = c("All","pc","nc")
# Column name vectors
measure_states_extra = c("States.chromHMM","Max_states_intra","States.WGBS","Max_expression")
measure_metrics = c("Length","mappability","JC_distance","CpGs","CpGs_per_length")
measure_metrics_subfam = c("Count","Total_length","Length","Mappability","Age","CpGs","Count_CpGs","CpGs_per_TE","CpGs_per_kbp")
enrichment_names = c("Length_ijk","Length_ik","Length_jk","Length_k","Enrichment","Length_percent_jk","Members","Count","Percent")
variable_labels = setNames(c("Length (bp)","Mappability","Jukes-Cantor evolutionary distance","CpGs","CpGs per bp"),measure_metrics)
measure_labels = setNames(c("Number of TEs","Total length (bp)","Median length (bp)","Median mappability","Median age (JC)","Total CpGs","TEs with CpGs","CpGs per TE","CpGs per kbp"),measure_metrics_subfam)
```
## Load metadata
Loads a data frame of metadata for each Roadmap epigenome, including: epigenome name; Group, Anatomy, and Type assignments from the REMC; Germ layer, Age, and Cancer assignments from this study; whether the epigenome has data for each techique, includes chrY, and is a cancer cell line/IMR90.
```{bash sample lists, eval=FALSE}
# Samples with chromHMM annotations
## All 127 samples
/bar/epehrsson/TE_landscape/sample_lists/mnemonics.txt
## 121 samples, excluding cancer cell lines and IMR90
/bar/epehrsson/TE_landscape/sample_lists/mnemonics_noCancer.txt
# Samples with WGBS data (37)
/bar/epehrsson/TE_landscape/sample_lists/WGBS_samples.txt
# Samples with DHS data (53)
/bar/epehrsson/TE_landscape/sample_lists/DNase_samples.txt
# Samples with H3K27ac data (98)
/bar/epehrsson/TE_landscape/sample_lists/H3K27ac_samples.txt
# Samples with RNA data, strand-agnostic (56)
/bar/epehrsson/TE_landscape/sample_lists/RNA_samples_agnostic.txt
```
```{r metadata}
source("R_scripts/metadata.R")
```
## Load sample statistics
Creates a matrix of the number of samples with data for each technique, for all samples, excluding cancer cell lines/IMR90, and excluding samples without chrY. Also creates labels for each technique.
```{r load sample stats}
# Samples with data for each technique: all, not a cancer cell line/IMR90, with chrY, both
sample_counts = rbind(apply(metadata[,c("Sample","WGBS","DNase","H3K27ac","RNA")],2,function(x) dim(metadata[which(!is.na(x)),])[1]),
apply(metadata[,c("Sample","WGBS","DNase","H3K27ac","RNA")],2,function(x) dim(metadata[which(!is.na(x) & metadata$Exclude == "Include"),])[1]),
apply(metadata[,c("Sample","WGBS","DNase","H3K27ac","RNA")],2,function(x) dim(metadata[which(!is.na(x) & metadata$chrY == "Yes"),])[1]),
apply(metadata[,c("Sample","WGBS","DNase","H3K27ac","RNA")],2,function(x) dim(metadata[which(!is.na(x) & metadata$chrY == "Yes" & metadata$Exclude == "Include"),])[1]))
colnames(sample_counts)[1] = "chromHMM"
rownames(sample_counts) = c("All","Include","chrY","Include.chrY")
```
## Create essential matrices
Creates matrices of features (TEs, promoters, etc.) and their intersection with epigenetic states.
### TEs
Filters the hg19 repeats identified by RepeatMasker to standard chromosomes (no contigs) and 12 TE classes.
```{bash TEs, eval=FALSE}
# RepeatMasker file for hg19
#/bar/epehrsson/TE_landscape/raw_data/rmsk.txt.gz -> /bar/genomes/hg19/rmsk/rmsk.txt.gz
#/bar/epehrsson/TE_landscape/features/rmsk.txt
# RepeatMasker files restricted to all TE classes, chr 1-22, X, Y, M
awk -v OFS='\t' '{if(($12 == "LTR" || $12 == "DNA" || $12 == "SINE" || $12 == "LINE") && ($6 !~ /_/)) print $6, $7, $8, $11, $12, $13, $10}' rmsk.txt > rmsk_TE.txt
awk -v OFS='\t' '{if(($12=="Unknown"||$12=="Unknown?"||$12=="DNA?"||$12=="LINE?"||$12=="SINE?"||$12=="LTR?"||$12=="Other"||$12=="RC") && ($6 !~ /_/))print $6, $7, $8, $11, $12, $13, $10}' rmsk.txt > rmsk_other.txt
cat rmsk_TE.txt rmsk_other.txt > rmsk_TEother.txt
## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TE.txt
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_other.txt
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TEother.txt
```
Average 36bp mappability per repeat from Sundaram et al 2014 Genome Research. Filtered to TE subfamilies.
```{bash TE mappability, eval=FALSE}
# Rmsk 36bp mappability file, standard chromosomes
#/bar/epehrsson/TE_landscape/mappability/rmsk.txt.mapability.36mer -> /bar/genomes/hg19/rmsk/rmsk.txt.mapability.36mer
# TE 36bp mappability file
while read line; do awk -v subfam=$line '{if($4 == subfam) print $0}' rmsk.txt.mapability.36mer >> rmsk_TE_mappability_36mer.txt; done < subfamilies.txt
while read line; do awk -v subfam=$line '{if($4 == subfam) print $0}' rmsk.txt.mapability.36mer >> rmsk_other_mappability_36mer.txt; done < other_subfamilies.txt
## Output
#/bar/epehrsson/TE_landscape/mappability/rmsk_other_mappability_36mer.txt
#/bar/epehrsson/TE_landscape/mappability/rmsk_TE_mappability_36mer.txt
```
Calculates Jukes-Cantor evolutionary distance from subfamily consensus for all hg19 repeats, using the RepeatMasker file with substitution rate as input. Filtered to standard chromosomes and TE classes.
```{bash TE age, eval=FALSE}
# Python script takes *rmsk.txt.gz file in the same folder as input, calculates JC evolutionary distance for rmsk repeats
python calcAge_generalized.py rmsk.txt.gz.JCage
## Output
#/bar/epehrsson/TE_landscape/age/rmsk.txt.gz.JCage
# JC evolutionary matrix for TEs, standard chromosomes
awk -v OFS='\t' '{if(($6 == "LTR" || $6 == "DNA" || $6 == "SINE" || $6 == "LINE") && ($1 !~ /_/)) print $1, $2, $3, $4, $5, $6, $7, $8, $9}' rmsk.txt.gz.JCage > rmsk_TE_JCage.txt
awk -v OFS='\t' '{if(($6 == "LTR?" || $6 == "DNA?" || $6 == "SINE?" || $6 == "LINE?" || $6 == "Unknown?" || $6 == "Unknown" || $6 == "Other" || $6 == "RC") && ($1 !~ /_/)) print $1, $2, $3, $4, $5, $6, $7, $8, $9}' rmsk.txt.gz.JCage > rmsk_other_JCage.txt
## Output
#/bar/epehrsson/TE_landscape/age/rmsk_other_JCage.txt
#/bar/epehrsson/TE_landscape/age/rmsk_TE_JCage.txt
```
Downloaded hg19 chromosome sizes, RefSeq genic features, and CpG islands from the UCSC Table Browser.
```{bash download RefSeq features, eval=FALSE}
# Chromosome sizes
mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e "select chrom, size from hg19.chromInfo" > hg19.genome
## Output
#/bar/epehrsson/TE_landscape/raw_data/hg19.genome
# RefSeq features
#/bar/epehrsson/genic_features/RefSeq/refseq_3UTR.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_5UTR.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_coding_exon.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_exons.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_genes.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_introns.txt
# Regions 2000bp upstream of RefSeq gene TSS
#/bar/epehrsson/genic_features/RefSeq/refseq_up2000.txt
# CpG islands
#/bar/epehrsson/genic_features/CGI/cpgIslandExtUnmasked.txt
```
Filtered chromosome sizes to remove contigs and chrM. Expanded promoter regions (2000bp upstream of TSS) to 500bp downstream of TSS. Identified intergenic regions by taking the complement of RefSeq genic regions.
```{bash process RefSeq features, eval=FALSE}
# hg19 chromosome sizes from UCSC, filtered to chr 1-22, X, Y
head -n25 hg19.genome > hg19_standard.genome
## Output
#/bar/epehrsson/TE_landscape/features/hg19_standard.genome
# Expand 500bp downstream of RefSeq gene TSS
bedtools slop -i refseq_up2000.txt -g hg19.genome -l 0 -r 500 -s > refseq_promoters.txt
## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_promoters.txt
# Intergenic regions
bedtools complement -i refseq_genes.txt -g hg19.genome.standard > refseq_intergenic.txt
## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_intergenic.txt
```
Created files of unique regions overlapping each RefSeq feature (all, protein-coding, and non-coding) and CpG islands by merging individual features. Coding status was determined by RefSeq accession.
```{bash merge RefSeq features, eval=FALSE}
# Merged features
for file in refseq_*.txt ; do sort -k1,1 -k2,2n $file | bedtools merge -i - > $file\_merge; done
rename 's/.txt_merge/_merge.txt/' *.txt_merge
## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_3UTR_merge.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_5UTR_merge.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_coding_exon_merge.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_exons_merge.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_introns_merge.txt
bedtools slop -i refseq_up2000.txt -g hg19.genome -l 0 -r 500 -s | sort -k1,1 -k2,2n - | bedtools merge -i - > refseq_promoters_merge.txt
## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_promoters_merge.txt
sort -k1,1 -k2,2n cpgIslandExtUnmasked.txt | bedtools merge -i - > cpgIslandExtUnmasked_merge.txt
## Output
#/bar/epehrsson/genic_features/CGI/cpgIslandExtUnmasked_merge.txt
# Merged features, coding/non-coding
for file in refseq*.txt; do awk -v OFS='\t' '{if(substr($4,0,2) == "NM") print $1, $2, $3}' $file | sort -k1,1V -k2,2n - | bedtools merge -i - > $(basename "$file" .txt)\_pc_merge.txt; done
for file in refseq*.txt; do awk -v OFS='\t' '{if(substr($4,0,2) == "NR") print $1, $2, $3}' $file | sort -k1,1V -k2,2n - | bedtools merge -i - > $(basename "$file" .txt)\_nc_merge.txt; done
## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_[feature]_pc_merge.txt [6 files]
#/bar/epehrsson/genic_features/RefSeq/refseq_[feature]_nc_merge.txt [5 files]
```
Intersected individual TEs with merged RefSeq features (all, protein-coding, and non-coding) and CpG islands and summed the length of overlap for each TE.
```{bash TE RefSeq, eval=FALSE}
# Intersection between TEs, RefSeq features
for file in ~/genic_features/refseq*merge*txt; do bedtools intersect -wo -a rmsk_TEother.txt -b $file | awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=$11}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > rmsk_TEother_$(basename "$file" .txt)\.txt; done
bedtools intersect -wo -a rmsk_TEother.txt -b ~/genic_features/refseq_intergenic.txt | awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=$11}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > rmsk_TEother_refseq_intergenic.txt
## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/rmsk_TEother_refseq_[feature].txt [7 files]
# Intersection between TEs, Refseq features (coding/non-coding)
for file in ~/genic_features/RefSeq/refseq_*c_merge.txt; do bedtools intersect -wo -a ../TEs/rmsk_TEother.txt -b $file | awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=$11}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > rmsk_TEother_$(basename "$file" .txt)\.txt; done
## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/rmsk_TEother_refseq_[feature]_pc_merge.txt [6 files]
#/bar/epehrsson/TE_landscape/features/intersect_features/rmsk_TEother_refseq_[feature]_nc_merge.txt [5 files]
# Intersection between TEs, CpG islands
bedtools intersect -wo -a rmsk_TEother.txt -b ~/genic_features/cpgIslandExtUnmasked_merge.txt | awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=$11}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > rmsk_TEother_cpgIsland.txt
## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/rmsk_TEother_cpgIsland.txt
```
Identified TEs that overlap hg19 blacklist regions. No TE in the SVA or Other classes overlaps a blacklist region.
```{bash TE blacklist, eval=FALSE}
# TEs that overlap the hg19 blacklist
bedtools intersect -wao -a rmsk_TE.txt -b ../../genomes/hg19/blacklist.bed > TE_blacklist.bed
bedtools intersect -wao -a rmsk_other.txt -b ../../genomes/hg19/blacklist.bed > other_blacklist.bed
## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/TE_blacklist.bed
#/bar/epehrsson/TE_landscape/features/intersect_features/other_blacklist.bed
```
Downloaded VISTA enhancers, filtered to those identified in human, converted to bed format, and identified those that are positively validated. There are 1,790 human VISTA enhancers, 934 of which positively validated. Intersected positively validated VISTA enhancers with TEs and identified TEs with overlap.
```{bash TE VISTA, eval=FALSE}
# VISTA enhancers
https://enhancer.lbl.gov/cgi-bin/imagedb3.pl?form=search&show=1&search.form=no&search.result=yes
## Output
#/bar/epehrsson/TE_landscape/features/vista_enhancers/vista_enhancers.txt
# Human VISTA enhancer headers
grep '^>Human' features/vista_enhancers/vista_enhancers.txt
## Output
#/bar/epehrsson/TE_landscape/features/vista_enhancers/vista_enhancers_human.txt
# All human VISTA enhancers in bed format
awk -v FS=':|-|\t' -v OFS='\t' '{print $1, $2, $3, $4, $5}' features/vista_enhancers/vista_enhancers_human.txt > features/vista_enhancers/vista_enhancers_human_all.bed
## Output
#/bar/epehrsson/TE_landscape/features/vista_enhancers/vista_enhancers_human_all.bed
# Human, positively validated VISTA enhancers in bed format
#/bar/epehrsson/TE_landscape/features/vista_enhancers/vista_enhancers_human.bed
wc -l features/vista_enhancers/vista_enhancers_human.bed
# 934
# Human, positively validated VISTA enhancers intersected with TEs
bedtools intersect -wo -a vista_enhancers_human.bed -b rmsk_TE.txt > vista_enhancers_rmsk_TE.txt
bedtools intersect -wo -a vista_enhancers_human.bed -b rmsk_other.txt > vista_enhancers_rmsk_other.txt
## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/vista_enhancers_rmsk_TE.txt
#/bar/epehrsson/TE_landscape/features/intersect_features/vista_enhancers_rmsk_other.txt
cat features/intersect_features/vista_enhancers_rmsk_*.txt | awk -v OFS='\t' '{a[$4,$5,$6,$7,$8,$9,$10]+=1}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > features/intersect_features/vista_enhancers_TE.txt
## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/vista_enhancers_TE.txt
# Number of VISTA enhancers overlapping TEs
cat features/intersect_features/vista_enhancers_rmsk_*.txt | awk '{print $1, $2, $3}' - | sort | uniq | wc -l
# 355
```
Creates a data frame of individual TEs, including: length, TE class (assigned by Roadmap and SVA/Other class designations from this study), mappability, Jukes-Cantor evolutionary distance, number of CpGs, CpG density (CpGs/bp), length of overlap with RefSeq genic features, intergenic regions, CpG islands, and blacklist regions, and VISTA enhancers. CpG density is loaded from ''TE_CpG_count.txt'', generated in the "TE CpG count" chunk below.
```{r create TEs, eval=FALSE}
source("R_scripts/TE_matrix.R")
```
### Epigenetic states
Downloaded epigenetic data from the Roadmap Epigenomics Project and mouseENCODE.
```{bash get epigenetic data, eval=FALSE}
# Roadmap
# chromHMM annotations (15-state)
http://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/all.mnemonics.bedFiles.tgz
## Output
#/bar/epehrsson/TE_landscape/raw_data/chromHMM/all.mnemonics.bedFiles.tgz
#/bar/epehrsson/TE_landscape/raw_data/chromHMM/E#_15_coreMarks_mnemonics.bed [127 files]
# 18-state chromHMM bedfiles
wget https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/core_K27ac/jointModel/final/all.mnemonics.bedFiles.tgz
## Output
#/bar/epehrsson/TE_landscape/raw_data/chromHMM_18state/E#_18_core_K27ac_mnemonics.bed [98 files]
# Individual 50-state chromHMM bedfiles
for sample in E003 E004 E005 E006 E007 E008 E017; do wget https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/class1Models_50states/$sample/$sample\_50_segments.bed.gz .; done
## Output
#/bar/epehrsson/TE_landscape/raw_data/chromHMM_50state/E#_50_segments.bed [7 files]
# WGBS
# By-chromosome WGBS fractional methylation matrices of CpG x sample
#/bar/mchoudhary/2chromTE/Meth/FractionalMethylation_Removed_E027_E064_Fixed_E012/
# DNase narrow peak bedfiles
# DNase_peaks.txt is a list of all 53 files
while read line; do wget http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/$line\.gz; gunzip $line\.gz; done < DNase_peaks.txt
## Output
#/bar/epehrsson/TE_landscape/raw_data/DNase/DNase_narrow_peaks/E#-DNase.macs2.narrowPeak [53 files]
# H3K27ac narrow peak bedfiles
while read line; do wget http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/$line\-H3K27ac.narrowPeak.gz; done < H3K27ac_samples.txt
## Output
#/bar/epehrsson/TE_landscape/raw_data/H3K27ac/H3K27ac_narrow_peaks/E#-H3K27ac.narrowPeak [98 files]
# RNA-seq normalization factors
#/bar/epehrsson/TE_landscape/raw_data/RNAseq/all.EGID.N.readlength
# Mouse (ENCODE)
# chromHMM (mm10)
bash TE_landscape/bash_scripts/get_mouse_chromHMM.sh
## Output
#/bar/epehrsson/TE_landscape/raw_data/mouse/chromHMM/ENCFF#.bed.gz [12 files]
# WGBS (mm10)
bash TE_landscape/bash_scripts/get_mouse_meth.sh
## Output
#/bar/epehrsson/TE_landscape/raw_data/mouse/WGBS/ENCFF#.bed [9 files]
```
Divided the hg19 genome into 200bp windows (n=15,478,399; no contigs or chrM). Intersected with chromHMM to get state assignments across the genome and reformatted to bed files.
All of the intersections are 200bp. The total width of the 200bp windows (3,095,677,412 bp) is slightly larger than what is covered by chromHMM (3,095,675,000 bp). The missing 2,412 bp is at the end of the chromosomes and does not overlap any TEs (both windows and chromHMM start at base 0 for all chromosomes). Only 24 windows do not have chromHMM annotations, one per chromosome.
```{bash 200bp windows, eval=FALSE}
# Divided up the genome into 200bp windows
bedtools makewindows -g features/hg19_standard.genome -w 200 > features/hg19_standard.windows
## Output
#/bar/epehrsson/TE_landscape/features/hg19_standard.windows
# Intersected with chromHMM
while read line; do bedtools intersect -wo -b features/hg19_standard.windows -a raw_data/chromHMM/$line\_15_coreMarks_mnemonics.bed > chromHMM/genome/windows/windows_$line\.bed; done < sample_lists/mnemonics.txt
# Reformmated to bedfiles and sorted
for file in chromHMM/genome/windows/windows_E*.bed; do awk -v OFS='\t' '{print $5, $6, $7, $4}' $file | sort -k1,1 -k2,2n - > chromHMM/genome/windows/$( basename $file .bed ); done
## Output
#/bar/epehrsson/TE_landscape/chromHMM/genome/windows/windows_E#.bed [127 files]
# QC
## Total width of windows
awk '{sum+=$3-$2}END{print sum}' features/hg19_standard.windows
## Total width of chromHMM
awk '{if($1 != "chrM") sum+=$3-$2}END{print sum}' raw_data/chromHMM/E001_15_coreMarks_mnemonics.bed
```
### Intersection of TEs with epigenetic states
Assigned individual TEs to epigenetic states based on overlap with epigenetic annotations, using 5 techniques.
Assignment of individual TEs to chromHMM states. First, intersected TEs with 200bp windows and found TEs that overlap the center of a window. Then, identified those that do not.
```{bash split TEs summit, eval=FALSE}
# Intersect TEs with 200bp windows
bedtools intersect -wo -a features/TEs/rmsk_TEother.txt -b features/hg19_standard.windows > features/TEs/rmsk_TEother_windows.txt
## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TEother_windows.txt
# Identify TEs that span the center of a 200bp chromHMM annotation ("summit")
awk '{if(($9+99.5 >= $2) && ($9+99.5 < $3)) print $0}' features/TEs/rmsk_TEother_windows.txt > features/TEs/rmsk_TEother_summit.txt
cut -f1-7 features/TEs/rmsk_TEother_summit.txt | sort -k1,1 -k2,2n | uniq > features/TEs/rmsk_TEother_summit
mv features/TEs/rmsk_TEother_summit features/TEs/rmsk_TEother_summit.txt
## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TEother_summit.txt
# Identify TEs that do not ("majority")
comm -23 <(sort features/TEs/rmsk_TEother.txt) <(sort features/TEs/rmsk_TEother_summit.txt) > features/TEs/rmsk_TEother_majority.txt
sort -k1,1 -k2,2n features/TEs/rmsk_TEother_majority.txt > features/TEs/rmsk_TEother_majority
mv features/TEs/rmsk_TEother_majority features/TEs/rmsk_TEother_majority.txt
## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TEother_majority.txt
```
Intersected individual TEs with chromHMM bedfiles and found the total length of overlap between each TE and each chromHMM state per sample. From the intersection files, identified intersections where the TE overlaps the center of a 200bp window. Summed the total overlap of the TE with the state, including only intersections with 200bp window centers. For those that do not overlap a 200bp window center, found the state covering the majority of the TE (ties were randomly assigned). Combined all into one file and sorted.
```{bash TE chromHMM matrix, eval=FALSE}
# Intersect with individual TEs
for file in chromHMM_bedfiles/*.bed ; do bedtools intersect -wo -a rmsk_TE.txt -b $file > $file\_TE; done
for file in chromHMM_bedfiles/*.bed; do bedtools intersect -wo -a rmsk_other.txt -b $file > $file\_other; done
## Output
#/bar/epehrsson/TE_landscape/chromHMM/TEs/intersect/E#_15_coreMarks_mnemonics.bed_other [127 files]
#/bar/epehrsson/TE_landscape/chromHMM/TEs/intersect/E#_15_coreMarks_mnemonics.bed_TE [127 files]
# Original matrix creation
for file in chromHMM_bedfiles/*.bed_TE; do suffix=$(basename $file | cut -d '_' -f1); awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7, $11]+=$12;}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sep[8], a[i];}}' $file | awk -v x=$suffix 'BEGIN{OFS="\t";}{print $0, x}' - ; done >> all_chromHMM_TE.txt
sort -k1,1V -k2,2n -k3,3n -k6,6 -k10,10 all_chromHMM_TE.txt > all_chromHMM_TE_sorted.txt
for file in chromHMM_bedfiles/*.bed_other; do suffix=$(basename $file | cut -d '_' -f1); awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7, $11]+=$12;}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sep[8], a[i];}}' $file | awk -v x=$suffix 'BEGIN{OFS="\t";}{print $0, x}' - ; done >> all_chromHMM_other.txt
sort -k1,1V -k2,2n -k3,3n -k6,6 -k10,10 all_chromHMM_other.txt > all_chromHMM_other_sorted.txt
# Combined into one file
# Filtering by overlap with 200bp window centers ("summit")
while read line; do python ~/bin/TE_landscape/filter_summit.py chromHMM/TEs/intersect/$line\_15_coreMarks_mnemonics.bed_TE 1 8 chromHMM/summit/TEs/rmsk_TE_$line\_chromHMM.bed chromHMM; python ~/bin/TE_landscape/filter_summit.py chromHMM/TEs/intersect/$line\_15_coreMarks_mnemonics.bed_other 1 8 chromHMM/summit/TEs/rmsk_other_$line\_chromHMM.bed chromHMM; cat chromHMM/summit/TEs/rmsk_TE_$line\_chromHMM.bed chromHMM/summit/TEs/rmsk_other_$line\_chromHMM.bed > chromHMM/summit/TEs/rmsk_TEother_$line\_chromHMM.bed; rm chromHMM/summit/TEs/rmsk_TE_$line\_chromHMM.bed; rm chromHMM/summit/TEs/rmsk_other_$line\_chromHMM.bed; done < sample_lists/mnemonics.txt
## Output
#/bar/epehrsson/TE_landscape/chromHMM/summit/TEs/rmsk_TEother_E#_chromHMM.bed [127 files]
# Including those that do not overlap window center ("majority")
## From original TE x sample x state file, split by sample
while read line; do python identify_no_summit.py $line rmsk_TEother_$line\_chromHMM.bed rmsk_TEother_$line\_chromHMM_noSummit.txt 7 8; done < mnemonics.txt
## Output
#/bar/epehrsson/TE_landscape/chromHMM/summit/TEs/rmsk_TEother_E#_chromHMM_noSummit.txt [127 files]
# Matrix of TE x sample x state, including summit and majority TEs
while read line; do awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4, $5, $6, $7, $11]+=$12}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sample, a[i], sep[8], "summit"}}' rmsk_TEother_$line\_chromHMM.bed >> rmsk_TEother_chromHMM.txt; awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4, $5, $6, $7, $8]+=$9}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sample, a[i], sep[8], "majority"}}' rmsk_TEother_$line\_chromHMM_noSummit.txt >> rmsk_TEother_chromHMM.txt; done < mnemonics.txt
sort -k1,1V -k2,2n -k3,3n -k4,4 -k8,8 rmsk_TEother_chromHMM.txt > ~/TE_landscape/chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt
## Output
#/bar/epehrsson/TE_landscape/chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt
```
Using the chromHMM states assigned above, found the number of samples in which each TE is annotated with each state, for all samples and excluding cancer cell lines/IMR90.
```{bash TE chromHMM potential, eval=FALSE}
# All
python ~/bin/TE_landscape/potential.py chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt features/TEs/rmsk_TEother.txt chromHMM/chromHMM_states.txt sample_lists/mnemonics.txt chromHMM/potential/rmsk_TEother_chromHMM_summit_potential.txt 0 7 9 8
## Output
#/bar/epehrsson/TE_landscape/chromHMM/potential/rmsk_TEother_chromHMM_summit_potential.txt
# No cancer
python ~/bin/TE_landscape/potential.py chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt features/TEs/rmsk_TEother.txt chromHMM/chromHMM_states.txt sample_lists/mnemonics_noCancer.txt chromHMM/potential/rmsk_TEother_chromHMM_summit_potential_noCancer.txt 0 7 9 8
## Output
#/bar/epehrsson/TE_landscape/chromHMM/potential/rmsk_TEother_chromHMM_summit_potential_noCancer.txt
```
Found the maximum number of states with which each TE can be annotated in a single sample, for TEs overlapping 200bp window centers only.
```{bash TE chromHMM max intra, eval=FALSE}
# Maximum states per TE in any sample
python ~/bin/TE_landscape/state_sharing_intra_max.py chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt chromHMM/rmsk_TEother_chromHMM_summit_max.txt 7
## Output
#/bar/epehrsson/TE_landscape/chromHMM/rmsk_TEother_chromHMM_summit_max.txt
```
Assignment of individual TEs to methylation states. First, combined the CpG methylation level across all chromosomes and reformatted as a sorted bed file.
```{bash process WGBS, eval=FALSE}
# Bedfile of CpG methylation level across all chromosomes, both strands
for file in /bar/mchoudhary/2chromTE/Meth/FractionalMethylation_Removed_E027_E064_Fixed_E012/chr*.fm; do awk -v chr=$(basename "$file" .fm) -v OFS='\t' '{print chr, $1, $1+1, $0}' $file >> all_CpG_Meth.bedGraph; done &
cut -f1-3,5- all_CpG_Meth.bedGraph | sort -k1,1V -k2,2n -k3,3n - > all_CpG_Meth.bed
## Output
#/bar/epehrsson/TE_landscape/WGBS/all_CpG_Meth.bedGraph
#/bar/epehrsson/TE_landscape/WGBS/all_CpG_Meth.bed
```
Intersected the CpG methylation level bedfile with individual TEs and found the average methylation across all CpGs overlapping each TE.
```{bash TE WGBS matrix, eval=FALSE}
# Intersect with individual TEs
split -l 1000000 ~/TE_landscape/all_CpG_Meth.bed
for file in xa*; do echo $file; bedtools intersect -wo -a ~/TE_landscape/rmsk_TEother.txt -b $file >> TE_CpG_Meth_new.bed ; done
## Output
#/bar/epehrsson/TE_landscape/WGBS/TE_CpG_Meth_new.bed
# Average methylation level of each TE in each sample
python ~/bin/TE_landscape/average_methylation.py TE_CpG_Meth_new.bed ~/TE_landscape/rmsk_TEother.txt ~/TE_landscape/WGBS_samples.txt TE_CpG_Meth_new_average.txt
## Output
#/bar/epehrsson/TE_landscape/WGBS/TE_CpG_Meth_new_average.txt
```
Counted the number of CpGs overlapping each TE.
```{bash TE CpG count, eval=FALSE}
# Number of CpGs per TE
awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' TE_CpG_Meth_new.bed > TE_CpG_count.txt
## Output
#/bar/epehrsson/TE_landscape/WGBS/TE_CpG_count.txt
```
Overlap of individual TEs with DHS peaks. Intersected individual TEs with DHS peaks by sample, then filtered to only intersections where the TE overlaps the summit of the DHS peak (see https://genome.ucsc.edu/FAQ/FAQformat.html#format12). Counted the number of intersections per TE and combined into one file.
```{bash TE DHS matrix, eval=FALSE}
# Intersect with individual TEs
bash TE_landscape/DNase/intersect_sample_list_DNase_peak.sh
## Output
#/bar/epehrsson/TE_landscape/DNase/intersect/TEs/rmsk_TEother_E#-DNase.macs2.narrowPeak [53 files]
# Filtering to TEs overlapping peak summit
for file in DNase/intersect/TEs/rmsk_TEother_E*-DNase.macs2.narrowPeak; do awk '{summit=$9+$17; if((summit >= $2) && (summit < $3)) print $0}' $file > DNase/true_summit/$( basename $file -DNase.macs2.narrowPeak)_DNase_summit.txt; done
## Output
#/bar/epehrsson/TE_landscape/DNase/true_summit/rmsk_TEother_E#_DNase_summit.txt [53 files]
# Matrix of TE x sample x state
while read line; do awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4, $5, $6, $7]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sample, a[i]}}' DNase/true_summit/rmsk_TEother_$line\_DNase_summit.txt >> DNase/true_summit/rmsk_TEother_DNase_summit.txt; done < sample_lists/DNase_samples.txt
## Output
#/bar/epehrsson/TE_landscape/DNase/true_summit/rmsk_TEother_DNase_summit.txt
```
Overlap of individual TEs with H3K27ac peaks. Intersected individual TEs with H3K27ac peaks by sample, then filtered to only intersections where the TE overlaps the summit of the H3K27ac peak. Counted the number of intersections per TE and combined into one file.
```{bash TE H3K27ac matrix, eval=FALSE}
# Intersect with individual TEs
while read line; do bedtools intersect -wo -a ../rmsk_TEother.txt -b H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak > H3K27ac_TEs/rmsk_TEother_$line\-H3K27ac.narrowPeak; done < H3K27ac_samples.txt
## Output
#/bar/epehrsson/TE_landscape/H3K27ac/intersect/TEs/rmsk_TEother_E#-H3K27ac.narrowPeak [98 files]
# Filtering to TEs overlapping peak summit
for file in H3K27ac/intersect/TEs/rmsk_TEother_E*-H3K27ac.narrowPeak; do awk '{summit=$9+$17; if((summit >= $2) && (summit < $3)) print $0}' $file > H3K27ac/true_summit/$( basename $file -H3K27ac.narrowPeak)_H3K27ac_summit.txt; done
## Output
#/bar/epehrsson/TE_landscape/H3K27ac/true_summit/rmsk_TEother_E#_H3K27ac_summit.txt [98 files]
# Matrix of TE x sample x state
while read line; do awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4, $5, $6, $7]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sample, a[i]}}' H3K27ac/true_summit/rmsk_TEother_$line\_H3K27ac_summit.txt >> H3K27ac/true_summit/rmsk_TEother_H3K27ac_summit.txt; done < sample_lists/H3K27ac_samples.txt