TE_landscape_draft.Rmd

---
title: "The epigenomic landscape of transposable elements across normal human development and anatomy"
author: "Erica Pehrsson"
date: "`r Sys.Date()`"
output: word_document
editor_options: 
  chunk_output_type: console
---

```{r setup, include=TRUE}
knitr::opts_chunk$set(dev = 'pdf',dev.args=list(pdf = list(useDingbats = FALSE)),fig.width=7, fig.height=8, fig.path='TE_landscape_draft_figures/', cache.path="TE_landscape_draft_cache/")
```

Includes code for producing figures and analysis for the manuscript. 

# Setup

## Load required libraries

```{r load libraries}
library(plyr)
packageVersion("plyr")
library(reshape2)
packageVersion("reshape2")
library(ggplot2)
packageVersion("ggplot2")
library(gridExtra)
packageVersion("gridExtra")
library(RColorBrewer)
packageVersion("RColorBrewer")
library(scales)
packageVersion("scales")
library(mgcv)
packageVersion("mgcv")
library(extrafont)
packageVersion("extrafont")
library(rcompanion)
packageVersion("rcompanion")
```

## Set universal plot parameters

```{r set plot parameters}
theme_set(theme_bw(base_size=10) %+replace%  theme(panel.background=element_rect(fill="white"),panel.border=element_rect(color="black",fill=NA),panel.grid=element_blank()))
```

## Load custom functions

```{r load functions}
# General

# Count the number of NAs per column for a dataframe
## Input: data_frame - any dataframe
## Output: vector with the number of NAs per column
count_na = function(data_frame){
  counts = apply(data_frame,2,function(x) sum(is.na(x)))
  return(counts)
}

# Plotting

# From https://stackoverflow.com/questions/11883844/inserting-a-table-under-the-legend-in-a-ggplot2-histogram
# Returns the legend for a ggplot object, to be included in a composite figure
## Input: myggplot - ggplot call
## Output: legend - legend component of a ggplot object
get_legend = function(myggplot){
  tmp <- ggplot_gtable(ggplot_build(myggplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}

# From https://stackoverflow.com/questions/8197559/emulate-ggplot2-default-color-palette
# Returns a vector of colors equally spaced along the color wheel
## Input: n - number of colors
## Output: vector of colors
gg_color_hue = function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

# Matrix processing

# Filter the metadata matrix to only samples with data for that epigenetic technique 
## Input: metadata - metadata matrix produced by metadata.R
## metric - epigenetic technique ("WGBS","DNase","H3K27ac"); for all other input, the dataframe is not filtered (all samples have chromHMM data)
## Output: metadata matrix filtered to only samples with data for that technique
filter_metadata = function(metadata,metric){
  if (metric == "WGBS") {
    metadata_matrix = metadata[which(!is.na(metadata$WGBS)),]
  } else if (metric == "DNase") {
    metadata_matrix = metadata[which(!is.na(metadata$DNase)),]
  } else if (metric == "H3K27ac") {
    metadata_matrix = metadata[which(!is.na(metadata$H3K27ac)),]
  } else{
    metadata_matrix = metadata
  }
}

# Convert TE classes as assigned by RepeatMasker to composite classes
# "Other" to "SVA", nine ambiguous classes to "Other"
## Input: class_vector - dataframe column listing TE class
## Output: vector listing updated TE class
convert_class = function(class_vector){
  class_vector = factor(class_vector,levels=unique(c(levels(class_vector),"SVA","Other")))
  class_vector[which(class_vector == "Other")] = "SVA"
  class_vector[which(class_vector %in% c("DNA?","LINE?","LTR?","SINE?","Unknown","Unknown?","RC","Unconfident","Unconfident_RC"))] = "Other"
  class_vector = factor(class_vector,levels=c("DNA","LINE","LTR","SINE","SVA","Other"))
  return(class_vector)
}

# Split combined RefSeq genic feature names (Cohort) into feature and coding status
## Input: feature_matrix - dataframe with Cohort column
## new_features - additional features to include in Feature factor levels
## Output: dataframe with additional columns Feature and Coding
split_coding = function(feature_matrix,new_features=c()){
  to_split = gsub("coding_exon","CDS",feature_matrix$Cohort)
  feature_matrix$Feature = factor(unlist(lapply(to_split,function(x) unlist(strsplit(as.character(x),"_"))[1])),levels=c(new_features,features))
  feature_matrix$Coding = unlist(lapply(to_split,function(x) unlist(strsplit(as.character(x),"_"))[2]))
  # Non-genic cohorts are given the coding status "All"
  feature_matrix[which(is.na(feature_matrix$Coding)),]$Coding = "All"
  feature_matrix$Coding = factor(feature_matrix$Coding,levels=c("All","pc","nc"))
  feature_matrix$Cohort = factor(feature_matrix$Cohort,levels=c(new_features,cohorts))
  
  return(feature_matrix)
}

# Potential

# Given a matrix of the number of samples each individual feature is annotated with each epigenetic state,
# Creates a table with the number of TEs in each state (column) for each number of samples (rows)
## Input: TE_matrix - matrix output by potential.py, individual TEs/promoters (rows) versus the number of samples in each state (columns)
## columns - columns containing states
## samples - total number of samples in dataset
## Output: dist_TE - table with columns for number of samples ("Samples", 0-total) and each state
sample_distribution = function(TE_matrix,columns,samples){
  # Create column with number of samples (0 to total)
  dist_TE = data.frame(Samples = seq(0,samples))
  
  # Process each state
  for (i in columns){
    dist_TE = merge(dist_TE,as.data.frame(table(as.integer(TE_matrix[,i]))),by.x="Samples",by.y="Var1",all=TRUE)
    colnames(dist_TE)[length(colnames(dist_TE))] = colnames(TE_matrix)[i]
  }
  dist_TE[is.na(dist_TE)] = 0
  
  return(dist_TE)
}

# From the number of TEs in each state in each number of samples, calculates TE epigenetic statistics
## Input: distribution - table output by sample_distribution
## states - number of states under consideration
## samples - total number of samples in dataset
## Output: stats - table by state (row) with the proportion of TEs ever in the state
## and the mean proportion of samples a TE is in the state (columns)
potential_stats = function(distribution,states,samples){
  # Dataframe with state columns
  if (states > 1){
    distribution = distribution[,2:(states+1)]
  }
  else{
    distribution = as.data.frame(distribution[,2])
  }
  # For each state, calculate 1) proportion of TEs in the state in at least one sample
  # 2) mean/sd proportion of samples in which all TEs are annotated with the state
  # 3) mean proportion of samples in which TEs in the state in at least one sample are annotated with the state
  stats = as.data.frame(t(rbind(apply(distribution,2,function(x) sum(x[2:(samples+1)])/sum(x)),
                                apply(distribution,2,function(x) sum(x*seq(0,samples))/sum(x))/samples,
                                apply(distribution,2,function(x) sd(rep(seq(0,samples),x))/sqrt(sum(x))/samples),
                                apply(as.data.frame(distribution[2:(samples+1),]),2,function(x) sum(x*seq(1,samples))/sum(x))/samples)))
  colnames(stats) = c("Proportion_ever","Samples_avg_all","Samples_SE_all","Samples_avg_ever")
  
  return(stats)
}

# TE features

# Given a dataframe with several variables for a feature, calculates the Spearman correlation between each variable pair, for all TEs and by class
## Input: matrix - matrix of features (rows) versus feature variables (columns)
## indpt_var - list of variables to compare to response_vars variables
## response_vars - list of variables to compare to indpt_var variables
## Output: Dataframe with a row for each combination of variables from indpt_var and response_vars with Spearman's rho and p-value,
## for all TEs and by class
correlate_spearman = function(matrix, indpt_var, response_vars){ 
  # All TEs
  indv = as.data.frame(t(apply(matrix[,response_vars],2,function(x) unlist(cor.test(matrix[,indpt_var],x,method="spearman"))[c("p.value","estimate.rho")])))
  indv$State = rownames(indv)
  indv$class_update = rep("All",dim(indv)[1])
  
  # By class
  class = merge(melt(ddply(matrix,~class_update,function(y) apply(y[,response_vars],2,function(x) unlist(cor.test(y[,indpt_var],x,method="spearman"))["p.value"])),id.vars="class_update"),
                melt(ddply(matrix,~class_update,function(y) apply(y[,response_vars],2,function(x) unlist(cor.test(y[,indpt_var],x,method="spearman"))["estimate.rho"])),id.vars="class_update"),
                by=c("class_update","variable"))
  colnames(class) = c("class_update","State","p.value","estimate.rho")
  
  class = rbind(indv,class)
  return(class)
}

# Enrichment
# Process the output from a prcomp() call to 1) add metadata to the rotated matrix and 
# 2) convert the standard deviations of the principal components into variance per component
## Input: pca - prcomp object
## metadata - the metadata matrix produced by metadata.R
## metadata_col - the column of the metadata matrix corresponding to the rownames of the rotated matrix
## Output: prcomp object with two additional components:
## eigenvectors - rotation matrix with additional columns with sample metadata
## eigenvalues - variance explained by each principal component
format_pca = function(pca,metadata,metadata_col="Sample"){
  pca$eigenvectors = cbind(as.data.frame(pca$x),metadata[match(rownames(pca$x),metadata[[metadata_col]]),])
  pca$eigenvalues = 100*pca$sdev^2/sum(pca$sdev^2)
  return(pca)
}

# Loads dataframe of individual TEs in a subfamily annotated with a state in any sample
## Input: subfamily - subfamily of interest
## state - state of interest
## metric - epigenetic technique
## Output: dataframe with individual TE coordinates, the sample, length of overlap with the state (value varies by technique), and the state
get_subfamily_in_state = function(subfamily,state,metric){
  # Load dataframe of individual TEs in the state, location of file based on epigenetic technique
  if (metric=="chromHMM"){
    subfamily_in_state = read.table(paste("chromHMM/subfamily/by_state/",subfamily,"_",state,".txt",sep=""),sep='\t')[,1:10]
    colnames(subfamily_in_state) = c(TE_coordinates[c(1:4,6,5,7)],"Sample","Overlap","State")
  } else if (metric=="WGBS") {
    subfamily_in_state = read.table(paste("WGBS/subfamily/by_state/",subfamily,"_",state,".txt",sep=""),sep='\t')
    colnames(subfamily_in_state) = c(TE_coordinates[c(1:4,6,5,7)],"Sample","Overlap","State")
  } else if (metric=="DNase" | metric=="H3K27ac"){
    subfamily_in_state = read.table(paste(metric,"/subfamily/true_summit/",subfamily,"_",state,".txt",sep=""),sep='\t')
    colnames(subfamily_in_state) = c(TE_coordinates[c(1:4,6,5,7)],"Sample","Overlap")
  }
  print("Loaded matrix")

  return(subfamily_in_state)
}

# Loads list of samples where the subfamily is enriched in the state with LOR > threshold
## Input: subfamily - subfamily of interest
## state - state of interest
## enrichment - LOR threshold
## Output: List of samples where the subfamily is enriched in the state at the specified threshold
get_enriched_samples = function(subfamily,state,enrichment=THRESHOLD_LOR){
  # From the matrix of enrichments, filtered by member thresholds (>30 members overall, >10 members in state)
  samples = as.vector(unique(subfamily_state_sample_filter[which(subfamily_state_sample_filter$subfamily == subfamily 
                                                                 & subfamily_state_sample_filter$State == state
                                                                 & subfamily_state_sample_filter$Enrichment > enrichment),]$Sample))
  return(samples)
}

# For a given subfamily and state, writes out a file with all individual TEs in the state in samples where the subfamily is enriched
## Input: subfamily - subfamily of interest
## State - state of interest
## Output: subfamily_bedfile - dataframe of individual TEs in the state when the subfamily is enriched with the number of samples
## the TE is in the state, across all samples and only enriched samples.
## Written to a file in bed format
get_subfamily_enriched = function(subfamily,State){
  # Identify the epigenetic technique associated with the specified state
  metric = ifelse(State %in% chromHMM_states,"chromHMM",ifelse(State %in% meth_states,"WGBS",State))
  
  # Load dataframe of individual TEs in the subfamily annotated with the state in any sample
  subfamily_in_state = get_subfamily_in_state(subfamily,State,metric)
  # Load list of samples where the subfamily is enriched in the state with LOR > 1.5
  samples = get_enriched_samples(subfamily,State,THRESHOLD_LOR)
  # Filter dataframe of individual TEs to only samples where the subfamily is enriched
  subfamily_enriched = subfamily_in_state[which(subfamily_in_state$Sample %in% samples),]
  
  # Total number of samples each TE is annotated with the state
  subfamily_ubiq = aggregate(data=subfamily_in_state,Sample~chromosome+start+stop+subfamily+strand,length)
  colnames(subfamily_ubiq)[6] = "Total_samples"
  # Number of samples each TE is annotated with the state, enriched samples only
  subfamily_bedfile = aggregate(data=subfamily_enriched,Sample~chromosome+start+stop+subfamily+strand,length)
  # Combine and sort
  subfamily_bedfile = merge(subfamily_bedfile,subfamily_ubiq,by=c("chromosome","start","stop","subfamily","strand"))
  subfamily_bedfile = subfamily_bedfile[order(subfamily_bedfile$chromosome,subfamily_bedfile$start),]
  
  # Write out individual TEs in bedfile format
  write.table(subfamily_bedfile[,c("chromosome","start","stop","subfamily","Sample","strand")],
              file=paste("enrichment/",subfamily,"_",State,"_enriched.bed",sep=""),
              quote=FALSE,row.names = FALSE,col.names=FALSE,sep='\t')
  
  return(subfamily_bedfile)
}

# Using permutation tests, calculates p-values for the likelihood of observing the number of samples belonging to each sample category/grouping
# among samples filtered by user-defined thresholds by chance, given the number of samples in each category/grouping overall
## Input: matrix - dataframe with Sample column, with 1-2 columns on which samples can be filtered and a State column (restricted to one state only)
## metric - variable to filter dataframe on
## direction - for p-value calculation, whether permuted results are tested for being above or below observed results
## threshold - threshold for filtering variable
## filtering - additional variable to filter dataframe on
## threshold2 - threshold for additional filtering variable
## Output: dataframe with each category/grouping with the number of samples overall and passing thresholds,
## plus a p-value for observing the number of samples passing the thresholds by chance
permute_by_sample = function(matrix,metric,direction,threshold=0,filtering,threshold2=0){
  # Filter the metadata matrix output by metadata.R to match the state under consideration
  metadata_matrix = filter_metadata(metadata,ifelse(unique(matrix$State) == "H3K27ac","H3K27ac",ifelse(unique(matrix$State) == "DNase","DNase",ifelse(unique(matrix$State) %in% meth_states,"WGBS","chromHMM"))))
  
  # Creates a table with the number of samples in each grouping for each sample category for all samples
  metadata_table = ldply(apply(metadata_matrix[,sample_categories],2,as.data.frame(table)))
  colnames(metadata_table) = c("Category","Grouping","Samples")
  print("Computing background")
  
  # Creates a table with the number of samples per category/grouping for all samples and only those that pass the specified thresholds
  threshold_matrix = function(matrix,metric,threshold,filtering,threshold2){
    # Add sample categories for each sample
    filter_matrix = merge(matrix,metadata_matrix[,c("Sample",sample_categories)],by="Sample")
    # Filter matrix to those passing threshold(s)
    filter_matrix = filter_matrix[which(filter_matrix[[metric]] > threshold & filter_matrix[[filtering]] > threshold2),]
    # Create a table with the number of samples in each sample category/grouping for only those samples passing thresholds
    table_matrix = ldply(apply(filter_matrix[,sample_categories],2,as.data.frame(table)))
    colnames(table_matrix) = c("Category","Grouping","Samples")
    # Expand table to all possible category/groupings for those samples
    table_matrix = merge(table_matrix,metadata_table[,1:2],all.y=TRUE,by=c("Category","Grouping"))
    table_matrix[is.na(table_matrix)] = 0
    return(table_matrix)
  }
  
  # Create sample table for true results
  real = threshold_matrix(matrix,metric,threshold,filtering,threshold2)
  print("Computing real")
  
  # Create sample table for 1000 permutations of sample labels
  permuted = rdply(1000,function(x) {permute_matrix = matrix; permute_matrix$Sample = sample(permute_matrix$Sample);threshold_matrix(permute_matrix,metric,threshold,filtering,threshold2)})  
  colnames(permuted) = c("Replicate","Category","Grouping","Replicate_samples")
  print("Computing permuted")
  
  # Combine true and permuted results
  permuted = merge(permuted,real,by=c("Category","Grouping"),all.x=TRUE)
  print("Merging")
  
  # Calculate the proportion of permutations for which the result is at or above/at or below the true result, for each category/grouping
  # Convert into a p-value by dividing by the number of permutations
  if(direction == "+"){
    over = ddply(permuted,.(Category,Grouping),summarise,Pvalue=sum(Replicate_samples >= Samples)/1000)
  } else if (direction == "-") {
    over = ddply(permuted,.(Category,Grouping),summarise,Pvalue=sum(Replicate_samples <= Samples)/1000)
  }
  
  print("Computing p-values")
  
  # Combine p-values with true results and table of all samples in each category/grouping
  over = merge(over,real,by=c("Category","Grouping"),all.x=TRUE)
  over = merge(over,metadata_table,by=c("Category","Grouping"),all.x=TRUE)
  return(over)
}

# Mouse

# Counts the number of orthologous TEs with tissue-specific epigenetic activity by tissue, 
# including only TEs in the state in <5 human tissues and in both samples of the specified tissue (All),
# and the number of tissues they are in the state in mouse:
# Specific, both samples of the specified tissue, <5 samples overall; On, >8 samples; Off, <2 samples
## Input: x - tissue of interest
## matrix - Dataframe of the number of samples each orthologous TE pair is in the state in each species, overall and by tissue
## Output: named vector with the number of orthologous TE pairs in each category
tissue_matrix = function(x,matrix){
  c(All = dim(matrix[which(matrix$Human_samples < 5 & matrix[[paste(x,".x",sep="")]] == 2),])[1],
    Specific = dim(matrix[which(matrix$Human_samples < 5 & matrix[[paste(x,".x",sep="")]] == 2 & matrix$Mouse_samples < 5 & matrix[[paste(x,".y",sep="")]] == 2),])[1],
    On = dim(matrix[which(matrix$Human_samples < 5 & matrix[[paste(x,".x",sep="")]] == 2 & matrix$Mouse_samples > 8),])[1],
    Off = dim(matrix[which(matrix$Human_samples < 5 & matrix[[paste(x,".x",sep="")]] == 2 & matrix$Mouse_samples < 2),])[1])
}
```

## Define color and label vectors

```{bash state lists, eval=FALSE}
# List of 15 chromHMM states
/bar/epehrsson/TE_landscape/sample_lists/chromHMM_states.txt

# List of 4 methylation states
/bar/epehrsson/TE_landscape/sample_lists/methylation_states.txt
```

```{r define vectors}
# Chromosomes
standard_chromosomes = c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY","chrM")

# TE coordinates
TE_coordinates = c("chromosome","start","stop","subfamily","family","class","strand")
hg19_coordinates = c("human_chr_hg19","human_start_hg19","human_stop_hg19","human_subfamily","human_class","human_family","human_strand_hg19")
mm10_coordinates = c("mouse_chr_mm10","mouse_start_mm10","mouse_stop_mm10","mouse_subfamily","mouse_class","mouse_family","mouse_strand_mm10")

# TE class colors
class_colors = setNames(c("#4A72E8","#FF6600","#006600","#cc0000","lightseagreen","#5C5C5C"),c("DNA","LINE","LTR","SINE","SVA","Other"))

# chromHMM states
## 15-state model
chromHMM_states = c("1_TssA","2_TssAFlnk","3_TxFlnk","4_Tx","5_TxWk","6_EnhG","7_Enh","8_ZNF/Rpts","9_Het","10_TssBiv","11_BivFlnk","12_EnhBiv","13_ReprPC","14_ReprPCWk","15_Quies")
chromHMM_colors = setNames(c(rgb(255,0,0,maxColorValue=255),rgb(255,69,0,maxColorValue=255),rgb(50,205,50,maxColorValue=255),rgb(0,128,0,maxColorValue=255),rgb(0,100,0,maxColorValue=255),rgb(194,225,5,maxColorValue=255),rgb(255,255,0,maxColorValue=255),rgb(102,205,170,maxColorValue=255),rgb(138,145,208,maxColorValue=255),rgb(205,92,92,maxColorValue=255),rgb(233,150,122,maxColorValue=255),rgb(189,183,107,maxColorValue=255),rgb(128,128,128,maxColorValue=255),rgb(192,192,192,maxColorValue=255),"grey90"),chromHMM_states)

## 18-state model
chromHMM_states_18 = setNames(c(rgb(255,0,0,maxColorValue=255),rgb(255,69,0,maxColorValue=255),rgb(255,69,0,maxColorValue=255),
                            rgb(255,69,0,maxColorValue=255),rgb(0,128,0,maxColorValue=255),rgb(0,100,0,maxColorValue=255),
                            rgb(194,225,5,maxColorValue=255),rgb(194,225,5,maxColorValue=255),rgb(255,195,77,maxColorValue=255),
                            rgb(255,195,77,maxColorValue=255),rgb(255,255,0,maxColorValue=255),rgb(102,205,170,maxColorValue=255),
                            rgb(138,145,208,maxColorValue=255),rgb(205,92,92,maxColorValue=255),rgb(189,183,107,maxColorValue=255),
                            rgb(128,128,128,maxColorValue=255),rgb(192,192,192,maxColorValue=255),rgb(255,255,255,maxColorValue=255)),
                          c("1_TssA","2_TssFlnk","3_TssFlnkU","4_TssFlnkD","5_Tx","6_TxWk","7_EnhG1","8_EnhG2","9_EnhA1","10_EnhA2","11_EnhWk",
                            "12_ZNF/Rpts","13_Het","14_TssBiv","15_EnhBiv","16_ReprPC","17_ReprPCWk","18_Quies"))

## Mouse 15-state chromHMM model
mouse_chromHMM_states = c("TssA","TssAFlnk1","TssAFlnk2","Tx1","Tx2","Enh","EnhLo1","EnhLo2","HetCons","HetFac","TssBiv","EnhPois1","EnhPois2","QuiesG","Quies")
mouse_chromHMM_colors = setNames(c("red","orangered2","orangered4","green4","darkgreen","yellow","gold1","goldenrod2","cornflowerblue","steelblue2","indianred3","burlywood1","lightgoldenrod2","grey","lightgrey"),mouse_chromHMM_states)

# WGBS/methylation states
meth_states = c("Hypomethylated","Intermediate","Hypermethylated","Missing")
meth_labels = setNames(c("Hypomethylated","Intermediate meth","Hypermethylated","Missing meth data"),meth_states)
meth_colors = setNames(c("#F8766D","#00BFC4","#7CAE00","#C77CFF"),meth_states)

# All epigenetic states
states = c(chromHMM_states,meth_states,"DNase","H3K27ac","Expressed_samples")
all_state_colors = c(chromHMM_colors,meth_colors,"aquamarine","tan1","black","black","black") 
names(all_state_colors)[20:24] = c("DNase","H3K27ac","Expressed_samples","Bases","CpGs")
all_state_labels = setNames(c(chromHMM_states,meth_labels,"DHS","H3K27ac","RPKM > 1"),states)
all_pc_states = c(states[1:21],unlist(lapply(states[1:21],function(x) paste(x,"PC",sep="_"))))

# Technique labels
metric_labels = c("chromHMM","WGBS","DNase","H3K27ac","RPKM > 1")
names(metric_labels) = c("chromHMM","WGBS","DNase","H3K27ac","Expressed_samples")

## Alternative technique labels
mark_labels = setNames(c("chromHMM","WGBS","DHS","H3K27ac","RPKM > 1"),c("chromHMM","WGBS","DNase","H3K27ac","RNA"))

# Sample classification
sample_categories = c("Group","Anatomy","Type","Age","Cancer","Germline")
category_labels = setNames(c("Group","Anatomy","Type","Age","Cancer","Germ layer"),sample_categories)

group_colors = setNames(c("#924965","#4178AE","#E41A1C","#69608A","#B65C73","#FF9D0C","#678C69","#55A354","#E67326","#FFD924","#AF5B39","#D56F80","#999999","#C5912B","#C58DAA","#F182BC","#C2655D","#DAB92E","#000000"),c("ESC","ES-deriv","IMR90","iPSC","Mesench","Epithelial","HSC & B-cell","Blood & T-cell","Myosat","Neurosph","Adipose","Heart","Other","Brain","Digestive","Sm. Muscle","Muscle","Thymus","ENCODE2012"))
anatomy_colors = setNames(c("#B2DF8A","#33A02C","#FB9A99","#FFED6F","#80B1D3","#666666","#924965","#4178AE","#FDBF6F","#CAB2D6","#6A3D9A","#BEBADA","#FCCDE5","#BC80BD","#E7298A","#FF7F00","#69608A","#B3DE69","#CCEBC5","#000000","#FB8072","#FDB462","#8DD3C7","#D9D9D9","#E31A1C","#A6CEE3","#D95F02","#E6AB02","#66A61E","#B15928"),c("ADRENAL","BLOOD","BONE","BRAIN","BREAST","CERVIX","ESC","ESC_DERIVED","FAT","GI_COLON","GI_DUODENUM","GI_ESOPHAGUS","GI_INTESTINE","GI_RECTUM","GI_STOMACH","HEART","IPSC","KIDNEY","LIVER","LUNG","MUSCLE","MUSCLE_LEG","OVARY","PANCREAS","PLACENTA","SKIN","SPLEEN","STROMAL_CONNECTIVE","THYMUS","VASCULAR"))
type_colors = setNames(brewer.pal(5,"Greens"),c("PrimaryCulture","PrimaryCell","PrimaryTissue","CellLine","ESCDerived"))
age_colors = setNames(brewer.pal(4,"YlOrRd"),c("Cell_line","Unknown","Non_fetal","Fetal"))
cancer_colors = setNames(c("lightgrey","red"),c("No","Yes"))
germline_colors = setNames(brewer.pal(6,"Dark2"),c("Pluripotent","Mesoderm","Ectoderm","Endoderm","Mixed","Other"))

group_order = c("ESC","iPSC","ES-deriv","Blood & T-cell","HSC & B-cell","Thymus","Brain","Neurosph","Digestive","Epithelial","Adipose","Mesench","Myosat","Muscle","Sm. Muscle","Heart","Other","IMR90","ENCODE2012")

age_labels = setNames(c("Cell line","Fetal","Non-fetal","Unknown"),c("Cell_line","Fetal","Non_fetal","Unknown"))

# Genic features
features = c("cpgIsland","promoters","5UTR","CDS","3UTR","exons","introns","intergenic")
genic_labels = setNames(c("CpG island","Promoter","5'UTR","CDS","3'UTR","Exon","Intron","Intergenic","CDS","Vista enhancers"),c(features,"coding_exon","Vista_enhancers"))

cohorts = c("cpgIsland","promoters","promoters_pc","promoters_nc","5UTR","5UTR_pc","5UTR_nc","coding_exon","coding_exon_pc","3UTR","3UTR_pc","3UTR_nc",
                     "exons","exons_pc","exons_nc","introns","introns_pc","introns_nc","intergenic","blacklist","Vista_enhancers")

# Feature overlap labels
overlap_labels = c("% genome","% TEs (either strand)","% TEs (same strand)","% feature in TEs")
names(overlap_labels) = c("Genome_percent_genome","TEs_percent_TE","TEs_strand_percent_TE","Genome_percent_TE")

# Coding labels
coding_labels = c("All","Protein-coding","Non-coding")
names(coding_labels) = c("All","pc","nc")

# Column name vectors
measure_states_extra = c("States.chromHMM","Max_states_intra","States.WGBS","Max_expression")
measure_metrics = c("Length","mappability","JC_distance","CpGs","CpGs_per_length")
measure_metrics_subfam = c("Count","Total_length","Length","Mappability","Age","CpGs","Count_CpGs","CpGs_per_TE","CpGs_per_kbp")

enrichment_names = c("Length_ijk","Length_ik","Length_jk","Length_k","Enrichment","Length_percent_jk","Members","Count","Percent")

variable_labels = setNames(c("Length (bp)","Mappability","Jukes-Cantor evolutionary distance","CpGs","CpGs per bp"),measure_metrics)
measure_labels = setNames(c("Number of TEs","Total length (bp)","Median length (bp)","Median mappability","Median age (JC)","Total CpGs","TEs with CpGs","CpGs per TE","CpGs per kbp"),measure_metrics_subfam)
```

## Load metadata

Loads a data frame of metadata for each Roadmap epigenome, including: epigenome name; Group, Anatomy, and Type assignments from the REMC; Germ layer, Age, and Cancer assignments from this study; whether the epigenome has data for each techique, includes chrY, and is a cancer cell line/IMR90.

```{bash sample lists, eval=FALSE}
# Samples with chromHMM annotations
## All 127 samples
/bar/epehrsson/TE_landscape/sample_lists/mnemonics.txt

## 121 samples, excluding cancer cell lines and IMR90
/bar/epehrsson/TE_landscape/sample_lists/mnemonics_noCancer.txt

# Samples with WGBS data (37)
/bar/epehrsson/TE_landscape/sample_lists/WGBS_samples.txt

# Samples with DHS data (53)
/bar/epehrsson/TE_landscape/sample_lists/DNase_samples.txt

# Samples with H3K27ac data (98)
/bar/epehrsson/TE_landscape/sample_lists/H3K27ac_samples.txt

# Samples with RNA data, strand-agnostic (56)
/bar/epehrsson/TE_landscape/sample_lists/RNA_samples_agnostic.txt
```

```{r metadata}
source("R_scripts/metadata.R")
```

## Load sample statistics

Creates a matrix of the number of samples with data for each technique, for all samples, excluding cancer cell lines/IMR90, and excluding samples without chrY. Also creates labels for each technique.

```{r load sample stats}
# Samples with data for each technique: all, not a cancer cell line/IMR90, with chrY, both
sample_counts = rbind(apply(metadata[,c("Sample","WGBS","DNase","H3K27ac","RNA")],2,function(x) dim(metadata[which(!is.na(x)),])[1]),
                      apply(metadata[,c("Sample","WGBS","DNase","H3K27ac","RNA")],2,function(x) dim(metadata[which(!is.na(x) & metadata$Exclude == "Include"),])[1]),
                      apply(metadata[,c("Sample","WGBS","DNase","H3K27ac","RNA")],2,function(x) dim(metadata[which(!is.na(x) & metadata$chrY == "Yes"),])[1]),
                      apply(metadata[,c("Sample","WGBS","DNase","H3K27ac","RNA")],2,function(x) dim(metadata[which(!is.na(x) & metadata$chrY == "Yes" & metadata$Exclude == "Include"),])[1]))
colnames(sample_counts)[1] = "chromHMM"
rownames(sample_counts) = c("All","Include","chrY","Include.chrY")
```

## Create essential matrices

Creates matrices of features (TEs, promoters, etc.) and their intersection with epigenetic states. 

### TEs

Filters the hg19 repeats identified by RepeatMasker to standard chromosomes (no contigs) and 12 TE classes. 

```{bash TEs, eval=FALSE}
# RepeatMasker file for hg19	 
#/bar/epehrsson/TE_landscape/raw_data/rmsk.txt.gz	-> /bar/genomes/hg19/rmsk/rmsk.txt.gz
#/bar/epehrsson/TE_landscape/features/rmsk.txt

# RepeatMasker files restricted to all TE classes, chr 1-22, X, Y, M	 
awk -v OFS='\t' '{if(($12 == "LTR" || $12 == "DNA" || $12 == "SINE" || $12 == "LINE") && ($6 !~ /_/)) print $6, $7, $8, $11, $12, $13, $10}' rmsk.txt > rmsk_TE.txt
awk -v OFS='\t' '{if(($12=="Unknown"||$12=="Unknown?"||$12=="DNA?"||$12=="LINE?"||$12=="SINE?"||$12=="LTR?"||$12=="Other"||$12=="RC") && ($6 !~ /_/))print $6, $7, $8, $11, $12, $13, $10}' rmsk.txt > rmsk_other.txt
cat rmsk_TE.txt rmsk_other.txt > rmsk_TEother.txt

## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TE.txt
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_other.txt
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TEother.txt
```

Average 36bp mappability per repeat from Sundaram et al 2014 Genome Research. Filtered to TE subfamilies. 

```{bash TE mappability, eval=FALSE}
# Rmsk 36bp mappability file, standard chromosomes
#/bar/epehrsson/TE_landscape/mappability/rmsk.txt.mapability.36mer -> /bar/genomes/hg19/rmsk/rmsk.txt.mapability.36mer

# TE 36bp mappability file	 

while read line; do awk -v subfam=$line '{if($4 == subfam) print $0}' rmsk.txt.mapability.36mer >> rmsk_TE_mappability_36mer.txt; done < subfamilies.txt
while read line; do awk -v subfam=$line '{if($4 == subfam) print $0}' rmsk.txt.mapability.36mer >> rmsk_other_mappability_36mer.txt; done < other_subfamilies.txt

## Output
#/bar/epehrsson/TE_landscape/mappability/rmsk_other_mappability_36mer.txt
#/bar/epehrsson/TE_landscape/mappability/rmsk_TE_mappability_36mer.txt
```

Calculates Jukes-Cantor evolutionary distance from subfamily consensus for all hg19 repeats, using the RepeatMasker file with substitution rate as input. Filtered to standard chromosomes and TE classes.

```{bash TE age, eval=FALSE}
# Python script takes *rmsk.txt.gz file in the same folder as input, calculates JC evolutionary distance for rmsk repeats
python calcAge_generalized.py rmsk.txt.gz.JCage

## Output
#/bar/epehrsson/TE_landscape/age/rmsk.txt.gz.JCage

# JC evolutionary matrix for TEs, standard chromosomes	 

awk -v OFS='\t' '{if(($6 == "LTR" || $6 == "DNA" || $6 == "SINE" || $6 == "LINE") && ($1 !~ /_/)) print $1, $2, $3, $4, $5, $6, $7, $8, $9}' rmsk.txt.gz.JCage > rmsk_TE_JCage.txt
awk -v OFS='\t' '{if(($6 == "LTR?" || $6 == "DNA?" || $6 == "SINE?" || $6 == "LINE?" || $6 == "Unknown?" || $6 == "Unknown" || $6 == "Other" || $6 == "RC") && ($1 !~ /_/)) print $1, $2, $3, $4, $5, $6, $7, $8, $9}' rmsk.txt.gz.JCage > rmsk_other_JCage.txt

## Output
#/bar/epehrsson/TE_landscape/age/rmsk_other_JCage.txt
#/bar/epehrsson/TE_landscape/age/rmsk_TE_JCage.txt
```

Downloaded hg19 chromosome sizes, RefSeq genic features, and CpG islands from the UCSC Table Browser. 

```{bash download RefSeq features, eval=FALSE}
# Chromosome sizes
mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e "select chrom, size from hg19.chromInfo"  > hg19.genome

## Output
#/bar/epehrsson/TE_landscape/raw_data/hg19.genome

# RefSeq features
#/bar/epehrsson/genic_features/RefSeq/refseq_3UTR.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_5UTR.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_coding_exon.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_exons.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_genes.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_introns.txt

# Regions 2000bp upstream of RefSeq gene TSS
#/bar/epehrsson/genic_features/RefSeq/refseq_up2000.txt

# CpG islands
#/bar/epehrsson/genic_features/CGI/cpgIslandExtUnmasked.txt
```

Filtered chromosome sizes to remove contigs and chrM. Expanded promoter regions (2000bp upstream of TSS) to 500bp downstream of TSS. Identified intergenic regions by taking the complement of RefSeq genic regions.

```{bash process RefSeq features, eval=FALSE}
# hg19 chromosome sizes from UCSC, filtered to chr 1-22, X, Y	
head -n25 hg19.genome > hg19_standard.genome

## Output
#/bar/epehrsson/TE_landscape/features/hg19_standard.genome

# Expand 500bp downstream of RefSeq gene TSS	 
bedtools slop -i refseq_up2000.txt -g hg19.genome -l 0 -r 500 -s > refseq_promoters.txt

## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_promoters.txt

# Intergenic regions	 
bedtools complement -i refseq_genes.txt -g hg19.genome.standard > refseq_intergenic.txt

## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_intergenic.txt
```

Created files of unique regions overlapping each RefSeq feature (all, protein-coding, and non-coding) and CpG islands by merging individual features. Coding status was determined by RefSeq accession. 

```{bash merge RefSeq features, eval=FALSE}
# Merged features
for file in refseq_*.txt ; do sort -k1,1 -k2,2n $file | bedtools merge -i - > $file\_merge; done
rename 's/.txt_merge/_merge.txt/' *.txt_merge

## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_3UTR_merge.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_5UTR_merge.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_coding_exon_merge.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_exons_merge.txt
#/bar/epehrsson/genic_features/RefSeq/refseq_introns_merge.txt

bedtools slop -i refseq_up2000.txt -g hg19.genome -l 0 -r 500 -s | sort -k1,1 -k2,2n - | bedtools merge -i - > refseq_promoters_merge.txt

## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_promoters_merge.txt

sort -k1,1 -k2,2n cpgIslandExtUnmasked.txt | bedtools merge -i - > cpgIslandExtUnmasked_merge.txt

## Output
#/bar/epehrsson/genic_features/CGI/cpgIslandExtUnmasked_merge.txt

# Merged features, coding/non-coding
for file in refseq*.txt; do awk -v OFS='\t' '{if(substr($4,0,2) == "NM") print $1, $2, $3}' $file | sort -k1,1V -k2,2n - | bedtools merge -i - > $(basename "$file" .txt)\_pc_merge.txt; done
for file in refseq*.txt; do awk -v OFS='\t' '{if(substr($4,0,2) == "NR") print $1, $2, $3}' $file | sort -k1,1V -k2,2n - | bedtools merge -i - > $(basename "$file" .txt)\_nc_merge.txt; done

## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_[feature]_pc_merge.txt [6 files]
#/bar/epehrsson/genic_features/RefSeq/refseq_[feature]_nc_merge.txt [5 files]
```

Intersected individual TEs with merged RefSeq features (all, protein-coding, and non-coding) and CpG islands and summed the length of overlap for each TE. 

```{bash TE RefSeq, eval=FALSE}
# Intersection between TEs, RefSeq features	 
for file in ~/genic_features/refseq*merge*txt; do bedtools intersect -wo -a rmsk_TEother.txt -b $file | awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=$11}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > rmsk_TEother_$(basename "$file" .txt)\.txt; done

bedtools intersect -wo -a rmsk_TEother.txt -b ~/genic_features/refseq_intergenic.txt | awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=$11}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > rmsk_TEother_refseq_intergenic.txt

## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/rmsk_TEother_refseq_[feature].txt [7 files]	

# Intersection between TEs, Refseq features (coding/non-coding)	 
for file in ~/genic_features/RefSeq/refseq_*c_merge.txt; do bedtools intersect -wo -a ../TEs/rmsk_TEother.txt -b $file | awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=$11}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > rmsk_TEother_$(basename "$file" .txt)\.txt; done

## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/rmsk_TEother_refseq_[feature]_pc_merge.txt [6 files]	
#/bar/epehrsson/TE_landscape/features/intersect_features/rmsk_TEother_refseq_[feature]_nc_merge.txt [5 files]		

# Intersection between TEs, CpG islands	 
bedtools intersect -wo -a rmsk_TEother.txt -b ~/genic_features/cpgIslandExtUnmasked_merge.txt | awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=$11}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > rmsk_TEother_cpgIsland.txt

## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/rmsk_TEother_cpgIsland.txt
```

Identified TEs that overlap hg19 blacklist regions. No TE in the SVA or Other classes overlaps a blacklist region. 

```{bash TE blacklist, eval=FALSE}
# TEs that overlap the hg19 blacklist	 
bedtools intersect -wao -a rmsk_TE.txt -b ../../genomes/hg19/blacklist.bed > TE_blacklist.bed
bedtools intersect -wao -a rmsk_other.txt -b ../../genomes/hg19/blacklist.bed > other_blacklist.bed

## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/TE_blacklist.bed	
#/bar/epehrsson/TE_landscape/features/intersect_features/other_blacklist.bed
```

Downloaded VISTA enhancers, filtered to those identified in human, converted to bed format, and identified those that are positively validated. There are 1,790 human VISTA enhancers, 934 of which positively validated. Intersected positively validated VISTA enhancers with TEs and identified TEs with overlap. 

```{bash TE VISTA, eval=FALSE}
# VISTA enhancers	
https://enhancer.lbl.gov/cgi-bin/imagedb3.pl?form=search&show=1&search.form=no&search.result=yes

## Output
#/bar/epehrsson/TE_landscape/features/vista_enhancers/vista_enhancers.txt

# Human VISTA enhancer headers
grep '^>Human' features/vista_enhancers/vista_enhancers.txt

## Output
#/bar/epehrsson/TE_landscape/features/vista_enhancers/vista_enhancers_human.txt

# All human VISTA enhancers in bed format
awk -v FS=':|-|\t' -v OFS='\t' '{print $1, $2, $3, $4, $5}' features/vista_enhancers/vista_enhancers_human.txt > features/vista_enhancers/vista_enhancers_human_all.bed

## Output
#/bar/epehrsson/TE_landscape/features/vista_enhancers/vista_enhancers_human_all.bed

# Human, positively validated VISTA enhancers in bed format
#/bar/epehrsson/TE_landscape/features/vista_enhancers/vista_enhancers_human.bed

wc -l features/vista_enhancers/vista_enhancers_human.bed
# 934

# Human, positively validated VISTA enhancers intersected with TEs	 
bedtools intersect -wo -a vista_enhancers_human.bed -b rmsk_TE.txt > vista_enhancers_rmsk_TE.txt
bedtools intersect -wo -a vista_enhancers_human.bed -b rmsk_other.txt > vista_enhancers_rmsk_other.txt

## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/vista_enhancers_rmsk_TE.txt
#/bar/epehrsson/TE_landscape/features/intersect_features/vista_enhancers_rmsk_other.txt

cat features/intersect_features/vista_enhancers_rmsk_*.txt | awk -v OFS='\t' '{a[$4,$5,$6,$7,$8,$9,$10]+=1}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' -  > features/intersect_features/vista_enhancers_TE.txt

## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/vista_enhancers_TE.txt

# Number of VISTA enhancers overlapping TEs
cat features/intersect_features/vista_enhancers_rmsk_*.txt | awk '{print $1, $2, $3}' - | sort | uniq | wc -l
# 355
```

Creates a data frame of individual TEs, including: length, TE class (assigned by Roadmap and SVA/Other class designations from this study), mappability, Jukes-Cantor evolutionary distance, number of CpGs, CpG density (CpGs/bp), length of overlap with RefSeq genic features, intergenic regions, CpG islands, and blacklist regions, and VISTA enhancers. CpG density is loaded from ''TE_CpG_count.txt'', generated in the "TE CpG count" chunk below.

```{r create TEs, eval=FALSE}
source("R_scripts/TE_matrix.R")
```

### Epigenetic states

Downloaded epigenetic data from the Roadmap Epigenomics Project and mouseENCODE. 

```{bash get epigenetic data, eval=FALSE}
# Roadmap

# chromHMM annotations (15-state)
http://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/all.mnemonics.bedFiles.tgz

## Output
#/bar/epehrsson/TE_landscape/raw_data/chromHMM/all.mnemonics.bedFiles.tgz
#/bar/epehrsson/TE_landscape/raw_data/chromHMM/E#_15_coreMarks_mnemonics.bed [127 files]

# 18-state chromHMM bedfiles
wget https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/core_K27ac/jointModel/final/all.mnemonics.bedFiles.tgz

## Output
#/bar/epehrsson/TE_landscape/raw_data/chromHMM_18state/E#_18_core_K27ac_mnemonics.bed [98 files]

# Individual 50-state chromHMM bedfiles
for sample in E003 E004 E005 E006 E007 E008 E017; do wget https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/class1Models_50states/$sample/$sample\_50_segments.bed.gz .; done

## Output
#/bar/epehrsson/TE_landscape/raw_data/chromHMM_50state/E#_50_segments.bed [7 files]

# WGBS
# By-chromosome WGBS fractional methylation matrices of CpG x sample 	
#/bar/mchoudhary/2chromTE/Meth/FractionalMethylation_Removed_E027_E064_Fixed_E012/

# DNase narrow peak bedfiles
# DNase_peaks.txt is a list of all 53 files
while read line; do wget http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/$line\.gz; gunzip $line\.gz; done < DNase_peaks.txt

## Output
#/bar/epehrsson/TE_landscape/raw_data/DNase/DNase_narrow_peaks/E#-DNase.macs2.narrowPeak [53 files]

# H3K27ac narrow peak bedfiles
while read line; do wget http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/$line\-H3K27ac.narrowPeak.gz; done < H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/raw_data/H3K27ac/H3K27ac_narrow_peaks/E#-H3K27ac.narrowPeak [98 files]

# RNA-seq normalization factors
#/bar/epehrsson/TE_landscape/raw_data/RNAseq/all.EGID.N.readlength

# Mouse (ENCODE)

# chromHMM (mm10)
bash TE_landscape/bash_scripts/get_mouse_chromHMM.sh

## Output
#/bar/epehrsson/TE_landscape/raw_data/mouse/chromHMM/ENCFF#.bed.gz [12 files]

# WGBS (mm10)
bash TE_landscape/bash_scripts/get_mouse_meth.sh

## Output
#/bar/epehrsson/TE_landscape/raw_data/mouse/WGBS/ENCFF#.bed [9 files]
```

Divided the hg19 genome into 200bp windows (n=15,478,399; no contigs or chrM). Intersected with chromHMM to get state assignments across the genome and reformatted to bed files. 

All of the intersections are 200bp. The total width of the 200bp windows (3,095,677,412 bp) is slightly larger than what is covered by chromHMM (3,095,675,000 bp). The missing 2,412 bp is at the end of the chromosomes and does not overlap any TEs (both windows and chromHMM start at base 0 for all chromosomes). Only 24 windows do not have chromHMM annotations, one per chromosome.

```{bash 200bp windows, eval=FALSE}
# Divided up the genome into 200bp windows
bedtools makewindows -g features/hg19_standard.genome -w 200 > features/hg19_standard.windows

## Output
#/bar/epehrsson/TE_landscape/features/hg19_standard.windows

# Intersected with chromHMM
while read line; do bedtools intersect -wo -b features/hg19_standard.windows -a raw_data/chromHMM/$line\_15_coreMarks_mnemonics.bed > chromHMM/genome/windows/windows_$line\.bed; done < sample_lists/mnemonics.txt

# Reformmated to bedfiles and sorted
for file in chromHMM/genome/windows/windows_E*.bed; do awk -v OFS='\t' '{print $5, $6, $7, $4}' $file | sort -k1,1 -k2,2n - > chromHMM/genome/windows/$( basename $file .bed ); done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/genome/windows/windows_E#.bed [127 files]

# QC 
## Total width of windows
awk '{sum+=$3-$2}END{print sum}' features/hg19_standard.windows

## Total width of chromHMM
awk '{if($1 != "chrM") sum+=$3-$2}END{print sum}' raw_data/chromHMM/E001_15_coreMarks_mnemonics.bed
```

### Intersection of TEs with epigenetic states

Assigned individual TEs to epigenetic states based on overlap with epigenetic annotations, using 5 techniques.

Assignment of individual TEs to chromHMM states. First, intersected TEs with 200bp windows and found TEs that overlap the center of a window. Then, identified those that do not. 

```{bash split TEs summit, eval=FALSE}
# Intersect TEs with 200bp windows
bedtools intersect -wo -a features/TEs/rmsk_TEother.txt -b features/hg19_standard.windows > features/TEs/rmsk_TEother_windows.txt

## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TEother_windows.txt

# Identify TEs that span the center of a 200bp chromHMM annotation ("summit")
awk '{if(($9+99.5 >= $2) && ($9+99.5 < $3)) print $0}' features/TEs/rmsk_TEother_windows.txt > features/TEs/rmsk_TEother_summit.txt
cut -f1-7 features/TEs/rmsk_TEother_summit.txt | sort -k1,1 -k2,2n | uniq > features/TEs/rmsk_TEother_summit
mv features/TEs/rmsk_TEother_summit features/TEs/rmsk_TEother_summit.txt

## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TEother_summit.txt

# Identify TEs that do not ("majority")
comm -23 <(sort features/TEs/rmsk_TEother.txt) <(sort features/TEs/rmsk_TEother_summit.txt) > features/TEs/rmsk_TEother_majority.txt
sort -k1,1 -k2,2n features/TEs/rmsk_TEother_majority.txt > features/TEs/rmsk_TEother_majority
mv features/TEs/rmsk_TEother_majority features/TEs/rmsk_TEother_majority.txt

## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TEother_majority.txt
```

Intersected individual TEs with chromHMM bedfiles and found the total length of overlap between each TE and each chromHMM state per sample. From the intersection files, identified intersections where the TE overlaps the center of a 200bp window. Summed the total overlap of the TE with the state, including only intersections with 200bp window centers. For those that do not overlap a 200bp window center, found the state covering the majority of the TE (ties were randomly assigned). Combined all into one file and sorted.

```{bash TE chromHMM matrix, eval=FALSE}
# Intersect with individual TEs
for file in chromHMM_bedfiles/*.bed ; do bedtools intersect -wo -a rmsk_TE.txt -b $file > $file\_TE; done
for file in chromHMM_bedfiles/*.bed; do bedtools intersect -wo -a rmsk_other.txt -b $file > $file\_other; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/TEs/intersect/E#_15_coreMarks_mnemonics.bed_other [127 files]
#/bar/epehrsson/TE_landscape/chromHMM/TEs/intersect/E#_15_coreMarks_mnemonics.bed_TE [127 files]

# Original matrix creation
for file in chromHMM_bedfiles/*.bed_TE; do suffix=$(basename $file | cut -d '_' -f1); awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7, $11]+=$12;}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sep[8], a[i];}}' $file | awk -v x=$suffix 'BEGIN{OFS="\t";}{print $0, x}' - ; done >> all_chromHMM_TE.txt

sort -k1,1V -k2,2n -k3,3n -k6,6 -k10,10 all_chromHMM_TE.txt > all_chromHMM_TE_sorted.txt

for file in chromHMM_bedfiles/*.bed_other; do suffix=$(basename $file | cut -d '_' -f1); awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7, $11]+=$12;}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sep[8], a[i];}}' $file | awk -v x=$suffix 'BEGIN{OFS="\t";}{print $0, x}' - ; done >> all_chromHMM_other.txt

sort -k1,1V -k2,2n -k3,3n -k6,6 -k10,10 all_chromHMM_other.txt > all_chromHMM_other_sorted.txt

# Combined into one file

# Filtering by overlap with 200bp window centers ("summit")
while read line; do python ~/bin/TE_landscape/filter_summit.py chromHMM/TEs/intersect/$line\_15_coreMarks_mnemonics.bed_TE 1 8 chromHMM/summit/TEs/rmsk_TE_$line\_chromHMM.bed chromHMM; python ~/bin/TE_landscape/filter_summit.py chromHMM/TEs/intersect/$line\_15_coreMarks_mnemonics.bed_other 1 8 chromHMM/summit/TEs/rmsk_other_$line\_chromHMM.bed chromHMM; cat chromHMM/summit/TEs/rmsk_TE_$line\_chromHMM.bed chromHMM/summit/TEs/rmsk_other_$line\_chromHMM.bed > chromHMM/summit/TEs/rmsk_TEother_$line\_chromHMM.bed; rm chromHMM/summit/TEs/rmsk_TE_$line\_chromHMM.bed; rm chromHMM/summit/TEs/rmsk_other_$line\_chromHMM.bed; done < sample_lists/mnemonics.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/summit/TEs/rmsk_TEother_E#_chromHMM.bed [127 files]

# Including those that do not overlap window center ("majority")
## From original TE x sample x state file, split by sample
while read line; do python identify_no_summit.py $line rmsk_TEother_$line\_chromHMM.bed rmsk_TEother_$line\_chromHMM_noSummit.txt 7 8; done < mnemonics.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/summit/TEs/rmsk_TEother_E#_chromHMM_noSummit.txt [127 files]

# Matrix of TE x sample x state, including summit and majority TEs
while read line; do awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4, $5, $6, $7, $11]+=$12}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sample, a[i], sep[8], "summit"}}' rmsk_TEother_$line\_chromHMM.bed >> rmsk_TEother_chromHMM.txt; awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4, $5, $6, $7, $8]+=$9}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sample, a[i], sep[8], "majority"}}' rmsk_TEother_$line\_chromHMM_noSummit.txt >> rmsk_TEother_chromHMM.txt; done < mnemonics.txt

sort -k1,1V -k2,2n -k3,3n -k4,4 -k8,8 rmsk_TEother_chromHMM.txt > ~/TE_landscape/chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt
```

Using the chromHMM states assigned above, found the number of samples in which each TE is annotated with each state, for all samples and excluding cancer cell lines/IMR90. 

```{bash TE chromHMM potential, eval=FALSE}
# All
python ~/bin/TE_landscape/potential.py chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt features/TEs/rmsk_TEother.txt chromHMM/chromHMM_states.txt sample_lists/mnemonics.txt chromHMM/potential/rmsk_TEother_chromHMM_summit_potential.txt 0 7 9 8

## Output
#/bar/epehrsson/TE_landscape/chromHMM/potential/rmsk_TEother_chromHMM_summit_potential.txt

# No cancer
python ~/bin/TE_landscape/potential.py chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt features/TEs/rmsk_TEother.txt chromHMM/chromHMM_states.txt sample_lists/mnemonics_noCancer.txt chromHMM/potential/rmsk_TEother_chromHMM_summit_potential_noCancer.txt 0 7 9 8

## Output
#/bar/epehrsson/TE_landscape/chromHMM/potential/rmsk_TEother_chromHMM_summit_potential_noCancer.txt
```

Found the maximum number of states with which each TE can be annotated in a single sample, for TEs overlapping 200bp window centers only. 

```{bash TE chromHMM max intra, eval=FALSE}
# Maximum states per TE in any sample
python ~/bin/TE_landscape/state_sharing_intra_max.py chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt chromHMM/rmsk_TEother_chromHMM_summit_max.txt 7

## Output
#/bar/epehrsson/TE_landscape/chromHMM/rmsk_TEother_chromHMM_summit_max.txt
```

Assignment of individual TEs to methylation states. First, combined the CpG methylation level across all chromosomes and reformatted as a sorted bed file. 

```{bash process WGBS, eval=FALSE}
# Bedfile of CpG methylation level across all chromosomes, both strands 	 
for file in /bar/mchoudhary/2chromTE/Meth/FractionalMethylation_Removed_E027_E064_Fixed_E012/chr*.fm; do awk -v chr=$(basename "$file" .fm) -v OFS='\t' '{print chr, $1, $1+1, $0}' $file >> all_CpG_Meth.bedGraph; done &
cut -f1-3,5- all_CpG_Meth.bedGraph | sort -k1,1V -k2,2n -k3,3n - > all_CpG_Meth.bed

## Output
#/bar/epehrsson/TE_landscape/WGBS/all_CpG_Meth.bedGraph
#/bar/epehrsson/TE_landscape/WGBS/all_CpG_Meth.bed
```

Intersected the CpG methylation level bedfile with individual TEs and found the average methylation across all CpGs overlapping each TE. 

```{bash TE WGBS matrix, eval=FALSE}
# Intersect with individual TEs
split -l 1000000 ~/TE_landscape/all_CpG_Meth.bed
for file in xa*; do echo $file; bedtools intersect -wo -a ~/TE_landscape/rmsk_TEother.txt -b $file >>  TE_CpG_Meth_new.bed ; done

## Output
#/bar/epehrsson/TE_landscape/WGBS/TE_CpG_Meth_new.bed

# Average methylation level of each TE in each sample	 
python ~/bin/TE_landscape/average_methylation.py TE_CpG_Meth_new.bed ~/TE_landscape/rmsk_TEother.txt ~/TE_landscape/WGBS_samples.txt TE_CpG_Meth_new_average.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/TE_CpG_Meth_new_average.txt
```

Counted the number of CpGs overlapping each TE. 

```{bash TE CpG count, eval=FALSE}
# Number of CpGs per TE	 
awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' TE_CpG_Meth_new.bed > TE_CpG_count.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/TE_CpG_count.txt
```

Overlap of individual TEs with DHS peaks. Intersected individual TEs with DHS peaks by sample, then filtered to only intersections where the TE overlaps the summit of the DHS peak (see https://genome.ucsc.edu/FAQ/FAQformat.html#format12). Counted the number of intersections per TE and combined into one file. 

```{bash TE DHS matrix, eval=FALSE}
# Intersect with individual TEs
bash TE_landscape/DNase/intersect_sample_list_DNase_peak.sh

## Output
#/bar/epehrsson/TE_landscape/DNase/intersect/TEs/rmsk_TEother_E#-DNase.macs2.narrowPeak [53 files]

# Filtering to TEs overlapping peak summit
for file in DNase/intersect/TEs/rmsk_TEother_E*-DNase.macs2.narrowPeak; do awk '{summit=$9+$17; if((summit >= $2) && (summit < $3)) print $0}' $file > DNase/true_summit/$( basename $file -DNase.macs2.narrowPeak)_DNase_summit.txt; done

## Output
#/bar/epehrsson/TE_landscape/DNase/true_summit/rmsk_TEother_E#_DNase_summit.txt [53 files]

# Matrix of TE x sample x state
while read line; do awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4, $5, $6, $7]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sample, a[i]}}' DNase/true_summit/rmsk_TEother_$line\_DNase_summit.txt >> DNase/true_summit/rmsk_TEother_DNase_summit.txt; done < sample_lists/DNase_samples.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/true_summit/rmsk_TEother_DNase_summit.txt
```

Overlap of individual TEs with H3K27ac peaks. Intersected individual TEs with H3K27ac peaks by sample, then filtered to only intersections where the TE overlaps the summit of the H3K27ac peak. Counted the number of intersections per TE and combined into one file. 

```{bash TE H3K27ac matrix, eval=FALSE}
# Intersect with individual TEs
while read line; do bedtools intersect -wo -a ../rmsk_TEother.txt -b H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak > H3K27ac_TEs/rmsk_TEother_$line\-H3K27ac.narrowPeak; done < H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/intersect/TEs/rmsk_TEother_E#-H3K27ac.narrowPeak [98 files]

# Filtering to TEs overlapping peak summit
for file in H3K27ac/intersect/TEs/rmsk_TEother_E*-H3K27ac.narrowPeak; do awk '{summit=$9+$17; if((summit >= $2) && (summit < $3)) print $0}' $file > H3K27ac/true_summit/$( basename $file -H3K27ac.narrowPeak)_H3K27ac_summit.txt; done

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/true_summit/rmsk_TEother_E#_H3K27ac_summit.txt [98 files]

# Matrix of TE x sample x state
while read line; do awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4, $5, $6, $7]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sample, a[i]}}' H3K27ac/true_summit/rmsk_TEother_$line\_H3K27ac_summit.txt >> H3K27ac/true_summit/rmsk_TEother_H3K27ac_summit.txt; done < sample_lists/H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/true_summit/rmsk_TEother_H3K27ac_summit.txt
```

Average RNA-seq read coverage for individual TEs. Intersected individual TEs with unnormalized, strand-agnostic RNA-seq read coverage, then calculated the average read coverage per TE. Combined values for all samples into a single matrix with a row for each TE and a column per sample. 

```{bash TE RNA matrix, eval=FALSE}
# Average RNA-seq expression for individual TEs
intersect_RNA_agnostic.sh

# Average expression per TE, raw, combined samples
cut -f1-7 rmsk_TEother.txt.sorted_E003_average.txt > rmsk_TEother_average.txt
while read line; do cut -f10 rmsk_TEother.txt.sorted_$line\_average.txt | paste rmsk_TEother_average.txt - > rmsk_TEother_average; mv rmsk_TEother_average rmsk_TEother_average.txt ; done < RNA_samples_agnostic.txt

## Output
#/bar/epehrsson/TE_landscape/RNAseq/rmsk_TEother_average.txt
```

Created matrices of all TE x sample instances in each state, split by subfamily x state.

```{bash by subfamily state, eval=FALSE}
# TEs by subfamily and state, chromHMM
awk '{if($10 != "8_ZNF/Rpts") print > "chromHMM/subfamily/by_state/"$4"_"$10".txt"; else print > "chromHMM/subfamily/by_state/"$4"_8_ZNF.Rpts.txt"}' chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt

# TEs by subfamily and state, WGBS
awk '{print > "WGBS/subfamily/by_state/"$4"_"$10".txt"}' WGBS/TE_WGBS_state_sorted.txt

# TEs by subfamily and state, DHS
awk '{print $0 > "DNase/subfamily/true_summit/"$4"_DNase.txt"}' DNase/true_summit/rmsk_TEother_DNase_summit.txt

# TEs by subfamily and state, H3K27ac
awk '{print $0 > "H3K27ac/subfamily/true_summit/"$4"_H3K27ac.txt"}' H3K27ac/true_summit/rmsk_TEother_H3K27ac_summit.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/subfamily/by_state/[subfam]_[state].txt [13950 files]
#/bar/epehrsson/TE_landscape/WGBS/subfamily/by_state/[subfam]_[state].txt [3846 files]
#/bar/epehrsson/TE_landscape/DNase/subfamily/true_summit/[subfam]_DNase.txt [965 files]
#/bar/epehrsson/TE_landscape/H3K27ac/subfamily/true_summit/[subfam]_H3K27ac.txt [963 files]
```

### RefSeq exons and intersection with RNA-seq read coverage

Average RNA-seq read coverage for individual RefSeq exons. Filtered the RefSeq exons to standard chromosomes (no contigs) and found unique exons. 

```{bash individual RefSeq exons, eval=FALSE}
# Unique Refseq exon coordinates, filtered to standard chromosomes
awk -v OFS='\t' '{if($1 !~ /_/) print $1, $2, $3, $6}' ~/genic_features/RefSeq/refseq_exons.txt | sort | uniq > ~/genic_features/RefSeq/refseq_exons_unique.txt

## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_exons_unique.txt

# Sorted
sort -k1,1 -k2,2n ~/genic_features/RefSeq/refseq_exons_unique.txt > RNAseq/refseq_exons_unique.txt.sorted

## Output
#/bar/epehrsson/TE_landscape/RNAseq/refseq_exons_unique.txt.sorted
```

Intersected individual exons with unnormalized, strand-agnostic RNA-seq read coverage, then calculated the average read coverage per exon. Combined values for all samples into a single matrix with a row for each exon and a column per sample. 

```{bash exon RNA matrix, eval=FALSE}
# Average RNA-seq expression for individual exons
intersect_RNA_agnostic.sh

# Average expression per exon, raw, combined samples
cut -f1-4 refseq_exons_unique.txt.sorted_E003_average.txt > refseq_exons_average.txt
while read line; do cut -f7 refseq_exons_unique.txt.sorted_$line\_average.txt | paste refseq_exons_average.txt - > refseq_exons_average; mv refseq_exons_average refseq_exons_average.txt ; done < RNA_samples_agnostic.txt

## Output
#/bar/epehrsson/TE_landscape/RNAseq/refseq_exons_average.txt
```

### Load TE and exon state assignments

Creates dataframes with the epigenetic state assignments for each TE (row) in each sample or state (column). 

For chromHMM, the dataframe includes the number of samples each TE is annotated with each state, the number of states the TE is annotated with across all samples (total and unique), the maximum number of states the TE is annotated with in a single sample, and whether the TE overlaps the center of any 200bp window. The number of samples each TE is annotated with each state is loaded into a separate dataframe excluding cancer cell lines and IMR90. 

For WGBS, the dataframe lists the average methylation per TE for each sample, for only TEs that overlap at least one CpG. It also includes the number of CpGs per TE, the number of samples in which the TE is annotated with each methylation state (for all samples with WGBS data and excluding IMR90), and the total number of states with which the TE is annotated across samples (all and excluding IMR90). 

For DHS and H3K27ac, the dataframes provide the number of peaks each TE overlaps per sample. They also include the total number of samples in which the TE overlaps a peak summit, across all samples and excluding cancer cell lines and IMR90. Each dataframe includes only TEs overlapping a peak in at least one sample.

For expression, the read length and normalization factor provided by the Roadmap project for each epigenome are used to normalize the average read coverage per TE and create a dataframe with the average RPKM per TE in each sample. The dataframe also includes the total number of samples in which the TE is expressed RPKM > 1 (in all samples or excluding cancer cell lines/IMR90) and the maximum expression across samples. The same information is provided for unique RefSeq exons in a separate dataframe. 

```{r create TE epigenetics, eval=FALSE}
source("R_scripts/chromHMM_matrix.R")
source("R_scripts/WGBS_TE_avg_methylation.R")
source("R_scripts/DNase_peaks.R")
source("R_scripts/H3K27ac_peaks.R")
source("R_scripts/RNAseq.R")
```

### RefSeq promoters and intersection with epigenetic states

Filtered the RefSeq promoters (2000bp upstream and 500bp downstream of each TSS) to standard chromosomes (no contigs) and found unique promoters. 

```{bash individual RefSeq promoters, eval=FALSE}
# Unique Refseq promoter coordinates	 
awk -v OFS='\t' '{print $1, $2, $3, $6}' ~/genic_features/RefSeq/refseq_promoters.txt | sort | uniq > refseq_promoters_unique.txt

## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_promoters_unique.txt

# Filtered to standard chromosomes	 
awk -v OFS='\t' '{if($1 !~ /_/) print $0}' refseq_promoters_unique.txt > refseq_promoters_unique_std.txt

## Output
#/bar/epehrsson/genic_features/RefSeq/refseq_promoters_unique_std.txt
```

Assignment of individual promoters to chromHMM states. Intersected each promoter with the chromHMM bedfiles and filtered to only intersections where the promoter overlaps the center of a 200bp window. Summed the total overlap of the promoter with the state, including only intersections with 200bp window centers. 

```{bash promoter chromHMM matrix, eval=FALSE}
# Intersect with Refseq promoters
for file in ../../raw_data/chromHMM/E*.bed; do suffix=$(basename $file | cut -d '_' -f1); bedtools intersect -wo -a refseq_promoters_unique.txt -b $file | awk -v OFS='\t' -v sample=$suffix '{print $0, sample}' - >> chromHMM_refseq_promoters_unique.txt; done

# Filtering by overlap with 200bp window centers ("summit")
python ~/bin/TE_landscape/filter_summit.py chromHMM/Refseq_promoters/chromHMM_refseq_promoters_unique.txt 1 5 chromHMM/summit/promoters/chromHMM_refseq_promoters_unique_summit.txt chromHMM

## Output
#/bar/epehrsson/TE_landscape/chromHMM/summit/promoters/chromHMM_refseq_promoters_unique_summit.txt

awk -v OFS='\t' '{a[$1, $2, $3, $4, $8, $10]+=$9}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[6], a[i], sep[5]}}' chromHMM/summit/promoters/chromHMM_refseq_promoters_unique_summit.txt | sort -k1,1V -k2,2n -k3,3n -k5,5 - > chromHMM/refseq_promoters_unique_chromHMM_summit_sorted.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/refseq_promoters_unique_chromHMM_summit_sorted.txt
```

Found the number of samples in which each promoter is annotated with each chromHMM state. Promoters on contigs are removed at this stage. 

```{bash promoter chromHMM potential, eval=FALSE}
# Refseq promoters
python ~/bin/TE_landscape/potential.py chromHMM/refseq_promoters_unique_chromHMM_summit_sorted.txt ~/genic_features/RefSeq/refseq_promoters_unique_std.txt chromHMM/chromHMM_states.txt sample_lists/mnemonics.txt chromHMM/potential/refseq_promoters_chromHMM_summit_potential.txt 0 4 6 5

## Output
#/bar/epehrsson/TE_landscape/chromHMM/potential/refseq_promoters_chromHMM_summit_potential.txt
```

Assignment of individual promoters to methylation states. Intersected the unique promoters with the CpG methylation level bedfile, then found the average methylation level across all CpGs overlapping each promoter. Promoters on contigs are removed at this stage. 

```{bash promoter WGBS matrix, eval=FALSE}
# Intersect with Refseq promoters
bedtools intersect -wo -a ~/genic_features/RefSeq/refseq_promoters_unique.txt -b ../all_CpG_Meth.bed > refseq_promoter_unique_CpG_Meth.bed

## Output
#/bar/epehrsson/TE_landscape/WGBS/Refseq_promoters/refseq_promoter_unique_CpG_Meth.bed

# Average methylation of unique Refseq promoters	 
python ~/bin/TE_landscape/average_methylation_promoter.py refseq_promoter_unique_CpG_Meth.bed ~/genic_features/RefSeq/refseq_promoters_unique_std.txt ../../sample_lists/WGBS_samples.txt refseq_promoter_unique_CpG_Meth_average.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/Refseq_promoters/refseq_promoter_unique_CpG_Meth_average.txt
```

Overlap of individual promoters with DHS peaks. Intersected individual promoters with DHS peaks by sample, then filtered to only intersections where the promoter overlaps the summit of the DHS peak. Counted the number of intersections per promoter and combined into one file. 

```{bash promoter DHS matrix, eval=FALSE}
# Intersect with Refseq promoters
while read line; do bedtools intersect -wo -a ~/genic_features/RefSeq/refseq_promoters_unique.txt -b ../../raw_data/DNase/DNase_narrow_peaks/$line\-DNase.macs2.narrowPeak > refseq_promoter_unique_$line\-DNase.macs2.narrowPeak; done < ../../sample_lists/DNase_samples.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/Refseq_promoters/refseq_promoter_unique_E#-DNase.macs2.narrowPeak [53 files]

# Filter to promoters overlapping peak summit
for file in DNase/Refseq_promoters/refseq_promoter_unique_E*-DNase.macs2.narrowPeak; do awk '{summit=$6+$14; if((summit >= $2) && (summit < $3)) print $0}' $file > DNase/true_summit/promoters/$( basename $file -DNase.macs2.narrowPeak)_DNase_summit.txt; done

## Output
#/bar/epehrsson/TE_landscape/DNase/true_summit/promoters/refseq_promoter_unique_E#_DNase_summit.txt [53 files]

# Number of peak summits overlapping each promoter x sample
while read line; do awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sample, a[i]}}' DNase/true_summit/promoters/refseq_promoter_unique_$line\_DNase_summit.txt >> DNase/true_summit/promoters/refseq_promoter_unique_DNase_summit.txt; done < sample_lists/DNase_samples.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/true_summit/promoters/refseq_promoter_unique_DNase_summit.txt
```

Overlap of individual promoters with H3K27ac peaks. Intersected individual promoters with H3K27ac peaks by sample, then filtered to only intersections where the promoter overlaps the summit of the H3K27ac peak. Counted the number of intersections per promoter and combined into one file. 

```{bash promoter H3K27ac matrix, eval=FALSE}
# Intersect with Refseq promoters
while read line; do bedtools intersect -wo -a ~/genic_features/RefSeq/refseq_promoters_unique.txt -b ../../raw_data/H3K27ac/H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak > refseq_promoter_unique_$line\-H3K27ac.narrowPeak; done < ../../sample_lists/H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/Refseq_promoters/refseq_promoter_unique_E#-H3K27ac.narrowPeak [98 files]

# Filter to promoters overlapping peak summit
for file in H3K27ac/Refseq_promoters/refseq_promoter_unique_E*-H3K27ac.narrowPeak; do awk '{summit=$6+$14; if((summit >= $2) && (summit < $3)) print $0}' $file > H3K27ac/true_summit/promoters/$( basename $file -H3K27ac.narrowPeak)_H3K27ac_summit.txt; done

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/true_summit/promoters/refseq_promoter_unique_E#_H3K27ac_summit.txt [98 files]

# Number of peak summits overlapping each promoter x sample
while read line; do awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sample, a[i]}}' H3K27ac/true_summit/promoters/refseq_promoter_unique_$line\_H3K27ac_summit.txt >> H3K27ac/true_summit/promoters/refseq_promoter_unique_H3K27ac_summit.txt; done < sample_lists/H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/true_summit/promoters/refseq_promoter_unique_H3K27ac_summit.txt
```

Creates dataframes of individual promoter assignments to epigenetic states. 

For chromHMM states, the dataframe includes the number of samples in which each promoter is assigned to each state. 

For methlyation states, the dataframe includes the average methylation level for each promoter in each sample, as well as the number of samples in which the promoter is annotated with each methylation state. 

For DHS and H3K27ac peak overlap, the dataframe includes the number of peaks overlapping the promoter in each sample. Promoters on contigs are removed at this stage, and all promoters, regardless of whether they overlap a peak in any sample, are included in the dataframe. The total number of samples in which each promoter overlaps a peak summit is also provided.

```{r create promoter epigenetics, eval=FALSE}
source("R_scripts/promoter_matrices.R")
```

### Shuffled TEs and intersection with epigenetic states

Creates 10 iterations of shuffled TE locations on standard chromosomes (no chrM or contigs), excluding genome gaps. All shuffled files have 4,430,788 TEs, with the same number of class and subfamily as true TEs, but not the same number on each chromosome. The files were sorted. 

```{bash shuffled TEs, eval=FALSE}
# hg19 gaps, sorted	 
tail -n+2 /bar/genomes/hg19/gap/gap.txt | awk -v OFS='\t' '{print $2, $3, $4}' - | sort -k1,1V -k2,2n -k3,3n - > gap_sorted.txt

## Output
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/gap_sorted.txt

# Shuffled TE positions (no gaps)	 
for i in {1..10}; do bedtools shuffle -i ~/TE_landscape/features/TEs/rmsk_TEother.txt -g ~/TE_landscape/features/hg19_standard.genome -excl gap_sorted.txt > rmsk_TE_shuffle_$i\.txt; done

# Sorted shuffled TEs (2/20/19)
for i in {1..10}; do sort -k1,1 -k2,2n features/shuffled_TEs/rmsk_TE_shuffle_$i\.txt > features/shuffled_TEs/rmsk_TE_shuffle_$i\_sorted.txt; done
for i in {1..10}; do mv features/shuffled_TEs/rmsk_TE_shuffle_$i\_sorted.txt features/shuffled_TEs/rmsk_TE_shuffle_$i\.txt; done

## Output
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/rmsk_TE_shuffle_#.txt [10 files]
```

Intersected the shuffled TEs with RefSeq genic features (all, protein-coding, non-coding), intergenic regions, and CpG islands and found the length of overlap between each shuffled TE and the features. 

```{bash shuffled feature overlap, eval=FALSE}
# Intersection between shuffled TEs, Refseq features
for j in {1..10}; do for file in ~/genic_features/RefSeq/*_merge.txt; do bedtools intersect -wo -a features/shuffled_TEs/rmsk_TE_shuffle_$j\.txt -b $file | awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=$11}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > features/shuffled_TEs/intersect_features/rmsk_TE_shuffle_$j\_$(basename "$file" .txt)\.txt; done; bedtools intersect -wo -a features/shuffled_TEs/rmsk_TE_shuffle_$j\.txt -b ~/genic_features/RefSeq/refseq_intergenic.txt | awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=$11}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > features/shuffled_TEs/intersect_features/rmsk_TE_shuffle_$j\_refseq_intergenic.txt; bedtools intersect -wo -a features/shuffled_TEs/rmsk_TE_shuffle_$j\.txt -b ~/genic_features/cpgIslandExtUnmasked_merge.txt | awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=$11}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' - > features/shuffled_TEs/intersect_features/rmsk_TE_shuffle_$j\_cpgIsland.txt; done

## Output
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/intersect_features/rmsk_TE_shuffle_#_refseq_[feature]_merge.txt [60 files]
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/intersect_features/rmsk_TE_shuffle_#_refseq_[feature]_nc_merge.txt [50 files]
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/intersect_features/rmsk_TE_shuffle_#_refseq_[feature]_pc_merge.txt [60 files]
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/intersect_features/rmsk_TE_shuffle_#_refseq_intergenic.txt [10 files]
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/intersect_features/rmsk_TE_shuffle_#_cpgIsland.txt [10 files]
```

Assignment of shuffled TEs to chromHMM states. First, shuffled TEs were intersected with 200bp windows, and those overlapping the center of a window were identified. That set of TEs was then intersected with the 200bp windows that had been assigned a chrommHMM state earlier. The intersections were filtered to those where the TE overlaps the center of the 200bp window, and the total overlap with the intersections per state was summed. Note that the length of chromHMM state overlap is not the same for shuffled TEs as for real TEs.

Shuffled TEs that are between 200bp window centers were also identified and intersected with the chromHMM bedfiles. Each TE can overlap at most two states, and the TE was assigned the state overlapping the majority of its length. 

The two sets of TEs were combined for each iteration and sorted within each chromosome. 

```{bash shuffled chromHMM matrix, eval=FALSE}
# TEs overlapping the center of 200bp windows ("summit")
## Intersect TEs with 200bp windows
for i in {1..10}; do bedtools intersect -wo -a features/shuffled_TEs/rmsk_TE_shuffle_$i\.txt -b features/hg19_standard.windows > features/shuffled_TEs/rmsk_TE_shuffle_$i\_windows.txt; done

## Output
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/rmsk_TE_shuffle_#_windows.txt [10 files]

## Unique shuffled TEs that overlap any chromHMM 200bp window center ("summit")
for i in {1..10}; do awk '{if(($9+99.5 >= $2) && ($9+99.5 < $3)) print $0}' features/shuffled_TEs/rmsk_TE_shuffle_$i\_windows.txt > features/shuffled_TEs/rmsk_TE_shuffle_$i\_summit.txt; done
for file in features/shuffled_TEs/rmsk_TE_shuffle_*_summit.txt; do cut -f1-7 $file | sort -k1,1 -k2,2n | uniq > features/shuffled_TEs/$(basename $file .txt); mv features/shuffled_TEs/$(basename $file .txt) $file; done

## Output
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/rmsk_TE_shuffle_#_summit.txt [10 files]

## Intersect with 200bp chromHMM windows 
for i in {1..10}; do while read line; do bedtools intersect -wo -sorted -a features/shuffled_TEs/rmsk_TE_shuffle_$i\_summit.txt -b chromHMM/genome/windows/windows_$line\.bed > /scratch/ecp/shuffled/rmsk_TE_shuffle_$i\_$line\_summit.txt; done < sample_lists/mnemonics.txt; done

## Filter to bin center overlap and sum state within TE
for file in rmsk_TE_shuffle_1_E*_summit.txt; do awk -v OFS='\t' '{if(($9+99.5 >= $2) && ($9+99.5 < $3)) a[$1, $2, $3, $4, $5, $6, $7, $11]+=$12}END{for(i in a) {split(i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sep[8], a[i]}}' $file > $(basename $file .txt); done &
for i in {1..10}; do for file in rmsk_TE_shuffle_$i\_E*_summit; do mv $file $file\.txt; done; done

# Majority
## TEs that are not center overlapping ("majority"), sorted
for i in {1..10}; do comm -23 <(sort features/shuffled_TEs/rmsk_TE_shuffle_$i\.txt) <(cut -f1-7 features/shuffled_TEs/rmsk_TE_shuffle_$i\_summit.txt | sort) > features/shuffled_TEs/rmsk_TE_shuffle_$i\_majority.txt; done
for file in features/shuffled_TEs/rmsk_TE_shuffle_*_majority.txt; do sort -k1,1 -k2,2n $file > features/shuffled_TEs/$(basename $file .txt); mv features/shuffled_TEs/$(basename $file .txt) $file; done

## Output
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/rmsk_TE_shuffle_#_majority.txt [10 files]

## Intersection with chromHMM states
for i in {1..10}; do while read line; do bedtools intersect -wo -a features/shuffled_TEs/rmsk_TE_shuffle_$i\_majority.txt -b raw_data/chromHMM/$line\_15_coreMarks_mnemonics.bed > /scratch/ecp/shuffled/rmsk_TE_shuffle_$i\_$line\_majority.txt; done < sample_lists/mnemonics.txt; done

## Pick the majority state
for i in {1..10}; do for file in rmsk_TE_shuffle_$i\_E*_majority.txt; do python ~/bin/TE_landscape/pick_majority.py $file $(basename $file .txt) 10 11; done; done
for file in rmsk_TE_shuffle_*_E*_majority; do mv $file $file\.txt; done

# Combined summit and majority TEs
for i in {1..10}; do while read line; do awk -v OFS='\t' -v sample=$line '{print $0, sample, "summit"}' rmsk_TE_shuffle_$i\_$line\_summit.txt >> rmsk_TE_shuffle_$i\_chromHMM.txt; awk -v OFS='\t' -v sample=$line '{print $0, sample, "majority"}' rmsk_TE_shuffle_$i\_$line\_majority.txt >> rmsk_TE_shuffle_$i\_chromHMM.txt; done < ~/TE_landscape/sample_lists/mnemonics.txt; done

# Sorted
# Where chromosomes.txt is chr 1-22 and sex chromosomes
for i in {1..10}; do awk '{print>$1}' rmsk_TE_shuffle_$i\_chromHMM.txt; while read line; do sort -k1,1V -k2,2n -k3,3n -k4,4 -k10,10 $line >> rmsk_TE_shuffle_$i\_chromHMM_sorted.txt; rm $line; done < chromosomes.txt; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/shuffled_TEs/rmsk_TE_shuffle_#_chromHMM_sorted.txt [10 files]
```

For each iteration of shuffled TEs, found the number of samples in which each TE is annotated with each chromHMM state. 

```{bash shuffled chromHMM potential, eval=FALSE}
for i in {1..10}; do python ~/bin/TE_landscape/potential.py rmsk_TE_shuffle_$i\_chromHMM_sorted.txt ~/TE_landscape/features/shuffled_TEs/rmsk_TE_shuffle_$i\.txt chromHMM_states.txt mnemonics.txt rmsk_TE_shuffle_$i\_chromHMM_potential.txt 0 9 7 8; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/shuffled_TEs/rmsk_TE_shuffle_#_chromHMM_potential.txt [10 files]
```

Assignment of individual TEs to methylation states. Intersected each iteration of shuffled TEs with the CpG methylation level bedfile, then found the average methylation level over all CpGs overlapping each TE for each sample. 

```{bash shuffled WGBS matrix, eval=FALSE}
# Intersect with shuffled TEs
split -l 1000000 ~/TE_landscape/WGBS/all_CpG_Meth.bed
for i in {1..10}; do for file in x*; do bedtools intersect -wo -a ../rmsk_TE_shuffle_$i\.txt -b $file >> rmsk_TE_shuffle_$i\_Meth.bed; done; done

# Average methylation per shuffled TE	 
for i in {1..10}; do python ~/bin/TE_landscape/average_methylation.py rmsk_TE_shuffle_$i\_Meth.bed ../rmsk_TE_shuffle_$i\.txt ~/TE_landscape/sample_lists/WGBS_samples.txt rmsk_TE_shuffle_$i\_Meth_average.txt; done

## Output
#/bar/epehrsson/TE_landscape/WGBS/shuffled/rmsk_TE_shuffle_#_Meth_average.txt  [10 files]
```

Counted the number of CpGs overlapping each shuffled TE. 

```{bash shuffled CpG count, eval=FALSE}
for j in {1..10}; do awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=1}END{for(k in a){split(k,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[k];}}' rmsk_TE_shuffle_$j\_Meth.bed > TE_CpG_count_$j.txt; done

## Output
#/bar/epehrsson/TE_landscape/WGBS/shuffled/TE_CpG_count_#.txt [10 files]
```

Overlap of shuffled TEs with DHS peaks. Intersected individual shuffled TEs with DHS peaks by sample, then filtered to only intersections where the TE overlaps the summit of the DHS peak. Counted the number of intersections per TE and combined into one file per iteration. 

```{bash shuffled DHS, eval=FALSE}
for i in {1..10}; do while read line; do bedtools intersect -wo -a features/shuffled_TEs/rmsk_TE_shuffle_$i\.txt -b raw_data/DNase/DNase_narrow_peaks/$line\-DNase.macs2.narrowPeak > DNase/shuffled/rmsk_TE_$line\_$i\-DNase.narrowPeak; done < sample_lists/DNase_samples.txt; done

## Output
#/bar/epehrsson/TE_landscape/DNase/shuffled/rmsk_TE_E#_#-DNase.narrowPeak [530 files]

for file in DNase/shuffled/rmsk_TE_E*-DNase.narrowPeak; do awk '{summit=$9+$17; if((summit >= $2) && (summit < $3)) print $0}' $file > DNase/shuffled/true_summit/$( basename $file -DNase.narrowPeak)_DNase_summit.txt; done

## Output
#/bar/epehrsson/TE_landscape/DNase/shuffled/true_summit/rmsk_TE_E#_#_DNase_summit.txt [530 files]

for j in {1..10}; do while read line; do awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4, $5, $6, $7]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sample, a[i]}}' DNase/shuffled/true_summit/rmsk_TE_$line\_$j\_DNase_summit.txt >> DNase/shuffled/true_summit/rmsk_TE_$j\_DNase_summit.txt; done < sample_lists/DNase_samples.txt; done

## Output
#/bar/epehrsson/TE_landscape/DNase/shuffled/true_summit/rmsk_TE_#_DNase_summit.txt [10 files]
```

Overlap of shuffled TEs with H3K27ac peaks. Intersected individual shuffled TEs with H3K27ac peaks by sample, then filtered to only intersections where the TE overlaps the summit of the H3K27ac peak. Counted the number of intersections per TE and combined into one file per iteration. 

```{bash shuffled H3K27ac, eval=FALSE}
for i in {1..10}; do while read line; do bedtools intersect -wo -a features/shuffled_TEs/rmsk_TE_shuffle_$i\.txt -b raw_data/H3K27ac/H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak > H3K27ac/shuffled/rmsk_TE_$line\_$i\-H3K27ac.narrowPeak; done < sample_lists/H3K27ac_samples.txt; done

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/shuffled/rmsk_TE_E#_#-H3K27ac.narrowPeak [980 files]

for file in H3K27ac/shuffled/rmsk_TE_E*-H3K27ac.narrowPeak; do awk '{summit=$9+$17; if((summit >= $2) && (summit < $3)) print $0}' $file > H3K27ac/shuffled/true_summit/$( basename $file -H3K27ac.narrowPeak)_H3K27ac_summit.txt; done

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/shuffled/true_summit/rmsk_TE_#_#_H3K27ac_summit.txt [980 files]

for j in {1..10}; do while read line; do awk -v OFS='\t' -v sample=$line '{a[$1, $2, $3, $4, $5, $6, $7]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sample, a[i]}}' H3K27ac/shuffled/true_summit/rmsk_TE_$line\_$j\_H3K27ac_summit.txt >> H3K27ac/shuffled/true_summit/rmsk_TE_$j\_H3K27ac_summit.txt; done < sample_lists/H3K27ac_samples.txt; done

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/shuffled/true_summit/rmsk_TE_#_H3K27ac_summit.txt [10 files]
```

Creates dataframes of shuffled TEs and the epigenetic state assignments for each shuffled TE, across 4 techniques. 

The first script creates a dataframe of individual shuffled TEs for each iteration, including TE coordinates and the length of overlap with RefSeq genic features, intergenic regions, and CpG islands. 

For chromHMM, the dataframes include the number of samples in which each shuffled TE is annotated with each state and the number of unique states the TE is annotated with across all samples. 

For WGBS, the dataframes list the average methylation per shuffled TE for each sample, for only TEs that overlap at least one CpG. It also includes the number of CpGs per TE, the number of samples in which the TE is annotated with each methylation state, and the total number of methylation states with which the TE is annotated across samples. 

For DHS and H3K27ac, the dataframes provide the number of peaks each TE overlaps per sample and the total number of samples in which each TE overlaps a peak summit. Each dataframe includes only TEs overlapping a peak in at least one sample.

```{r create shuffled matrices, eval=FALSE}
source("R_scripts/shuffled_matrix.R")
source("R_scripts/shuffled_matrix_chromHMM.R")
source("R_scripts/shuffled_matrix_WGBS.R")
source("R_scripts/shuffled_matrix_DNase.R")
source("R_scripts/shuffled_matrix_H3K27ac.R")
```

## Load essential matrices

Loads the dataframes of TEs and TE epigenetic state assignments created above (as well as the exon expression dataframe). 

```{r load essential matrices}
load("R_datasets/rmsk_TE.RData")
load("R_datasets/chromHMM_TE_state.RData")
load("R_datasets/TE_meth_average.RData")
load("R_datasets/TE_DNase_peaks.RData")
load("R_datasets/TE_H3K27ac_peaks.RData")
load("R_datasets/rna.RData")
```

## Load feature sizes

Creates a file of unique regions overlapping TEs by merging individual TEs. 

```{bash merged TEs, eval=FALSE}
# Merged all TE RepeatMasker file	 
cat rmsk_TE.txt rmsk_other.txt | sort -k1,1 -k2,2n - | bedtools merge -i - > rmsk_TEother_merge.txt

## Output
#/bar/epehrsson/TE_landscape/features/TEs/merge/rmsk_TEother_merge.txt
```

Finds the total length of the regions covered by TEs, for the entire genome (no contigs) and without chrY. 

```{bash merged TE width, eval=FALSE}
awk '{sum+=$3-$2}END{print sum}' features/TEs/merge/rmsk_TEother_merge.txt
awk '{if($1 != "chrY") sum+=$3-$2}END{print sum}' features/TEs/merge/rmsk_TEother_merge.txt
```

Counts the number of CpGs in the genome and overlapping TEs. 

```{bash count CpGs, eval=FALSE}
wc -l WGBS/all_CpG_Meth.bed

bedtools intersect -a all_CpG_Meth.bed -b rmsk_TEother_merge.txt > CpG_TE_Meth.bed

## Output
#/bar/epehrsson/TE_landscape/WGBS/CpG_TE_Meth.bed

wc -l WGBS/CpG_TE_Meth.bed
```

For the RefSeq genic features and intergenic regions, finds the total length of the regions covered by the feature (no contigs; with and without chrY) and the length of overlap with TEs (either strand or same strand only). 

```{bash get feature length, eval=FALSE}
bash feature_overlap_length.sh

## Output
#/bar/epehrsson/TE_landscape/features/intersect_features/feature_overlap.txt
```

Loads constants and feature sizes, including a matrix of RefSeq genic feature lengths. 

```{r load feature sizes}
# Genome
## Chromosome length
hg19_genome = read.table("raw_data/hg19.genome",sep='\t',header=TRUE)

## Length of the genome, with and without chrY, no contigs
GENOME_WIDTH = sum(as.numeric(hg19_genome[which(hg19_genome$chrom %in% standard_chromosomes),]$size))
GENOME_WIDTH_noY = sum(as.numeric(hg19_genome[which(hg19_genome$chrom %in% standard_chromosomes & hg19_genome$chrom != "chrY"),]$size))

## Total length of the genome across all samples with data for that technique
CHROMHMM_TOTAL_WIDTH = sample_counts["chrY","chromHMM"]*GENOME_WIDTH + (sample_counts["All","chromHMM"]-sample_counts["chrY","chromHMM"])*GENOME_WIDTH_noY
DNASE_TOTAL_WIDTH = sample_counts["chrY","DNase"]*GENOME_WIDTH + (sample_counts["All","DNase"]-sample_counts["chrY","DNase"])*GENOME_WIDTH_noY
H3K27AC_TOTAL_WIDTH = sample_counts["chrY","H3K27ac"]*GENOME_WIDTH + (sample_counts["All","H3K27ac"]-sample_counts["chrY","H3K27ac"])*GENOME_WIDTH_noY

# TEs 
## Number of individual TEs, with and without chrY
NUM_TE = dim(rmsk_TE)[1]
NUM_TE_noY = dim(rmsk_TE[which(rmsk_TE$chromosome != "chrY"),])[1]

## Number of individual TEs overlapping any CpG
NUM_TE_WGBS = dim(TE_meth_average)[1]
  
## Length of regions overlapping TEs (unique), with and without chrY
MERGED_TE_WIDTH = 1389947349
MERGED_TE_WIDTH_noY = 1375898142

## Total length of regions overlapping TEs across all samples with data for that techique
CHROMHMM_TE_WIDTH = sample_counts["chrY","chromHMM"]*MERGED_TE_WIDTH + (sample_counts["All","chromHMM"]-sample_counts["chrY","chromHMM"])*MERGED_TE_WIDTH_noY
DNASE_TE_WIDTH = sample_counts["chrY","DNase"]*MERGED_TE_WIDTH + (sample_counts["All","DNase"]-sample_counts["chrY","DNase"])*MERGED_TE_WIDTH_noY
H3K27AC_TE_WIDTH = sample_counts["chrY","H3K27ac"]*MERGED_TE_WIDTH + (sample_counts["All","H3K27ac"]-sample_counts["chrY","H3K27ac"])*MERGED_TE_WIDTH_noY

# Number of CpGs in the genome and overlapping TEs
ALL_CPGS = 56434896/2
TE_CPGS = 28373958/2

# Length of RefSeq features and intergenic regions, with and without chrY, and the length of overlap with TEs (either strand or same strand)
feature_overlap = as.data.frame(matrix(readLines("features/intersect_features/feature_overlap.txt"),ncol=5,byrow=TRUE))
colnames(feature_overlap) = c("Cohort","Genome","Genome_noY","TEs","TEs_stranded")
feature_overlap$Cohort = gsub("refseq_","",feature_overlap$Cohort)
feature_overlap = split_coding(feature_overlap)
feature_overlap[,2:5] = apply(feature_overlap[,2:5],2,function(x) as.numeric(x))
feature_overlap = feature_overlap[,c(1,6:7,2:5)]
```

## Calculate TE class and subfamily statistics

Lists of TE classes, including DNA, LINE, LTR, SINE, SVA (Other in the RepeatMasker file), and Other (Unknown, RC, and five ? classes in the RepeatMasker file).

```{bash classes, eval=FALSE}
# Four large TE classes (DNA, LINE, LTR, SINE)
#/bar/epehrsson/TE_landscape/sample_lists/TE_class.txt

# 8 other TE classes from the rmsk file
#/bar/epehrsson/TE_landscape/sample_lists/other_class.txt
```

Creates lists of unique regions overlapping each TE class by merging individual TEs. Includes the original Other class without RC, as well as the Other class with RC that is used in these analyses. All merged class files were combined into a single file. 

```{bash merged class, eval=FALSE}
# Merged file for each class	
while read line; do awk -v OFS='\t' -v class=$line '{if($5 == class)print $0}' rmsk_TE.txt | bedtools merge -i - > TE_classes/rmsk_$line\.txt; done < TE_class.txt &
while read line; do awk -v OFS='\t' -v class=$line '{if($5 == class)print $0}' rmsk_other.txt | bedtools merge -i - > TE_classes/rmsk_$line\.txt; done < other_class.txt &

## Original Unconfident class (without RC)
awk -v OFS='\t' '{if(($5=="LTR?")||($5=="DNA?")||($5=="LINE?")||($5=="SINE?")||($5=="Unknown?")||($5=="Unknown")) print $0}' ../rmsk_other.txt | bedtools merge -i - > rmsk_Unconfident.txt

## Adding RC to Unconfident class	 
awk -v OFS='\t' '{if(($5=="LTR?")||($5=="DNA?")||($5=="LINE?")||($5=="SINE?")||($5=="Unknown?")||($5=="Unknown")||($5=="RC")) print $0}' ../rmsk_TEother.txt | bedtools merge -i - > rmsk_Unconfident_RC.txt
    
## Output
#/bar/epehrsson/TE_landscape/features/TEs/class/rmsk_[class].txt [12 files]	
#/bar/epehrsson/TE_landscape/features/TEs/class/rmsk_Unconfident.txt
#/bar/epehrsson/TE_landscape/features/TEs/class/rmsk_Unconfident_RC.txt	

# Merged TE bases by class	 
while read line; do awk -v OFS='\t' -v class=$line '{print $0, class}' rmsk_$line\.txt >> TEother_class_merge.txt ; done < TEother_class.txt

## Adding merged Unconfident-RC bases	 
awk -v OFS='\t' '{print $0, "Unconfident_RC"}' rmsk_Unconfident_RC.txt >> TEother_class_merge.txt

## Input (DNA, LINE, LTR, SINE, Other, RC, and Unconfident)
#/bar/epehrsson/TE_landscape/features/TEs/class/TEother_class.txt

## Output
#/bar/epehrsson/TE_landscape/features/TEs/class/TEother_class_merge.txt
```

Number of CpGs per class, generated by counting the number of unique CpGs overlapping each class in the file of intersections between individual TEs and CpGs. 

```{bash class CpG count, eval=FALSE}
awk -v OFS='\t' '{print $5, $8, $9, $10}' TE_CpG_Meth_new.bed | sort | uniq | awk -v OFS='\t' '{a[$1]+=1}END{for(i in a){print i, a[i];}}' - > TE_CpG_class.txt 
awk -v OFS='\t' '{if($5 == "LTR?" || $5 == "DNA?" || $5 == "SINE?" || $5 == "LINE?" || $5 == "Unknown?" || $5 == "Unknown") print $8, $9, $10}' TE_CpG_Meth_new.bed | sort | uniq | wc -l >> TE_CpG_class.txt
awk -v OFS='\t' '{if($5 == "LTR?" || $5 == "DNA?" || $5 == "SINE?" || $5 == "LINE?" || $5 == "Unknown?" || $5 == "Unknown" || $5 == "RC") print $8, $9, $10}' TE_CpG_Meth_new.bed | sort | uniq | wc -l >> class/TE_CpG_class.txt #Added name manually

## Output
#/bar/epehrsson/TE_landscape/WGBS/class/TE_CpG_class.txt
```

The total length of the region overlapping each TE class (unique), with and without chrY. 

```{bash class length, eval=FALSE}
# Class lengths with/without chrY	 
awk -v OFS='\t' '{a[$4]+=$3-$2}END{for(i in a){print i, a[i]}}' TEother_class_merge.txt > class_lengths.txt
awk -v OFS='\t' '{if($1 != "chrY") a[$4]+=$3-$2}END{for(i in a){print i, a[i]}}' TEother_class_merge.txt > class_lengths_noY.txt

## Output
#/bar/epehrsson/TE_landscape/features/TEs/class/class_lengths.txt
#/bar/epehrsson/TE_landscape/features/TEs/class/class_lengths_noY.txt
```

Lists of TE subfamilies, including those in the SVA and Other classes. 

```{bash subfamilies, eval=FALSE}
# 903 subfamilies in the DNA/LINE/SINE/LTR classes OR all 968 subfamilies
#/bar/epehrsson/TE_landscape/sample_lists/subfamilies.txt

# 65 subfamilies belonging to Other and SVA classes
#/bar/epehrsson/TE_landscape/sample_lists/other_subfamilies.txt
```

Creates lists of unique regions overlapping each TE subfamily by merging individual TEs. All merged subfamily files were combined into a single file. 

```{bash merged subfamilies, eval=FALSE}
# Merged file for each subfamily	 
while read line; do awk -v OFS='\t' -v subfam=$line '{if($4 == subfam)print $0}' ../rmsk_TE.txt | bedtools merge -i - > rmsk_$line\.txt; done < subfamilies.txt
while read line; do awk -v OFS='\t' -v subfam=$line '{if($4 == subfam)print $0}' ../rmsk_other.txt | bedtools merge -i - > rmsk_$line\.txt; done < other_subfamilies.txt

while read line; do awk -v OFS='\t' -v subfam=$line '{print $0, subfam}' rmsk_$line\.txt >> TEother_subfamily_merge.txt; rm rmsk_$line\.txt; done < subfamilies.txt
while read line; do awk -v OFS='\t' -v subfam=$line '{print $0, subfam}' rmsk_$line\.txt >> TEother_subfamily_merge.txt; done < other_subfamilies.txt
while read line; do rm rmsk_$line\.txt; done < other_subfamilies.txt

## Output
#/bar/epehrsson/TE_landscape/features/TEs/subfamily/TEother_subfamily_merge.txt
```

The total length of the region overlapping each TE subfamily (unique), with and without chrY. 

```{bash subfamily length, eval=FALSE}
# Subfamily lengths with/without chrY	 
awk -v OFS='\t' '{a[$4]+=$3-$2}END{for(i in a){print i, a[i]}}' TEother_subfamily_merge.txt > subfamily_lengths.txt
awk -v OFS='\t' '{if($1 != "chrY") a[$4]+=$3-$2}END{for(i in a){print i, a[i]}}' TEother_subfamily_merge.txt > subfamily_lengths_noY.txt

## Output
#/bar/epehrsson/TE_landscape/features/TEs/subfamily/subfamily_lengths.txt
#/bar/epehrsson/TE_landscape/features/TEs/subfamily/subfamily_lengths_noY.txt
```

Number of CpGs per subfamily, generated by counting the number of unique CpGs overlapping each subfamily in the file of intersections between individual TEs and CpGs. 

```{bash subfamily CpG count, eval=FALSE}
awk -v OFS='\t' '{print $4, $8, $9, $10}' TE_CpG_Meth_new.bed | sort | uniq | awk -v OFS='\t' '{a[$1]+=1}END{for(i in a){print i, a[i];}}' - > TE_CpG_subfamily.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/subfamily/TE_CpG_subfamily.txt
```

Generates dataframes with metadata for each TE class and subfamily. 

The by-class dataframe includes: the number of elements (with and without chrY) per class; number of families and subfamilies; median and SD length of the individual TEs; median mappability; median Jukes-Cantor distance; the proportion of members overlapping each RefSeq genic feature, intergenic regions, CpG islands, VISTA enhancers, and blacklist regions; number of unique CpGs overlapping the class; proportion of all CpGs and CpGs overlapping TEs that overlap the class; number and proportion of individual TEs with CpGs; the total length of the class (unique; with and without chrY); and the total length of the class across all samples with data for each epigenetic technique.

The by-subfamily dataframe includes: the number of elements (with and without chrY) per subfamily; median length of the individual TEs; median mappability; median Jukes-Cantor distance; the total length of the subfamily (unique; with and without chrY); number of unique CpGs overlapping the class; mean CpGs per TE and per kbp; number of individual TEs with CpGs; the proportion of members overlapping each RefSeq genic feature, intergenic regions, CpG islands, VISTA enhancers, and blacklist regions; and the proportion of members on each chromosome.

```{r calculate TE stats, cache=TRUE, cache.lazy=FALSE}
source("R_scripts/TE_class_stats.R")
source("R_scripts/TE_subfamily_stats.R")
```

# Introduction

## Basic sample stats

Prints the matrix of the number of samples with data for each epigenetic technique, including those with chrY and excluding IMR90/cancer cell lines. 

```{r basic sample stats}
sample_counts
```

## Basic TE statistics

Prints statistics for TEs. 

```{r basic TE stats}
# Number of TEs
NUM_TE

# % of genome within TEs
MERGED_TE_WIDTH/GENOME_WIDTH

# % of CpGs overlapping TEs
TE_CPGS/ALL_CPGS

# Number of TE families
length(unique(rmsk_TE$family))

# Number of TE subfamilies
length(unique(rmsk_TE$subfamily))

# Length of individual TEs (min, median, max)
ddply(rmsk_TE,.(),summarise,Min=min(Length),Median=median(Length),Max=max(Length))

# Number and % of TEs overlapping at least one CpG or overlapping no CpG
NUM_TE_WGBS
NUM_TE_WGBS/NUM_TE
NUM_TE-NUM_TE_WGBS

# Number of CpGs per TE (min, median, max)
ddply(TE_meth_average,.(),summarise,Min=min(CpGs),Median=median(CpGs),Max=max(CpGs))
```

# Results

## Contribution of TEs to the regulatory epigenome

The number of bases total and in each chromHMM state across the entire genome, by sample. Combined into a single file with a column for each sample and a row for each chromHMM state. 

```{bash genome chromHMM, eval=FALSE}
# Whole genome

# Number of bases in each state in each sample	 
for file in chromHMM_bedfiles/E*_15_coreMarks_mnemonics.bed; do awk 'BEGIN{SUM=0}{SUM+=$3-$2}END{print SUM}' $file > $file\_state; while read line; do grep $line $file | awk 'BEGIN{SUM=0}{SUM+=$3-$2}END{print SUM}' - >> $file\_state; done < chromHMM_states.txt ; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/genome/sample_summaries/E#_15_coreMarks_mnemonics.bed_state [127 files]

# Matrix of number of bases in each state in each sample
bash combine_states.sh chromHMM_states.txt mnemonics.txt chromHMM_bedfiles/*state

## Output 
#/bar/epehrsson/TE_landscape/chromHMM/genome/mnemonics_state.txt
```

The number of bases total and in each chromHMM state within TEs, by sample. Intersected the file of unique TE bases with the chromHMM annotations for each sample, summed the number of bases in each chromHMM state, then combined into a single file with a column for each sample. 

```{bash TE chromHMM, eval=FALSE}
# Merged TEs

# Intersect with merged TEs
for file in chromHMM_bedfiles/E*_15_coreMarks_mnemonics.bed; do output=$(basename $file); bedtools intersect -wo -a rmsk_TEother_merge.txt -b $file > chromHMM_other/$output\_TEother_merge; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/TEs/intersect/E#_15_coreMarks_mnemonics.bed_TEother_merge [127 files]

# Sum number of bases in state
for file in chromHMM_other/*; do awk '{SUM+=$8}END{print SUM}' $file > $file\_state; while read line; do grep $line $file | awk '{SUM+=$8}END{print SUM}' - >> $file\_state; done < chromHMM_states.txt ; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/TEs/sample_summaries/TEother_merge/E#_15_coreMarks_mnemonics.bed_TEother_merge_state [127 files]

# Table of number of bases in each state in merged TEs in each sample	
bash combine_states.sh chromHMM_states.txt mnemonics.txt chromHMM_other/*state

## Output
#/bar/epehrsson/TE_landscape/chromHMM/mnemonics_TEother_merge_states.txt
```

The number of bases in each chromHMM state within each RefSeq genic feature and intergenic regions, by sample. Intersected each file of unique feature regions with the chromHMM annotations for each sample, then summed the number of bases in each chromHMM state. 

```{bash feature chromHMM, eval=FALSE}
# Intersect with merged features
for file in ~/genic_features/RefSeq/*_merge.txt; do feature=$(basename "$file" _merge.txt); while read line; do bedtools intersect -wo -a $file -b raw_data/chromHMM/$line\_15_coreMarks_mnemonics.bed | awk -v OFS='\t' -v sample=$line -v feature=$feature '{print sample, feature, $0}' - >> chromHMM/Refseq_features/chromHMM_features.txt; done < sample_lists/mnemonics.txt; done

while read line; do bedtools intersect -wo -a ~/genic_features/RefSeq/refseq_intergenic.txt -b raw_data/chromHMM/$line\_15_coreMarks_mnemonics.bed | awk -v OFS='\t' -v sample=$line '{print sample, "intergenic", $0}' - >> chromHMM/Refseq_features/chromHMM_features.txt; done < sample_lists/mnemonics.txt

# Sum overlap
awk -v OFS='\t' '{a[$1, $2, $9]+=$10}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], a[i]}}' chromHMM/Refseq_features/chromHMM_features.txt > chromHMM/chromHMM_refseq_features.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/chromHMM_refseq_features.txt
```

The chromHMM annotations are 2,583bp shorter than the width of all chromosomes (as specified by UCSC, no contigs), 2,417bp when chrY is excluded. However, all RefSeq feature bases are annotated, indicating all chromHMM-unannotated bases are intergenic. 

```{bash chromHMM QC, eval=FALSE}
# Difference between hg19 chromosome lengths and chromHMM annotation
awk -v OFS='\t' '{chr[$1]+=$3-$2}END{for(i in chr){print i, chr[i]}}' raw_data/chromHMM/E001_15_coreMarks_mnemonics.bed

# Difference between feature lengths and chromHMM annotation
for file in chromHMM/Refseq_features/intersect/chromHMM_*.bed; do awk -v OFS='\t' -v feature=$(basename $file .bed) '{a[$9]+=$8}END{for(i in a){print feature, a[i]}}' $file | sort | uniq; done < features/features.txt
```

The number of CpGs in each methylation state across the entire genome, by sample. CpGs are assigned a methylation state based on methylation level. 

```{bash genome WGBS, eval=FALSE}
# Number of CpGs in each state, all CpGs, each sample
awk '{for (i=4;i<=NF;i++){if($i == -1) miss[i]+=1; else if ($i < 0.3) hypo[i]+=1; else if ($i > 0.7) hyper[i]+=1; else if (($i <= 0.7) && ($i >= 0.3)) inter[i]+=1;}}; END{for (i in hyper) print i" "hypo[i]" "inter[i]" "hyper[i]" "miss[i];}' all_CpG_Meth.bed > all_CpG_Meth_states.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/all_CpG_Meth_states.txt
```

The number of CpGs in each methylation state within TEs, by sample. 

```{bash TE WGBS, eval=FALSE}
# Number of CpGs in each state, TE CpGs, each sample
awk '{for (i=4;i<=NF;i++){if($i == -1) miss[i]+=1; else if ($i < 0.3) hypo[i]+=1; else if ($i > 0.7) hyper[i]+=1; else if (($i <= 0.7) && ($i >= 0.3)) inter[i]+=1;}}; END{for (i in hyper) print i" "hypo[i]" "inter[i]" "hyper[i]" "miss[i];}' CpG_TE_Meth.bed > CpG_TE_Meth_states.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/CpG_TE_Meth_states.txt
```

The number of CpGs in each methylation state within each RefSeq feature, by sample. 

```{bash feature WGBS, eval=FALSE}
# Merged features
for file in ~/genic_features/RefSeq/*_merge.txt; do echo $file; feature=$(basename "$file" _merge.txt); bedtools intersect -a WGBS/all_CpG_Meth.bed -b $file | awk -v OFS='\t' -v feature=$feature '{print feature, $0}' >> WGBS/feature_CpG_Meth.bed; done

bedtools intersect -a WGBS/all_CpG_Meth.bed -b ~/genic_features/RefSeq/refseq_intergenic.txt | awk -v OFS='\t' '{print "intergenic", $0}' >> WGBS/feature_CpG_Meth.bed

## Output
#/bar/epehrsson/TE_landscape/WGBS/feature_CpG_Meth.bed

# Number of CpGs in each state, CpGs overlapping Refseq features, each sample
awk -v OFS='\t' '{for (i=5;i<=NF;i++){if($i == -1) miss[$1, i]+=1; else if ($i < 0.3) hypo[$1, i]+=1; else if ($i > 0.7) hyper[$1, i]+=1; else if (($i <= 0.7) && ($i >= 0.3)) inter[$1, i]+=1}}END{for(i in hypo){split(i,sep,SUBSEP); print "Hypomethylated", sep[1], sep[2], hypo[i]}; for(i in hyper){split(i,sep,SUBSEP); print "Hypermethylated", sep[1], sep[2], hyper[i]}; for(i in inter){split(i,sep,SUBSEP); print "Intermediate", sep[1], sep[2], inter[i]}; for(i in miss){split(i,sep,SUBSEP); print "Missing", sep[1], sep[2], miss[i]}}' WGBS/feature_CpG_Meth.bed > WGBS/feature_CpG_Meth_states.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/feature_CpG_Meth_states.txt
```

The number of and total width of DHS peaks per sample, plus the number of peaks overlapping TEs (regardless of whether the summit overlaps the TE). 

```{bash genome DHS, eval=FALSE}
# Number and width of DNase peaks overall
while read line; do wget http://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/$line\.gz; gunzip $line\.gz; echo $line >> DNase_stats.txt; wc -l $line>> DNase_stats.txt; awk '{sum+=$3-$2}END{print sum}' $line >> DNase_stats.txt; done < DNase_peaks.txt

# Peaks overlapping TEs per sample (no summit rule)
while read line; do awk '{print $8, $9, $10}' rmsk_TEother_$line\-DNase.macs2.narrowPeak | sort | uniq | wc -l >> test; done < ../DNase_samples.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/DNase_stats.txt
```

The total width of DHS peak overlap with TEs and the number of DHS peaks whose summit overlaps a TE, by sample. Intersected DHS peaks with merged TE regions and summed the length of overlap. Using the intersection of DHS peaks with individual TEs that was filtered to only those that overlap the summit of a peak, counted the number of unique DHS peaks whose summit overlaps a TE. 

```{bash TE DHS, eval=FALSE}
# Intersect with merged TEs
while read line; do bedtools intersect -wo -a DNase_narrow_peaks/$line\-DNase.macs2.narrowPeak -b ../rmsk_TEother_merge.txt | awk -v OFS='\t' -v sample=$line '{print $0, sample}' - >> rmsk_TEother_merge_DNase.txt; done < DNase_samples.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/rmsk_TEother_merge_DNase.txt

awk -v OFS='\t' '{a[$15]+=$14}END{for(i in a){print i, a[i]}}' rmsk_TEother_merge_DNase.txt | sort > rmsk_TEother_merge_DNase_contribution.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/rmsk_TEother_merge_DNase_contribution.txt

# Peaks with summits overlapping TEs per sample
while read line; do awk -v OFS='\t' '{print $8, $9, $10}' DNase/true_summit/rmsk_TEother_$line\_DNase_summit.txt | sort | uniq | wc -l >> DNase/true_summit/rmsk_TEother_DNase_summit_stats.txt; done < sample_lists/DNase_samples.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/true_summit/rmsk_TEother_DNase_summit_stats.txt
```

The total width of DHS peaks overlapping RefSeq genic features and intergenic regions, by sample. Intersected DHS peaks with merged feature regions, then summed the total length of overlap. 

```{bash feature DHS, eval=FALSE}
for file in ~/genic_features/RefSeq/*_merge.txt; do feature=$(basename "$file" _merge.txt); echo $feature; while read line; do echo $line; bedtools intersect -wo -a raw_data/DNase/DNase_narrow_peaks/$line\-DNase.macs2.narrowPeak -b $file | awk -v OFS='\t' -v sample=$line -v feature=$feature '{print sample, feature, $0}' - >> DNase/refseq_features_DNase_intersect.txt; done < sample_lists/DNase_samples.txt; done

while read line; do echo $line; bedtools intersect -wo -a raw_data/DNase/DNase_narrow_peaks/$line\-DNase.macs2.narrowPeak -b ~/genic_features/RefSeq/refseq_intergenic.txt | awk -v OFS='\t' -v sample=$line '{print sample, "intergenic", $0}' - >> DNase/refseq_features_DNase_intersect.txt; done < sample_lists/DNase_samples.txt

# Merged features
awk -v OFS='\t' '{a[$1, $2]+=$16}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], a[i]}}' DNase/refseq_features_DNase_intersect.txt > DNase/refseq_features_DNase.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/refseq_features_DNase.txt
```

The number of and total width of H3K27ac peaks per sample, plus the number of peaks overlapping TEs (regardless of whether the summit overlaps the TE). 

```{bash genome H3K27ac, eval=FALSE}
# Number and width of H3K27ac peaks overall
while read line; do echo $line >> H3K27ac_stats.txt; wc -l H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak >> H3K27ac_stats.txt; awk '{sum+=$3-$2}END{print sum}' H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak >> H3K27ac_stats.txt; done < H3K27ac_samples.txt

# Peaks overlapping TEs per sample (no summit rule)
while read line; do awk '{print $8, $9, $10}' H3K27ac_TEs/rmsk_TEother_$line\-H3K27ac.narrowPeak | sort | uniq | wc -l >> test; done < ../H3K27ac_samples.txt #Combine in Excel

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/H3K27ac_stats.txt
```

The total width of H3K27ac peak overlap with TEs and the number of H3K27ac peaks whose summit overlaps a TE, by sample. Intersected H3K27ac peaks with merged TE regions and summed the length of overlap. Using the intersection of H3K27ac peaks with individual TEs that was filtered to only those that overlap the summit of a peak, counted the number of unique H3K27ac peaks whose summit overlaps a TE. 

```{bash TE H3K27ac, eval=FALSE}
# Intersect with merged TEs
while read line; do bedtools intersect -wo -a H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak -b ../rmsk_TEother_merge.txt | awk -v OFS='\t' -v sample=$line '{print $0, sample}' - >> rmsk_TEother_merge_H3K27ac.txt; done < H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/rmsk_TEother_merge_H3K27ac.txt

awk -v OFS='\t' '{a[$15]+=$14}END{for(i in a){print i, a[i]}}' rmsk_TEother_merge_H3K27ac.txt | sort > rmsk_TEother_merge_H3K27ac_contribution.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/rmsk_TEother_merge_H3K27ac_contribution.txt

# Peaks with summits overlapping TEs per sample
while read line; do awk -v OFS='\t' '{print $8, $9, $10}' H3K27ac/true_summit/rmsk_TEother_$line\_H3K27ac_summit.txt | sort | uniq | wc -l >> H3K27ac/true_summit/rmsk_TEother_H3K27ac_summit_stats.txt; done < sample_lists/H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/true_summit/rmsk_TEother_H3K27ac_summit_stats.txt
```

The total width of H3K27ac peaks overlapping RefSeq genic features and intergenic regions, by sample. Intersected H3K27ac peaks with merged feature regions, then summed the total length of overlap. 

```{bash feature H3K27ac, eval=FALSE}
for file in ~/genic_features/RefSeq/*_merge.txt; do feature=$(basename "$file" _merge.txt); echo $feature; while read line; do echo $line; bedtools intersect -wo -a raw_data/H3K27ac/H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak -b $file | awk -v OFS='\t' -v sample=$line -v feature=$feature '{print sample, feature, $0}' - >> H3K27ac/refseq_features_H3K27ac_intersect.txt; done < sample_lists/H3K27ac_samples.txt; done

while read line; do echo $line; bedtools intersect -wo -a raw_data/H3K27ac/H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak -b ~/genic_features/RefSeq/refseq_intergenic.txt | awk -v OFS='\t' -v sample=$line '{print sample, "intergenic", $0}' - >> H3K27ac/refseq_features_H3K27ac_intersect.txt; done < sample_lists/H3K27ac_samples.txt

# Merged features
awk -v OFS='\t' '{a[$1, $2]+=$16}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], a[i]}}' H3K27ac/refseq_features_H3K27ac_intersect.txt > H3K27ac/refseq_features_H3K27ac.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/refseq_features_H3K27ac.txt
```

The number of bases in each TE class annotated with each chromHMM state, by sample. Includes both old and current Other class definitions. Intersected the merged class file with chromHMM annotations, then summed across each state.

```{bash class chromHMM, eval=FALSE}
# TE merged classes
for class in TE_classes/rmsk_*.txt; do for file in chromHMM_bedfiles/E*.bed; do suffix=$(basename $file | cut -d '_' -f1); bedtools intersect -wo -a $class -b $file | awk -v OFS='\t' -v tag=$suffix '{print $0, tag}' - >> $class\_chromHMM.bed; done; done

# Unconfident class
for file in ../chromHMM_bedfiles/E*.bed; do suffix=$(basename $file | cut -d '_' -f1); bedtools intersect -wo -a rmsk_Unconfident.txt -b $file | awk -v OFS='\t' -v tag=$suffix '{print $0, tag}' - >> rmsk_Unconfident.txt_chromHMM.bed; done

for file in ../../raw_data/chromHMM/E*.bed; do suffix=$(basename $file | cut -d '_' -f1); bedtools intersect -wo -a ../../features/TEs/class/rmsk_Unconfident_RC.txt -b $file | awk -v OFS='\t' -v tag=$suffix '{print $0, tag}' - >> rmsk_Unconfident_RC.txt_chromHMM.bed; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/TEs/intersect/class/rmsk_[class].txt_chromHMM.bed [12 files]
#/bar/epehrsson/TE_landscape/chromHMM/TEs/intersect/class/rmsk_Unconfident.txt_chromHMM.bed
#/bar/epehrsson/TE_landscape/chromHMM/TEs/intersect/class/rmsk_Unconfident_RC.txt_chromHMM.bed

# Merged TE classes
# Number of bases in each state in merged TE classes in each sample
while read line; do awk -v OFS='\t' -v class=$line '{a[$7,$9]+=$8}END{for(i in a) {split (i, sep, SUBSEP); print class, sep[1], sep[2], a[i];}}' chromHMM/TEs/intersect/class/rmsk_$line\.txt_chromHMM.bed >> chromHMM/class/class_state_sample.txt; done < chromHMM/class/classes.txt

## Input
#/bar/epehrsson/TE_landscape/chromHMM/class/classes.txt (DNA, LINE, LTR, SINE, Other (SVA), and Unconfident_RC)

## Output
#/bar/epehrsson/TE_landscape/chromHMM/class/class_state_sample.txt
```

The number of CpGs overlapping each TE class in each methylation state, by sample. From the intersection of CpGs with individual TEs, found the unique CpGs overlapping each TE class, then counted the number in each methylation state (based on methylation level). Includes both old and current Other class definitions. 

```{bash class WGBS, eval=FALSE}
# Number of CpGs in each state, TE CpGs, by sample x class
while read line; do awk -v OFS='\t' -v class=$line '{if($5 == class) print $0}' TE_CpG_Meth_new.bed | cut -f8- - | sort | uniq | awk -v OFS='\t' -v class=$line '{for (i=4;i<=NF;i++){if($i == -1) miss[i]+=1; else if ($i < 0.3) hypo[i]+=1; else if ($i > 0.7) hyper[i]+=1; else if (($i <= 0.7) && ($i >= 0.3)) inter[i]+=1;}}; END{for (i in hyper) print i, hypo[i], inter[i], hyper[i], miss[i], class;}' -; done < ../features/TEs/class/TEother_class.txt >> TE_class_hypo.txt

awk -v OFS='\t' '{if(($5 == "LINE?") || ($5 == "SINE?") || ($5 == "DNA?") || ($5 == "LTR?") || ($5 == "Unknown?") || ($5 == "Unknown")) print $0}' TE_CpG_Meth_new.bed | cut -f8- - | sort | uniq | awk -v OFS='\t' '{for (i=4;i<=NF;i++){if($i == -1) miss[i]+=1; else if ($i < 0.3) hypo[i]+=1; else if ($i > 0.7) hyper[i]+=1; else if (($i <= 0.7) && ($i >= 0.3)) inter[i]+=1;}}; END{for (i in hyper) print i, hypo[i], inter[i], hyper[i], miss[i], "Unconfident";}' - >> TE_class_hypo.txt (class_CpG_Meth_states.txt)

awk -v OFS='\t' '{if(($5 == "LINE?") || ($5 == "SINE?") || ($5 == "DNA?") || ($5 == "LTR?") || ($5 == "Unknown?") || ($5 == "Unknown") || ($5 == "RC")) print $0}' TE_CpG_Meth_new.bed | cut -f8- - | sort | uniq | awk -v OFS='\t' '{for (i=4;i<=NF;i++){if($i == -1) miss[i]+=1; else if ($i < 0.3) hypo[i]+=1; else if ($i > 0.7) hyper[i]+=1; else if (($i <= 0.7) && ($i >= 0.3)) inter[i]+=1;}}; END{for (i=4;i<=NF;i++) {print i, hypo[i], inter[i], hyper[i], miss[i], "Unconfident_RC";}}' - >> class_CpG_Meth_states.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/class_CpG_Meth_states.txt
```

The total width of overlap of TE classes with DHS peaks, by sample. Intersected the merged class file with DHS peaks, then summed the total width of overlap with DHS peaks within the class. 

```{bash class DHS, eval=FALSE}
# Merged TE classes
while read line; do bedtools intersect -wo -a ../TE_classes/TEother_class_merge.txt -b DNase_narrow_peaks/$line\-DNase.macs2.narrowPeak | awk -v OFS='\t' -v sample=$line '{print $0, sample}' - >> rmsk_TEother_class_Dnase.txt ; done < DNase_samples.txt

while read line; do grep "Unconfident_RC" ../features/TEs/class/TEother_class_merge.txt | bedtools intersect -wo -a - -b ../raw_data/DNase/DNase_narrow_peaks/$line\-DNase.macs2.narrowPeak | awk -v OFS='\t' -v sample=$line '{print $0, sample}' - >> rmsk_TEother_class_Dnase.txt ; done < ../sample_lists/DNase_samples.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/rmsk_TEother_class_Dnase.txt

# Merged TE classes
awk -v OFS='\t' '{a[$4, $16]+=$15}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], a[i];}}' rmsk_TEother_class_Dnase.txt > class_DNase_sample.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/class_DNase_sample.txt
```

The total width of overlap of TE classes with H3K27ac peaks, by sample. Intersected the merged class file with DHS peaks, then summed the total width of overlap with H3K27ac peaks within the class. 

```{bash class H3K27ac, eval=FALSE}
# Merged TE classes
while read line; do bedtools intersect -wo -a ../features/TEs/class/TEother_class_merge.txt -b ../raw_data/H3K27ac/H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak | awk -v OFS='\t' -v sample=$line '{print $0, sample}' - >> rmsk_TEother_class_H3K27ac.txt ; done < ../sample_lists/H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/rmsk_TEother_class_H3K27ac.txt

# Merged TE classes
awk -v OFS='\t' '{a[$4, $16]+=$15}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], a[i];}}' rmsk_TEother_class_H3K27ac.txt > class_H3K27ac_sample.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/class_H3K27ac_sample.txt
```

### Figure 1

a. Proportion of the genome, TEs, and genic features annotated with each epigenetic state (protein-coding and non-coding), across all samples. b. Proportion of each TE class annotated with each epigenetic state, across all samples. c. Proportion of each epigenetic state within TEs, across all samples. d. Proportion of each state within TEs that is within each TE class (including duplicated bases from overlapping classes). 

```{r Figure 1, echo=FALSE}
source("R_scripts/proportion.R")
source("R_scripts/proportion_class.R") 

a = ggplot(combined_proportion[which(combined_proportion$Coding == "All"),],aes(x=Feature,y=Proportion,fill=State)) + geom_bar(stat="identity") + xlab("Feature") + ylab("Proportion in state") + scale_fill_manual(values=all_state_colors,guide=FALSE) + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5)) + scale_x_discrete(labels=c("TE","Genome",genic_labels)) + facet_wrap(~Mark,nrow=1,labeller=labeller(Mark = mark_labels)) 

b = ggplot(combine_class_proportion,aes(x=class,y=Proportion,fill=State)) + geom_bar(stat="identity") + xlab("TE class") + ylab("Proportion in state") + scale_fill_manual(values=all_state_colors,labels=all_state_labels) + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),legend.position="bottom",legend.box.margin = margin(t=-12),legend.key.size = unit(3,'mm')) + facet_wrap(~Cohort,nrow=1,labeller=labeller(Cohort = mark_labels)) + guides(fill = guide_legend(ncol = 8,title.position = "top"))

c = ggplot(contribution,aes(x=State,y=Proportion,fill=State)) + geom_bar(stat="identity") + coord_flip() + scale_y_reverse(limits=c(1,0),expand=c(0.01,0.01)) + xlab("State") + ylab("Proportion of state overlapping TEs") + scale_fill_manual(values=all_state_colors,guide=FALSE) + scale_x_discrete(labels=all_state_labels,limits=c(rev(states[16:19]),"CpGs",rev(states[c(1:15,20:21)]),"Bases")) + geom_text(aes(label=round(Proportion,2)),hjust=1.01,size=3) + theme(axis.title.y = element_text(margin = margin(r = -10))) + geom_vline(xintercept=5.5) + geom_vline(xintercept=4.5,linetype="dashed",color="black") + geom_vline(xintercept=6.5,linetype="dashed",color="grey") + geom_vline(xintercept=7.5,linetype="dashed",color="grey") + geom_vline(xintercept=22.5,linetype="dashed",color="black") 

d = ggplot(contribution_class,aes(x=State,y=Proportion,fill=forcats::fct_rev(class))) + geom_bar(stat="identity") + coord_flip() + ylab("Proportion in each TE class") + theme(axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y = element_blank()) + scale_x_discrete(limits=c(rev(states[16:19]),"CpGs",rev(states[c(1:15,20:21)]),"Bases")) + scale_fill_manual(values=class_colors,guide=FALSE) + geom_vline(xintercept=5.5) + geom_vline(xintercept=4.5,linetype="dashed",color="black") + geom_vline(xintercept=6.5,linetype="dashed",color="grey") + geom_vline(xintercept=7.5,linetype="dashed",color="grey") + geom_vline(xintercept=22.5,linetype="dashed",color="black") + scale_y_continuous(limits=c(0,1),expand=c(0.01,0.01)) 

legend_class = get_legend(ggplot(contribution_class,aes(x=State,y=Proportion,fill=class)) + geom_bar(stat="identity") + theme(axis.title.y=element_blank(),axis.ticks.y=element_blank(),axis.ticks.x=element_blank(),axis.text.y=element_blank(),legend.position="bottom",legend.direction = "horizontal",legend.key.size = unit(3,'mm')) + scale_fill_manual(values=class_colors,name="Class") + guides(fill = guide_legend(nrow = 2)))

grid.arrange(a,b,c,d,legend_class, nrow = 4,layout_matrix = rbind(c(1),c(2),c(3,4),c(NA,5)),heights=c(0.25,0.3,0.4,0.05),widths=c(0.55,0.45))
```

```{r Figure 1 source data}
write.table(combined_proportion[which(combined_proportion$Coding == "All"),c("Feature","Proportion","State","Mark")],file="source_data/Figure_1a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combine_class_proportion[,c("class","Proportion","State","Cohort")],file="source_data/Figure_1b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(contribution[,c("State","Proportion")],file="source_data/Figure_1c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(contribution_class[,c("State","Proportion","class")],file="source_data/Figure_1d.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Proportion/contribution analysis

```{r analysis Fig 1}
# Proportion of each feature in each state, across all samples
combined_proportion[which(combined_proportion$Coding == "All"),]

# Proportion of TEs in active regulatory states
sum(combined_proportion[which(combined_proportion$Coding == "All" & combined_proportion$Feature == "TE" & combined_proportion$State %in% chromHMM_states[c(1:3,6:7)]),]$Proportion)

# Proportion of TEs in transcribed states
sum(combined_proportion[which(combined_proportion$Coding == "All" & combined_proportion$Feature == "TE" & combined_proportion$State %in% chromHMM_states[4:5]),]$Proportion)

# Proportion of promoters in active regulatory states
sum(combined_proportion[which(combined_proportion$Coding == "All" & combined_proportion$Feature == "promoters" & combined_proportion$State %in% chromHMM_states[c(1:3,6:7)]),]$Proportion)

# Proportion of exons in transcribed states
sum(combined_proportion[which(combined_proportion$Coding == "All" & combined_proportion$Feature == "exons" & combined_proportion$State %in% chromHMM_states[4:5]),]$Proportion)

# Features with a greater proportion in state than TEs
ddply(combined_proportion[which(combined_proportion$Coding == "All"),],.(State),function(x) x[which(x$Proportion > x[which(x$Feature == "TE"),]$Proportion),c("State","Cohort","Feature","Coding","Proportion")])

# Proportion of each class in each epigenetic state across samples
dcast(combine_class_proportion,class~State,value.var="Proportion")

# Class with largest proportion annotated with each state
test = ddply(combine_class_proportion,.(State),function(x) x[which.max(x$Proportion),])
test[order(test$class),]

# Proportion of all genomic bases in TEs
CHROMHMM_TE_WIDTH/CHROMHMM_TOTAL_WIDTH

# Proportion of all CpGs and TE CpGs within each TE class
rmsk_TE_class[,c("class_update","CpGs","Percent_TE_CpGs","Percent_all_CpGs")]

# Proportion of each epigenetic state within TEs across all samples
contribution

## For composite chromHMM states
contribution_composite

# Proportion of each state within TEs that is within each TE class
dcast(contribution_class,class~State,value.var="Proportion")
```

### Supplementary Figure 1

Creates a matrix of length of the state in each sample, overall and within TEs, then calculates the Spearman correlation (rho and p-value) between the two lengths for each state across all samples. 

Plots the length of the state in each sample, overall vs. within TEs, colored by sample group. Plots include the correlation and a linear model of the relationship between the two values. 

```{r Figure S1, echo=FALSE}
# Plot bases in genome/TEs in each state by sample, increasing order
all_state_proportion_matrix = dcast(all_state_proportion[which(all_state_proportion$Cohort %in% c("Genome","TE")),],State+Sample+Group~Cohort,value.var="Bases")
all_state_proportion_corr = ddply(all_state_proportion_matrix,.(State),summarise,Cor=as.numeric(unlist(cor.test(Genome,TE,method="spearman"))["estimate.rho"]),Pvalue=as.numeric(unlist(cor.test(Genome,TE,method="spearman"))["p.value"]))

ggplot(all_state_proportion_matrix,aes(x=log10(Genome),y=log10(TE))) + geom_point(aes(color=Group)) + scale_color_manual(values=group_colors) + facet_wrap(~State,scales="free",ncol=6,labeller=labeller(State=all_state_labels)) + xlab("Bases or CpGs in genome (log10)") + ylab("Bases or CpGs in TEs (log10)") + geom_smooth(method="lm",se=FALSE,color="black") + geom_text(data=all_state_proportion_corr,aes(x = -Inf, y = Inf,label=round(Cor,2)),hjust=-0.1,vjust=1.5,size=3) + scale_y_continuous(breaks=pretty_breaks(n=3)) + scale_x_continuous(breaks=pretty_breaks(n=3)) + theme(legend.position="bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm'))
```

```{r Figure S1 source data}
write.table(all_state_proportion_matrix[,c("Sample","Genome","TE","Group","State")],file="source_data/Figure_S1_dot.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(all_state_proportion_corr,file="source_data/Figure_S1_cor.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Compare TE bases to non-TE bases by sample

```{r proportion investigation}
# Summary statistics for the number of bases in each state, overall and within TEs
ddply(all_state_proportion[which(all_state_proportion$Cohort %in% c("Genome","TE")),],.(State,Cohort),summarise,Median=median(Bases),Mean=mean(Bases),SD=sd(Bases),Min=min(Bases),Max=max(Bases),CV=sd(Bases)/mean(Bases))

# Median proportion of the genome in each epigenetic state per sample
ddply(all_state_proportion[which(all_state_proportion$Cohort == "Genome"),],.(State,Cohort),summarise,Median=median(Proportion))

# Proportion of each sample's CpGs in each methylation state
dcast(all_state_proportion[which(all_state_proportion$Mark == "WGBS" & all_state_proportion$Cohort == "Genome"),c("Sample","State","Proportion")],Sample~State,value.var="Proportion")

# Median number of bases in state by sample group
test = ddply(all_state_proportion_matrix,.(State,Group),summarise,Genome_median=median(Genome))
test[order(test$State,test$Genome_median),]

# Spearman correlation by state between 1) the number of bases within non-TE regions and within TEs and 2) the number of bases in the state and the proportion within TEs
# Not MHC corrected (0.05/21 ~ 0.0024)
ddply(all_state_proportion_matrix,.(State),summarise,Genome_corr = unlist(cor.test(Genome-TE,TE,method="spearman"))["estimate.rho"],Genome_pvalue = unlist(cor.test(Genome-TE,TE,method="spearman")["p.value"]))
ddply(all_state_proportion_matrix,.(State),summarise,Genome_corr = unlist(cor.test(Genome,TE/Genome,method="spearman"))["estimate.rho"],Genome_pvalue = unlist(cor.test(Genome,TE/Genome,method="spearman")["p.value"]))
```

### Supplementary Figure 2

a. The proportion of the genome overlapping each feature, the proportion of TEs overlapping each feature, and the proportion of each feature within TEs, split by coding status. b. Mean proportion of each feature in each epigenetic state, across samples. c. For genic RefSeq features, the proportion in each epigenetic state across all samples, split by coding status.

```{r Figure S2, echo=FALSE}
source("R_scripts/feature_overlap.R")

# Feature overlap
a = ggplot(feature_overlap_long,aes(x=Feature,y=Percent,fill=Coding)) + geom_bar(position="dodge",stat="identity") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5),legend.key.size = unit(2,'mm')) + scale_x_discrete(labels=genic_labels) + scale_fill_discrete(labels=coding_labels,name="Feature") + facet_wrap(~Measure,nrow=1,labeller=labeller(Measure=overlap_labels)) + xlab("Feature") + ylab("Proportion")

# Average proportion by sample
b = ggplot(ddply(all_state_proportion[which(all_state_proportion$Coding == "All"),],.(Mark,State,Feature),summarise,Proportion=mean(Proportion)),aes(x=Feature,y=Proportion,fill=State)) + geom_bar(stat="identity") + xlab("Feature") + ylab("Proportion in state") + scale_fill_manual(values=all_state_colors,guide=FALSE) + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5)) + scale_x_discrete(labels=c("TE","Genome",genic_labels)) + facet_wrap(~Mark,nrow=1,labeller=labeller(Mark = mark_labels)) 

# Total proportion split by protein-coding/non-coding
c = ggplot(combined_proportion[which(combined_proportion$Feature %in% c("promoters","5UTR","3UTR","exons","introns")),],aes(x=Feature,y=Proportion,fill=State)) + geom_bar(stat="identity") + xlab("Feature") + ylab("Proportion in state") + scale_fill_manual(values=all_state_colors,labels=all_state_labels) + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),legend.position="bottom",legend.box.margin = margin(t=-12),legend.key.size = unit(3,'mm')) + scale_x_discrete(labels=c("TE","Genome",genic_labels)) + facet_grid(Coding~Mark,labeller=labeller(Coding=coding_labels,Mark=mark_labels)) + guides(fill = guide_legend(ncol = 8,title.position = "top"))

grid.arrange(a,b,c,nrow=3,heights=c(0.25,0.22,0.53))
```

```{r Figure S2 source data}
write.table(feature_overlap_long[,c("Feature","Percent","Coding","Measure")],file="source_data/Figure_S2a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(ddply(all_state_proportion[which(all_state_proportion$Coding == "All"),],.(Mark,State,Feature),summarise,Proportion=mean(Proportion))[,c("Feature","Proportion","State","Mark")],file="source_data/Figure_S2b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combined_proportion[which(combined_proportion$Feature %in% c("promoters","5UTR","3UTR","exons","introns")),c("Feature","Proportion","State","Coding","Mark")],file="source_data/Figure_S2c.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### RefSeq feature analysis

```{r analysis Fig S2}
# Length and proportion of overlap between genic features and TEs
feature_overlap_long

# Proportion of each feature in each epigenetic state, across all samples
combined_proportion
```

### Supplementary Figure 3

a-c. Total length, number of TEs, and number of subfamilies for each TE class. d-h. Distribution of TE length (kbp), number of CpGs, CpGs/kbp, mappability, and age by class. 

```{r Figure S3, echo=FALSE}
a = ggplot(rmsk_TE_class,aes(x=factor(1),y=Total_length,fill=class_update,label=paste(round(Total_length/1000000,1)," (",round(Total_length/sum(Total_length)*100,0),"%) ",sep=""))) + geom_bar(stat="identity",position="fill",width=0.8) + coord_polar(theta="y") + scale_fill_manual(name="Class",values=class_colors,guide=FALSE) + theme(axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.x = element_blank(),axis.text.y = element_blank(),axis.ticks.y = element_blank(),plot.title = element_text(size=8)) + geom_text(position = position_fill(vjust = 0.5),size=2) + ggtitle("Total length (Mb)") + scale_y_continuous()

b = ggplot(rmsk_TE_class,aes(x=factor(1),y=Count,fill=class_update,label=paste(format(Count, big.mark=",", scientific=FALSE)," (",round(Count/sum(Count)*100,0),"%) ",sep=""))) + geom_bar(stat="identity",position="fill",width=0.8) + coord_polar(theta="y") + scale_fill_manual(name="Class",values=class_colors,guide=FALSE) + theme(axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.x = element_blank(),axis.text.y = element_blank(),axis.ticks.y = element_blank(),plot.title = element_text(size=8)) + geom_text(position = position_fill(vjust = 0.5),size=2) + ggtitle("TEs")

c = ggplot(rmsk_TE_class,aes(x=factor(1),y=Subfamilies,fill=class_update,label=paste(format(Subfamilies, big.mark=",", scientific=FALSE)," (",round(Subfamilies/sum(Subfamilies)*100,0),"%) ",sep=""))) + geom_bar(stat="identity",position="fill",width=0.8) + coord_polar(theta="y") + scale_fill_manual(name="Class",values=class_colors,guide=FALSE) + theme(axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.x = element_blank(),axis.text.y = element_blank(),axis.ticks.y = element_blank(),plot.title = element_text(size=8)) + geom_text(position = position_fill(vjust = 0.5),size=2) + ggtitle("Subfamilies")

d = ggplot(rmsk_TE,aes(x=Length/1000,y=..scaled..,fill=class_update)) + geom_density() + scale_fill_manual(values=class_colors,guide=FALSE) + xlab("Length (kbp)") + ylab("Density") + facet_wrap(~class_update,nrow=1) + scale_y_continuous(breaks=c(0,0.5,1))

e = ggplot(rmsk_TE,aes(x=CpGs,y=..scaled..,fill=class_update)) + geom_density() + scale_fill_manual(values=class_colors,guide=FALSE) + xlab("CpGs") + ylab("Density") + facet_wrap(~class_update,nrow=1) + scale_x_continuous(breaks=pretty_breaks(n=4)) + scale_y_continuous(breaks=c(0,0.5,1))

f = ggplot(rmsk_TE,aes(x=CpGs_per_length*1000,y=..scaled..,fill=class_update)) + geom_density() + scale_fill_manual(values=class_colors,guide=FALSE)+ xlab("CpGs per kbp") + ylab("Density") + facet_wrap(~class_update,nrow=1) + scale_x_continuous(breaks=pretty_breaks(n=4)) + scale_y_continuous(breaks=c(0,0.5,1))

g = ggplot(rmsk_TE,aes(x=mappability,y=..scaled..,fill=class_update)) + geom_density() + scale_fill_manual(values=class_colors,guide=FALSE)+ xlab("Mappability") + ylab("Density") + facet_wrap(~class_update,nrow=1) + scale_y_continuous(breaks=c(0,0.5,1)) + scale_x_continuous(breaks=c(0,0.5,1))

h = ggplot(rmsk_TE,aes(x=JC_distance,y=..scaled..,fill=class_update)) + geom_density() + scale_fill_manual(values=class_colors,guide=FALSE) + xlab("Jukes-Cantor evolutionary distance") + ylab("Density") + facet_wrap(~class_update,nrow=1) + scale_x_continuous(breaks=c(0,0.5,1)) + scale_y_continuous(breaks=c(0,0.5,1),limits=c(0,1))

grid.arrange(a,b,c,legend_class,d,e,f,g,h,nrow=7,layout_matrix=rbind(c(1,2,3),c(4),c(5),c(6),c(7),c(8),c(9)),heights=c(0.2,0.05,0.15,0.15,0.15,0.15,0.15))
```

```{r Figure S3 source data}
write.table(rmsk_TE_class[,c("class_update","Total_length","Count","Subfamilies")],file="source_data/Figure_S3abc.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(rmsk_TE[,c("class_update","Length","CpGs","CpGs_per_length","mappability","JC_distance")],file="source_data/Figure_S3d_to_h.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### TE metric stats

```{r metrics analysis, cache=TRUE, cache.lazy=FALSE}
# Kruskal-Wallis test for TE characteristics across classes, with Bonferroni correction
p.adjust(apply(rmsk_TE[,measure_metrics],2,function(x) unlist(kruskal.test(x~rmsk_TE$class_update))["p.value"]),method="bonf")

# Summary statistics for TE characteristics, overall and by class
apply(rmsk_TE[,measure_metrics],2,median)
aggregate(data=rmsk_TE[,c("class_update",measure_metrics)],.~class_update,function(x) median(na.omit(x)),na.action=na.pass)

# Identify classes most different from the average
sort(apply(aggregate(data=rmsk_TE[,c("class_update",measure_metrics)],.~class_update,function(x) median(na.omit(x)),na.action=na.pass)[,2:6],2,function(x) IQR(x)/median(x)))

# Max TE length by class
aggregate(data=rmsk_TE,Length~class_update,max)
```

## Potential for individual TEs to be epigenetically active

Counts the number of TEs annotated with each chromHMM state by sample. 

```{bash TE chromHMM counts, eval=FALSE}
# Number of TEs in state in sample
awk -v OFS='\t' '{a[$8,$10]+=1}END{for(i in a){split (i, sep, SUBSEP); print sep[1], sep[2], a[i];}}' chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt > chromHMM/state_sample_counts_summit.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/state_sample_counts_summit.txt
```

Counts the number of TEs annotated with each chromHMM state in each class by sample. First, splits the matrix of TE chromHMM state assignments by sample into by-class files, then counts the number of TEs in each state and sample by file. 

```{bash class chromHMM counts, eval=FALSE}
# TEs by class
while read line ; do awk -v OFS='\t' -v class=$line '{if($5 == class) print $0}' chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt > chromHMM/chromHMM_summit_$line\.txt; done < features/TEs/class/TE_class.txt
awk -v OFS='\t' '{if($5 == "Other") print $0}' chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt > chromHMM/chromHMM_summit_SVA.txt
awk -v OFS='\t' '{if(($5 != "Other") && ($5 != "DNA") && ($5 != "LINE") && ($5 != "LTR") && ($5 != "SINE")) print $0}' chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt > chromHMM/chromHMM_summit_Other.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/chromHMM_summit_[class].txt [6 files]

# By class
for file in chromHMM/chromHMM_summit_*.txt; do awk -v OFS='\t' -v class=$(basename $file .txt) '{a[$8, $10]+=1}END{for(i in a) {split (i, sep, SUBSEP); print class, sep[1], sep[2], a[i];}}' $file >> chromHMM/class_state_sample_summit.txt; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/class_state_sample_summit.txt
```

### Figure 2

Creates dataframes of the distribution of TEs in each epigenetic state. Includes the number of TEs in each state per sample; the number/proportion of TEs in each state in each number/proportion of samples and in at least that number/proportion of samples; and tables with the proportion of TEs ever in each state and the mean proportion of samples in state. 

Dataframes are created for all samples and excluding cancer cell lines/IMR90, as well as for all TEs and by class. 

```{r Figure 2 scripts, echo=FALSE, cache=TRUE, cache.lazy=FALSE}
source("R_scripts/potential.R")
source("R_scripts/potential_class.R")
```

a. Proportion of TEs in each state by sample (boxplot) and number of TEs ever in each state (dot). b. Proportion of TEs ever in each epigenetic state that belong to each TE class. c. Distribution of the number of TEs in each state at each number of samples, for those ever in the state. d. Number of TEs in each state in 100% of samples. e. in 90% of samples. Bars have a pseudocount of 1 added, but the text does not. f. Proportion of TEs in each state in 90% of samples belonging to each TE class. 

```{r Figure 2, echo=FALSE}
a = ggplot(combine_boxplot,aes(x=State,y=Proportion,fill=State)) + geom_boxplot() +scale_fill_manual(values=all_state_colors,guide=FALSE) + theme(axis.title.y = element_text(margin = margin(r = -10))) + labs(y="Proportion of TEs\nin state") + coord_flip() + scale_x_discrete(limits = rev(states),labels=rev(all_state_labels)) + geom_point(data=combine_stats,aes(x=State,y=Proportion_ever),color="red",size=2) 

b = ggplot(combine_class_ever,aes(x=State,y=Proportion,fill=forcats::fct_rev(Class))) + geom_bar(stat="identity") + scale_fill_manual(values=class_colors,guide=FALSE) + theme(axis.title.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y = element_blank()) + labs(y="Proportion of TEs\never in state by class") + coord_flip() + scale_x_discrete(limits = rev(states),labels=rev(all_state_labels)) + scale_y_continuous(expand=c(0.01,0.01))

c = ggplot(combine_potential[which(combine_potential$Samples != 0),],aes(x=State,y=Sample.Proportion,weight=Count,fill=State)) + geom_violin(scale="width") + coord_flip() + scale_fill_manual(values=all_state_colors,guide=FALSE) + theme(axis.title.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y = element_blank()) + ylab("Proportion of samples\nin state") + scale_x_discrete(limits = rev(states),labels=rev(all_state_labels)) + geom_boxplot(width=0.2,outlier.size = 0.1) + scale_y_continuous(expand=c(0.01,0.01))

d = ggplot(combine_potential[which(combine_potential$Sample.Proportion == 1),],aes(x=State,y=log10(Count+1),fill=State)) + geom_bar(stat="identity") + geom_text(aes(label=format(Count, big.mark=",", scientific=FALSE)),hjust=1.01,size=2.5) + scale_y_reverse(limits=c(8,0)) + coord_flip() + scale_x_discrete(limits = rev(states),labels=rev(all_state_labels)) + scale_fill_manual(values=all_state_colors,guide=FALSE) + theme(axis.title.y = element_text(margin = margin(r = -10))) + ylab("Number of TEs in state in\n100% of samples (log10)")

e = ggplot(ddply(combine_potential[which(combine_potential$Sample.Proportion >= 0.9),],.(State),summarise,Count=sum(Count)),aes(x=State,y=log10(Count+1),fill=State)) + geom_bar(stat="identity") + geom_text(aes(label=format(Count, big.mark=",", scientific=FALSE)),hjust=0,size=2.5) + coord_flip() + scale_x_discrete(limits = rev(states),labels=rev(all_state_labels)) + scale_fill_manual(values=all_state_colors,guide=FALSE) + theme(axis.title.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y = element_blank()) + ylab("Number of TEs in state in\n90% of samples (log10)") + scale_y_continuous(limits=c(0,8))

f = ggplot(ddply(combine_potential_class[which(combine_potential_class$Sample.Proportion >= 0.9),],.(State,Class),summarise,TEs.Proportion=sum(Count)/sum(Total)),aes(x=State,y=TEs.Proportion,fill=forcats::fct_rev(Class))) + geom_bar(stat="identity") + scale_fill_manual(values=class_colors,guide=FALSE) + theme(axis.title.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y = element_blank()) + labs(y="Proportion of TEs in 90%\nof samples by class") + coord_flip() + scale_x_discrete(limits = rev(states),labels=rev(all_state_labels)) + scale_y_continuous(expand=c(0.01,0.01))

grid.arrange(a,b,c,d,e,f,legend_class, nrow = 3, layout_matrix=rbind(c(1,2,3),c(4,5,6),c(NA,NA,7)),widths=c(0.42,0.29,0.29),heights=c(0.475,0.475,0.05))
```

```{r Figure 2 source data}
write.table(combine_boxplot[,c("Sample","State","Proportion")],file="source_data/Figure_2a_boxplot.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combine_stats[,c("State","Proportion_ever")],file="source_data/Figure_2a_dots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combine_class_ever[,c("State","Proportion","Class")],file="source_data/Figure_2b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combine_potential[which(combine_potential$Samples != 0),c("State","Sample.Proportion","Count")],file="source_data/Figure_2c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combine_potential[which(combine_potential$Sample.Proportion == 1),c("State","Count")],file="source_data/Figure_2d.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(ddply(combine_potential[which(combine_potential$Sample.Proportion >= 0.9),],.(State),summarise,Count=sum(Count))[,c("State","Count")],file="source_data/Figure_2e.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(ddply(combine_potential_class[which(combine_potential_class$Sample.Proportion >= 0.9),],.(State,Class),summarise,TEs.Proportion=sum(Count)/sum(Total))[,c("State","TEs.Proportion","Class")],file="source_data/Figure_2f.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Potential analysis

```{r analysis Fig 2a, cache=TRUE, cache.lazy=FALSE}
# Average proportion of TEs in the state per sample
ddply(combine_boxplot,.(State),summarise,Median=median(Proportion),Mean=mean(Proportion))

# Proportion of TEs ever in composite state categories
## Roadmap Active (states 1-8)
sum(apply(chromHMM_TE_state[8:15],1,function(x) sum(x > 0)) > 0)/NUM_TE
## Roadmap Inactive (states 9-15)
sum(apply(chromHMM_TE_state[16:22],1,function(x) sum(x > 0)) > 0)/NUM_TE
## Active regulatory states (states 1-3, 6-7)
sum(apply(chromHMM_TE_state[c(8:10,13:14)],1,function(x) sum(x > 0)) > 0)/NUM_TE
## Transcribed states (states 4-5)
sum(apply(chromHMM_TE_state[11:12],1,function(x) sum(x > 0)) > 0)/NUM_TE
## Poised regulatory states (states 10-12)
sum(apply(chromHMM_TE_state[17:19],1,function(x) sum(x > 0)) > 0)/NUM_TE
## Repressed states (states 9, 13-14)
sum(apply(chromHMM_TE_state[c(16,20,21)],1,function(x) sum(x > 0)) > 0)/NUM_TE

# Number of TEs ever in each state
ddply(combine_potential,.(State),summarise,Count=sum(Count[which(Samples != 0)]))

# Proportion of TEs ever in each state and mean proportion of samples in state
combine_stats

# By class
## Proportion of all TEs in each class
ddply(rmsk_TE_class,.(class_update,Count),summarise,TE_prop=Count/NUM_TE)

## Proportion of TEs ever in each state that belong to each class
dcast(combine_class_ever,Class~State,value.var="Proportion")

## Number/proportion of TEs in each state in 100% of samples that belong to each class
combine_potential_class[which(combine_potential_class$Sample.Proportion == 1),]

# 1st/3rd quartile and median proportion of samples in state for TEs ever in each state
ddply(combine_potential[which(combine_potential$Samples != 0),],.(State),summarise,first=quantile(rep(Sample.Proportion,times=Count),0.25),Median=median(rep(Sample.Proportion,times=Count)),third=quantile(rep(Sample.Proportion,times=Count),0.75))

# Median proportion of samples in state for TEs ever in each state, by class
dcast(ddply(combine_potential_class[which(combine_potential_class$Samples != 0),],.(State,Class),summarise,Median=median(rep(Sample.Proportion,times=Count))),Class~State,value.var="Median")
```

### TEs in each state in x% of samples

Calculates the number of TEs in each state in at least that proportion of samples (100%, 90%, 75%, 50%, 25%).

```{r potential percent}
ddply(combine_potential,.(State),summarise,In_100=sum(Count[which(Sample.Proportion == 1)]),
      In_90=sum(Count[which(Sample.Proportion >= 0.9)]),
      In_75=sum(Count[which(Sample.Proportion >= 0.75)]),
      In_50=sum(Count[which(Sample.Proportion >= 0.5)]),
      In_25=sum(Count[which(Sample.Proportion >= 0.25)]))
```

### TEs always in single state

Analysis for TEs that are in each epigenetic state in 100% (or 90%) of samples.

```{r always TEs}
# Dataframe of TEs in each epigenetic state in 100% of samples with data for that technique (TE coordinates and state only)
always_TEs = c(setNames(lapply(chromHMM_states,function(x) chromHMM_TE_state[which(chromHMM_TE_state[[x]] == sample_counts["All","chromHMM"]),TE_coordinates]),chromHMM_states),
                  setNames(lapply(meth_states,function(x) TE_meth_average[which(TE_meth_average[[x]] == sample_counts["All","WGBS"]),TE_coordinates]),meth_states))
always_TEs$DNase = TE_DNase_peaks[which(TE_DNase_peaks$Samples == sample_counts["All","DNase"]),TE_coordinates]
always_TEs$H3K27ac = TE_H3K27ac_peaks[which(TE_H3K27ac_peaks$Samples == sample_counts["All","H3K27ac"]),TE_coordinates]
always_TEs$RNA = RNA_TE[which(RNA_TE$Expressed_samples == sample_counts["All","RNA"]),TE_coordinates]
always_TEs = ldply(always_TEs,.id="State")

# Dataframe of chrY TEs in each epigenetic state in 100% of samples with data for that technique, excluding samples without chrY
always_TEs_noY = setNames(lapply(chromHMM_states,function(x) chromHMM_TE_state[which(chromHMM_TE_state$chromosome == "chrY" & chromHMM_TE_state[[x]] == sample_counts["chrY","chromHMM"]),TE_coordinates]),chromHMM_states)
always_TEs_noY$DNase = TE_DNase_peaks[which(TE_DNase_peaks$chromosome == "chrY" & TE_DNase_peaks$Samples == sample_counts["chrY","DNase"]),TE_coordinates]
always_TEs_noY$H3K27ac = TE_H3K27ac_peaks[which(TE_H3K27ac_peaks$chromosome == "chrY" & TE_H3K27ac_peaks$Samples == sample_counts["chrY","H3K27ac"]),TE_coordinates]
always_TEs_noY$RNA = RNA_TE[which(RNA_TE$chromosome == "chrY" & RNA_TE$Expressed_samples == sample_counts["chrY","RNA"]),TE_coordinates]
always_TEs_noY = ldply(always_TEs_noY,.id="State")

# Number/proportion of TEs in each state in all samples, excluding TEs that do not overlap CpGs for the methylation states
ddply(always_TEs,.(State),summarise,Count=length(chromosome),Proportion=length(chromosome)/NUM_TE,Proportion_CpG=length(chromosome)/NUM_TE_WGBS)
ddply(always_TEs_noY,.(State),summarise,Count=length(chromosome),Proportion=length(chromosome)/NUM_TE,Proportion_CpG=length(chromosome)/NUM_TE_WGBS)

# Number of TEs in a chromHMM state in all samples
dim(unique(always_TEs[which(always_TEs$State %in% chromHMM_states),2:8]))

# Length and genomic location of TEs in an epigenetic state in all samples (overlap with genic features)
always_TEs_location = merge(always_TEs,rmsk_TE[,c(TE_coordinates,"class_update","Length",cohorts)],by=TE_coordinates,all.x=TRUE)

## Number of TEs in each epigenetic state in all samples overlapping each genic feature
ddply(always_TEs_location,.(State),function(x) apply(x[,cohorts],2,function(y) sum(!is.na(y))))

## Number of TEs RPKM > 1 in all samples that do not overlap RefSeq genes or promoters
dim(always_TEs_location[which(always_TEs_location$State == "RNA" & is.na(always_TEs_location$exons) & is.na(always_TEs_location$introns) & is.na(always_TEs_location$promoters)),])

# Number of TEs in each epigenetic state in all samples that belong to each TE class
ddply(always_TEs_location,.(State,class_update),summarise,Count=length(chromosome))

# Write out TEs in each epigenetic state in 100% of samples
write.table(always_TEs[,c(TE_coordinates,"State")],file="always_TEs_true.bed",sep='\t',row.names=FALSE,quote=FALSE,col.names = FALSE)

# Dataframe of TEs in each epigenetic state in at least 90% of samples with data for that technique (TE coordinates and state only)
always_TEs_90 = c(setNames(lapply(chromHMM_states,function(x) chromHMM_TE_state[which(chromHMM_TE_state[[x]] >= sample_counts["All","chromHMM"]*.9),TE_coordinates]),chromHMM_states),
                  setNames(lapply(meth_states,function(x) TE_meth_average[which(TE_meth_average[[x]] >= sample_counts["All","WGBS"]*.9),TE_coordinates]),meth_states))
always_TEs_90$DNase = TE_DNase_peaks[which(TE_DNase_peaks$Samples >= sample_counts["All","DNase"]*.9),TE_coordinates]
always_TEs_90$H3K27ac = TE_H3K27ac_peaks[which(TE_H3K27ac_peaks$Samples >= sample_counts["All","H3K27ac"]*.9),TE_coordinates]
always_TEs_90$RNA = RNA_TE[which(RNA_TE$Expressed_samples >= sample_counts["All","RNA"]*.9),TE_coordinates]
always_TEs_90 = ldply(always_TEs_90,.id="State")

# Write out TEs in each epigenetic state in at least 90% of samples
write.table(always_TEs_90[,c(TE_coordinates,"State")],file="always_TEs_90_true.bed",sep='\t',row.names=FALSE,quote=FALSE,col.names = FALSE)

# Proportion of TEs in each epigenetic state in at least 90% of samples that belong to each TE class
dcast(ddply(combine_potential_class[which(combine_potential_class$Sample.Proportion >= 0.9),],.(State,Class),summarise,TEs.Proportion=sum(Count)/sum(Total)),State~Class,value.var="TEs.Proportion")
```

For TEs in each state in 100% of samples, identifies the closest RefSeq gene. Reports all ties. Also counts TEs overlapping a DHS peak summit in all samples that are >5kb from the nearest RefSeq gene. 

```{bash investigate always TEs, eval=FALSE}
# TEs always in a state

# Find the nearest gene
sort -k1,1 -k2,2n always_TEs_true.bed > always_TEs_true_sorted.bed
sort -k1,1 -k2,2n ~/genic_features/RefSeq/refseq_genes.txt | bedtools closest -a always_TEs_true_sorted.bed -b - -D b -t all > always_TEs_true_genes.txt

## Input
#/bar/epehrsson/TE_landscape/always_TEs_true.bed

## Output
#/bar/epehrsson/TE_landscape/always_TEs_true_sorted.bed
#/bar/epehrsson/TE_landscape/always_TEs_true_genes.txt

# TEs always overlapping a DHS peak summit that are >50kb from the nearest RefSeq gene
## And do not overlap a GENCODE v19 comprehensive gene
awk -v FS='\t' '{if(($8 == "DNase") && (($21 > 50000)||($21 < -50000))) print $0}' always_TEs_true_genes.txt | bedtools intersect -v -a - -b ~/genic_features/Gencode/v19/GENCODE_v19_comp_genes.bed | cut -f1-7 | sort | uniq

## And do overlap a GENCODE v19 comprehensive gene
awk -v FS='\t' '{if(($8 == "DNase") && (($21 > 50000)||($21 < -50000))) print $0}' always_TEs_true_genes.txt | bedtools intersect -wo -a - -b ~/genic_features/Gencode/v19/GENCODE_v19_comp_genes.bed | cut -f1-7 | sort | uniq

# Number of TEs always overlapping a DHS peak that are intergenic
grep "DNase" always_TEs_true.bed | bedtools intersect -v -a - -b ~/genic_features/RefSeq/refseq_genes.txt  | wc -l
```

### Background proportion of the genome in an active regulatory state in any Roadmap sample

From the 200bp window-chromHMM intersections, pulled out windows in the five active regulatory states. Then, found the number of unique windows that are ever in one of those states.

```{bash window potential, eval=FALSE}
# 200bp windows in an active regulatory state
while read line; do awk -v OFS='\t' -v sample=$line '{if(($4 == "1_TssA") || ($4 == "2_TssAFlnk") || ($4 == "3_TxFlnk") || ($4 == "6_EnhG") || ($4 == "7_Enh")) print $5, $6, $7, $4, sample}' chromHMM/genome/windows/windows_$line\.bed >> chromHMM/genome/windows/windows_active_reg.bed; done < sample_lists/mnemonics.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/genome/windows/windows_active_reg.bed

# Number of unique windows that are ever in an active regulatory state
cut -f1-3 chromHMM/genome/windows/windows_active_reg.bed | sort | uniq | wc -l
```

From the files listing the number of samples in which each shuffled TE is annotated with each chromHMM state (10 iterations), found the the number of shuffled TEs that are in an active regulatory state in at least one sample.  

```{bash shuffled potential, eval=FALSE}
# Number of shuffled TEs per iteration that are ever in an active regulatory state
for i in {1..10}; do tail -n +2 chromHMM/shuffled_TEs/rmsk_TE_shuffle_$i\_chromHMM_potential.txt | awk '{if(($8 > 0) || ($9 > 0) || ($10 > 0) || ($13 > 0) || ($14 > 0)) print $0}' | wc -l; done
```

### Supplementary Figure 4 

'''Block/window'''

Identified TEs that overlap the center of chromHMM annotation blocks. From the intersection files between TEs and chromHMM states, filtered to only TEs that overlap the center of the chromHMM annotation. Then, summed the total length of overlap between the TE and the state in that sample. 

Counted the number of TEs in each state per sample and the number of samples each TE is in each chromHMM state using these annotation rules. 

```{bash block control, eval=FALSE}
# Filtered TE x chromHMM state intersections to only those where the TE overlaps the center of the block
while read line; do awk -v OFS='\t' -v sample=$line '{mid=$9+(($10-$9)/2)-0.5;if((mid >= $2) && (mid < $3)) print $0, sample}' chromHMM/TEs/intersect/$line\_15_coreMarks_mnemonics.bed_TE >> chromHMM/blocks/rmsk_TE_block_summit.txt ; done < sample_lists/mnemonics.txt
while read line; do awk -v OFS='\t' -v sample=$line '{mid=$9+(($10-$9)/2)-0.5;if((mid >= $2) && (mid < $3)) print $0, sample}' chromHMM/TEs/intersect/$line\_15_coreMarks_mnemonics.bed_other >> chromHMM/blocks/rmsk_TE_block_summit.txt ; done < sample_lists/mnemonics.txt

# Unique
awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7, $11, $13]+=$12}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sep[8], sep[9], a[i]}}' chromHMM/rmsk_TE_block_summit.txt | sort -k1,1V -k2,2n -k3,3n -k4,4 -k9,9 - > chromHMM/rmsk_TE_block_summit_sorted.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/block/rmsk_TE_block_summit_sorted.txt

# Number of TEs in each state per sample
awk -v OFS='\t' '{a[$8, $9]+=1}END{for(i in a) {split(i, sep, SUBSEP); print sep[1], sep[2], a[i]}}' chromHMM/block/rmsk_TE_block_summit_sorted.txt > chromHMM/block/rmsk_TE_block_summit_counts.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/block/rmsk_TE_block_summit_counts.txt

# Number of samples each TE is in each state
awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7, $8]+=1}END{for(i in a) {split(i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sep[8], a[i]}}' chromHMM/rmsk_TE_block_summit_sorted.txt > chromHMM/rmsk_TE_block_potential.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/block/rmsk_TE_block_potential.txt
```

As QC, found the number of TEs annotated with each state that are identified using block center overlap, but not the standard analysis, and compared it to the total number identified using block center overlap. First, separated the TEs that overlap the center of chromHMM annotation blocks into files by subfamily and state, then found those that were not also identified using the standard annotation rules. No subfamily x state combination had no entry using the alternative annotation rules. Then, counted the number of TEs identified using the block center overlap rules and the number that are exclusively identified using those rules, by state. The proportion is small, usually <1%. 

```{bash block QC, eval=FALSE}
# TEs overlapping a block summit that are not counted in the original analysis
awk -v OFS='\t' '{if($8 != "8_ZNF/Rpts") print $1, $2, $3, $4, $5, $6, $7, $9, $10, $8 > $4"_"$8".txt"; else print $1, $2, $3, $4, $5, $6, $7, $9, $10, $8 > $4"_8_ZNF.Rpts.txt"}' rmsk_TE_block_summit_sorted.txt

while read state; do while read subfam; do echo $subfam $state; comm -23 <(sort chromHMM/block/$subfam\_$state\.txt) <(cut -f1-10 chromHMM/subfamily/by_state/$subfam\_$state\.txt | sort) >> chromHMM/block/block_exclusive.txt; done < features/TEs/subfamily/subfamilies.txt; done < chromHMM/chromHMM_states.txt
while read subfam; do echo $subfam; comm -23 <(sort chromHMM/block/$subfam\_8_ZNF.Rpts.txt) <(cut -f1-10 chromHMM/subfamily/by_state/$subfam\_8_ZNF.Rpts.txt | sort) >> chromHMM/block/block_exclusive.txt; done < features/TEs/subfamily/subfamilies.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/block/block_exclusive.txt

# Subfamily x state combinations present using block overlap but not normally
while read state; do while read subfam; do if [ ! -f chromHMM/subfamily/by_state/$subfam\_$state\.txt ]; then if [ -f chromHMM/block/$subfam\_$state\.txt ]; then echo $subfam $state; fi; fi; done < features/TEs/subfamily/subfamilies.txt; done < chromHMM/chromHMM_states.txt >> chromHMM/block/not_all.txt
while read subfam; do if [ ! -f chromHMM/subfamily/by_state/$subfam\_8_ZNF.Rpts.txt ]; then if [ -f chromHMM/block/$subfam\_8_ZNF.Rpts.txt ]; then echo $subfam; fi; fi; done < features/TEs/subfamily/subfamilies.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/block/[subfam]_[state].txt [13539 files]

# Number of TEs per state identified using block overlap but not normally vs. total
awk -v OFS='\t' '{a[$10]+=1}END{for(i in a){print i, a[i]}}' chromHMM/block/block_exclusive.txt
awk -v OFS='\t' '{a[$8]+=1}END{for(i in a){print i, a[i]}}' chromHMM/block/rmsk_TE_block_summit_sorted.txt
```

As another alternative annotation rule, counted the number of TEs that overlap 200bp window centers annotated with each state in each sample.

```{bash summit control, eval=FALSE}
# TEs per sample, restricted to those overlapping 200bp window centers
awk -v OFS='\t' '{if($11 == "summit") a[$8,$10]+=1}END{for(i in a){split (i, sep, SUBSEP); print sep[1], sep[2], a[i];}}' chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt > chromHMM/state_sample_counts_summit_only.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/state_sample_counts_summit_only.txt
```

Compares the number of TEs in each state per sample and the number of TEs ever in each epigenetic state using the original annotation rules (center of a 200bp bin or majority of the TE) to the results using the two alternative annotation rules. 

```{r block analysis}
source("R_scripts/chromHMM_block_potential.R")

# The median ratio of the number of TEs in each state by sample using the alternative vs. original annotation rules
test2 = ddply(state_sample_count_rules,.(State,Category),summarise,Median=median(na.omit(Ratio)))
test2[order(test2$Median),]

# The ratio of the number of TEs ever in each state using the alternative vs. original annotation rules
potential_ever[order(potential_ever$Ratio),]

# Number of TEs in an active regulatory state in any sample
## Requiring overlap with block centers
dim(unique(block_potential[which(block_potential$State %in% chromHMM_states[c(1:3,6:7)]),TE_coordinates]))[1]/NUM_TE

## Requiring overlapping with 200bp bin centers
sum(apply(summit_potential[c(8:10,13:14)],1,function(x) sum(x > 0)) > 0)/NUM_TE
```

a. Ratio of the number of TEs in each state per sample using two alternative annotation rules to the results with the original annotation rules. X-axis is in increasing order of median block size. Dashed line is proportion of all TEs overlapping 200bp bin centers. b. Ratio of the number of TEs ever in each state using the alternative vs. original annotation rules. 

```{r Figure S4, echo=FALSE, fig.height=5}
a = ggplot(state_sample_count_rules,aes(x=State,y=Ratio,fill=Category)) + geom_boxplot() + ylim(0,1) + 
  ylab("TEs in state per sample\n(proportion of Fig. 2a TEs)") + scale_x_discrete(limits=chromHMM_states[rev(c(2,10:12,3,6:8,1,13,9,4:5,14:15))]) + 
  geom_hline(yintercept = 0.82,linetype="dashed") + scale_fill_discrete(guide=FALSE) + coord_flip()

b = ggplot(potential_ever,aes(x=State,y=Ratio,color=Category)) + geom_point(size=4) + ylim(0,1) +
  theme(axis.text.y = element_blank(),axis.title.y = element_blank(),axis.ticks.y = element_blank()) + 
  ylab("TEs ever in state\n(proportion of Fig. 2a TEs)") + scale_x_discrete(limits=chromHMM_states[rev(c(2,10:12,3,6:8,1,13,9,4:5,14:15))]) + 
  geom_hline(yintercept = 0.82,linetype="dashed") + scale_color_discrete(labels=setNames(c("Center of chromHMM block","Center of 200bp bin"),c("Block","Summit")),guide=FALSE) + coord_flip()

category_legend = get_legend(ggplot(potential_ever,aes(x=State,y=Ratio,color=Category)) + geom_point(size=4) + ylim(0,1) +
  theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),legend.position = "bottom") + 
  ylab("TEs ever in state\n(proportion of Fig. 2a TEs)") + scale_x_discrete(limits=chromHMM_states[c(2,10:12,3,6:8,1,13,9,4:5,14:15)]) + 
  geom_hline(yintercept = 0.82,linetype="dashed") + scale_color_discrete(labels=setNames(c("Center of chromHMM block","Center of 200bp bin"),c("Block","Summit")),name="Overlap") + coord_flip())

grid.arrange(a,b,category_legend,nrow=2,layout_matrix=rbind(c(1,2),c(3)),heights=c(0.9,0.1),widths=c(0.55,0.45))
```

```{r Figure S4 source data}
write.table(state_sample_count_rules[,c("Sample","State","Ratio","Category")],file="source_data/Figure_S4a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(potential_ever[,c("State","Ratio","Category")],file="source_data/Figure_S4b.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Supplementary Figure 5

'''18-state model'''

The Roadmap project annotated 98 samples with the [https://egg2.wustl.edu/roadmap/web_portal/chr_state_learning.html#exp_18state](18-state model), which includes H3K27ac. I assigned TEs to chromHMM states using this model with the same annotation rules used for the 15-state model. 

First, intersected the 200bp windows with the chromHMM annotations and converted to bed format. 15,478,375 windows have annotations for most samples, 15,181,508 for the samples without chrY, and the annotations do not go to the end of the chromosomes (the same as for the 15-state model). Then, intersected TEs that overlap the center of 200bp windows with the window bedfiles, filtered to intersections that overlap the center of 200bp windows, and summed the overlap of the TE with the state in that sample. For TEs that do not overlap the center of 200bp windows, intersected with the chromHMM annotation bedfiles and found the state that covers the majority of the TE. Combined both sets of TEs and sorted by chromosome.

```{bash 18 state, eval=FALSE}
# Intersect 200bp windows with chromHMM, 18-state model
while read line; do bedtools intersect -wo -a raw_data/chromHMM_18state/$line\_18_core_K27ac_mnemonics.bed -b features/hg19_standard.windows | awk -v OFS='\t' '{print $5, $6, $7, $4}' - | sort -k1,1 -k2,2n - > chromHMM/genome/windows/windows_18state_$line\.bed; done < sample_lists/H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/genome/windows/windows_18state_E#.bed [98 files]

# TEs overlapping the center of 200bp windows ("summit")
while read line; do bedtools intersect -wo -sorted -a features/TEs/rmsk_TEother_summit.txt -b chromHMM/genome/windows/windows_18state_$line\.bed > /scratch/ecp/TE_landscape/state18/rmsk_TEother_summit_$line\.txt; done < sample_lists/H3K27ac_samples.txt
for file in rmsk_TEother_summit_E*.txt; do awk -v OFS='\t' '{if(($9+99.5 >= $2) && ($9+99.5 < $3)) a[$1, $2, $3, $4, $5, $6, $7, $11]+=$12}END{for(i in a) {split(i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sep[8], a[i]}}' $file > $(basename $file .txt); done
for file in rmsk_TEother_summit_E[0-9][0-9][0-9] ; do mv $file $file\.txt; done

# TEs that do not ("majority")
while read line; do bedtools intersect -wo -a features/TEs/rmsk_TEother_majority.txt -b raw_data/chromHMM_18state/$line\_18_core_K27ac_mnemonics.bed > /scratch/ecp/TE_landscape/state18/rmsk_TEother_majority_$line\.txt; done < sample_lists/H3K27ac_samples.txt
for file in rmsk_TEother_majority_*.txt; do python ~/bin/TE_landscape/pick_majority.py $file $(basename $file .txt) 10 11; done
for file in rmsk_TEother_majority_E[0-9][0-9][0-9]; do mv $file $file\.txt; done

# Combine and sort
while read line; do awk -v OFS='\t' -v sample=$line '{print $0, sample, "summit"}' rmsk_TEother_summit_$line\.txt >> rmsk_TEother_18state.txt; awk -v OFS='\t' -v sample=$line '{print $0, sample, "majority"}' rmsk_TEother_majority_$line\.txt >> rmsk_TEother_18state.txt; done < ~/TE_landscape/sample_lists/H3K27ac_samples.txt &
awk '{print>$1}' rmsk_TEother_18state.txt
while read line; do sort -k1,1V -k2,2n -k3,3n -k4,4 -k10,10 $line >> rmsk_TEother_18state_sorted.txt; rm $line; done < chromosomes.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/rmsk_TEother_18state_sorted.txt
```

For TEs assigned to 18-state model chromHMM states, found the number of samples in which each TE is annotated with each state and the number of TEs annotated with the state per sample. 

```{bash 18 state potential, eval=FALSE}
# List of states in 18-state model
#/bar/epehrsson/TE_landscape/sample_lists/chromHMM_18_states.txt

# Potential
python ~/bin/TE_landscape/potential.py chromHMM/rmsk_TEother_18state_sorted.txt features/TEs/rmsk_TEother.txt sample_lists/chromHMM_18_states.txt sample_lists/H3K27ac_samples.txt chromHMM/potential/rmsk_TEother_18state_potential.txt 0 9 7 8

## Output
#/bar/epehrsson/TE_landscape/chromHMM/potential/rmsk_TEother_18state_potential.txt

# TEs in state per sample
awk -v OFS='\t' '{a[$8,$10]+=1}END{for(i in a){split (i, sep, SUBSEP); print sep[1], sep[2], a[i];}}' chromHMM/rmsk_TEother_18state_sorted.txt > chromHMM/state_sample_counts_18state.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/state_sample_counts_18state.txt
```

Creates dataframes of the number of TEs in each 18-state model chromHMM state in each number of samples, as well as the number of TEs in each state in at least one sample. 

```{r 18 state potential load}
# Load dataframe of the number of samples each TE is annotated with each 18-state model state
chromHMM_18state = read.table("chromHMM/potential/rmsk_TEother_18state_potential.txt",sep='\t',header=TRUE,quote="")
colnames(chromHMM_18state)[8:25] = names(chromHMM_states_18)

# Number/proportion of TEs in each chromHMM state for each number of samples (0-98)
chromHMM_18state_dist = sample_distribution(chromHMM_18state,c(8:25),sample_counts["All","H3K27ac"])
chromHMM_18state_potential = melt(chromHMM_18state_dist,id.vars="Samples",
                                  variable.name="State",value.name="Count")
chromHMM_18state_potential = ddply(chromHMM_18state_potential,.(State),transform,Sample.Proportion = Samples/(length(Samples)-1))

# Proportion of TEs ever in each state and mean/SE proportion of samples in state, for all TEs and those ever in the state
chromHMM_18state_stats = potential_stats(chromHMM_18state_dist,18,sample_counts["All","H3K27ac"])
chromHMM_18state_stats$State = factor(rownames(chromHMM_18state_stats),levels=names(chromHMM_states_18))

# Number/proportion of TEs in each chromHMM state by sample
state_sample_18state = read.table("chromHMM/state_sample_counts_18state.txt",sep='\t',quote="",col.names=c("State","Sample","Count"))
state_sample_18state$Count = as.numeric(state_sample_18state$Count)
state_sample_18state$Proportion = ifelse(metadata[match(state_sample_18state$Sample,metadata$Sample),]$chrY == "Yes",state_sample_18state$Count/NUM_TE,state_sample_18state$Count/NUM_TE_noY)
state_sample_18state$State = factor(state_sample_18state$State,levels=names(chromHMM_states_18))
```

```{r 18 state potential analysis, cache=TRUE, cache.lazy=FALSE}
# Median proportion of TEs in each state per sample
ddply(state_sample_18state,.(State),summarise,Median=median(Proportion))

# Proportion of TEs ever in composite state categories
## Active regulatory states (states 1-4, 7-11)
sum(apply(chromHMM_18state[c(8:11,14:18)],1,function(x) sum(x > 0)) > 0)/NUM_TE
## Transcribed states (states 5-6)
sum(apply(chromHMM_18state[12:13],1,function(x) sum(x > 0)) > 0)/NUM_TE
## Poised regulatory states (states 14-15)
sum(apply(chromHMM_18state[21:22],1,function(x) sum(x > 0)) > 0)/NUM_TE
## Repressed states (states 13, 16-17)
sum(apply(chromHMM_18state[c(20,23:24)],1,function(x) sum(x > 0)) > 0)/NUM_TE

# Number of TEs ever in each state
ddply(chromHMM_18state_potential,.(State),summarise,Count=sum(Count[which(Samples != 0)]))

# Proportion of TEs ever in each state and mean proportion of samples in state
chromHMM_18state_stats
```

'''50-state models'''

The Roadmap Epigenomics Project also generated independent 50-state chromHMM models for 7 reference epigenomes that were broadly profiled: E003 through E008 and E017. They are H1 and H9 cells (ESCs), H1 derived cells (ES-derived), and IMR90. Each sample had an average of 24 epigenetic marks profiled, and all available marks were included in the model for each state.

Each of the files has chromosomes 1-22, X, Y, and M, and the windows do not go to the edge of the chromosome, as in the 15-state and 18-state models. I assigned TEs to chromHMM states using this model with the same annotation rules used for the 15-state model.

```{bash 50-state chromHMM, eval=FALSE}
# Intersect 200bp windows with 50-state chromHMM annotations
for file in raw_data/chromHMM_50state/E0*_50_segments.bed; do bedtools intersect -wo -a $file -b features/hg19_standard.windows | awk -v OFS='\t' '{print $5, $6, $7, $4}' - | sort -k1,1 -k2,2n - >chromHMM/genome/windows/windows_50state_$(basename $file _50_segments.bed ).bed; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/genome/windows/windows_50state_E#.bed [7 files]

# TEs overlapping the center of 200bp windows ("summit")
for file in chromHMM/genome/windows/windows_50state_E0*.bed; do bedtools intersect -wo -sorted -a features/TEs/rmsk_TEother_summit.txt -b $file > /scratch/ecp/TE_landscape/state50/rmsk_TEother_summit_$(basename $file .bed).txt; done
for file in rmsk_TEother_summit_windows_50state_E*.txt; do  awk -v OFS='\t' '{if(($9+99.5 >= $2) && ($9+99.5 < $3)) a[$1, $2, $3, $4, $5, $6, $7, $11]+=$12}END{for(i in a) {split(i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sep[8], a[i]}}' $file > $(basename $file .txt); done
for file in rmsk_TEother_summit_windows_50state_E0[0-1][3-8] ; do mv $file $file\.txt; done

# TEs that do not ("majority")
for file in raw_data/chromHMM_50state/*bed; do bedtools intersect -wo -a features/TEs/rmsk_TEother_majority.txt -b $file > /scratch/ecp/TE_landscape/state50/rmsk_TEother_majority_$(basename $file .bed).txt; done &
for file in rmsk_TEother_majority_*.txt; do python ~/bin/TE_landscape/pick_majority.py $file $(basename $file .txt) 10 11; done
for file in rmsk_TEother_majority_E[0-9][0-9][0-9]_50_segments; do mv $file $file\.txt; done

# Combine and sort
for sample in E003 E004 E005 E006 E007 E008 E017; do awk -v OFS='\t' -v sample=$sample '{print $0, sample, "summit"}' rmsk_TEother_summit_windows_50state_$sample\.txt >> rmsk_TEother_50state_$sample\.txt; awk -v OFS='\t' -v sample=$sample '{print $0, sample, "majority"}' rmsk_TEother_majority_$sample\_50_segments.txt >> rmsk_TEother_50state_$sample\.txt; done &
for file in rmsk_TEother_50state_*.txt; do sort -k1,1V -k2,2n -k3,3n -k4,4 -k10,10 $file > $(basename $file .txt)_sorted.txt; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/state50/rmsk_TEother_50state_E#_sorted.txt [7 files]
```

Then, I found the number of TEs that are annotated with each chromHMM state per sample. 

```{bash 50-state potential, eval=FALSE}
# List of states in 50-state model
#/bar/epehrsson/TE_landscape/sample_lists/chromHMM_50state.txt

# Number of TEs in state per sample
for file in rmsk_TEother_50state_E0*_sorted.txt; do awk -v OFS='\t' '{a[$8]+=1}END{for(i in a){print i, a[i]}}' $file > state_sample_counts_$(basename $file _sorted.txt).txt; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/state50/state_sample_counts_rmsk_TEother_50state_E#.txt [7 files]
```

Creates a dataframe of the number/proportion of TEs in each 50-state model chromHMM state in each sample. Using Roadmap 2015 Supplementary Fig. 4, for all 7 samples, I determined which of 18-state model states is most enriched for each of the 50 states.

```{r 50 state potential}
# Number of TEs in each chromHMM state by sample
state50_files = list.files(path="chromHMM/state50",pattern="state_sample_counts_rmsk_TEother_50state_",full.names = TRUE)
state_sample_50state = lapply(state50_files,function(x) read.table(x,sep='\t',quote=""))
names(state_sample_50state) = gsub(".txt","",gsub("chromHMM/state50/state_sample_counts_rmsk_TEother_50state_","",state50_files))
state_sample_50state = ldply(state_sample_50state)
colnames(state_sample_50state) = c("Sample","State50","Count")

# Corresponding 18-state model state for each 50-state model state, by sample
state50_state18 = read.table("sample_lists/chromHMM_50state.txt",sep='\t',quote="",header=TRUE)
state50_state18$State50 = paste("E",state50_state18$State50,sep="")

# Combine and calculate proportion of TEs in each state by sample
state_sample_50state = merge(state_sample_50state,state50_state18,by=c("Sample","State50"))
state_sample_50state$Proportion = state_sample_50state$Count/NUM_TE
state_sample_50state$State50 = factor(state_sample_50state$State50,levels=paste("E",seq(1,50,1),sep=""))
state_sample_50state$State18 = factor(state_sample_50state$State18,levels=names(chromHMM_states_18))
```

The number of TEs in an active regulatory state in each sample, based on the corresponding 18-state model state (states 1-4 and 7-11), for each sample. 

```{bash 50-state active reg, eval=FALSE}
awk '{if(($8 == "E1") || ($8 == "E2") ||($8 == "E3") ||($8 == "E4") ||($8 == "E5") ||($8 == "E6") ||($8 == "E7") ||($8 == "E8") ||($8 == "E14") ||($8 == "E15") ||($8 == "E16") ||($8 == "E17") ||($8 == "E18") ||($8 == "E19") ||($8 == "E20") ||($8 == "E21") ||($8 == "E22") ||($8 == "E23") ||($8 == "E24") ||($8 == "E25") ||($8 == "E26") ||($8 == "E27") ||($8 == "E28") ||($8 == "E29") ||($8 == "E30") ||($8 == "E31")) print $0}' rmsk_TEother_50state_E003_sorted.txt | cut -f1-7 | sort | uniq | wc -l >> count_active.txt

awk '{if(($8 == "E1") ||($8 == "E2") ||($8 == "E3") ||($8 == "E4") ||($8 == "E5") ||($8 == "E6") ||($8 == "E7") ||($8 == "E8") ||($8 == "E9") ||($8 == "E10") ||($8 == "E11") ||($8 == "E12") ||($8 == "E20") ||($8 == "E21") ||($8 == "E22") ||($8 == "E23") ||($8 == "E24") ||($8 == "E25") ||($8 == "E26") ||($8 == "E27") ||($8 == "E28") ||($8 == "E29") ||($8 == "E30") ||($8 == "E31") ||($8 == "E32") ||($8 == "E33") ||($8 == "E34") ||($8 == "E35") ||($8 == "E36") ||($8 == "E37")) print $0}' rmsk_TEother_50state_E004_sorted.txt | cut -f1-7 | sort | uniq | wc -l >> count_active.txt

awk '{if(($8 == "E1") ||($8 == "E2") ||($8 == "E3") ||($8 == "E4") ||($8 == "E5") ||($8 == "E6") ||($8 == "E7") ||($8 == "E8") ||($8 == "E9") ||($8 == "E10") ||($8 == "E11") ||($8 == "E18") ||($8 == "E19") ||($8 == "E20") ||($8 == "E21") ||($8 == "E22") ||($8 == "E23") ||($8 == "E24") ||($8 == "E25") ||($8 == "E26") ||($8 == "E27") ||($8 == "E28") ||($8 == "E29") ||($8 == "E30") ||($8 == "E31") ||($8 == "E32") ||($8 == "E33") ||($8 == "E34") ||($8 == "E35") ||($8 == "E36") ||($8 == "E37") ||($8 == "E38") ||($8 == "E39")) print $0}' rmsk_TEother_50state_E005_sorted.txt | cut -f1-7 | sort | uniq | wc -l >> count_active.txt

awk '{if(($8 == "E1") ||($8 == "E2") ||($8 == "E3") ||($8 == "E4") ||($8 == "E5") ||($8 == "E6") ||($8 == "E7") ||($8 == "E8") ||($8 == "E18") ||($8 == "E19") ||($8 == "E20") ||($8 == "E21") ||($8 == "E22") ||($8 == "E23") ||($8 == "E24") ||($8 == "E25") ||($8 == "E26") ||($8 == "E27") ||($8 == "E28") ||($8 == "E29") ||($8 == "E30") ||($8 == "E31") ||($8 == "E32") ||($8 == "E33") ||($8 == "E34") ||($8 == "E35")) print $0}' rmsk_TEother_50state_E006_sorted.txt | cut -f1-7 | sort | uniq | wc -l >> count_active.txt

awk '{if(($8 == "E1") ||($8 == "E2") ||($8 == "E3") ||($8 == "E4") ||($8 == "E5") ||($8 == "E6") ||($8 == "E7") ||($8 == "E8") ||($8 == "E9") ||($8 == "E10") ||($8 == "E14") ||($8 == "E15") ||($8 == "E16") ||($8 == "E17") ||($8 == "E18") ||($8 == "E19") ||($8 == "E20") ||($8 == "E21") ||($8 == "E22") ||($8 == "E23") ||($8 == "E24") ||($8 == "E25") ||($8 == "E26") ||($8 == "E27") ||($8 == "E28")) print $0}' rmsk_TEother_50state_E007_sorted.txt | cut -f1-7 | sort | uniq | wc -l >> count_active.txt

awk '{if(($8 == "E1") ||($8 == "E2") ||($8 == "E3") ||($8 == "E4") ||($8 == "E5") ||($8 == "E6") ||($8 == "E7") ||($8 == "E8") ||($8 == "E9") ||($8 == "E10") ||($8 == "E14") ||($8 == "E15") ||($8 == "E16") ||($8 == "E17") ||($8 == "E18") ||($8 == "E19") ||($8 == "E20") ||($8 == "E21") ||($8 == "E22") ||($8 == "E23") ||($8 == "E24") ||($8 == "E25") ||($8 == "E26") ||($8 == "E27") ||($8 == "E28") ||($8 == "E29") ||($8 == "E30") ||($8 == "E31") ||($8 == "E32") ||($8 == "E33") ||($8 == "E34") ||($8 == "E35")) print $0}'  rmsk_TEother_50state_E008_sorted.txt | cut -f1-7 | sort | uniq | wc -l >> count_active.txt

awk '{if(($8 == "E1") ||($8 == "E2") ||($8 == "E3") ||($8 == "E4") ||($8 == "E5") ||($8 == "E6") ||($8 == "E7") ||($8 == "E8") ||($8 == "E17") ||($8 == "E18") ||($8 == "E19") ||($8 == "E20") ||($8 == "E21") ||($8 == "E22") ||($8 == "E23") ||($8 == "E24") ||($8 == "E25") ||($8 == "E26") ||($8 == "E27") ||($8 == "E28") ||($8 == "E29") ||($8 == "E30") ||($8 == "E31") ||($8 == "E32") ||($8 == "E33") ||($8 == "E34") ||($8 == "E35") ||($8 == "E36") ||($8 == "E37")) print $0}' rmsk_TEother_50state_E017_sorted.txt | cut -f1-7 | sort | uniq | wc -l >> count_active.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/state50/count_active.txt
```

The number of TEs in an active regulatory state per sample, using the 15-state model. Input is the by-sample matrices produced by the Figure S10 analysis (combine marks).

```{bash active reg 15, eval=FALSE}
for file in E*; do awk '{if(($10 == "1_TssA") || ($10 == "2_TssAFlnk") || ($10 == "3_TxFlnk") || ($10 == "6_EnhG") || ($10 == "7_Enh")) print $0}' $file | cut -f1-7 | uniq | wc -l >> count_active.txt ; done

## Output
#/bar/epehrsson/TE_landscape/compare_marks/combined/count_active.txt
```

Comparison of the number of TEs in an active regulatory state per sample using the 15-state and 50-state models.

```{r active reg by sample}
# Number/proportion of TEs in an active regulatory state per sample, 15-state model
count_active_15 = cbind(metadata[,c("Sample","chrY")],read.table("compare_marks/combined/count_active.txt"))
colnames(count_active_15)[3] = "Count"
count_active_15$Proportion = count_active_15$Count/ifelse(count_active_15$chrY == "Yes",NUM_TE,NUM_TE_noY)

## Summary statistics for proportion of TEs in an active regulatory state per sample, 
## for all samples and only those that were also annotated with a 50-state model
summary(count_active_15$Proportion)
summary(count_active_15[which(count_active_15$Sample %in% c("E003","E004","E005","E006","E007","E008","E017")),]$Proportion)

# Number/proportion of TEs in an active regulatory state per sample, 50-state model
count_active_50 = cbind(c("E003","E004","E005","E006","E007","E008","E017"),read.table("chromHMM/state50/count_active.txt"))
colnames(count_active_50) = c("Sample","Count")
count_active_50$Proportion = count_active_50$Count/NUM_TE

## Summary statistics for proportion of TEs in an active regulatory state per sample
summary(count_active_50$Proportion)
```

Potential analysis (Fig 2A/C) using 18-state and 50-state chromHMM models. a. Proportion of TEs in each 18-state model state by sample (boxplot) and number of TEs ever in each state (dot). b. Distribution of the number of TEs in each state at each number of samples, for those ever in the state. c. Proportion of TEs in each 50-state model chromHMM states, by sample, colored by the corresponding 18-state model state.

```{r Figure S5, echo=FALSE}
a = ggplot(state_sample_18state,aes(x=State,y=Proportion,fill=State)) + geom_boxplot() +scale_fill_manual(values=chromHMM_states_18,guide=FALSE) + scale_x_discrete(limits=rev(names(chromHMM_states_18))) + theme(axis.title.y = element_text(margin = margin(r = -5))) + labs(y="Proportion of TEs in state") + ylim(0,1) + coord_flip() + geom_point(data=chromHMM_18state_stats,aes(x=State,y=Proportion_ever),color="red",size=2) + scale_y_continuous(limits=c(0,1),expand=c(0.01,0.01))

b = ggplot(chromHMM_18state_potential[which(chromHMM_18state_potential$Samples != 0),],aes(x=State,y=Sample.Proportion,weight=Count,fill=State)) + geom_violin(scale="width") + coord_flip() + scale_fill_manual(values=chromHMM_states_18,guide=FALSE) + theme(axis.title.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y = element_blank()) + ylab("Proportion of samples in state") + scale_x_discrete(limits=rev(names(chromHMM_states_18))) + geom_boxplot(width=0.2,outlier.size = 0.1) + scale_y_continuous(expand=c(0.01,0.01))

c = ggplot(state_sample_50state,aes(x=State50,y=Proportion,fill=State18)) + geom_bar(stat="identity",color="black",size=0.1) + scale_fill_manual(values=chromHMM_states_18,name="State\n(18-state model)") + coord_flip() + facet_wrap(~Sample) + labs(x="State (50-state model)",y="Proportion of TEs in state") + scale_x_discrete(limits=rev(levels(state_sample_50state$State50))) + theme(axis.text.y=element_blank(),axis.ticks.y = element_blank(),legend.key.size = unit(2,'mm'))

grid.arrange(a,b,c,nrow=2,layout_matrix=rbind(c(1,2),c(3)),widths=c(0.55,0.45),heights=c(0.4,0.6))
```

```{r Figure S5 source data}
write.table(state_sample_18state[,c("Sample","State","Proportion")],file="source_data/Figure_S5a_boxplot.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(chromHMM_18state_stats[,c("State","Proportion_ever")],file="source_data/Figure_S5a_dots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(chromHMM_18state_potential[which(chromHMM_18state_potential$Samples != 0),c("State","Sample.Proportion","Count")],file="source_data/Figure_S5b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(state_sample_50state[,c("Sample","State50","Proportion","State18")],file="source_data/Figure_S5c.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Potential without chrY

Tests the results for the average number of samples a TE is in each chromHMM state and the proportion of TEs ever in each chromHMM state without chrY. 

```{r potential noY}
# Proportion of TEs ever in each chromHMM state and mean proportion of samples in state, without chrY
test = chromHMM_TE_state[which(chromHMM_TE_state$chromosome != "chrY"),]
test.a = sample_distribution(test,c(8:22),sample_counts["All","chromHMM"])
test.c = potential_stats(test.a,15,sample_counts["All","chromHMM"])
test.c$State = factor(chromHMM_states,levels=chromHMM_states)

## Difference in results with and without chrY
chromHMM_TE_state_dist_stats-test.c

# Mean number of samples chrY TEs are in each chromHMM state
sort(colMeans(chromHMM_TE_state[which(chromHMM_TE_state$chromosome == "chrY"),8:22]))
```

### Supplementary Figure 6

By-class potential. a. Proportion of TEs in each class in each state by sample (boxplots) and number of TEs ever in each state by class (dots). b. Proportion of members of each class overlapping genic features, split by coding status. c. Distribution of the number of TEs in each state at each number of samples, for those ever in the state, by class.

```{r Figure S6, echo=FALSE}
a = ggplot(combine_boxplot_class,aes(x=State,y=Proportion,fill=Class)) + geom_boxplot() + scale_fill_manual(values=class_colors,guide=FALSE) + coord_flip() + ylab("Proportion of TEs in state") + scale_x_discrete(labels=rev(all_state_labels),limits=rev(levels(combine_boxplot_class$State))) + geom_point(data=combine_stats_class,aes(x=State,y=Proportion_ever,color=Class),position=position_dodge(width=0.5)) + scale_color_manual(values=class_colors) + theme(legend.position="bottom",axis.title.y = element_text(margin = margin(r = -5)),legend.key.size = unit(1,'mm'),legend.box.margin = margin(t=-10)) + guides(color = guide_legend(nrow = 2)) 

# Proportion of members of each class overlapping each genic feature, split by coding status
rmsk_TE_class_feature = melt(rmsk_TE_class[,c("class_update",cohorts[1:19])],id.vars="class_update")
colnames(rmsk_TE_class_feature) = c("Class","Cohort","Proportion")
rmsk_TE_class_feature = split_coding(rmsk_TE_class_feature)

b = ggplot(rmsk_TE_class_feature,aes(x=Class,y=Proportion,fill=Feature)) + geom_bar(stat="identity") + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),legend.position="bottom",legend.box.margin = margin(t=-10),legend.key.size = unit(3,'mm')) + xlab("TE class") + ylab("Proportion of TEs overlapping feature") + scale_fill_manual(values=c(brewer.pal(8,"Spectral")),labels=genic_labels[c(8,1:7)],name="Feature") + facet_wrap(~Coding,ncol=1,labeller=labeller(Coding=coding_labels),scales="free_y") + guides(fill=guide_legend(title.position = "top"))

c = ggplot(combine_potential_class[which(combine_potential_class$Samples != 0),],aes(x=State,y=Sample.Proportion,weight=Count,fill=State)) + geom_violin(scale="width") + coord_flip() + scale_fill_manual(values=all_state_colors,guide=FALSE) + xlab("State") + ylab("Proportion of samples in state") + scale_x_discrete(labels=rev(all_state_labels),limits = rev(states)) + facet_wrap(~Class,nrow=1) + geom_boxplot(width=0.2,outlier.size = 0.1)  + theme(axis.title.y = element_text(margin = margin(r = -10))) + scale_y_continuous(breaks=pretty_breaks(n=3),expand=c(0.01,0.01)) 

grid.arrange(a,b,c,nrow=2,layout_matrix=rbind(c(1,2),c(3)),heights=c(0.6,0.4),widths=c(0.58,0.42))
```

```{r Figure S6 source data}
write.table(combine_boxplot_class[,c("Sample","State","Proportion","Class")],file="source_data/Figure_S6a_boxplots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combine_stats_class[,c("State","Proportion_ever","Class")],file="source_data/Figure_S6a_dots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(rmsk_TE_class_feature[,c("Class","Proportion","Feature","Coding")],file="source_data/Figure_S6b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combine_potential_class[which(combine_potential_class$Samples != 0),c("State","Sample.Proportion","Count","Class")],file="source_data/Figure_S6c.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Class potential analysis

```{r class potential analysis}
# Proportion of TEs ever in each state and mean/SE proportion of samples in state, by class
combine_stats_class

# CV and range of mean samples in state for all TEs, by state
ddply(combine_stats_class,"State",summarise,CV=sd(Samples_avg_all)/mean(Samples_avg_all), Range=max(Samples_avg_all)-min(Samples_avg_all))
```

### Feature overlap and CpGs per class

```{r class feature analysis}
# Table of the number of RefSeq genic features each TE overlaps, all and split by coding status
table(apply(rmsk_TE[,c(features[2:3],"coding_exon",features[5:8])],1,function(x) sum(!is.na(x))))
table(apply(rmsk_TE[,cohorts[2:19]],1,function(x) sum(!is.na(x))))

# Number/proportion of TEs in each feature, overall and by class
feature_proportion = apply(rmsk_TE[,cohorts[1:19]],2,function(x) length(na.omit(x)))
feature_proportion/NUM_TE
rmsk_TE_class[,c("class_update",cohorts[1:20])]

# Number/proportion of TEs overlapping CpGs per class
rmsk_TE_class[,c("class_update","Count","TEs_wCpG","TEs_wCpG_per")]

# Median CpGs per TE, overall and by class, for all TEs and those overlapping at least one CpG
median(rmsk_TE$CpGs)
ddply(rmsk_TE,.(class_update),summarise,Median_CpGs=median(CpGs),Median_CpGs_wCpGs=median(CpGs[which(CpGs > 0)]))
```

### Supplementary Figure 7

Counted the number of promoters annotated with each chromHMM state in each sample (excluding promoters on contigs). 

```{bash promoter chromHMM counts, eval=FALSE}
# Number of promoters in state in sample
awk -v OFS='\t' '{if($1 !~ /_/) a[$5, $7]+=1}END{for(i in a){split (i, sep, SUBSEP); print sep[1], sep[2], a[i];}}' chromHMM/refseq_promoters_unique_chromHMM_summit_sorted.txt > chromHMM/promoters_state_sample_counts_summit.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/promoters_state_sample_counts_summit.txt
```

Creates dataframes of 1) the number of promoters in each epigenetic state in each number of samples, as well as the number of promoters in each state in at least one sample; and 2) the number of exons expressed RPKM > 1 in each number of samples, as well as the number of exons expressed RPKM > 1 in at least one sample. 

```{r Figure S7 scripts, echo=FALSE, cache=TRUE, cache.lazy=FALSE}
source("R_scripts/potential_promoter.R")
source("R_scripts/potential_exon.R")
```

a. Proportion of promoters in each state by sample (boxplot) and number of promoters ever in each state (dot). b. Distribution of the number of promoters in each state at each number of samples, for those ever in the state. c. Proportion of exons expressed RPKM > 1 by sample (boxplot) and number of exons ever expressed RPKM > 1 (dot). d. Distribution of the number of exons expressed RPKM > 1 at each number of samples, for those ever expressed RPKM > 1. e. Proportion of TEs in each state by sample (boxplot) and number of TEs ever in each state (dot), excluding cancer cell lines/IMR90. f. Distribution of the number of TEs in each state at each number of samples, for those ever in the state, excluding cancer cell lines/IMR90. 

```{r Figure S7, echo=FALSE}
# Promoters
a = ggplot(combine_boxplot_prom,aes(x=State,y=Proportion,fill=State)) + geom_boxplot() +scale_fill_manual(values=all_state_colors[1:21],guide=FALSE) + theme(axis.title.y = element_text(margin = margin(r = -10))) + labs(y="Proportion of promoters in state") + coord_flip() + scale_x_discrete(limits=rev(states[1:21]),labels=all_state_labels[1:21]) + geom_point(data=combine_stats_prom,aes(x=State,y=Proportion_ever),color="red",size=2) + scale_y_continuous(limits=c(0,1),expand=c(0.05,0.05))

b = ggplot(combine_potential_prom[which(combine_potential_prom$Samples != 0),],aes(x=State,y=Sample.Proportion,weight=Count,fill=State)) + geom_violin(scale="width") + coord_flip() + scale_fill_manual(values=all_state_colors[1:21],guide=FALSE) + theme(axis.title.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y = element_blank()) + ylab("Proportion of samples in state") + scale_x_discrete(limits = rev(states[1:21]),labels=all_state_labels[1:21]) + geom_boxplot(width=0.2,outlier.size = 0.1) + scale_y_continuous(expand=c(0.01,0.01))

# Exons
c = ggplot(RNA_exon_sample,aes(x=State,y=Proportion,fill=State)) + geom_boxplot() +scale_fill_manual(values="black",guide=FALSE) + labs(y="Proportion of exons in state") + coord_flip() + scale_x_discrete(labels="       RPKM > 1") + geom_point(data=RNA_potential_exon_stats,aes(x=State,y=Proportion_ever),color="red",size=3) + scale_y_continuous(limits=c(0,1),expand=c(0.05,0.05))

d = ggplot(RNA_potential_exon[which(RNA_potential_exon$Samples != 0),],aes(x=State,y=Sample.Proportion,weight=Count,fill=State)) + geom_violin(scale="width") + coord_flip() + scale_fill_manual(values="black",guide=FALSE) + theme(axis.title.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y = element_blank()) + ylab("Proportion of samples RPKM > 1") + scale_x_discrete(labels=all_state_labels[22]) + geom_boxplot(width=0.2,outlier.size = 0.1) + scale_y_continuous(expand=c(0.01,0.01))

# No cancer/IMR90
e = ggplot(combine_boxplot_noCancer_IMR90,aes(x=State,y=Proportion,fill=State)) + geom_boxplot() +scale_fill_manual(values=all_state_colors,guide=FALSE) + theme(axis.title.y = element_text(margin = margin(r = -10))) + labs(y="Proportion of TEs in state") + coord_flip() + scale_x_discrete(limits = rev(levels(combine_boxplot_noCancer_IMR90$State)),labels=rev(all_state_labels)) + geom_point(data=combine_stats_noCancer_IMR90,aes(x=State,y=Proportion_ever),color="red",size=2) + scale_y_continuous(limits=c(0,1),expand=c(0.05,0.05))

f = ggplot(combine_potential_noCancer[which(combine_potential_noCancer$Samples != 0),],aes(x=State,y=Sample.Proportion,weight=Count,fill=State)) + geom_violin(scale="width") + coord_flip() + scale_fill_manual(values=all_state_colors,guide=FALSE) + theme(axis.title.y=element_blank(),axis.ticks.y=element_blank(),axis.text.y = element_blank()) + ylab("Proportion of samples in state") + scale_x_discrete(limits = rev(states),labels=rev(all_state_labels)) + geom_boxplot(width=0.2,outlier.size = 0.1) + scale_y_continuous(expand=c(0.01,0.01))

grid.arrange(a,b,c,d,e,f,nrow = 3,layout_matrix=rbind(c(1,2),c(3,4),c(5,6)),heights=c(0.45,0.1,0.45),widths=c(0.6,0.4))
```

```{r Figure S7 source data}
write.table(combine_boxplot_prom[,c("Sample","State","Proportion")],file="source_data/Figure_S7a_boxplots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combine_stats_prom[,c("State","Proportion_ever")],file="source_data/Figure_S7a_dots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combine_potential_prom[which(combine_potential_prom$Samples != 0),c("State","Sample.Proportion","Count")],file="source_data/Figure_S7b.txt",sep='\t',row.names=FALSE,quote=FALSE)

write.table(RNA_exon_sample[,c("Sample","State","Proportion")],file="source_data/Figure_S7c_boxplots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(RNA_potential_exon_stats[,c("State","Proportion_ever")],file="source_data/Figure_S7c_dots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(RNA_potential_exon[which(RNA_potential_exon$Samples != 0),c("State","Sample.Proportion","Count")],file="source_data/Figure_S7d.txt",sep='\t',row.names=FALSE,quote=FALSE)

write.table(combine_boxplot_noCancer_IMR90[,c("Sample","State","Proportion")],file="source_data/Figure_S7e_boxplots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combine_stats_noCancer_IMR90[,c("State","Proportion_ever")],file="source_data/Figure_S7e_dots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(combine_potential_noCancer[which(combine_potential_noCancer$Samples != 0),c("State","Sample.Proportion","Count")],file="source_data/Figure_S7f.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Promoter statistics

From the intersection of individual RefSeq promoters with CpG methylation level, found the number of CpGs overlapping each promoter. 

```{bash promoter CpG count, eval=FALSE}
# Refseq promoters
awk -v OFS='\t' '{a[$1, $2, $3, $4]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], a[i];}}' refseq_promoter_unique_CpG_Meth.bed > refseq_promoter_unique_CpG_count.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/Refseq_promoters/refseq_promoter_unique_CpG_count.txt
```

```{r promoter analysis}
# Number of unique promoters
NUM_PROMOTER

# Proportion of promoters ever in each state and mean proportion of samples in state
combine_stats_prom

# Mean/median proportion of promoters in each state by sample
ddply(combine_boxplot_prom,.(State),summarise,Mean=mean(Proportion),Median=median(Proportion))

# Creates a dataframe of the number of CpGs per promoter
promoter_CpG_count = read.table("WGBS/Refseq_promoters/refseq_promoter_unique_CpG_count.txt",sep='\t')
promoter_CpG_count$V5 = promoter_CpG_count$V5/2

## Number/proportion of promoters with CpGs
dim(promoter_CpG_count)[1] 
dim(promoter_CpG_count)[1]/NUM_PROMOTER

## Summary statistics for the number of CpGs per promoter
ddply(promoter_CpG_count,.(),summarise,Min=min(V5),Median=median(V5),Mean=mean(V5),Max=max(V5))

# Proportion of promoters in each epigenetic state in at least 90% of samples
ddply(combine_potential_prom,.(State),summarise,Total=sum(Count[which(Sample.Proportion >= 0.9)])/NUM_PROMOTER)
```

### Exon statistics (expression potential control)

Intersected the CpG methylation level bed file with unique individual RefSeq exons, then counted the number of CpGs per exon. 

```{bash exon CpG count, eval=FALSE}
# Refseq exons
split -l 1000000 ~/TE_landscape/WGBS/all_CpG_Meth.bed
for file in x*; do echo $file; bedtools intersect -wo -a ~/genic_features/RefSeq/refseq_exons_unique.txt -b $file >> ~/TE_landscape/WGBS/refseq_exons_unique_CpG_Meth.bed ; done

## Output
#/bar/epehrsson/TE_landscape/WGBS/refseq_exons_unique_CpG_Meth.bed

# Exons
awk -v OFS='\t' '{a[$1, $2, $3, $4]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], a[i];}}' WGBS/refseq_exons_unique_CpG_Meth.bed > WGBS/refseq_exons_unique_CpG_count.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/refseq_exons_unique_CpG_count.txt
```

```{r exon control analysis, cache=TRUE,cache.lazy=FALSE}
# Number of unique exons
NUM_EXON

# Proportion of exons ever expressed RPKM > 1 and mean proportion of samples in state
RNA_potential_exon_stats

# Mean/median proportion of TEs expressed RPKM > 1 per sample
mean(RNA_exon_sample$Proportion)
median(RNA_exon_sample$Proportion)

# Creates a dataframe of the number of CpGs per exon
exon_CpG_count = read.table("WGBS/refseq_exons_unique_CpG_count.txt",sep='\t')
exon_CpG_count$V5 = exon_CpG_count$V5/2

## Number/proportion of exons with CpGs
dim(exon_CpG_count)[1] 
dim(exon_CpG_count)[1]/NUM_EXON

## Summary statistics for the number of CpGs per exon, for all exons and those expressed RPKM > 1 in at least one sample
ddply(exon_CpG_count,.(),summarise,Min=min(V5),Median=median(V5),Mean=mean(V5),Max=max(V5))
mean(c(exon_CpG_count$V5,rep(0,(NUM_EXON-dim(exon_CpG_count)[1]))))
median(c(exon_CpG_count$V5,rep(0,(NUM_EXON-dim(exon_CpG_count)[1]))))

# Table of the number of samples expressed RPKM > 1 per TE (count/proportion)
table(RNA_TE$Expressed_samples)
table(RNA_TE$Expressed_samples)/dim(RNA_TE)[1]

# Table of the number of samples expressed RPKM > 1 per exon (count/proportion)
table(RNA_refseq_exon$Expressed_samples)
table(RNA_refseq_exon$Expressed_samples)/NUM_EXON

# Proportion of exons expressed RPKM > 1 in at least 90% of samples
ddply(RNA_potential_exon,.(State),summarise,Total=sum(Count[which(Sample.Proportion >= 0.9)])/NUM_EXON)

# Maximum expression
## Median maximum expression level for each TE
median(RNA_TE$Max_expression)

## Top 20 TEs with the highest maximum expression in any sample
tail(RNA_TE[order(RNA_TE$Max_expression),],n=10)

## Median maximum expression level for each exon
median(RNA_refseq_exon$Max_expression)

## Top 20 exons with the highest maximum expression in any sample
tail(RNA_refseq_exon[order(RNA_refseq_exon$Max_expression),],n=10)

# Spearman correlation between the maximum expression of a TE and the number of samples it is expression RPKM > 1
cor.test(RNA_TE$Max_expression,RNA_TE$Expressed_samples,method="spearman")

# Median maximum expression per TE by class, for all TEs and those expressed RPKM > 1 in at least one sample
merge(ddply(RNA_TE,.(class_update),summarise,Max_median=median(Max_expression)),
      ddply(RNA_TE[which(RNA_TE$Expressed_samples > 0),],.(class_update),summarise,Max_median_expressed=median(Max_expression)),by="class_update")

# Median level of expression for TEs and exons, for all features and those expressed RPKM > 1 in at least one sample
TE_express = unlist(RNA_TE[,8:63])
median(TE_express)
median(TE_express[which(TE_express > 1)])
rm(TE_express)

exon_express = unlist(RNA_refseq_exon[,5:60])
median(exon_express)
median(exon_express[which(exon_express > 1)])
rm(exon_express)
```

### Cancer/IMR90 outlier analysis

```{r no cancer analysis}
# Proportion of TEs ever in each state and mean proportion of samples in state, excluding cancer cell lines/IMR90
combine_stats_noCancer_IMR90

# Summary statistics for the proportion of TEs in each state per sample, with and without IMR90
ddply(combine_boxplot,.(State),summarise,Mean=mean(Proportion),SD=sd(Proportion),Outlier=mean(Proportion)+2*sd(Proportion),Median=median(Proportion),IQR=IQR(Proportion))
ddply(combine_boxplot[which(combine_boxplot$Sample != "E017"),],.(State),summarise,Mean=mean(Proportion),SD=sd(Proportion),Median=median(Proportion),IQR=IQR(Proportion))

# Proportion of TEs in each state for IMR90
combine_boxplot[which(combine_boxplot$Sample == "E017"),]
```

### Response Figure 1 

Created files listing all TEs annotated with each active regulatory chromHMM state in a sample, by state. 

```{bash active reg matrices, eval=FALSE}
# Lists of individual TEs in each active regulatory state by sample
cat chromHMM/subfamily/by_state/*1_TssA* > chromHMM_1_TssA.txt
cat chromHMM/subfamily/by_state/*2_TssAFlnk* > chromHMM_2_TssAFlnk.txt
cat chromHMM/subfamily/by_state/*3_TxFlnk* > chromHMM_3_TxFlnk.txt
cat chromHMM/subfamily/by_state/*6_EnhG* > chromHMM_6_EnhG.txt
cat chromHMM/subfamily/by_state/*7_Enh* > chromHMM_7_Enh.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/chromHMM_1_TssA.txt
#/bar/epehrsson/TE_landscape/chromHMM/chromHMM_2_TssAFlnk.txt
#/bar/epehrsson/TE_landscape/chromHMM/chromHMM_3_TxFlnk.txt
#/bar/epehrsson/TE_landscape/chromHMM/chromHMM_6_EnhG.txt
#/bar/epehrsson/TE_landscape/chromHMM/chromHMM_7_Enh.txt
```

Loads data frames of TEs and 200bp windows in an active regulatory state in a sample.

```{r load replicate matrices, cache=TRUE, cache.lazy=FALSE}
source("R_scripts/replicate_matrices.R")
```

Identified six similar pairs of samples ("replicates") using Table S1 from the Roadmap publication. Then, created 24 pairs of samples from different tissues by pairing the first member of each replicate pair with four samples used by Roadmap for inter-tissue analyses. 

```{r replicate table}
# Replicate pairs and inter-tissue pairs
replicates = as.data.frame(matrix(c("E019","E020",
                                    "E008","E014",
                                    "E003","E016",
                                    "E055","E056",
                                    "E059","E061",
                                    "E101","E102"),ncol=2,byrow = TRUE))
colnames(replicates) = c("Sample 1","Sample 2")
replicates$Set = rep("Replicate",6)

test = as.data.frame(cbind(rep(as.vector(replicates$`Sample 1`),each=4),
                           rep(c("E032","E049","E066","E071"),times=6),
                           rep("Tissue",24)))
colnames(test) = c("Sample 1","Sample 2","Set")

replicates = rbind(replicates,test)
replicates$Pair = paste(replicates$`Sample 1`,replicates$`Sample 2`,sep="/")
rm(test)
```

For each sample pair, found the number of samples each TE and each window is annotated with each state. Added set information for each sample pair. 

```{r run replicates, cache=TRUE, cache.lazy=FALSE}
chromHMM_replicates = apply(replicates,1,function(z) ddply(chromHMM_active_matrix[which(chromHMM_active_matrix$Sample %in% as.vector(z[1:2])),],.(State,chromosome,start,stop,subfamily,family,class,strand),summarise,Samples=length(Sample)))
names(chromHMM_replicates) = replicates$Pair
chromHMM_replicates = ldply(chromHMM_replicates)
colnames(chromHMM_replicates)[1] = "Pair"
chromHMM_replicates = merge(chromHMM_replicates,replicates[,c("Pair","Set")],by="Pair")
```

```{r run replicates window, cache=TRUE, cache.lazy=FALSE}
window_replicates = apply(replicates,1,function(z) ddply(windows_active[which(windows_active$Sample %in% as.vector(z[1:2])),],.(State,chr,start,stop),summarise,Samples=length(Sample)))
names(window_replicates) = replicates$Pair
window_replicates = ldply(window_replicates)
colnames(window_replicates)[1] = "Pair"
window_replicates = merge(window_replicates,replicates[,c("Pair","Set")],by="Pair")
```

For each TE/window in each pair, found the maximum number of samples the TE/window is annotated with the same active regulatory state, then removed state information. Calculated the fraction of TEs that are annotated with an active regulatory state in one sample that are annotated with the same state in the other member of the pair. 

```{r filter replicates, cache=TRUE, cache.lazy=FALSE}
# Combine all active regulatory states
chromHMM_replicates_active = ddply(chromHMM_replicates,.(Pair,Set,chromosome,start,stop,subfamily,family,class,strand),summarise,Samples=max(Samples))

# Fraction of TEs in either sample found in both
chromHMM_replicates_count = ddply(chromHMM_replicates_active,.(Pair,Set,Samples),summarise,Count=length(Samples))
chromHMM_replicates_count = dcast(chromHMM_replicates_count,Pair+Set~Samples,value.var="Count")
chromHMM_replicates_count[is.na(chromHMM_replicates_count)] = 0
chromHMM_replicates_count$Fraction = chromHMM_replicates_count$`2`/(chromHMM_replicates_count$`1`+chromHMM_replicates_count$`2`)
```

```{r filter replicates window, cache=TRUE, cache.lazy=FALSE}
# Combine all active regulatory states
window_replicates_active = ddply(window_replicates,.(Pair,Set,chr,start,stop),summarise,Samples=max(Samples))

# Fraction of TEs in either sample found in both
window_replicates_count = ddply(window_replicates_active,.(Pair,Set,Samples),summarise,Count=length(Samples))
window_replicates_count = dcast(window_replicates_count,Pair+Set~Samples,value.var="Count")
window_replicates_count[is.na(window_replicates_count)] = 0
window_replicates_count$Fraction = window_replicates_count$`2`/(window_replicates_count$`1`+window_replicates_count$`2`)
```

Combined TE and window matrices. Calculated the median fraction of TEs/windows annotated in both members of a pair for both replicates and inter-tissue pairs, as well as the ratio between the sample pair sets. Tested each feature for a significant difference in fraction between the sample pair sets.

```{r replicate analysis}
# Combine TEs and windows
chromHMM_replicates_count$Feature = rep("TE",dim(chromHMM_replicates_count)[1])
window_replicates_count$Feature = rep("Bin",dim(window_replicates_count)[1])
replicates_count = rbind(window_replicates_count,chromHMM_replicates_count)

# Median fraction annotated in both members of the pair
replicates_median = dcast(ddply(replicates_count,.(Feature,Set),summarise,Median=median(Fraction)),Feature~Set,value.var="Median")
replicates_median$Ratio = replicates_median$Replicate/replicates_median$Tissue
replicates_median

# Wilcox text between replicates and inter-tissue pairs, by feature
by(replicates_count,replicates_count$Feature,function(x) wilcox.test(Fraction~Set,data=x))
```

Also repeated the analyses to include only TEs/windows that are not in an active regulatory state outside the sample pair. 

Generates a dataframe of TEs that are in an active state in each sample pair, but not outside the pair. 

```{r replicates restricted, cache=TRUE, cache.lazy=FALSE}
# TEs in any active regulatory state in each sample
chromHMM_active_unique = unique(chromHMM_active_matrix[,c(TE_coordinates,"Sample")])

# Total number of samples (out of 127) each TE is in an active regulatory state
replicates_total = ddply(chromHMM_active_unique,.(chromosome,start,stop,subfamily,family,class,strand),summarise,Total=length(Sample))

# Filter TEs to those in an active regulatory state in no more than 2 samples
replicates_exclusive = merge(chromHMM_active_unique,replicates_total[which(replicates_total$Total <= 2),],by=TE_coordinates)

# Calculate the number of samples the TE in in an active regulatory state, for each sample pair
replicates_restricted = apply(replicates,1,function(z) 
  ddply(replicates_exclusive[which(replicates_exclusive$Sample %in% as.vector(z[1:2])),],
        .(chromosome,start,stop,subfamily,family,class,strand,Total),summarise,Samples=length(Sample)))
names(replicates_restricted) = replicates$Pair
replicates_restricted = ldply(replicates_restricted,.id="Pair")

# Restrict the dataframe to TEs exclusive to each sample pair
replicates_restricted = replicates_restricted[which(replicates_restricted$Samples == replicates_restricted$Total),]
```

Generates a dataframe of windows that are in an active state in each sample pair, but not outside the pair. 

```{r replicates restricted window, cache=TRUE, cache.lazy=FALSE}
# Windows in any active regulatory state in each sample
windows_active_unique = unique(windows_active[,c("chr","start","stop","Sample")])

# Total number of samples (out of 127) the window is in an active regulatory state
window_replicates_total = ddply(windows_active_unique,.(chr,start,stop),summarise,Total=length(Sample))

# Filter windows to those in an active regulatory state in no more than 2 samples
window_replicates_exclusive = merge(windows_active_unique,window_replicates_total[which(window_replicates_total$Total <= 2),],by=c("chr","start","stop"))

# Calculate the number of samples the window in in an active regulatory state, for each sample pair
window_replicates_restricted = apply(replicates,1,function(z) 
  ddply(window_replicates_exclusive[which(window_replicates_exclusive$Sample %in% as.vector(z[1:2])),],
        .(chr,start,stop,Total),summarise,Samples=length(Sample)))
names(window_replicates_restricted) = replicates$Pair
window_replicates_restricted = ldply(window_replicates_restricted,.id="Pair")

# Restrict the dataframe to windows exclusive to each sample pair
window_replicates_restricted = window_replicates_restricted[which(window_replicates_restricted$Samples == window_replicates_restricted$Total),]
```

Filters the dataframes of the number of samples each TE/window is in an active regulatory state in each sample pair to TEs/windows that are not in any active regulatory state outside the sample pair. Calculates the fraction of TEs/windows that are annotated with an active regulatory state in one sample that are annotated with the same state in the other member of the pair. 

```{r replicates restricted count}
# Restrict TEs to those not found in an active regulatory state outside the pair
chromHMM_replicates_active_restricted = merge(chromHMM_replicates_active,replicates_restricted[,c("Pair",TE_coordinates)],by=c("Pair",TE_coordinates))

# Fraction of TEs in either sample found in both
chromHMM_replicates_restricted_count = ddply(chromHMM_replicates_active_restricted,.(Pair,Set,Samples),summarise,Count=length(Samples))
chromHMM_replicates_restricted_count = dcast(chromHMM_replicates_restricted_count,Pair+Set~Samples,value.var="Count")
chromHMM_replicates_restricted_count[is.na(chromHMM_replicates_restricted_count)] = 0
chromHMM_replicates_restricted_count$Fraction = chromHMM_replicates_restricted_count$`2`/(chromHMM_replicates_restricted_count$`1`+chromHMM_replicates_restricted_count$`2`)
```

```{r replicates restricted window count}
# Restrict windows to those not found in an active regulatory state outside the pair
window_replicates_active_restricted = merge(window_replicates_active,window_replicates_restricted[,c("Pair","chr","start","stop")],by=c("Pair","chr","start","stop"))

# Fraction of windows in either sample found in both
window_replicates_restricted_count = ddply(window_replicates_active_restricted,.(Pair,Set,Samples),summarise,Count=length(Samples))
window_replicates_restricted_count = dcast(window_replicates_restricted_count,Pair+Set~Samples,value.var="Count")
window_replicates_restricted_count[is.na(window_replicates_restricted_count)] = 0
window_replicates_restricted_count$Fraction = window_replicates_restricted_count$`2`/(window_replicates_restricted_count$`1`+window_replicates_restricted_count$`2`)
```

Combined TE and window matrices. Calculated the median fraction of TEs/windows annotated in both members of a pair for both replicates and inter-tissue pairs, as well as the ratio between the sample pair sets. Tested each feature for a significant difference in fraction between the sample pair sets. Also calculated the total number of TEs/windows in an active regulatory state in one sample or both, by sample pair category, with and without restriction that features only be active in the sample pair. 

```{r replicates restricted analysis}
# Combine TEs and windows
chromHMM_replicates_restricted_count$Feature = rep("TE",dim(chromHMM_replicates_restricted_count)[1])
window_replicates_restricted_count$Feature = rep("Bin",dim(window_replicates_restricted_count)[1])
replicates_restricted_count = rbind(window_replicates_restricted_count,chromHMM_replicates_restricted_count)

# Median fraction annotated in both members of the pair
replicates_restricted_median = dcast(ddply(replicates_restricted_count,.(Feature,Set),summarise,Median=median(Fraction)),Feature~Set,value.var="Median")
replicates_restricted_median$Ratio = replicates_restricted_median$Replicate/replicates_restricted_median$Tissue
replicates_restricted_median

# Wilcox text between replicates and inter-tissue pairs, by feature
by(replicates_restricted_count,replicates_restricted_count$Feature,function(x) wilcox.test(Fraction~Set,data=x))

# Number of TEs under consideration, with and without restriction to sample pair
ddply(replicates_count,.(Feature,Set),function(x) sum(x[,c("1","2")]))
ddply(replicates_restricted_count,.(Feature,Set),function(x) sum(x[,c("1","2")]))
```

Downloaded epigenetic correlation between samples from the Roadmap Project. 

```{bash download correlation matrix, eval=FALSE}
wget https://egg2.wustl.edu/roadmap/data/byDataType/celltype_clustering/correlation_matrices/cor_H3K4me1_7.RData
wget https://egg2.wustl.edu/roadmap/data/byDataType/celltype_clustering/correlation_matrices/cor_H3K4me3_1.RData 
wget https://egg2.wustl.edu/roadmap/data/byDataType/celltype_clustering/correlation_matrices/cor_H3K9me3_9.RData 
wget https://egg2.wustl.edu/roadmap/data/byDataType/celltype_clustering/correlation_matrices/cor_H3K27me3_13.RData 
wget https://egg2.wustl.edu/roadmap/data/byDataType/celltype_clustering/correlation_matrices/cor_H3K36me3_4.RData

## Output
#/bar/epehrsson/TE_landscape/raw_data/correlation_matrices/cor_H3K27me3_13.RData
#/bar/epehrsson/TE_landscape/raw_data/correlation_matrices/cor_H3K36me3_4.RData
#/bar/epehrsson/TE_landscape/raw_data/correlation_matrices/cor_H3K4me1_7.RData
#/bar/epehrsson/TE_landscape/raw_data/correlation_matrices/cor_H3K4me3_1.RData
#/bar/epehrsson/TE_landscape/raw_data/correlation_matrices/cor_H3K9me3_9.RData
```

Loads the sample correlation matrices as a list of dataframes, then reformats as a single dataframe. Identifies correlations where Sample 1 is the first member of one of the replicate pairs, then assigns a rank to each correlation between that sample and another sample. Assigns a category to each sample pair based on the pairs used above (replicate, inter-tissue, or other). 

```{r replicate matrix correlation}
# Load correlation matrices from Roadmap
cor_matrices_files = list.files(path="raw_data/correlation_matrices",pattern=".RData", full.names = TRUE)
cor_marks = lapply(cor_matrices_files,function(x) mget(load(x)))
names(cor_marks) = gsub(".RData","",gsub("raw_data/correlation_matrices/cor_","",cor_matrices_files))

# Reformat
cor_marks = lapply(cor_marks,function(x) {y = melt(as.matrix(x$markcor)); return(y)})
cor_marks = ldply(cor_marks,.id="Mark")
colnames(cor_marks)[2:4] = c("Sample1","Sample2","Correlation")

# Remove self correlations
cor_marks = cor_marks[which(cor_marks$Sample1 != cor_marks$Sample2),]

# Correlations where Sample 1 is in a replicate pair
cor_marks_replicates = cor_marks[which(cor_marks$Sample1 %in% as.vector(unique(replicates$`Sample 1`))),]

# Assign rank to each correlation
cor_marks_replicates = ddply(cor_marks_replicates,.(Sample1,Mark),transform,Rank=rank(Correlation))

# Identify replicate/different tissue pairs
cor_marks_replicates = merge(cor_marks_replicates,replicates[,c("Sample 1","Sample 2","Set")],by.x=c("Sample1","Sample2"),by.y=c("Sample 1","Sample 2"),all.x=TRUE)
cor_marks_replicates[is.na(cor_marks_replicates)] = "Other"
cor_marks_replicates$Set = factor(cor_marks_replicates$Set,levels=c("Replicate","Tissue","Other"))
```

Response figure. a. The fraction of TEs/windows in an active regulatory state in one member of the sample pair that are in the same state in the other member, for pseudo-replicates and inter-tissue sample pairs. b. With the restriction that TEs not be active outside the sample pair. c. The Pearson correlation between Sample 1 of each sample pair and all other samples, colored by pair category. Results are presented for H3K4me1 values within 7_Enh annotations and H3K4me3 values within 1_TssA annotations. 

```{r Figure R1, echo=FALSE}
a = ggplot(replicates_count,aes(x=Feature,y=Fraction,fill=Set)) + geom_boxplot() + theme(axis.title.x=element_blank()) + scale_x_discrete(labels=c("200bp windows","TEs")) + scale_fill_manual(values=c("maroon4","lightseagreen"),guide=FALSE) + ylab("Fraction of TEs in both samples / either sample") + ylim(0,0.5)

b = ggplot(replicates_restricted_count,aes(x=Feature,y=Fraction,fill=Set)) + geom_boxplot() + theme(axis.title.x=element_blank()) + scale_x_discrete(labels=c("200bp windows","TEs")) + scale_fill_manual(values=c("maroon4","lightseagreen"),guide=FALSE) + ylab("Fraction of TEs in both samples / either sample") + ylim(0,0.5)

replicate_legend = get_legend(ggplot(replicates_count,aes(x=Feature,y=Fraction,fill=Set)) + geom_boxplot() + scale_fill_manual(labels=setNames(c("Replicates","Different tissue"),c("Replicate","Tissue")),values=c("maroon4","lightseagreen"),name="Pair type"))

c = ggplot(cor_marks_replicates[which(cor_marks_replicates$Mark %in% c("H3K4me1_7","H3K4me3_1")),],aes(x=Sample1,y=Correlation,color=Set)) + geom_jitter(width = 0.2) + facet_wrap(~Mark) + scale_color_manual(labels=setNames(c("Replicates","Different tissue","Other"),c("Replicate","Tissue","Other")),values=c("maroon4","lightseagreen","grey"),name="Pair type") + xlab("Sample 1") + ylab("Pearson correlation")

grid.arrange(a,b,replicate_legend,c,layout_matrix=rbind(c(1,2,3),c(4)),widths=c(0.4,0.4,0.2))
```

## Epigenetic state dynamics of TEs

For instances of a TE annotated with a chromHMM state in a sample, produces a matrix with the number of instances where the TE is also annotated with another chromHMM state in the same sample. 

```{bash chromHMM intra, eval=FALSE}
# State switching intra, chromHMM
python ~/bin/TE_landscape/state_sharing_intra2.py chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt chromHMM/chromHMM_states.txt chromHMM/state_switching/rmsk_TEother_chromHMM_intra.txt 7 9

## Output
#/bar/epehrsson/TE_landscape/chromHMM/state_switching/rmsk_TEother_chromHMM_intra.txt
```

Produces a matrix with the total number of samples in which TEs are annotated with each chromHMM state (column), for TEs annotated with each chromHMM state in any sample (row).

```{bash chromHMM inter, eval=FALSE}
# State switching inter, chromHMM
python ~/bin/TE_landscape/state_sharing_inter.py chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt chromHMM/chromHMM_states.txt chromHMM/state_switching/rmsk_TEother_chromHMM_inter.txt 7 9

## Output
#/bar/epehrsson/TE_landscape/chromHMM/state_switching/rmsk_TEother_chromHMM_inter.txt
```

The above analysis, split by TE class.

```{bash chromHMM inter class, eval=FALSE}
# State switching inter, chromHMM, by class
while read line; do python ~/bin/TE_landscape/state_sharing_inter.py chromHMM/chromHMM_summit_$line\.txt chromHMM/chromHMM_states.txt chromHMM/state_switching/class/rmsk_TEother_chromHMM_inter_$line\.txt 7 9; done < features/TEs/class/TE_class.txt

python ~/bin/TE_landscape/state_sharing_inter.py chromHMM/chromHMM_summit_SVA.txt chromHMM/chromHMM_states.txt chromHMM/state_switching/class/rmsk_TEother_chromHMM_inter_SVA.txt 7 9

python ~/bin/TE_landscape/state_sharing_inter.py chromHMM/chromHMM_summit_Other.txt chromHMM/chromHMM_states.txt chromHMM/state_switching/class/rmsk_TEother_chromHMM_inter_Other.txt 7 9

## Output
#/bar/epehrsson/TE_landscape/chromHMM/state_switching/class/rmsk_TEother_chromHMM_inter_[class].txt [6 files]
```

Using the matrix of TE methylation level per sample, sorted and added methylation states. Then, produces a matrix with the total number of samples in which TEs are annotated with each methylation state (column), for TEs annotated with each methylation state in any sample (row). 

```{bash WGBS inter, eval=FALSE}
# Methylation level of each TE x sample	(produced by WGBS_TE_avg_methylation.R; removed header)
#/bar/epehrsson/TE_landscape/WGBS/TE_WGBS_state.txt

# Methylation level of each TE x sample, sorted	 
sort -k1,1V -k2,2n -k3,3n -k4,4 -k8,8 WGBS/TE_WGBS_state.txt > WGBS/TE_WGBS_state_sorted.txt

# Adding methylation states	 
awk -v OFS='\t' '{if($9 == "NA") print $0, "Missing"; else if ($9 < 0.3) print $0, "Hypomethylated"; else if ($9 > 0.7) print $0, "Hypermethylated"; else if (($9 <= 0.7) && ($9 >= 0.3)) print $0, "Intermediate";}' TE_WGBS_state_sorted.txt > TE_WGBS_state_sorted

## Output
#/bar/epehrsson/TE_landscape/WGBS/TE_WGBS_state_sorted.txt

# WGBS inter switching
python ~/bin/TE_landscape/state_sharing_inter.py WGBS/TE_WGBS_state_sorted.txt WGBS/methylation_states.txt WGBS/TE_WGBS_state_inter.txt 7 9

## Output
#/bar/epehrsson/TE_landscape/WGBS/TE_WGBS_state_inter.txt
```

Split the matrix of TE methylation state per sample by class, then repeated the above analysis by class. 

```{bash WGBS inter class, eval=FALSE}
# Methylation level/state of each TE x sample, sorted, by class	 
while read line ; do awk -v OFS='\t' -v class=$line '{if($5 == class) print $0}' TE_WGBS_state_sorted.txt > $line\_WGBS_state_sorted.txt; done < ../features/TEs/class/TE_class.txt

awk -v OFS='\t' '{if($5 == "Other") print $0}' TE_WGBS_state_sorted.txt > SVA_WGBS_state_sorted.txt

awk -v OFS='\t' '{if(($5 != "Other") && ($5 != "LINE") && ($5 != "SINE") && ($5 != "DNA") && ($5 != "LTR")) print $0}' TE_WGBS_state_sorted.txt > Other_WGBS_state_sorted.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/class/[class]_WGBS_state_sorted.txt [6 files]	

# WGBS inter switching, by class
for file in WGBS/class/*WGBS_state_sorted.txt; do python ~/bin/TE_landscape/state_sharing_inter.py $file WGBS/methylation_states.txt $file\_inter.txt 7 9; done &

## Output
#/bar/epehrsson/TE_landscape/WGBS/class/[class]_WGBS_state_sorted.txt_inter.txt [6 files]		
```

### Figure 3

a. Histogram of the number of unique chromHMM states each TE is annotated with across all samples, by class. b. Histogram of the number of unique methylation states each TE is annotated with across all samples, by class. c. Average proportion of samples a TE is annotated with each chromHMM state (column), for those ever annotated with each state (row). d. Average proportion of samples a TE is annotated with each methylation state (column), for those ever annotated with each state (row).

```{r Figure 3, echo=FALSE, fig.width=7, fig.height=6}
source("R_scripts/state_switching.R")

# chromHMM states
a = ggplot(chromHMM_TE_state,aes(x=States,y = ..density..,fill=class_update)) + geom_histogram(binwidth=1,color="grey") + xlab("Number of chromHMM states") + ylab("Proportion of TEs") + scale_fill_manual(values=class_colors,guide=FALSE) + facet_wrap(~class_update)   

# WGBS states
b = ggplot(TE_meth_average,aes(x=States,y = ..density..,fill=class_update)) + geom_histogram(binwidth=1,color="grey") + xlab("Number of methylation states") + ylab("Proportion of TEs") + scale_fill_manual(values=class_colors,guide=FALSE) + facet_wrap(~class_update) 

# State switching inter, chromHMM
c = ggplot(melt(as.matrix(state_switching_inter)),aes(x=Var2,y=Var1,fill=value/sample_counts["All","chromHMM"])) + geom_tile() + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5)) + scale_x_discrete(labels=chromHMM_states) + scale_y_discrete(limits=rev(chromHMM_states)) + scale_fill_gradient(low="white",high="darkmagenta",limits=c(0,1),guide=FALSE) + coord_equal() + xlab("State 2") + ylab("State 1")

# State switching inter, WGBS
d = ggplot(melt(as.matrix(ss_inter_meth)),aes(x=Var2,y=Var1,fill=value/sample_counts["All","WGBS"])) + geom_tile() + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5)) + scale_y_discrete(limits=rev(meth_states)) + scale_fill_gradient(low="white",high="darkmagenta",limits=c(0,1),guide=FALSE) + coord_equal() + xlab("State 2") + ylab("State 1")

scale_switching = get_legend(ggplot(melt(as.matrix(ss_inter_meth)),aes(x=Var2,y=Var1,fill=value/sample_counts["All","WGBS"])) + geom_tile() + theme(legend.position = "bottom",legend.direction = "horizontal") + scale_fill_gradient(low="white",high="darkmagenta",limits=c(0,1),name="Average proportion\nof samples in state"))

grid.arrange(a,b,c,d,scale_switching,nrow = 3, layout_matrix=rbind(c(1,2),c(3,4),c(5,5)),heights=c(0.3,0.6,0.1))
```

```{r Figure 3 source data}
write.table(chromHMM_TE_state[,c("States","class_update")],file="source_data/Figure_3a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(TE_meth_average[,c("States","class_update")],file="source_data/Figure_3b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(state_switching_inter,file="source_data/Figure_3c.txt",sep='\t',quote=FALSE)
write.table(ss_inter_meth,file="source_data/Figure_3d.txt",sep='\t',quote=FALSE)
```

### Total number of states per TE

For TEs ever in each chromHMM/methylation state, the total number of states they are annotated with across all samples. 

```{r state dynamics}
# Median number of chromHMM states each TE is annotated with across all samples
# For those ever annotated with each state
chromHMM_ever_total = ldply(chromHMM_states,function(x) median(chromHMM_TE_state[which(chromHMM_TE_state[[x]] > 0),]$States))
chromHMM_ever_total$State = chromHMM_states
chromHMM_ever_total

# Number of chromHMM states each TE is annotated with across all samples, 
# For those ever annotated with each state
chromHMM_dynamics = melt(chromHMM_TE_state[,c(chromHMM_states,"States")],id.var="States")
chromHMM_dynamics = chromHMM_dynamics[which(chromHMM_dynamics$value > 0),]

# Median number of methylation states each TE is annotated with across all samples
# For those ever annotated with each state
meth_ever_total = ldply(meth_states,function(x) median(TE_meth_average[which(TE_meth_average[[x]] > 0),]$States))
meth_ever_total$State = meth_states
meth_ever_total

# Number of methylation states each TE is annotated with across all samples, 
# For those ever annotated with each state
meth_dynamics = melt(TE_meth_average[,c(meth_states,"States")],id.var="States")
meth_dynamics = meth_dynamics[which(meth_dynamics$value > 0),]
```

### Supplementary Figure 8

a. Density plots of the number of chromHMM states each TE is annotated with across all samples, for those ever annotated with each state. b. Density plots of the number of methylation states each TE is annotated with across all samples, for those ever annotated with each state. c. Histogram of the maximum number of chromHMM states annotating each TE in a single sample, for those overlapping the center of 200bp bins only, by class. d. For all instances of a TE x sample annotated with a chromHMM state (row), the proportion of instances in which it is also annotated with another state (column). e. Average proportion of samples a TE is annotated with each methylation state (column), for those ever annotated with each state (row), by class.

```{r Figure S8, echo=FALSE}
# State dynamics
a = ggplot(chromHMM_dynamics,aes(x=States,y=..scaled..,fill=forcats::fct_rev(variable))) + geom_density(adjust=10,alpha=0.5) + scale_fill_manual(values=chromHMM_colors,name="State",limits=chromHMM_states) + xlim(0,15) + ylab("Density") + theme(legend.position="bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(3,'mm')) + guides(fill=guide_legend(nrow=4,title.position = "top")) + xlab("Number of chromHMM states")

b = ggplot(meth_dynamics,aes(x=States,y=..scaled..,fill=forcats::fct_rev(variable))) + geom_density(adjust=10,alpha=0.8) + scale_fill_manual(values=meth_colors,name="State",limits=meth_states,labels=meth_labels) + ylab("Density") + theme(legend.position="bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(3,'mm')) + guides(fill=guide_legend(nrow=4,title.position = "top")) + xlab("Number of methylation states")

# chromHMM states intra
c = ggplot(chromHMM_TE_state[which(chromHMM_TE_state$Category == "summit"),],aes(x=Max_states_intra,y = ..density..,fill=class_update)) + geom_histogram(binwidth=1,color="grey") + ylab("Proportion of TEs") + xlab("Number of chromHMM states") + facet_wrap(~class_update) + scale_fill_manual(values=class_colors,guide=FALSE) + theme(legend.position="bottom")

# State switching intra, chromHMM
d = ggplot(melt(as.matrix(state_switching_intra)),aes(x=Var2,y=Var1,fill=value)) + geom_tile() + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),axis.title.y = element_text(margin = margin(t = -10))) + scale_x_discrete(labels=chromHMM_states) + scale_y_discrete(labels=rev(chromHMM_states),limits=rev(levels(melt(as.matrix(state_switching_intra))$Var1))) + scale_fill_gradient(low="white",high="darkblue",limits=c(0,1),name="Proportion of\nTE instances") + coord_equal() + xlab("State 2") + ylab("State 1")

# State switching inter, WGBS, by class
e = ggplot(melt(ss_inter_meth_class,id.vars=c("Class","State")),aes(x=variable,y=State,fill=value/sample_counts["All","WGBS"])) + geom_tile() + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5)) + scale_y_discrete(limits=rev(meth_states)) + scale_fill_gradient(low="white",high="darkmagenta",limits=c(0,1),guide=FALSE) + facet_wrap(~Class,nrow=1) + coord_equal() + xlab("State 2") + ylab("State 1")

grid.arrange(a,b,c,d,e,scale_switching,nrow=4,layout_matrix=rbind(c(1,2),c(3,4),c(5),c(6)),heights=c(0.35,0.3,0.3,0.05),widths=c(0.55,0.45))
```

```{r Figure S8 source data}
write.table(chromHMM_dynamics[,c("States","variable")],file="source_data/Figure_S8a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(meth_dynamics[,c("States","variable")],file="source_data/Figure_S8b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(chromHMM_TE_state[which(chromHMM_TE_state$Category == "summit"),c("Max_states_intra","class_update")],file="source_data/Figure_S8c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(state_switching_intra,file="source_data/Figure_S8d.txt",sep='\t',quote=FALSE)
write.table(melt(ss_inter_meth_class,id.vars=c("Class","State"))[,c("variable","State","value","Class")],file="source_data/Figure_S8e.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### State switching analysis

```{r state switching analysis}
# Average proportion of samples a TE is annotated with each chromHMM state, for those ever annotated with each state
state_switching_inter/sample_counts["All","chromHMM"]

# Number of TEs ever in the poised promoter/promoter flanking states
dim(chromHMM_TE_state[which(chromHMM_TE_state$`10_TssBiv` > 0 | chromHMM_TE_state$`11_BivFlnk` > 0),])
```

### Number of states per TE, overall and by class, chromHMM and WGBS

```{r states overall}
# Total number of chromHMM states a TE is annotated with across all samples (table and stats, overall and by class)
table(chromHMM_TE_state$States)
ddply(chromHMM_TE_state,.(),summarise,Mean=mean(States),SD=sd(States),Median=median(States),IQR=IQR(States))
ddply(chromHMM_TE_state,.(class_update),summarise,Mean=mean(States),SD=sd(States),Median=median(States),IQR=IQR(States))

# TEs annotated with all 15 chromHMM states across samples
states_outlier = chromHMM_TE_state[which(chromHMM_TE_state$States == 15),]

## For those TEs, median length, median number of states per sample, and median maximum states in a single sample
ddply(states_outlier,.(),summarise,Length=median(stop-start),States_intra=median(Tissues)/127,Max_states_intra=median(Max_states_intra))

## Overlap with genic features
states_outlier = merge(states_outlier,rmsk_TE[,c(TE_coordinates,cohorts)],by=TE_coordinates,all.x=TRUE)
states_outlier

## Write out, including overlap with genic features
write.table(states_outlier[,c(TE_coordinates,"class_update",cohorts)],file="States_15_TEs.bed",sep='\t',row.names=FALSE,quote=FALSE)

# Total number of methylation states a TE is annotated with across all samples (table and stats, overall and by class)
table(TE_meth_average$States)
ddply(TE_meth_average,.(),summarise,Mean=mean(States),SD=sd(States),Median=median(States),IQR=IQR(States))
ddply(TE_meth_average,.(class_update),summarise,Mean=mean(States),SD=sd(States),Median=median(States),IQR=IQR(States))

# Spearman correlation of TE length with total number of chromHMM/methylation states across samples
cor.test((chromHMM_TE_state$stop-chromHMM_TE_state$start),chromHMM_TE_state$States,method="spearman")
cor.test((TE_meth_average$stop-TE_meth_average$start),TE_meth_average$States,method="spearman")

# Spearman correlation of total number of chromHMM states across samples with maximum number of chromHMM states in a single sample
## Only relevant for those overlapping the center of a 200bp bin ("Summit")
by(chromHMM_TE_state,chromHMM_TE_state$Category,function(x) cor.test(x$Max_states_intra,x$States,method="spearman"))

# Median total number of chromHMM states across samples and maximum number of chromHMM states in a single sample, by chromosome
## Only relevant for those overlapping the center of a 200bp bin ("Summit")
ddply(chromHMM_TE_state,.(chromosome,Category),summarise,Max_states_intra=median(Max_states_intra),States=median(States))
```

### Number of states per TE x sample

Found the number of chromHMM states with which each instance of a TE in a sample is annotated, for TEs overlapping the center of a 200bp window only. Produces a count matrix.

```{bash chromHMM states per TE, eval=FALSE}
# Number of states per TE x sample
python ~/bin/TE_landscape/state_sharing_intra_lite.py chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt chromHMM/rmsk_TEother_chromHMM_summit_state_counts.txt 7 9

## Output
#/bar/epehrsson/TE_landscape/chromHMM/rmsk_TEother_chromHMM_summit_state_counts.txt
```

For each TE, counted the number of CpGs overlapping the TE in each methylation state per sample. Then, counted the number of CpG methylation states overlapping each instance of a TE in a sample. 

```{bash WGBS states per TE, eval=FALSE}
# TEs overlapping a CpG in each methylation state by sample
python ~/bin/TE_landscape/count_CpG_state.py TE_CpG_Meth_new.bed TE_CpG_count.txt ../sample_lists/WGBS_samples.txt TE_CpG_Meth_state.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/TE_CpG_Meth_state.txt

# Number of TE x sample with CpGs in more than one state
python ~/bin/TE_landscape/shared_meth_CpG.py WGBS/TE_CpG_Meth_state.txt WGBS/TE_CpG_state_counts.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/TE_CpG_state_counts.txt
```

```{r states per TE-sample}
# Frequency of TE annotation with multiple chromHMM states in a single sample
state_switching_intra

# Load count table of the number of chromHMM states with which each TE x sample instance is annotated
## For those overlapping the center of 200bp windows only
test = read.table("chromHMM/rmsk_TEother_chromHMM_summit_state_counts.txt",sep='\t',col.names = c("States","Total"))

## Remove erroneous 15_Quies entry generated by python script (hardcode)
test[1,2] = test[1,2]-1

## Proportion of TE x sample instances at each number of chromHMM states
test$Proportion = test$Total/sum(test$Total)
test

## Proportion of TE x sample instances with >1 state annotated with only 2 states
test$Total[2]/sum(test$Total[2:15])

# Table of the maximum number of chromHMM states each TE is annotated with in a single sample
## For those overlapping the center of 200bp windows only
table(chromHMM_TE_state[,c("Max_states_intra","Category")])

# TEs annotated with >7 chromHMM states in a single sample
max_intra_outlier = chromHMM_TE_state[which(chromHMM_TE_state$Max_states_intra > 7),]

## For those TEs, median length and number of states per sample
ddply(max_intra_outlier,.(),summarise,Length=median(stop-start),Tissues=median(Tissues)/127)

## Overlap with genic features
max_intra_outlier = merge(max_intra_outlier,rmsk_TE[,c(TE_coordinates,cohorts)],by=TE_coordinates,all.x=TRUE)
max_intra_outlier

## Write out outliers, including overlap with genic features
write.table(max_intra_outlier[,c(TE_coordinates,"class_update",cohorts)],file="States_intra_8_TEs.bed",sep='\t',row.names=FALSE,quote=FALSE)

# Proportion of TEs overlapping 200bp bin centers
table(chromHMM_TE_state$Category)/NUM_TE

# Number of TEs overlapping 200bp bin centers, by class
table(chromHMM_TE_state[which(chromHMM_TE_state$Category == "summit"),]$class_update)

# Statistics on the maximum number of chromHMM states each TE is annotated with in a single sample, overall and by class
ddply(chromHMM_TE_state,.(Category),summarise,Mean=mean(Max_states_intra),SD=sd(Max_states_intra),Median=median(Max_states_intra),IQR(Max_states_intra))
ddply(chromHMM_TE_state,.(class_update,Category),summarise,Count=length(Max_states_intra),Mean=mean(Max_states_intra),SD=sd(Max_states_intra),Median=median(Max_states_intra),IQR(Max_states_intra))

# Number/proportion of TEs with a maximum number of chromHMM states in a single sample >1
dim(chromHMM_TE_state[which(chromHMM_TE_state$Category == "summit" & chromHMM_TE_state$Max_states_intra > 1),])[1]
dim(chromHMM_TE_state[which(chromHMM_TE_state$Category == "summit" & chromHMM_TE_state$Max_states_intra > 1),])[1]/dim(chromHMM_TE_state[which(chromHMM_TE_state$Category == "summit"),])[1]

# Spearman correlation between TE length and maximum number of chromHMM states in a single sample
by(data=chromHMM_TE_state,chromHMM_TE_state$Category,function(x) cor.test(x$stop-x$start,x$Max_states_intra,method="spearman"))

# Count table of the number of CpG methylation states overlapping each TE x sample instance
## For TEs overlapping CpGs only
test = read.table("WGBS/TE_CpG_state_counts.txt",sep='\t')

## Proportion of TE x sample instances at each number of CpG methylation states
test$Proportion = test$V2/sum(test$V2)
test

## Proportion of TE x sample instances with >1 state
sum(test$Proportion[2:4])
```

### DNase/H3K27ac peak analysis

TEs are considered "in" the DHS or H3K27ac states if they overlap the summit of at least one peak. The below code provides analysis of the number of peaks per TE. 

```{r peak analysis, cache=TRUE, cache.lazy=FALSE}
# Table of the number of DHS peaks per TE x sample
d = table(unlist(TE_DNase_peaks[,8:60]))
d

# Proportion of TE x sample instances overlapping a peak that overlap only 1 peak
d[2]/sum(d[2:length(d)])

# Table of the number of H3K27ac peaks per TE x sample
e = table(unlist(TE_H3K27ac_peaks[,8:105]))
e

# Proportion of TE x sample instances overlapping a peak that overlap only 1 peak
e[2]/sum(e[2:length(e)])
```

### Supplementary Figure 9 

For each iteration of shuffled TEs, produces a matrix with the total number of samples in which TEs are annotated with each chromHMM state (column), for TEs annotated with each chromHMM state in any sample (row). Also produces a matrix with the total number of samples in which TEs are annotated with each methylation state (column), for TEs annotated with each methylation state in any sample (row). The methylation matrix also contains a column listing the number of TEs ever in each methylation state. 

```{bash shuffled dynamics, eval=FALSE}
# Inter state switching for shuffled TEs
## chromHMM
for i in {1..10}; do python ~/bin/TE_landscape/state_sharing_inter.py rmsk_TE_shuffle_$i\_chromHMM_sorted.txt chromHMM_states.txt rmsk_TE_shuffle_$i\_chromHMM_inter.txt 9 7; done &

## Output
#/bar/epehrsson/TE_landscape/chromHMM/shuffled_TEs/rmsk_TE_shuffle_#_chromHMM_inter.txt [10 files]

## WGBS
for i in {1..10}; do python ~/bin/TE_landscape/state_sharing_inter_meth.py WGBS/shuffled/rmsk_TE_shuffle_$i\_Meth_average.txt WGBS/shuffled/TE_CpG_count_$i\.txt WGBS/shuffled/TE_shuffle_$i\_inter.txt; done

## Output
#/bar/epehrsson/TE_landscape/WGBS/shuffled/TE_shuffle_#_inter.txt [10 files]
```

For 10 iterations of shuffled TEs, chromHMM and methylation states, creates matrices with: 1) the number of unique states each TE is annotated with across samples, 2) the average number of samples a TE is annotated with each state, for those ever annotated with each state, and 3) the median number of states each TE is annotated with across samples, for TEs ever annotated with each state.

```{r shuffled dynamics script, cache=TRUE, cache.lazy=FALSE}
source("R_scripts/shuffled_state_switching.R")
```

a. For each iteration of shuffled TEs, the average proportion of samples a TE is annotated with each chromHMM state (column), for those ever annotated with each state (row). b. For each iteration of shuffled TEs, the average proportion of samples a TE is annotated with each methylation state (column), for those ever annotated with each state (row).

```{r Figure S9, echo=FALSE}
# Labels for each iteration ("Iteration_xx")
iteration_labels = unlist(setNames(lapply(seq(1,10,1),function(x) paste("Iteration ",x,sep="")),seq(1,10,1)))

a = ggplot(shuffled_chromHMM_inter,aes(x=State2,y=State1,fill=Mean_samples/sample_counts["All","chromHMM"])) + geom_tile() +
  theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),legend.title = element_blank()) + scale_y_discrete(limits=rev(chromHMM_states)) + scale_x_discrete(labels=chromHMM_states) + scale_fill_gradient(low="white",high="darkmagenta",limits=c(0,1),guide=FALSE) + coord_equal() + facet_wrap(~Iteration,nrow=2,labeller=labeller(Iteration=iteration_labels)) + xlab("State 2") + ylab("State 1")

b = ggplot(shuffled_meth_inter,aes(x=State2,y=State1,fill=Mean_samples/sample_counts["All","WGBS"])) + geom_tile() +
  theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),legend.position="bottom") + scale_y_discrete(limits=rev(meth_states)) + 
  scale_fill_gradient(name="Average proportion\nof samples in state",low="white",high="darkmagenta",limits=c(0,1)) + coord_equal() + facet_wrap(~Iteration,nrow=2,labeller=labeller(Iteration=iteration_labels)) + xlab("State 2") + ylab("State 1")

grid.arrange(a,b)
```

```{r Figure S9 source data}
write.table(shuffled_chromHMM_inter[,c("State1","State2","Mean_samples","Iteration","Count.y")],file="source_data/Figure_S9a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(shuffled_meth_inter[,c("State1","State2","Mean_samples","Iteration","Total")],file="source_data/Figure_S9b.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

```{r shuffled dynamics analysis}
# Total number of chromHMM states a TE is annotated with across all samples (table and stats, overall and by class, each iteration)
table(shuffled_chromHMM_states$States,shuffled_chromHMM_states$Iteration)
ddply(shuffled_chromHMM_states,.(Iteration),summarise,Median=median(States),IQR=IQR(States))
ddply(shuffled_chromHMM_states,.(Iteration,class),summarise,Median=median(States),IQR=IQR(States))

# Total number of methylation states a TE is annotated with across all samples (table and stats, overall and by class, each iteration)
table(shuffled_WGBS_states$States,shuffled_WGBS_states$Iteration)
ddply(shuffled_WGBS_states,.(Iteration),summarise,Median=median(States),IQR=IQR(States))
ddply(shuffled_WGBS_states,.(Iteration,class),summarise,Median=median(States),IQR=IQR(States))

# Average number of samples a TE is annotated with each chromHMM state, for those ever annotated with each state, each iteration
shuffled_chromHMM_inter

## Mean and SD across iterations, for each state combination
ddply(shuffled_chromHMM_inter,.(State1,State2),summarise,Proportion=mean(Mean_samples/sample_counts["All","chromHMM"]),SD=sd(Mean_samples/sample_counts["All","chromHMM"]))

# Average number of samples a TE is annotated with each methylation state, for those ever annotated with each state, each iteration
shuffled_meth_inter

# Proportion of TEs ever in the poised promoter states, each iteration
ldply(shuffled_chromHMM_potential,function(x) dim(x[which(x$X10_TssBiv > 0 | x$X11_BivFlnk > 0),])[1]/NUM_TE)

# Median number of states each TE is annotated with across all samples, for TEs ever in each state, chromHMM, each iteration
shuffled_chromHMM_dynamics

# Median number of states each TE is annotated with across all samples, for TEs ever in each state, methylation, each iteration
shuffled_meth_dynamics

rm(shuffled_chromHMM_potential)
```

### Supplementary Figure 10

Created a combined matrix with multiple epigenetic marks (chromHMM, WGBS, DNase, H3K27ac, and RNA states) for each TE in each sample. First, split the matrices of TE epigenetic states by sample. Then, combined into a single combined matrix by sample. From those matrices, counted the number of TE instances with each state combination. 

```{bash compare marks, eval=FALSE}
# Link to matrices of TE x sample x state
ln -s /bar/epehrsson/TE_landscape/chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt .
ln -s /bar/epehrsson/TE_landscape/WGBS/TE_WGBS_state_sorted.txt .
ln -s /bar/epehrsson/TE_landscape/DNase/true_summit/rmsk_TEother_DNase_summit.txt .
ln -s /bar/epehrsson/TE_landscape/H3K27ac/true_summit/rmsk_TEother_H3K27ac_summit.txt .
ln -s /bar/epehrsson/TE_landscape/RNAseq/rmsk_TE_rpkm.txt . # Produced by RNAseq.R

# Split each matrix by sample
awk '{print>"chromHMM/"$8}' rmsk_TEother_chromHMM_summit_sorted.txt &
awk '{print>"WGBS/"$8}' TE_WGBS_state_sorted.txt &
awk '{print>"H3K27ac/"$8}' rmsk_TEother_H3K27ac_summit.txt &
awk '{print>"DNase/"$8}' rmsk_TEother_DNase_summit.txt &
awk '{print>"RNA/"$8}' rmsk_TE_rpkm.txt &

# Input: TE states by sample, split into a folder for each metric
python ~/bin/TE_landscape/combine_marks_pandas.py ~/TE_landscape/sample_lists/mnemonics.txt &

## Output (Single combined file per sample)
#/bar/epehrsson/TE_landscape/compare_marks/combined/E# [127 files]

# Count number of TEs x sample in each state combination (all, unique)
python ~/bin/TE_landscape/aggregate_marks_pandas.py ~/TE_landscape/sample_lists/mnemonics.txt &

## Output (hardcoded in script)
#/bar/epehrsson/TE_landscape/compare_marks/true_summit/combine_marks_counts.txt
#/bar/epehrsson/TE_landscape/compare_marks/true_summit/combine_marks_counts_unique.txt
```

Generates tables of the number of TE x sample instances with each combination of states, with and without chromHMM.

a. For each chromHMM state, the proportion of TE x sample instances in the state that are also annotated with each methylation state, overlap with DHS or H3K27ack peak summits, or RPKM > 1. b. By methylation state. c. By DHS peak summit overlap. d. By H3K27ac peak summit overlap. e. By RPKM > 1. 

```{r Figure S10, echo=FALSE}
source("R_scripts/compare_marks.R")

a = ggplot(by_chromHMM,aes(x=1,y=Proportion,fill=State)) + geom_bar(stat="identity",position="stack") + 
  geom_text(aes(label = round(Proportion,2)), position = position_stack(vjust=0.5),size=2.5) + facet_grid(Mark~chromHMM,labeller=labeller(Mark=mark_labels,chromHMM=label_wrap_gen())) + 
  theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(),strip.text.x = element_text(size = 7), plot.title=element_text(size=8)) + ylab("Proportion of TE by sample") + 
  scale_fill_manual(values=c(chromHMM_colors,meth_colors,setNames(c("gold","darkgrey"),c("True","False"))),guide=FALSE) + ggtitle("chromHMM state")

b = ggplot(by_WGBS,aes(x=1,y=Proportion,fill=State)) + geom_bar(stat="identity",position="stack") + 
  geom_text(aes(label = round(Proportion,2)), position = position_stack(vjust=0.5),size=2.5) + facet_grid(Mark~WGBS,labeller=labeller(WGBS=setNames(c("Hypo","Inter","Hyper","Missing"),meth_states),Mark=mark_labels)) + 
  theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.title=element_text(size=8),strip.text.x = element_text(size = 7)) + ylab("Proportion of TE by sample") + 
  scale_fill_manual(values=c(chromHMM_colors,meth_colors,setNames(c("gold","darkgrey"),c("True","False"))),guide=FALSE) + ggtitle("WGBS state")

c = ggplot(by_DNase,aes(x=1,y=Proportion,fill=State)) + geom_bar(stat="identity",position="stack") + 
  geom_text(aes(label = round(Proportion,2)), position = position_stack(vjust=0.5),size=2.5) + facet_grid(Mark~DNase,labeller=labeller(Mark=mark_labels)) + 
  theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(),axis.title.y=element_blank(), plot.title=element_text(size=8)) + 
  scale_fill_manual(values=c(chromHMM_colors,meth_colors,setNames(c("gold","darkgrey"),c("True","False"))),guide=FALSE) + ggtitle("DHS peak overlap")

d = ggplot(by_H3K27ac,aes(x=1,y=Proportion,fill=State)) + geom_bar(stat="identity",position="stack") + 
  geom_text(aes(label = round(Proportion,2)), position = position_stack(vjust=0.5),size=2.5) + facet_grid(Mark~H3K27ac,labeller=labeller(Mark=mark_labels)) + 
  theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(),axis.title.y=element_blank(), plot.title=element_text(size=8)) + 
  scale_fill_manual(values=c(chromHMM_colors,meth_colors,setNames(c("gold","darkgrey"),c("True","False"))),guide=FALSE) + ggtitle("H3K27ac peak overlap")

e = ggplot(by_RNA,aes(x=1,y=Proportion,fill=State)) + geom_bar(stat="identity",position="stack") + 
  geom_text(aes(label = round(Proportion,2)), position = position_stack(vjust=0.5),size=2.5) + facet_grid(Mark~RNA,labeller=labeller(Mark=mark_labels)) + 
  theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(),axis.title.y=element_blank(), plot.title=element_text(size=8)) + 
  scale_fill_manual(values=c(chromHMM_colors,meth_colors,setNames(c("gold","darkgrey"),c("True","False"))),guide=FALSE) + ggtitle("RPKM > 1")

legend_states = get_legend(ggplot(by_DNase,aes(x=1,y=Proportion,fill=State)) + geom_bar(stat="identity",position="stack") + 
  theme(legend.position = "bottom",legend.key.size = unit(3,'mm')) + scale_fill_manual(values=c(chromHMM_colors,meth_colors,setNames(c("gold","darkgrey"),c("True","False"))),labels=c(all_state_labels,"True","False")) + guides(fill=guide_legend(ncol=7,title.position = "top")))

grid.arrange(a,b,c,d,e,legend_states, nrow=3, layout_matrix=rbind(c(1),c(2,3,4,5),c(6)),widths=c(0.36,0.21,0.21,0.22),heights=c(0.45,0.45,0.1))
```

```{r Figure S10 source data}
write.table(by_chromHMM[,c("Proportion","State","Mark","chromHMM","TE_sample")],file="source_data/Figure_S10a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(by_WGBS[,c("Proportion","State","Mark","WGBS","TE_sample")],file="source_data/Figure_S10b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(by_DNase[,c("Proportion","State","Mark","DNase","TE_sample")],file="source_data/Figure_S10c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(by_H3K27ac[,c("Proportion","State","Mark","H3K27ac","TE_sample")],file="source_data/Figure_S10d.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(by_RNA[,c("Proportion","State","Mark","RNA","TE_sample")],file="source_data/Figure_S10e.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Compare mark analysis 

```{r compare mark analysis}
# Number of samples with data for each combination of epigenetic marks
dim(metadata[which(!is.na(metadata$WGBS) & !is.na(metadata$DNase)),])[1]
dim(metadata[which(!is.na(metadata$WGBS) & !is.na(metadata$H3K27ac)),])[1]
dim(metadata[which(!is.na(metadata$WGBS) & !is.na(metadata$RNA)),])[1]
dim(metadata[which(!is.na(metadata$DNase) & !is.na(metadata$H3K27ac)),])[1]
dim(metadata[which(!is.na(metadata$DNase) & !is.na(metadata$RNA)),])[1]
dim(metadata[which(!is.na(metadata$H3K27ac) & !is.na(metadata$RNA)),])[1]
## Number of samples with all five epigenetic marks
dim(metadata[which(!is.na(metadata$WGBS) & !is.na(metadata$DNase) & !is.na(metadata$H3K27ac) & !is.na(metadata$RNA)),])[1]

# Total TE x sample instances (562,520,596)
sum(compare_marks_unique$TE_sample)

# Total TE x sample instances x chromHMM annotations (570,405,028)
sum(compare_marks_all$TE_sample)

# Total TE x sample instances for each epigenetic mark
apply(sample_counts,2,function(x) (x[3]*NUM_TE) + ((x[1]-x[3])*NUM_TE_noY))
apply(compare_marks_unique,2,function(x) sum(compare_marks_unique[which(!is.na(x)),]$TE_sample))

# Number/proportion of TE x sample instances annotated with each epigenetic state, by mark
ddply(WGBS_norm,.(),transform,Proportion=TE_sample/sum(TE_sample))
ddply(DNase_norm,.(),transform,Proportion=TE_sample/sum(TE_sample))
ddply(H3K27ac_norm,.(),transform,Proportion=TE_sample/sum(TE_sample))
ddply(RNA_norm,.(),transform,Proportion=TE_sample/sum(TE_sample))
## Number and proportion of TE x sample instances annotated with each chromHMM state
ddply(compare_marks_all,.(chromHMM),summarise,TE_sample=sum(TE_sample),Proportion=sum(TE_sample)/sum(compare_marks_unique$TE_sample))

# Tables by epigenetic mark, listing the number/proportion of TEs annotated with each state
# that are also annotated with another state generated from a different epigenetic mark
by_chromHMM
by_WGBS
by_DNase
by_H3K27ac
by_RNA

# Chi-squared test for each pair of epigenetic marks, Bonferroni-corrected p-values and residuals
# chromHMM comparisons can include multiple chromHMM states per TE x sample
table_list = list(xtabs(TE_sample~chromHMM+WGBS,compare_marks_all),
                  xtabs(TE_sample~chromHMM+DNase,compare_marks_all),
                  xtabs(TE_sample~chromHMM+H3K27ac,compare_marks_all),
                  xtabs(TE_sample~chromHMM+RNA,compare_marks_all),
                  xtabs(TE_sample~WGBS+DNase,compare_marks_unique),
                  xtabs(TE_sample~WGBS+H3K27ac,compare_marks_unique),
                  xtabs(TE_sample~WGBS+RNA,compare_marks_unique),
                  xtabs(TE_sample~DNase+H3K27ac,compare_marks_unique),
                  xtabs(TE_sample~DNase+RNA,compare_marks_unique),
                  xtabs(TE_sample~H3K27ac+RNA,compare_marks_unique))
chisq_list = lapply(table_list,function(x) chisq.test(x))
p.adjust(unlist(lapply(chisq_list,function(x) x$p.value)),method="bonf")
lapply(chisq_list,function(x) x$residuals)

# Proportion of TE x sample instances overlapping a DHS peak, an H3K27ac peak, or both
# For samples with data for both epigenetic marks
test1 = ddply(compare_marks_unique[which(!is.na(compare_marks_unique$DNase) & !is.na(compare_marks_unique$H3K27ac)),],.(DNase,H3K27ac),summarise,TE_sample=sum(TE_sample))
ddply(test1,.(),summarise,DHS=sum(TE_sample[which(DNase == "True")])/sum(TE_sample),H3K=sum(TE_sample[which(H3K27ac == "True")])/sum(TE_sample),Both=sum(TE_sample[which(DNase == "True" & H3K27ac == "True")])/sum(TE_sample))

# Proportion of TE x sample instances in each chromHMM state also overlapping a DHS peak, H3K27ac peak, or both
# For samples with data for all three epigenetic marks
test1 = ddply(compare_marks_all[which(!is.na(compare_marks_all$DNase) & !is.na(compare_marks_all$H3K27ac)),],.(chromHMM,DNase,H3K27ac),summarise,TE_sample=sum(TE_sample))
test2 = ddply(test1,.(chromHMM),summarise,DHS=sum(TE_sample[which(DNase == "True")])/sum(TE_sample),H3K=sum(TE_sample[which(H3K27ac == "True")])/sum(TE_sample),Both=sum(TE_sample[which(DNase == "True" & H3K27ac == "True")])/sum(TE_sample))
test2[order(test2$Both),]

# Proportion of TE x sample instances in each methylation state also overlapping a DHS peak, H3K27ac peak, or both
# For samples with data for all three epigenetic marks
test1 = ddply(compare_marks_unique[which(!is.na(compare_marks_unique$DNase) & !is.na(compare_marks_unique$H3K27ac) & !is.na(compare_marks_unique$WGBS)),],.(WGBS,DNase,H3K27ac),summarise,TE_sample=sum(TE_sample))
test2 = ddply(test1,.(WGBS),summarise,DHS=sum(TE_sample[which(DNase == "True")])/sum(TE_sample),H3K=sum(TE_sample[which(H3K27ac == "True")])/sum(TE_sample),Both=sum(TE_sample[which(DNase == "True" & H3K27ac == "True")])/sum(TE_sample))
test2[order(test2$Both),]
```

## Differences in TE activity by tissue classification

### By-sample analysis

Variation in the contribution of TEs to each epigenetic state by sample classification.

```{r analysis Fig 4a, cache=TRUE, cache.lazy=FALSE}
# Statistics on the proportion of each state within TEs by sample
by_sample_stats = ddply(by_sample_all,~State,summarise, Range = max(Proportion)-min(Proportion), Median = median(Proportion), Mean = mean(Proportion), CV = sd(Proportion)/mean(Proportion))
## States ordered by range of proportion in TEs
by_sample_stats[order(by_sample_stats$Range),]
## Minimum and maximum proportion of each state in TEs
ddply(by_sample_all,~State,function(x) x[which.min(x$Proportion),])
ddply(by_sample_all,~State,function(x) x[which.max(x$Proportion),])

# Kruskal-Wallis test for proportion of each state in TEs, by sample classification
# Bonferroni-corrected within each state and category, not across categories
# Filtered to state x category combinations with corrected p-value < 0.05
by_sample_KW = ddply(by_sample_all,~State,summarise,Group = p.adjust(unlist(kruskal.test(Proportion,Group))["p.value"],method="bonf"),
      Anatomy = p.adjust(unlist(kruskal.test(Proportion,Anatomy))["p.value"],method="bonf"),
      Type = p.adjust(unlist(kruskal.test(Proportion,Type))["p.value"],method="bonf"),
      Age = p.adjust(unlist(kruskal.test(Proportion,Age))["p.value"],method="bonf"),
      Germline = p.adjust(unlist(kruskal.test(Proportion,Germline))["p.value"],method="bonf"))
by_sample_KW = merge(by_sample_KW,ddply(droplevels(by_sample_all[which(!(by_sample_all$State %in% meth_states)),]),~State,summarise,Cancer = p.adjust(unlist(kruskal.test(Proportion,Cancer))["p.value"],method="bonf")),by="State",all.x=TRUE)
by_sample_KW = melt(by_sample_KW,id.var="State",variable.name="Category",value.name="Pvalue")
by_sample_KW[which(by_sample_KW$Pvalue < 0.05),]

# Permutation tests by state to identify sample classifications significantly over-represented in samples 
# where the proportion of the state in TEs is greater than the total proportion of the state in TEs across all samples.
# FDR-corrected across only groupings with at least one sample whose proportion is greater than the total proportion.
# Filtered to state x grouping combinations with adjusted p-value < 0.1, Group only, active states only
permute_up = ddply(by_sample_all[,c("State","Sample","Proportion","Contribution")],.(State),function(y) permute_by_sample(y,"Proportion","+",unique(y$Contribution),"Proportion",-1))
permute_up = permute_up[which(permute_up$Samples.x > 0),]
permute_up$Padjust = p.adjust(permute_up$Pvalue,method="fdr")
permute_up[which(permute_up$Padjust < 0.1 & permute_up$Category == "Group" & permute_up$State %in% states[c(1:7,16:17,20:21)]),]
## Write out results
write.table(permute_up,file="sample_permute_up.txt",sep='\t',row.names=FALSE,quote=FALSE)

# Permutation tests by state to identify sample classifications significantly over-represented in samples 
# where the proportion of the state in TEs is smaller than the total proportion of the state in TEs across all samples.
# FDR-corrected across only groupings with at least one sample whose proportion is smaller than the total proportion.
# Filtered to state x grouping combinations with adjusted p-value < 0.1, Group only, active states only
permute_down = ddply(by_sample_all[,c("State","Sample","Proportion","Contribution")],.(State),function(y) permute_by_sample(y,"Proportion","-",unique(y$Contribution),"Proportion",-1))
permute_down = permute_down[which(permute_down$Samples.y - permute_down$Samples.x > 0),]
permute_down$Padjust = p.adjust(permute_down$Pvalue,method="fdr")
permute_down[which(permute_down$Padjust < 0.1 & permute_down$Category == "Group" & permute_down$State %in% states[c(1:7,16:17,20:21)]),]
## Write out results
write.table(permute_down,file="sample_permute_down.txt",sep='\t',row.names=FALSE,quote=FALSE)

# Mean proportion of each state in TEs by group
test = aggregate(data=by_sample_all,Proportion~State+Group,mean)
## Ordered by state and proportion
test[order(test$State,test$Proportion),]
```

### Figure 4

Finds the total RNA-seq read coverage and the length of the covered region for the entire genome, TEs, and each TE class and subfamily, for the entire genome and over chrY only.

```{bash RNA coverage, eval=FALSE}
# Sort bed files
sort -k1,1 -k2,2n rmsk_TEother_merge.txt > rmsk_TEother_merge
sort -k1,1 -k2,2n TEother_subfamily_merge.txt > TEother_subfamily_merge
sort -k1,1 -k2,2n TEother_class_merge.txt > TEother_class_merge

intersect_RNA_features.sh
intersect_RNA_chrY.sh

## Output
#/bar/epehrsson/TE_landscape/RNAseq/features/Genome_average.txt
#/bar/epehrsson/TE_landscape/RNAseq/features/Genome_average_chrY.txt
#/bar/epehrsson/TE_landscape/RNAseq/features/TE_average.txt
#/bar/epehrsson/TE_landscape/RNAseq/features/TE_average_chrY.txt
#/bar/epehrsson/TE_landscape/RNAseq/features/TEother_class_merge_average.txt
#/bar/epehrsson/TE_landscape/RNAseq/features/TEother_class_merge_average_chrY.txt
#/bar/epehrsson/TE_landscape/RNAseq/features/TEother_subfamily_merge_average.txt
#/bar/epehrsson/TE_landscape/RNAseq/features/TEother_subfamily_merge_average_chrY.txt
```

Calculates the total and average RNA-seq read coverage over the entire genome, TEs, and each TE class and subfamily, by sample. In addition, calculates the ratio of the average read coverage over TEs compared to the entire genome and over each class compared to the entire genome, by sample.

a. Proportion of each state in TEs, by sample, colored by Group. Total proportion of state in TEs across all samples is indicated by the black bar. The proportion of the genome (bases/CpGs) in TEs is indicated by dashed lines. States where the Bonferroni-adjusted Kruskal-Wallis p-value is < 0.05 are indicated by red stars. b. Proportion of each state in each TE class, by sample, colored by Group. Total proportion of each state in each class across all samples is indicated by the black bar. The proportion of the genome (bases/CpGs) in each class is indicated by dashed lines. c. Ratio of the average RNA-seq read coverage across TEs vs. the entire genome, by sample, colored by Group. The median across all samples is indicated by the black bar. d. Ratio of the average RNA-seq read coverage over each TE class vs. the entire genome, by sample, colored by group. The median across all samples is indicated by the black bar.

```{r Figure 4, echo=FALSE}
source("R_scripts/RNA_coverage.R")

# States where the Bonferroni-adjusted Kruskal-Wallis p-value is < 0.05
by_sample_KW$Stars = ifelse(by_sample_KW$Pvalue < 0.05,"*","")

a = ggplot(by_sample_all,aes(x=State,y=Proportion,color=Group)) + geom_point(position=position_jitterdodge(dodge.width = 0.8),size=1) + theme(panel.grid=element_blank(),axis.title.x=element_blank(),axis.text.x = element_blank(),axis.ticks.x = element_blank(),strip.background = element_rect(fill=NA,color=NA)) + ylab("Proportion of state in TEs") + scale_color_manual(values=group_colors,guide=FALSE) + geom_errorbar(aes(ymax=Contribution,ymin=Contribution),position=position_dodge(),color="black") + geom_hline(yintercept = MERGED_TE_WIDTH/GENOME_WIDTH,linetype="dashed") + geom_hline(yintercept = TE_CPGS/ALL_CPGS,linetype="dotdash") + scale_y_continuous(limits=c(0,0.85)) + scale_x_discrete(limits=states[c(1:15,20:21,16:19)],labels=all_state_labels) + geom_vline(xintercept=15.5,linetype="dashed",color="grey") + geom_vline(xintercept=16.5,linetype="dashed",color="grey") + geom_vline(xintercept=17.5,linetype="dashed",color="grey") + geom_text(data=by_sample_KW[which(by_sample_KW$Category == "Group"),],aes(x=State,y=0.8,label=Stars),color="red",size=7) + facet_grid(rows=vars(""))

b = ggplot(by_sample_class,aes(x=State,y=Proportion_state,color=Group)) + geom_point(position = position_jitterdodge(dodge.width = 0.8),size=1) + theme(panel.grid=element_blank(),axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),axis.title.x = element_text(margin = margin(t = -10))) + ylab("Proportion of state in class") + scale_color_manual(values=group_colors,guide=FALSE) + scale_x_discrete(limits=states[c(1:15,20:21,16:19)],labels=all_state_labels) + geom_errorbar(aes(ymax=Contribution,ymin=Contribution),position=position_dodge(),color="black") + facet_grid(rows=vars(class),scales="free") + geom_hline(data=by_sample_class[which(by_sample_class$State == "1_TssA" & by_sample_class$Sample == "E001"),],aes(yintercept=Bases_class/GENOME_WIDTH),linetype="dashed") + geom_hline(data=by_sample_class[which(by_sample_class$State == "Hypomethylated" & by_sample_class$Sample == "E003"),],aes(yintercept=Bases_class/ALL_CPGS),linetype="dotdash") + geom_vline(xintercept=15.5,linetype="dashed",color="grey") + geom_vline(xintercept=16.5,linetype="dashed",color="grey") + geom_vline(xintercept=17.5,linetype="dashed",color="grey") + scale_y_continuous(limits=c(0,0.6))

c = ggplot(RNA_proportion,aes(x=1,y=Ratio,color=Group)) + geom_point(position=position_jitterdodge(dodge.width = 0.5),size=1) + theme(panel.grid=element_blank(),axis.title.x=element_blank(),axis.text.x = element_blank(),axis.ticks.x = element_blank(),strip.background = element_rect(fill=NA,color=NA)) + scale_color_manual(values=group_colors,guide=FALSE) + ylab("TE vs. Genome\nexpression") + scale_y_continuous(limits=c(0,1)) + stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom= "crossbar",color="black") + facet_grid(~"")

d = ggplot(class_RNA,aes(x=1,y=Ratio,color=Group)) + geom_point(position=position_jitterdodge(dodge.width = 0.5),size=1) + theme(panel.grid=element_blank(),axis.title.x=element_blank(),axis.text.x = element_blank(),axis.ticks.x = element_blank()) + scale_color_manual(values=group_colors,guide=FALSE) + ylab("Class vs. Genome\nexpression") + ylim(0,1) + facet_wrap(~class_update,nrow=1) + stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom= "crossbar",color="black")

legend = get_legend(ggplot(by_sample_all,aes(x=State,y=Proportion,color=Group)) + geom_point(position = position_jitter(width = 0.3),size=2) + scale_color_manual(values=group_colors) + theme(legend.margin=margin(0,0,0,0),legend.key.size = unit(5,'mm')))

grid.arrange(a,b,c,d,legend,nrow=3,ncol=3,layout_matrix=rbind(c(1,1,5),c(2,2,5),c(3,4,5)),heights=c(0.2,0.55,0.15),widths=c(0.2,0.65,0.15))
```

```{r Figure 4 source data}
write.table(by_sample_all[,c("State","Sample","Proportion","Group","Contribution")],file="source_data/Figure_4a_dots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(by_sample_KW[which(by_sample_KW$Category == "Group"),c("State","Stars")],file="source_data/Figure_4a_stars.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(by_sample_class[,c("State","Sample","Proportion_state","Group","Contribution","class","Bases_class")],file="source_data/Figure_4b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(RNA_proportion[,c("Ratio","Group","Sample")],file="source_data/Figure_4c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(class_RNA[,c("Ratio","Group","Sample","class_update")],file="source_data/Figure_4d.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### By sample and class analysis

Variation in the contribution of each TE class to each epigenetic state by sample classification.

```{r analysis Fig S11a, cache=TRUE, cache.lazy=FALSE}
# Statistics on the proportion of each state within each TE class by sample
by_sample_stats_class = ddply(na.omit(by_sample_class),.(State,class),summarise, Range = max(Proportion_state)-min(Proportion_state), Median = median(Proportion_state), Mean = mean(Proportion_state),CV = sd(Proportion_state)/mean(Proportion_state))
## States and TE classes ordered by CV and range of proportion of state in class
by_sample_stats_class[order(by_sample_stats_class$Range),]
by_sample_stats_class[order(by_sample_stats_class$CV),]

# Kruskal-Wallis test for proportion of each state in each TE class, by sample Group
# Bonferroni-corrected within each state and class
# Filtered to state x class combinations with corrected p-value < 0.05
by_sample_KW_class = ddply(by_sample_class,.(State,class),summarise,Group = p.adjust(unlist(kruskal.test(Proportion_state,Group))["p.value"],method="bonf"))
by_sample_KW_class[which(by_sample_KW_class$Group < 0.05),]

# Permutation tests by state and class to identify sample classifications significantly over-represented in samples 
# where the proportion of the state in the TE class is greater than the total proportion of the state in the TE class across all samples.
# FDR-corrected across only groupings with at least one sample whose proportion is greater than the total proportion.
# Filtered to class x state x grouping combinations with adjusted p-value < 0.1, Group only, active states only
permute_up = ddply(by_sample_class[,c("class","State","Sample","Proportion_state","Contribution")],.(State,class),function(y) permute_by_sample(y,"Proportion_state","+",unique(y$Contribution),"Proportion_state",-1))
permute_up = permute_up[which(permute_up$Samples.x > 0),]
permute_up$Padjust = p.adjust(permute_up$Pvalue,method="fdr")
permute_up[which(permute_up$Padjust < 0.1 & permute_up$Category == "Group" & permute_up$State %in% states[c(1:7,16:17,20:21)]),]

# Permutation tests by state and class to identify sample classifications significantly over-represented in samples 
# where the proportion of the state in the TE class is smaller than the total proportion of the state in the TE class across all samples.
# FDR-corrected across only groupings with at least one sample whose proportion is smaller than the total proportion.
# Filtered to class x state x grouping combinations with adjusted p-value < 0.1, Group only, active states only
permute_down = ddply(by_sample_class[,c("class","State","Sample","Proportion_state","Contribution")],.(State,class),function(y) permute_by_sample(y,"Proportion_state","-",unique(y$Contribution),"Proportion_state",-1))
permute_down = permute_down[which(permute_down$Samples.y - permute_down$Samples.x > 0),]
permute_down$Padjust = p.adjust(permute_down$Pvalue,method="fdr")
permute_down[which(permute_down$Padjust < 0.1 & permute_down$Category == "Group" & permute_down$State %in% states[c(1:7,16:17,20:21)]),]

# Mean proportion of each state in each TE class by Group and total proportion of state in class across all samples
test = ddply(by_sample_class,.(State,class,Group),summarise,Proportion_state=mean(Proportion_state),Contribution=unique(Contribution))
## Ordered by proportion in each class, by state and class
test[order(test$State,test$class,test$Proportion),]

# Number of samples where each class is enriched over expectation for annotation with each state (LOR)
table(by_sample_class[which(by_sample_class$Enrichment > 0),c("State","class")])
## LOR enrichments of the four major classes in active epigenetic states
by_sample_class[which(by_sample_class$class %in% c("DNA","LINE","SINE","LTR") & by_sample_class$State %in% states[c(1:3,6:7,16,20:21)] & by_sample_class$Enrichment > 0),c("Sample","class","State","Enrichment",sample_categories)]
```

### DHS/H3K27ac peak length vs. summit analysis

DHS and H3K27ac contribution results using overlap with peak summits instead of total length of overlap with peaks. 

```{r peak analysis pt2}
# Proportion of DHS peaks whose summit overlaps a TE, across all samples
sum(DNase_stats$Summit_in_TE)/sum(DNase_stats$Peaks)

# Statistics for the proportion of DHS peaks whose summit overlaps a TE, by sample
ddply(DNase_stats,.(),summarise,Median=median(Summit_in_TE/Peaks),Mean=mean(Summit_in_TE/Peaks),SD=sd(Summit_in_TE/Peaks),Min=min(Summit_in_TE/Peaks),Max=max(Summit_in_TE/Peaks))
## Samples with the highest and lowest proportion of DHS peak summits overlapping TEs
DNase_stats[which.min(DNase_stats$Summit_in_TE/DNase_stats$Peaks),]
DNase_stats[which.max(DNase_stats$Summit_in_TE/DNase_stats$Peaks),]

# Proportion of H3K27ac peaks whose summit overlaps a TE, across all samples
sum(H3K27ac_stats$Summit_in_TE)/sum(H3K27ac_stats$Peaks)

# Statistics for the proportion of H3K27ac peaks whose summit overlaps a TE, by sample
ddply(H3K27ac_stats,.(),summarise,Median=median(Summit_in_TE/Peaks),Mean=mean(Summit_in_TE/Peaks),SD=sd(Summit_in_TE/Peaks),Min=min(Summit_in_TE/Peaks),Max=max(Summit_in_TE/Peaks))
## Samples with the highest and lowest proportion of H3K27ac peak summits overlapping TEs
H3K27ac_stats[which.min(H3K27ac_stats$Summit_in_TE/H3K27ac_stats$Peaks),]
H3K27ac_stats[which.max(H3K27ac_stats$Summit_in_TE/H3K27ac_stats$Peaks),]
```

### Expression analysis

```{r ratio expression}
# Statistics for the ratio of the average RNA-seq read coverage over TEs versus the entire genome by sample
range(RNA_proportion$Ratio)
median(RNA_proportion$Ratio)
## Median ratio by sample group
ddply(RNA_proportion,.(Group),summarise,Median=median(Ratio))

# Median ratio by class of the average read coverage over the TE class versus the entire genome by sample
ddply(class_RNA,.(class_update),summarise,Median=median(Ratio))
## Median ratio by sample group and class
dcast(ddply(class_RNA,.(Group,class_update),summarise,Median=median(Ratio)),class_update~Group,value.var="Median")
```

### Supplementary Figure 11

Proportion of each state in TEs, by sample, colored by alternative sample classifications. Total proportion of state in TEs across all samples is indicated by the black bar. The proportion of the genome (bases/CpGs) in TEs is indicated by dashed lines. States where the Bonferroni-adjusted Kruskal-Wallis p-value is < 0.05 are indicated by red stars. Legends for each of the sample classifications are included.

```{r Figure S11, echo=FALSE}
# Proportion of each state in TEs by sample, with Age, Cancer, Anatomy, Type, and Germlayer sample classifications
by_sample_all_long = melt(by_sample_all[,c("State","Sample","Proportion","Contribution",sample_categories[2:6])],id.vars=c("State","Sample","Proportion","Contribution"),
                          variable.name="Category",value.name="Group")
colnames(by_sample_all_long)[5:6] = c("Category","Group")

a = ggplot(by_sample_all_long,aes(x=State,y=Proportion,color=Group)) + geom_point(position=position_jitterdodge(dodge.width = 0.6),size=1) + theme(panel.grid=element_blank(),axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),axis.title.x = element_text(margin = margin(t = -10))) + ylab("Proportion of state in TEs") + scale_color_manual(values=c(anatomy_colors,type_colors,age_colors,cancer_colors,germline_colors),guide=FALSE) + geom_errorbar(aes(ymax=Contribution,ymin=Contribution),position=position_dodge(),color="black") + geom_hline(yintercept = MERGED_TE_WIDTH/GENOME_WIDTH,linetype="dashed") + geom_hline(yintercept = TE_CPGS/ALL_CPGS,linetype="dotdash") + scale_y_continuous(limits=c(0,0.85)) + scale_x_discrete(limits=states[c(1:15,20:21,16:19)],labels=all_state_labels) + geom_vline(xintercept=15.5,linetype="dashed",color="grey") + geom_vline(xintercept=16.5,linetype="dashed",color="grey") + geom_vline(xintercept=17.5,linetype="dashed",color="grey") + facet_wrap(~Category,nrow=5,labeller=labeller(Category=category_labels)) + geom_text(data=by_sample_KW[which(by_sample_KW$Category != "Group"),],aes(x=State,y=0.8,label=Stars),color="red",size=7)

legend_anatomy = get_legend(ggplot(by_sample_all_long[which(by_sample_all_long$Category == "Anatomy"),],aes(x=State,y=Proportion,color=Group)) + geom_point(position=position_jitterdodge(dodge.width = 0.6)) + theme(panel.grid=element_blank(),axis.title.y=element_blank(),legend.position = "bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')) + scale_color_manual(values=c(anatomy_colors),name="Anatomy")) 

legend_type = get_legend(ggplot(by_sample_all_long[which(by_sample_all_long$Category == "Type"),],aes(x=State,y=Proportion,color=Group)) + geom_point(position=position_jitterdodge(dodge.width = 0.6)) + theme(panel.grid=element_blank(),axis.title.y=element_blank(),legend.position = "bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')) + scale_color_manual(values=c(type_colors),name="Type") + guides(color = guide_legend(nrow = 2)))

legend_age = get_legend(ggplot(by_sample_all_long[which(by_sample_all_long$Category == "Age"),],aes(x=State,y=Proportion,color=Group)) + geom_point(position=position_jitterdodge(dodge.width = 0.6)) + theme(panel.grid=element_blank(),axis.title.y=element_blank(),legend.position = "bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')) + scale_color_manual(values=c(age_colors),name="Age",labels=age_labels) + guides(color = guide_legend(nrow = 2)))

legend_cancer = get_legend(ggplot(by_sample_all_long[which(by_sample_all_long$Category == "Cancer"),],aes(x=State,y=Proportion,color=Group)) + geom_point(position=position_jitterdodge(dodge.width = 0.6)) + theme(panel.grid=element_blank(),axis.title.y=element_blank(),legend.position = "bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')) + scale_color_manual(values=c(cancer_colors),name="Cancer"))

legend_germline = get_legend(ggplot(by_sample_all_long[which(by_sample_all_long$Category == "Germline"),],aes(x=State,y=Proportion,color=Group)) + geom_point(position=position_jitterdodge(dodge.width = 0.6)) + theme(panel.grid=element_blank(),axis.title.y=element_blank(),legend.position = "bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')) + scale_color_manual(values=c(germline_colors),name="Germlayer"))

grid.arrange(a,legend_anatomy,legend_type,legend_age,legend_cancer,legend_germline,nrow=4,layout_matrix=rbind(c(1,1),c(2,2),c(3,4),c(6,5)),heights=c(0.8,0.1,0.05,0.05),widths=c(0.6,0.4))
```

```{r Figure S11 source data}
write.table(by_sample_all_long[,c("Sample","State","Proportion","Group","Contribution","Category")],file="source_data/Figure_S11_dots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(by_sample_KW[which(by_sample_KW$Category != "Group"),c("State","Stars","Category")],file="source_data/Figure_S11_stars.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

## Tissue-specific subfamily enrichment in epigenetic states

### Set thresholds 

Thresholds for considering TE subfamilies for enrichment analyses.

```{r set enrichment thresholds}
# Total subfamily members in state
THRESHOLD_IJK_MEMBER = 10

# Total subfamily members
THRESHOLD_IK_MEMBER = 30

# Log odds ratio enrichment
THRESHOLD_LOR = 1.5

# Proportion of state in TE subfamily
THRESHOLD_PC = 0.01
```

### Figure 5

Overlap of TE subfamilies with chromHMM states. Intersects the file of merged TE subfamily locations with the chromHMM annotations, then sums the number of bp in the state per subfamily and sample. 

```{bash subfamily chromHMM, eval=FALSE}
# Intersect with merged TE subfamilies
for file in ../chromHMM_bedfiles/E*.bed; do suffix=$(basename $file | cut -d '_' -f1); bedtools intersect -wo -a TEother_subfamily_merge.txt -b $file | awk -v OFS='\t' -v tag=$suffix '{print $0, tag}' - >> subfamily_state_sample.bed; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/subfamily/subfamily_state_sample.bed

awk -v OFS='\t' '{a[$4, $8, $10]+=$9;}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], a[i];}}' subfamily_state_sample.bed > subfamily_state_sample.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/subfamily/subfamily_state_sample.txt
```

The number of subfamily members in each chromHMM state by sample. 

```{bash subfamily chromHMM members, eval=FALSE}
# By subfamily
awk -v OFS='\t' '{a[$4, $8, $10]+=1}END{for(i in a){split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], a[i];}}' chromHMM/rmsk_TEother_chromHMM_summit_sorted.txt > chromHMM/subfamily/subfamily_state_sample_summit.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/subfamily/subfamily_state_sample_summit.txt
```

Overlap of TE subfamilies with DHS peaks. From the intersection of individual TEs with DHS peaks, filtered to intersections where the TE overlaps the summit of the DHS peak, counts the number of unique peaks overlapping each subfamily. 

```{bash subfamily DHS, eval=FALSE}
# Merged TE subfamilies (unique peaks overlapping subfamily)
while read line; do while read line2; do awk -v sample=$line -v subfam=$line2 -v OFS='\t' '{if($4 == subfam) a[$8, $9, $10]+=1}END{print sample, subfam, length(a)}' DNase/true_summit/rmsk_TEother_$line\_DNase_summit.txt >> DNase/true_summit/subfamily_DNase_sample_summit.txt; done < sample_lists/subfamilies.txt; done < sample_lists/DNase_samples.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/true_summit/subfamily_DNase_sample_summit.txt
```

Overlap of TE subfamilies with H3K27ac peaks. From the intersection of individual TEs with H3K27ac peaks, filtered to intersections where the TE overlaps the summit of the H3K27ac peak, counts the number of unique peaks overlapping each subfamily. 

```{bash subfamily H3K27ac, eval=FALSE}
# Merged TE subfamilies (unique peaks overlapping subfamily)
while read line; do while read line2; do awk -v sample=$line -v subfam=$line2 -v OFS='\t' '{if($4 == subfam) a[$8, $9, $10]+=1}END{print sample, subfam, length(a)}' H3K27ac/true_summit/rmsk_TEother_$line\_H3K27ac_summit.txt >> H3K27ac/true_summit/subfamily_H3K27ac_sample_summit.txt; done < sample_lists/subfamilies.txt; done < sample_lists/H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/true_summit/subfamily_H3K27ac_sample_summit.txt
```

Overlap of TE subfamilies with CpGs in each methylation state. From the intersection of individual TEs with CpG methylation levels, finds unique CpGs overlapping each subfamily and counts the number in each methylation state per sample. 

```{bash subfamily WGBS, eval=FALSE}
# Number of CpGs in each state, TE CpGs, by sample x subfamily
while read line; do awk -v OFS='\t' -v subfam=$line '{if($4 == subfam) print $0}' TE_CpG_Meth_new.bed | cut -f8- - | sort | uniq | awk -v OFS='\t' -v subfam=$line '{for (i=4;i<=40;i++){if($i == -1) miss[i]+=1; else if ($i < 0.3) hypo[i]+=1; else if ($i > 0.7) hyper[i]+=1; else if (($i <= 0.7) && ($i >= 0.3)) inter[i]+=1;}}; END{for (i=4;i<=40;i++){print i, hypo[i], inter[i], hyper[i], miss[i], subfam;}}' -; done < ../features/TEs/subfamily/subfamilies.txt >> subfamily_CpG_Meth_states.txt

## Output
#/bar/epehrsson/TE_landscape/WGBS/subfamily_CpG_Meth_states.txt
```

Overlap of shuffled TE subfamilies with epigenetic states. First, for each iteration of shuffled TEs, created files with unique regions overlapping each shuffled subfamily by merging individual TEs. 

For chromHMM, intersected the merged shuffled subfamily files with chromHMM annotations, then summed the number of bp overlapping the subfamily in each state per sample. Also found the number of subfamily members in the state per sample. 

For WGBS, intersected the merged shuffled subfamily files with CpG methylation levels, then counted the number of unique CpGs in each methylation state overlapping each subfamily per sample. Also found the total number of CpGs overlapping each shuffled subfamily, because it is different than for true TE subfamilies. 

For DHS and H3K27ac, intersected the merged shuffled subfamily files with all peaks, then filtered to intersections where the peak summit overlaps the subfamily. Counted the number of peaks overlapping the subfamily per sample. 

```{bash subfamily shuffle, eval=FALSE}
# Merged each subfamily and sorted
for i in {1..10}; do awk '{print>$4}' features/shuffled_TEs/rmsk_TE_shuffle_$i\.txt; while read line; do bedtools merge -i $line | awk -v OFS='\t' -v subfam=$line '{print $0, subfam}' - >> features/shuffled_TEs/subfamily/subfamily_merge_$i\.txt; rm $line; done < features/TEs/subfamily/subfamilies.txt; done
for i in {1..10}; do sort -k1,1 -k2,2n features/shuffled_TEs/subfamily/subfamily_merge_$i\.txt > features/shuffled_TEs/subfamily/subfamily_merge_$i\_sorted.txt; mv features/shuffled_TEs/subfamily/subfamily_merge_$i\_sorted.txt features/shuffled_TEs/subfamily/subfamily_merge_$i\.txt; done

## Output
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/subfamily/subfamily_merge_#.txt [10 files]

# Length of shuffled subfamilies
for file in features/shuffled_TEs/subfamily/subfamily_merge_*.txt; do awk -v OFS='\t' '{a[$4]+=$3-$2; if($1 != "chrY") b[$4]+=$3-$2}END{for(i in a) print i, a[i], b[i]}' $file > features/shuffled_TEs/subfamily/$(basename $file .txt)_length.txt; done

## Output
#/bar/epehrsson/TE_landscape/features/shuffled_TEs/subfamily/subfamily_merge_#_length.txt [10 files]

## chromHMM
# Intersect with epigenetic data
for i in {1..10}; do while read line; do bedtools intersect -wo -a features/shuffled_TEs/subfamily/subfamily_merge_$i\.txt -b raw_data/chromHMM/$line\_15_coreMarks_mnemonics.bed | awk -v OFS='\t' -v sample=$line '{print $0, sample}' - >> chromHMM/shuffled_TEs/subfamily/subfamily_state_sample_$i\.bed; done < sample_lists/mnemonics.txt; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/shuffled_TEs/subfamily/subfamily_state_sample_#.bed  [10 files]

# Number of bp per subfamily x sample
for j in {1..10}; do awk -v OFS='\t' '{a[$4, $8, $10]+=$9}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], a[i]}}' chromHMM/shuffled_TEs/subfamily/subfamily_state_sample_$j\.bed > chromHMM/shuffled_TEs/subfamily/subfamily_state_sample_$j\.txt; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/shuffled_TEs/subfamily/subfamily_state_sample_#.txt  [10 files]

# Number of subfamily members in state per sample
for j in {1..10}; do awk -v OFS='\t' '{a[$4, $8, $10]+=1}END{for(i in a){split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], a[i]}}' chromHMM/shuffled_TEs/rmsk_TE_shuffle_$j\_chromHMM_sorted.txt > chromHMM/shuffled_TEs/subfamily/subfamily_state_sample_summit_$j\.txt; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/shuffled_TEs/subfamily/subfamily_state_sample_summit_#.txt [10 files]

## WGBS
# Intersect with epigenetic data
split -l 1000000 ~/TE_landscape/WGBS/all_CpG_Meth.bed
for i in {1..10}; do for file in x*; do echo $file; bedtools intersect -wo -a ~/TE_landscape/features/shuffled_TEs/subfamily/subfamily_merge_$i\.txt -b $file >>  subfam_CpG_Meth_$i\.bed ; done; done

# Number of CpGs in state per subfamily x sample
for j in {1..10}; do awk '{print>"indv/"$4}' subfam_CpG_Meth_$j\.bed; while read line; do awk -v OFS='\t' -v subfam=$line '{for (i=8;i<=44;i++){if($i == -1) miss[i]+=1; else if ($i < 0.3) hypo[i]+=1; else if ($i > 0.7) hyper[i]+=1; else if (($i <= 0.7) && ($i >= 0.3)) inter[i]+=1;}}END{for (i=8;i<=44;i++){print i, hypo[i], inter[i], hyper[i], miss[i], subfam}}' indv/$line; rm indv/$line; done < ~/TE_landscape/sample_lists/subfamilies.txt >> subfamily_CpG_Meth_states_$j\.txt; done &

## Output
#/bar/epehrsson/TE_landscape/WGBS/shuffled/subfamily/subfamily_CpG_Meth_states_#.txt [10 files]

# Number of CpGs per subfamily (Length_ik)
for j in {1..10}; do awk -v OFS='\t' '{a[$4]+=1}END{for(i in a){print i, a[i]}}' /scratch/ecp/TE_landscape/WGBS/subfam_CpG_Meth_$j\.bed > ~/TE_landscape/WGBS/shuffled/subfamily/subfamily_CpG_count_$j\.txt; done

## Output
#/bar/epehrsson/TE_landscape/WGBS/shuffled/subfamily/subfamily_CpG_count_#.txt  [10 files]

## DHS
# Intersect with epigenetic data
for i in {1..10}; do while read line; do bedtools intersect -wo -a features/shuffled_TEs/subfamily/subfamily_merge_$i\.txt -b raw_data/DNase/DNase_narrow_peaks/$line\-DNase.macs2.narrowPeak | awk -v OFS='\t' -v sample=$line '{print $0, sample}' - >> DNase/shuffled/subfamily/rmsk_TEother_subfamily_DNase_$i\.txt ; done < sample_lists/DNase_samples.txt; done

## Output
#/bar/epehrsson/TE_landscape/DNase/shuffled/subfamily/rmsk_TEother_subfamily_DNase_#.txt [10 files]

# Filtered to only peaks where the summit overlaps the subfamily
for i in {1..10}; do awk '{summit=$6+$14; if((summit >= $2) && (summit < $3)) print $0}' DNase/shuffled/subfamily/rmsk_TEother_subfamily_DNase_$i\.txt > DNase/shuffled/subfamily/true_summit/rmsk_TEother_subfamily_DNase_$i\_summit.txt; done

## Output
#/bar/epehrsson/TE_landscape/DNase/shuffled/subfamily/true_summit/rmsk_TEother_subfamily_DNase_#_summit.txt [10 files]

# Number of peaks per subfamily x sample
for j in {1..10}; do awk -v OFS='\t' '{a[$4, $16]+=1}END{for(i in a) {split(i, sep, SUBSEP); print sep[1], sep[2], a[i]}}' DNase/shuffled/subfamily/true_summit/rmsk_TEother_subfamily_DNase_$j\_summit.txt > DNase/shuffled/subfamily/true_summit/subfamily_DNase_sample_summit_$j\.txt; done

## Output
#/bar/epehrsson/TE_landscape/DNase/shuffled/subfamily/true_summit/subfamily_DNase_sample_summit_#.txt [10 files]

## H3K27ac
# Intersect with epigenetic data
for i in {1..10}; do while read line; do bedtools intersect -wo -a features/shuffled_TEs/subfamily/subfamily_merge_$i\.txt -b raw_data/H3K27ac/H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak | awk -v OFS='\t' -v sample=$line '{print $0, sample}' - >> H3K27ac/shuffled/subfamily/rmsk_TEother_subfamily_H3K27ac_$i\.txt ; done < sample_lists/H3K27ac_samples.txt; done

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/shuffled/subfamily/rmsk_TEother_subfamily_H3K27ac_#.txt [10 files]

# Filtered to only peaks where the summit overlaps the subfamily
for i in {1..10}; do awk '{summit=$6+$14; if((summit >= $2) && (summit < $3)) print $0}' H3K27ac/shuffled/subfamily/rmsk_TEother_subfamily_H3K27ac_$i\.txt > H3K27ac/shuffled/subfamily/true_summit/rmsk_TEother_subfamily_H3K27ac_$i\_summit.txt; done

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/shuffled/subfamily/true_summit/rmsk_TEother_subfamily_H3K27ac_#_summit.txt [10 files]

# Number of peaks per subfamily x sample
for j in {1..10}; do awk -v OFS='\t' '{a[$4, $16]+=1}END{for(i in a) {split(i, sep, SUBSEP); print sep[1], sep[2], a[i]}}' H3K27ac/shuffled/subfamily/true_summit/rmsk_TEother_subfamily_H3K27ac_$j\_summit.txt > H3K27ac/shuffled/subfamily/true_summit/subfamily_H3K27ac_sample_summit_$j\.txt; done

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/shuffled/subfamily/true_summit/subfamily_H3K27ac_sample_summit_#.txt [10 files]
```

The file ''enrichment_list.txt'' is a list generated from the analysis in this section of subfamilies of interest enriched in a state. HOMER was run to identify transcription factor binding motifs (known motifs only) encoded by members of the subfamily that are annotated with the state in samples where the subfamily is enriched, in comparison to members of the subfamily never in the state (as background). 

```{bash HOMER, eval=FALSE}
bash run_homer.sh enrichment/enrichment_list.txt

## Input
#/bar/epehrsson/TE_landscape/enrichment/bedfiles/[subfam]_[state]_enriched.bed [115 files]
#/bar/epehrsson/TE_landscape/enrichment/bedfiles/[subfam]_never_[state].bed [115 files]

## Output
#/bar/epehrsson/TE_landscape/enrichment/homer/HOMER_[subfam]_[state]_enriched/knownResults.txt [107 folders]
#/bar/epehrsson/TE_landscape/enrichment/homer/HOMER_output.txt
#/bar/epehrsson/TE_landscape/enrichment/homer/HOMER_enriched_knownResults.txt
```

```{r Figure 5 scripts, echo=FALSE, cache=TRUE, cache.lazy=FALSE}
# LOR enrichment of each TE subfamily x epigenetic state x sample and the proportion of each state in each subfamily x sample
# with and without subfamily member thresholds
source("R_scripts/subfamily_enrichment.R")

# Number of samples each TE subfamily is annotated with each epigenetic state,
# plus the number of unique states each TE subfamily is annotated with across all samples
source("R_scripts/subfamily_potential.R")
```

```{r Figure 5 scripts part 2, cache=TRUE, cache.lazy=FALSE}
# PCA on LOR enrichment for sample x subfamily x state combinations, by sample and subfamily
source("R_scripts/subfamily_PCA.R")
```

```{r Figure 5 scripts part 3, cache=TRUE, cache.lazy=FALSE}
# LOR enrichment of each TE subfamily x epigenetic state x sample, for 10 iterations of shuffled TEs,
# with and without subfamily member thresholds
source("R_scripts/shuffled_subfamily_enrichment.R")
```

a. Histogram of the number of unique states each TE subfamily is in across all samples, divided by epigenetic mark. b. PCA plot (PC1/PC2) of Roadmap samples, clustered by LOR enrichment in each chromHMM state x subfamily. Colored by sample group, shape based on sample donor age. c. PCA plot (PC1/PC2) of TE subfamilies, clustered by LOR enrichment in each sample x chromHMM state. Colored by class, shape indicates Alu and L1 families. d. Proportion of samples in which each TE subfamily is enriched LOR > 1.5 in each epigenetic state. Includes only subfamilies with >30 members and >10 members in the state. Colored by class. Axis lists number of subfamilies enriched LOR > 1.5 at least once in each state. e. The top 5 transcription factor binding motifs enriched in LTR22A elements in the 7_Enh state in samples where the subfamily is enriched in that state, compared to LTR22A members never in the 7_Enh state, based on Benjamini-corrected p-values produced by HOMER. Bar chart presents the percent of members in each group that have the transcription factor binding motif, ordered by q-value.

```{r Figure 5, echo=FALSE}
# Potential
a = ggplot(subfam_states,aes(x=States)) + geom_histogram(binwidth=1,fill="black") + ylab("Subfamilies") + xlab("Number of states") + facet_wrap(~Metric,scales="free_x",nrow=1,labeller=labeller(Metric=mark_labels)) + scale_y_continuous(limits=c(0,968)) + expand_limits(x=0)

# PCA
b = ggplot(sample_chromHMM_pca$eigenvectors,aes(x=PC1,y=PC2,color=Group,shape=Age)) + geom_point() + scale_color_manual(values=group_colors,guide=FALSE) + scale_shape_manual(values=setNames(c(16,17,18,16),c("Non_fetal","Fetal","Cell_line","Unknown")),labels=age_labels) + labs(x=paste("PC1 (",round(sample_chromHMM_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(sample_chromHMM_pca$eigenvalues[2],1),"%)",sep="")) + theme(aspect.ratio = 1,legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm'))

family_shapes=setNames(c(17,rep(16,20),18,rep(16,23)),sort(as.vector(unique(rmsk_TE_subfamily$family))))
subfamily_chromHMM_pca$eigenvectors$family_shape = ifelse(subfamily_chromHMM_pca$eigenvectors$family == "L1","L1",ifelse(subfamily_chromHMM_pca$eigenvectors$family == "Alu","Alu","Other"))

c = ggplot(subfamily_chromHMM_pca$eigenvectors,aes(x=PC1,y=PC2,color=class_update,shape=family_shape)) + geom_point() + scale_color_manual(values=class_colors,guide=FALSE) + scale_shape_manual(values=family_shapes,name="Family") + labs(x=paste("PC1 (",round(subfamily_chromHMM_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(subfamily_chromHMM_pca$eigenvalues[2],1),"%)",sep="")) + theme(aspect.ratio = 1,legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')) 

# LOR enrichments
d = ggplot(subfamily_state_sample_counts,aes(x=State,y=Sample.Proportion,color=class_update)) + geom_point(position=position_jitterdodge(jitter.width=0.5)) + coord_flip() + ylab("Proportion of samples enriched LOR > 1.5") + scale_color_manual(values=class_colors,guide=FALSE) + geom_vline(xintercept=seq(1.5,20.5,1), colour='grey') + scale_x_discrete(limits = rev(states[c(1:15,20:21,16:19)]),labels=setNames(paste(subfamily_state_sample_counts_combine$State,"\n(n=",subfamily_state_sample_counts_combine$V1,")",sep=""),subfamily_state_sample_counts_combine$State)) + theme(axis.title.y = element_text(margin = margin(r = -10)))

# HOMER on LTR22A 7_Enh
homer = read.table("enrichment/homer/HOMER_LTR22A_7_Enh_enriched/knownResults.txt",sep='\t',header=TRUE,comment.char = "@")
homer_top5 = head(homer[order(homer$q.value..Benjamini.),],n=5)
homer_top5 = melt(homer_top5[,c(1,5,7,9)],id.vars=c("Motif.Name","q.value..Benjamini."))
homer_top5$PC = as.numeric(gsub("%","",homer_top5$value))

e = ggplot(homer_top5,aes(x=reorder(Motif.Name,`q.value..Benjamini.`,median),y=PC,fill=variable)) + geom_bar(stat="identity",position="dodge") + ylab("% TEs with motif") + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),legend.position=c(0.1,0.9),legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm'),legend.title = element_blank()) + scale_x_discrete(labels=c("Nkx2.1","Nkx2.2","Tcf3","Fli1","HNF4a")) + xlab("Transcription factor") + scale_fill_discrete(labels=c("Active TEs","Inactive TEs"))

legend_group = get_legend(ggplot(sample_chromHMM_pca$eigenvectors,aes(x=PC1,y=PC2,color=Group)) + geom_point() + scale_color_manual(values=group_colors) + theme(aspect.ratio = 1,legend.position = "bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')) + guides(color=guide_legend(ncol=8)))

grid.arrange(a,b,c,d,e,legend_group,legend_class,nrow=6,ncol=2,layout_matrix=rbind(c(1),c(6),c(2,4),c(3,4),c(5,4),c(5,7)),heights=c(0.2,0.05,0.25,0.25,0.2,0.05),widths=c(0.4,0.6))
```

```{r Figure 5 source data}
write.table(subfam_states[,c("subfamily","States","Metric")],file="source_data/Figure_5a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(sample_chromHMM_pca$eigenvectors[,c("PC1","PC2","Group","Age")],file="source_data/Figure_5b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(sample_chromHMM_pca$eigenvalues,file="source_data/Figure_5b_pc.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_chromHMM_pca$eigenvectors[,c("PC1","PC2","class_update","family_shape")],file="source_data/Figure_5c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_chromHMM_pca$eigenvalues,file="source_data/Figure_5c_pc.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_state_sample_counts[,c("subfamily","State","Sample.Proportion","class_update")],file="source_data/Figure_5d.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_state_sample_counts_combine[,c("State","V1")],file="source_data/Figure_5d_n.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(homer_top5[,c("Motif.Name","q.value..Benjamini.","PC","variable")],file="source_data/Figure_5e.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Number of enrichments and enriched subfamilies by state, overall and by class

```{r enrichment table}
# Table with the number of LOR enrichments > 1.5 and the number of subfamilies with at least one enrichment > 1.5 per state, 
# overall and by class, including only enrichments that pass member thresholds
enrichment_table = merge(as.data.frame(table(subfamily_state_sample_filter[which(subfamily_state_sample_filter$Enrichment > THRESHOLD_LOR),]$State)),subfamily_state_sample_counts_combine,by.x="Var1",by.y="State")
colnames(enrichment_table)[1:3] = c("State","Enrichments","Subfamilies")
enrichment_table = enrichment_table[match(states[1:21],enrichment_table$State),]

## Total number of enrichments across all states
sum(enrichment_table$Enrichments)

## Proportion of enrichments represented by each state
enrichment_table$Enrichments_pc = enrichment_table$Enrichments/sum(enrichment_table$Enrichments)

## Print table
enrichment_table

# Number of subfamilies enriched LOR > 1.5 in at least once sample
## Overall
length(unique(subfamily_state_sample_filter[which(subfamily_state_sample_filter$Enrichment > THRESHOLD_LOR),]$subfamily))
## Restricted to active state
length(unique(subfamily_state_sample_filter[which(subfamily_state_sample_filter$Enrichment > THRESHOLD_LOR & subfamily_state_sample_filter$State %in% c(chromHMM_states[1:7],meth_states[1:2],"DNase","H3K27ac")),]$subfamily))

# Enrichment of TE classes among enriched subfamilies
## Proportion of TE subfamilies in each class
table(rmsk_TE_subfamily$class_update)/968

## Proportion of subfamilies enriched at least once in each state in each TE class
ddply(enrichment_table,.(State),function(x) as.numeric(x[4:9])/sum(as.numeric(x[4:9])))

## Log odds ratio of the proportion of subfamilies enriched at least once in each state in each TE class 
## versus the proportion of subfamilies in each class
test = dcast(aggregate(data=subfamily_state_sample_counts,V1~State+class_update,function(x) sum(x > 0)),class_update~State,value.var = "V1")
apply(test[,2:22],2,function(x) log2((x/sum(x))/(table(rmsk_TE_subfamily$class_update)/968)))

# Enrichment of TE families among enriched subfamilies
## Proportion of TE subfamilies in each family
table(rmsk_TE_subfamily$family)/968

## Log odds ratio of the proportion of subfamilies enriched at least once in each state in each TE family
## versus the proportion of subfamilies in each class
## Plus the number of subfamilies in each family
test = dcast(aggregate(data=subfamily_state_sample_counts,V1~State+family,function(x) sum(x > 0)),family~State,value.var = "V1")
test_log = as.data.frame(apply(test[,2:22],2,function(x) log2((x/sum(x))/(table(rmsk_TE_subfamily$family)/968))))
test_log$Subfamilies = as.numeric(table(rmsk_TE_subfamily$family)[rownames(test_log)])
## By state, TE families with at least subfamily enriched in the state
lapply(colnames(test_log)[1:21],function(x) test_log[which(test_log[,x] > 0 & test_log$Subfamilies > 0),c(x,"Subfamilies")])
```

### Supplementary Figure 12

a. Histogram of the number of unique states each TE subfamily is annotated with across all samples, divided by epigenetic mark, for each iteration of shuffled TEs. b. Density plots of the proportion of samples in which each TE subfamily is annotated with each epigenetic state, for both real TEs (red) and 10 iterations of shuffled TEs (grey). c. Total number of enrichments LOR > 1.5 per state, for each iteration of shuffled TEs (boxplot) and real TEs (red dot).  d. Number of subfamilies with at least one enrichment > 1.5 per state, for each iteration of shuffled TEs (boxplot) and real TEs (red dot).

```{r Figure S12, echo=FALSE}
# Potential
## Number of unique states each TE subfamily is annotated with across all samples, by epigenetic mark and for each iteration of shuffled TEs
shuffle_state_potential = ddply(subfamily_state_potential_shuffle,.(Iteration,Metric,subfamily),summarise,States=sum(Samples > 0))

a = ggplot(shuffle_state_potential,aes(x=States,fill=Iteration)) + geom_histogram(binwidth=1,position="dodge") + ylab("Subfamilies") + xlab("Number of states") + facet_wrap(~Metric,scales="free_x",nrow=1,labeller=labeller(Metric=mark_labels)) + scale_y_continuous(limits=c(0,968)) + expand_limits(x=0) + scale_fill_grey() + theme(legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')) + guides(fill=guide_legend(ncol=2))

## Combine dataframes with the proportion of samples in which each TE subfamily is annotated with each state
## for both real TEs and 10 iterations of shuffled TEs
test = subfamily_state_potential
test$Iteration = rep("Real",dim(subfamily_state_potential)[1])
test = rbind(test,subfamily_state_potential_shuffle)
test$Iteration = factor(test$Iteration,levels=c(seq(1,10,1),"Real"))
  
b = ggplot(test,aes(x=Sample.Proportion,y=..scaled..,fill=Iteration)) + geom_density(alpha=0.5) + facet_wrap(~State,labeller=labeller(State=all_state_labels)) + ylab("Density") + scale_fill_manual(values=c(rep("grey",10),"red")) + xlab("Proportion of samples in state") + theme(legend.position="bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')) + guides(fill=guide_legend(nrow=1))

# Enrichments per state for real TEs vs. shuffled TEs
c = ggplot(shuffled_enrichment_counts_combine,aes(x=State,y=Enrichments,fill=State)) + geom_boxplot() + geom_point(data=enrichment_table,aes(x=State,y=Enrichments),color="red") + ylab("Number of enrichments") +  theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),axis.title.x = element_text(margin = margin(t = -10))) + scale_fill_manual(values=all_state_colors,guide=FALSE) + scale_x_discrete(labels=all_state_labels)

d = ggplot(shuffled_enrichment_counts_combine,aes(x=State,y=Subfamilies,fill=State)) + geom_boxplot() +   geom_point(data=enrichment_table,aes(x=State,y=Subfamilies),color="red") + ylab("Number of\nenriched subfamilies") +  theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),axis.title.x = element_text(margin = margin(t = -10))) + scale_fill_manual(values=all_state_colors,guide=FALSE) + scale_x_discrete(labels=all_state_labels)

grid.arrange(a,b,c,d,layout_matrix=rbind(c(1),c(2),c(3,4)),nrow=3,heights=c(0.15,0.55,0.3),widths=c(0.48,0.52))
```

```{r Figure S12 source data}
write.table(shuffle_state_potential[,c("subfamily","States","Iteration","Metric")],file="source_data/Figure_S12a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(test[,c("subfamily","Sample.Proportion","Iteration","State")],file="source_data/Figure_S12b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(shuffled_enrichment_counts_combine[,c("State","Enrichments","Subfamilies")],file="source_data/Figure_S12cd_boxplots.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(enrichment_table[,c("State","Enrichments","Subfamilies")],file="source_data/Figure_S12cd_dots.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Subfamily potential

```{r subfam potential}
# Number of subfamilies annotated with each epigenetic state in at least one sample
# and the median number of samples a TE subfamily is in each state
ddply(subfamily_state_potential,.(State),summarise,Subfamilies=sum(Samples > 0),Samples=median(Samples))

# Number of unique states each subfamily overlaps across all samples, by epigenetic mark (table and median)
by(subfam_states,subfam_states$Metric,function(x) table(x$States))
ddply(subfam_states,.(Metric),summarise,Median=median(States))

# Outliers: subfamilies in fewer than 11 chromHMM states across all samples
subfam_states[which(subfam_states$States < 11 & subfam_states$Metric == "chromHMM"),]

# Pearson correlation of the number of unique states across all samples with total subfamily length and number of CpGs, by epigenetic mark
by(subfam_states,subfam_states$Metric,function(x) cor.test(x$States,x$Total_length))
by(subfam_states,subfam_states$Metric,function(x) cor.test(x$States,x$CpGs))

# Number of subfamilies in each state in at least 75% of samples
ddply(subfamily_state_potential[which(subfamily_state_potential$Sample.Proportion >= 0.75),],.(State),summarise,Subfamilies=length(Samples))
```

### Shuffled subfamily potential

Three shuffled iterations have one subfamily without any CpGs.

```{r subfam potential shuffled}
# Table of the number of unique states each TE subfamily is annotated with across all samples, 
# by epigenetic mark and for each iteration of shuffled TEs
subfamily_potential_shuffle_ever = ddply(shuffle_state_potential,.(Iteration,Metric,States),summarise,Subfamilies=length(subfamily))
## Mean and SD for number of subfamilies, across iterations for each epigenetic mark and number of states
ddply(subfamily_potential_shuffle_ever,.(Metric,States),summarise,Mean=mean(Subfamilies),SD=sd(Subfamilies))

# Number of subfamilies in each state in at least 75% of samples, for each iteration of shuffled TEs
ddply(subfamily_state_potential_shuffle,.(Iteration,State),summarise,Subfamilies=sum(Sample.Proportion >= 0.75))
```

```{r shuffled subfamily enrichment}
# Median number of enrichments LOR > 1.5 and number of subfamilies with at least one enrichment > 1.5 by state, across iterations
ddply(shuffled_enrichment_counts_combine,.(State),summarise,Enrichments=median(Enrichments),Subfamilies=median(Subfamilies))
```

### Supplementary Figure 13

a. PCA plot (PC1/PC2) of Roadmap samples, clustered by LOR enrichment in each methylation state x subfamily. Colored by sample group, shape based on sample donor age. b. PCA plot (PC1/PC3) of Roadmap samples, clustered by LOR enrichment in each methylation state x subfamily. Colored by sample group, shape based on sample donor age. c. PCA plot (PC1/PC2) of Roadmap samples, clustered by LOR enrichment in DHS peak overlap for each subfamily. Colored by sample group, shape based on sample donor age. d. PCA plot (PC1/PC2) of Roadmap samples, clustered by LOR enrichment in H3K27ac peak overlap for each subfamily. Colored by sample group, shape based on sample donor age. e. PCA plot (PC1/PC2) of Roadmap samples, clustered by LOR enrichment in each chromHMM state x subfamily. Colored by sample anatomy. f. PCA plot (PC1/PC2) of Roadmap samples, clustered by LOR enrichment in each chromHMM state x subfamily. Colored by sample type. g. PCA plot (PC1/PC2) of Roadmap samples, clustered by LOR enrichment in each chromHMM state x subfamily. Colored by sample germlayer. h. PCA plot (PC1/PC2) of Roadmap samples, clustered by LOR enrichment in each chromHMM state x subfamily. Colored by sample cancer status. i. PCA plot (PC1/PC2) of TE subfamilies, clustered by LOR enrichment in each sample x methylation state. Colored by class, shape indicates Alu and L1 families. j. PCA plot (PC1/PC2) of TE subfamilies, clustered by LOR enrichment in each sample x DHS peak overlap. Colored by class, shape indicates Alu and L1 families. k. PCA plot (PC1/PC2) of TE subfamilies, clustered by LOR enrichment in each sample x H3K27ac peak overlap. Colored by class, shape indicates Alu and L1 families.  

```{r Figure S13, echo=FALSE}
# PCA for samples, colored by group, shape represents donor age
a = ggplot(sample_WGBS_pca$eigenvectors,aes(x=PC1,y=PC2,color=Group,shape=Age)) + geom_point() + scale_color_manual(values=group_colors,guide=FALSE) + labs(x=paste("PC1 (",round(sample_WGBS_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(sample_WGBS_pca$eigenvalues[2],1),"%)",sep=""))+ scale_shape_manual(values=setNames(c(16,17,18,16),c("Non_fetal","Fetal","Cell_line","Unknown")),labels=age_labels,guide=FALSE) + theme(aspect.ratio = 1)

b = ggplot(sample_WGBS_pca$eigenvectors,aes(x=PC1,y=PC3,color=Group,shape=Age)) + geom_point() + scale_color_manual(values=group_colors,guide=FALSE) + labs(x=paste("PC1 (",round(sample_WGBS_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC3 (",round(sample_WGBS_pca$eigenvalues[3],1),"%)",sep=""))+ scale_shape_manual(values=setNames(c(16,17,18,16),c("Non_fetal","Fetal","Cell_line","Unknown")),labels=age_labels,guide=FALSE) + theme(aspect.ratio = 1)

c = ggplot(sample_DNase_pca$eigenvectors,aes(x=PC1,y=PC2,color=Group,shape=Age)) + geom_point() + scale_color_manual(values=group_colors,guide=FALSE) + labs(x=paste("PC1 (",round(sample_DNase_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(sample_DNase_pca$eigenvalues[2],1),"%)",sep=""))+ scale_shape_manual(values=setNames(c(16,17,18,16),c("Non_fetal","Fetal","Cell_line","Unknown")),labels=age_labels,guide=FALSE) + theme(aspect.ratio = 1)

d = ggplot(sample_H3K27ac_pca$eigenvectors,aes(x=PC1,y=PC2,color=Group,shape=Age)) + geom_point() + scale_color_manual(values=group_colors,guide=FALSE) + labs(x=paste("PC1 (",round(sample_H3K27ac_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(sample_H3K27ac_pca$eigenvalues[2],1),"%)",sep=""))+ scale_shape_manual(values=setNames(c(16,17,18,16),c("Non_fetal","Fetal","Cell_line","Unknown")),labels=age_labels,guide=FALSE) + theme(aspect.ratio = 1)

# PCA for samples, chromHMM states, colored by other categories
e = ggplot(sample_chromHMM_pca$eigenvectors,aes(x=PC1,y=PC2,color=Anatomy)) + geom_point() + scale_color_manual(values=anatomy_colors,guide=FALSE) + labs(x=paste("PC1 (",round(sample_chromHMM_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(sample_chromHMM_pca$eigenvalues[2],1),"%)",sep=""))+ theme(aspect.ratio = 1)

f = ggplot(sample_chromHMM_pca$eigenvectors,aes(x=PC1,y=PC2,color=Type)) + geom_point() + scale_color_manual(values=type_colors,guide=FALSE) + labs(x=paste("PC1 (",round(sample_chromHMM_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(sample_chromHMM_pca$eigenvalues[2],1),"%)",sep=""))+ theme(aspect.ratio = 1)

g = ggplot(sample_chromHMM_pca$eigenvectors,aes(x=PC1,y=PC2,color=Germline)) + geom_point() + scale_color_manual(values=germline_colors,guide=FALSE) + labs(x=paste("PC1 (",round(sample_chromHMM_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(sample_chromHMM_pca$eigenvalues[2],1),"%)",sep=""))+ theme(aspect.ratio = 1)

h = ggplot(sample_chromHMM_pca$eigenvectors,aes(x=PC1,y=PC2,color=Cancer)) + geom_point() + scale_color_manual(values=cancer_colors,guide=FALSE) + labs(x=paste("PC1 (",round(sample_chromHMM_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(sample_chromHMM_pca$eigenvalues[2],1),"%)",sep=""))+ theme(aspect.ratio = 1)

# PCA for subfamilies, colored by class, shape represents TE family
i = ggplot(subfamily_WGBS_pca$eigenvectors,aes(x=PC1,y=PC2,color=class_update,shape=family)) + geom_point() + scale_color_manual(values=class_colors,guide=FALSE) + scale_shape_manual(values=family_shapes,guide=FALSE)+ labs(x=paste("PC1 (",round(subfamily_WGBS_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(subfamily_WGBS_pca$eigenvalues[2],1),"%)",sep=""))+ theme(aspect.ratio = 1)

j = ggplot(subfamily_DNase_pca$eigenvectors,aes(x=PC1,y=PC2,color=class_update,shape=family)) + geom_point() + scale_color_manual(values=class_colors,guide=FALSE) + scale_shape_manual(values=family_shapes,guide=FALSE)+ labs(x=paste("PC1 (",round(subfamily_DNase_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(subfamily_DNase_pca$eigenvalues[2],1),"%)",sep=""))+ theme(aspect.ratio = 1)

k = ggplot(subfamily_H3K27ac_pca$eigenvectors,aes(x=PC1,y=PC2,color=class_update,shape=family)) + geom_point() + scale_color_manual(values=class_colors,guide=FALSE) + scale_shape_manual(values=family_shapes,guide=FALSE)+ labs(x=paste("PC1 (",round(subfamily_H3K27ac_pca$eigenvalues[1],1),"%)",sep=""),y=paste("PC2 (",round(subfamily_H3K27ac_pca$eigenvalues[2],1),"%)",sep=""))+ theme(aspect.ratio = 1)

# Legends
legend_class_dot = get_legend(ggplot(subfamily_H3K27ac_pca$eigenvectors,aes(x=PC1,y=PC2,color=class_update,shape=family)) + geom_point() + scale_color_manual(values=class_colors,name="Class") + scale_shape_manual(values=family_shapes,guide=FALSE) + theme(aspect.ratio = 1,legend.position="bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')))

legend_shape_age = get_legend(ggplot(sample_H3K27ac_pca$eigenvectors,aes(x=PC1,y=PC2,color=Group,shape=Age)) + geom_point() + scale_color_manual(values=group_colors,guide=FALSE) + scale_shape_manual(values=setNames(c(16,17,18,16),c("Non_fetal","Fetal","Cell_line","Unknown")),labels=age_labels) + theme(aspect.ratio = 1,legend.position = "bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')))

legend_family = get_legend(ggplot(subfamily_chromHMM_pca$eigenvectors,aes(x=PC1,y=PC2,color=class_update,shape=family_shape)) + geom_point() + scale_color_manual(values=class_colors,guide=FALSE) + scale_shape_manual(values=family_shapes,name="Family") + theme(aspect.ratio = 1,legend.position = "bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')))

grid.arrange(a,b,c,d,legend_group,legend_shape_age,e,f,g,h,legend_anatomy,legend_type,legend_germline,legend_cancer,i,j,k,legend_class_dot,legend_family,nrow=9,layout_matrix=rbind(c(1,2,3,4),c(5),c(6),c(7,8,9,10),c(11),c(12),c(13,14),c(15,16,17,NA),c(18,19)),heights=c(0.23,0.05,0.04,0.23,0.1,0.04,0.04,0.23,0.04))
```

```{r Figure S13 source data}
write.table(sample_WGBS_pca$eigenvectors[,c("PC1","PC2","PC3","Group","Age")],file="source_data/Figure_S13ab.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(sample_WGBS_pca$eigenvalues,file="source_data/Figure_S13ab_pc.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(sample_DNase_pca$eigenvectors[,c("PC1","PC2","Group","Age")],file="source_data/Figure_S13c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(sample_DNase_pca$eigenvalues,file="source_data/Figure_S13c_pc.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(sample_H3K27ac_pca$eigenvectors[,c("PC1","PC2","Group","Age")],file="source_data/Figure_S13d.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(sample_H3K27ac_pca$eigenvalues,file="source_data/Figure_S13d_pc.txt",sep='\t',row.names=FALSE,quote=FALSE)

write.table(sample_chromHMM_pca$eigenvectors[,c("PC1","PC2","Anatomy","Type","Germline","Cancer")],file="source_data/Figure_S13efgh.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(sample_chromHMM_pca$eigenvalues,file="source_data/Figure_S13efgh_pc.txt",sep='\t',row.names=FALSE,quote=FALSE)

write.table(subfamily_WGBS_pca$eigenvectors[,c("PC1","PC2","class_update","family")],file="source_data/Figure_S13i.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_WGBS_pca$eigenvalues,file="source_data/Figure_S13i_pc.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_DNase_pca$eigenvectors[,c("PC1","PC2","class_update","family")],file="source_data/Figure_S13j.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_DNase_pca$eigenvalues,file="source_data/Figure_S13j_pc.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_H3K27ac_pca$eigenvectors[,c("PC1","PC2","class_update","family")],file="source_data/Figure_S13k.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_H3K27ac_pca$eigenvalues,file="source_data/Figure_S13k_pc.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### PCA

Dimensions of the matrices used as input for PCA. They are tables of LOR enrichment of TE subfamilies in each sample x state or Roadmap samples for each subfamily x state, by epigenetic mark, including only subfamilies with > 30 members and excluding variables with no variation in LOR enrichment. 

```{r subfam clustering}
# Sample versus subfamily x state
## chromHMM
dim(sample_chromHMM_3D)
## WGBS
dim(sample_WGBS_3D)
## DHS
dim(sample_DNase_enrichment)
## H3K27ac
dim(sample_H3K27ac_enrichment)

# Subfamily versus sample x state
## chromHMM
dim(subfamily_chromHMM_3D)
## WGBS
dim(subfamily_WGBS_3D)
## DHS
dim(subfamily_DNase_enrichment)
## H3K27ac
dim(subfamily_H3K27ac_enrichment)
```

### Enrichment thresholds

Number of subfamilies excluded from analysis by member thresholds.

```{r excluded subfamilies}
# Number of subfamilies with 30 or fewer members
dim(rmsk_TE_subfamily[which(rmsk_TE_subfamily$Count <= THRESHOLD_IK_MEMBER),])[1]
# Number of subfamilies with 30 or fewer members only in samples without chrY
dim(rmsk_TE_subfamily[which(rmsk_TE_subfamily$Count > THRESHOLD_IK_MEMBER & rmsk_TE_subfamily$Count_noY <= THRESHOLD_IK_MEMBER),])[1]
# Number of subfamilies with more than 30 members but 30 or fewer members that overlap a CpG (affects methylation states only)
dim(rmsk_TE_subfamily[which(rmsk_TE_subfamily$Count_CpGs <= THRESHOLD_IK_MEMBER & rmsk_TE_subfamily$Count > THRESHOLD_IK_MEMBER),])[1]

## Subfamilies included in one of the above lists, plus phylogenetic information, 
## number of members (overall, without chrY, overlapping CpGs), total length, and total number of CpGs
rmsk_TE_subfamily[which(rmsk_TE_subfamily$Count_noY <= THRESHOLD_IK_MEMBER | rmsk_TE_subfamily$Count_CpGs <= THRESHOLD_IK_MEMBER),c("subfamily","family","class_update","Count","Count_noY","Count_CpGs","Total_length","Total_length_noY","CpGs")]

# Subfamilies that do not overlap any CpG
rmsk_TE_subfamily[which(rmsk_TE_subfamily$CpGs == 0),c("subfamily","Count","Count_noY","Total_length","Total_length_noY","CpGs")]

# Number of enrichments LOR > 1.5 excluded from analysis by member threshold (more than 30 members overall, but 10 or fewer members in the state), 
# by state and overall
test = table(subfamily_state_sample_combined[which(subfamily_state_sample_combined$Enrichment > THRESHOLD_LOR & subfamily_state_sample_combined$Count > THRESHOLD_IK_MEMBER & subfamily_state_sample_combined$Members <= THRESHOLD_IJK_MEMBER),]$State)
test
sum(test)
```

### Average members in a state for subfamilies in a state

For all subfamilies, subfamilies enriched in a state, and subfamilies representing >1% of a state, the number/proportion of members annotated with the state. 

```{r bp members percent in state}
# For all subfamily x sample x state combinations, proportion and number of subfamily members in the state (median, min, max by state)
ddply(subfamily_state_sample_combined,~State,summarize,Percent_median=median(na.omit(Percent)), Members_median=median(na.omit(Members)),Percent_min=min(na.omit(Percent)), Members_min=min(na.omit(Members)), Percent_max=max(na.omit(Percent)), Members_max=max(na.omit(Members)))   

# For subfamily x sample x state combinations with LOR > 1.5, proportion and number of subfamily members in the state (median, min, max by state),
# including only those that pass member thresholds
ddply(subfamily_state_sample_filter[which(subfamily_state_sample_filter$Enrichment > THRESHOLD_LOR),],~State,summarise,Percent_median=median(Percent), Members_median=median(Members),Percent_min=min(Percent), Members_min=min(Members), Percent_max=max(Percent), Members_max=max(Members))

# For each state, the subfamily x sample x state combination with LOR > 1.5 with the highest proportion of members in the state
# and the top 10 combinations by proportion of members in the state, including only those combinations that pass member thresholds
ddply(subfamily_state_sample_filter[which(subfamily_state_sample_filter$Enrichment > THRESHOLD_LOR),],.(State),function(x) x[which.max(x$Percent),])
ddply(subfamily_state_sample_filter[which(subfamily_state_sample_filter$Enrichment > THRESHOLD_LOR),],.(State),function(x) tail(x[order(x$Percent),c("subfamily","family","class_update","State","Sample",sample_categories,"Members","Percent")],n=10))

# For subfamily x sample x state combinations where the subfamily represents >1% of the state, 
# proportion and number of subfamily members in the state (median, min, max by state)
ddply(subfamily_state_sample_combined[which(subfamily_state_sample_combined$Length_percent_jk > THRESHOLD_PC),],~State,summarise, Percent_median=median(Percent),Members_median=median(Members),Percent_min=min(Percent), Members_min=min(Members), Percent_max=max(Percent), Members_max=max(Members))
```

### Subfamilies with all members in a methylation state

Three subfamilies have all members in the intermediate state in at least one sample. They are all excluded by member thresholds. 

```{r WGBS subfamily stats}
# Number of samples where all members of a subfamily are in a methylation state
TE_meth_subfamily_stats = ddply(subfamily_state_sample_combined[which(subfamily_state_sample_combined$State %in% meth_states),],
                                .(subfamily,State),summarise,All_members=sum(Percent == 1))

# Number of subfamily x methylation state x sample combinations where all members are in the state
table(droplevels(TE_meth_subfamily_stats[which(TE_meth_subfamily_stats$All_members > 0),]$State))

merge(TE_meth_subfamily_stats[which(TE_meth_subfamily_stats$All_members > 0 & TE_meth_subfamily_stats$State == "Intermediate"),],rmsk_TE_subfamily[,c("subfamily","Count","Count_CpGs","Total_length","Total_length_noY","CpGs")],by="subfamily")
```

### Supplementary Figure 14

Script creates a combined dataframe with TE subfamily characteristics and proportion of samples the subfamily is enriched LOR > 1.5 in each epigenetic state, then finds the Spearman correlation between subfamily characteristics and number of enrichments in each state. 

a. Spearman correlation between LTR subfamily characteristics (number of members, median length of members, median JC distance, and CpG density) and the proportion of samples enriched LOR > 1.5 in each epigenetic state. b. Spearman correlation between the proportion of members in each LTR subfamily overlapping genic features and the proportion of samples enriched LOR > 1.5 in each epigenetic state. 15_Quies and hypermethylation have no enrichment in any subfamily. c-e. GREAT GO Biological Processes results for three TE subfamilies enriched in a state. Results are presented for all members of the subfamily in the state in samples where the subfamily is enriched. Default GREAT settings. Terms are ordered by the binomial p-value. 

```{r Figure S14, echo=FALSE}
source("R_scripts/TE_correlation_subfamily.R") 

a = ggplot(correlate_subfamily[which(correlate_subfamily$State %in% states & correlate_subfamily$Metric %in% measure_metrics_subfam[c(1,3,5,8)] & correlate_subfamily$class_update == "LTR"),],aes(x=State,y=Metric,fill=estimate.rho)) + geom_tile() + scale_fill_gradient2(limits=c(-1,1),high="darkred",low="darkblue",guide=FALSE) + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),axis.text=element_text(size=7),axis.title.x = element_text(margin = margin(r = -10)),axis.title.y=element_text(size=9)) + scale_x_discrete(limits=states[c(1:14,16:17,19:21)],labels=all_state_labels) + ylab("Subfamily\ncharacteristic") + xlab("Samples enriched in state") +  scale_y_discrete(limits=rev(measure_metrics_subfam[c(1,3,5,8)]),labels=measure_labels) 
  
b = ggplot(correlate_subfamily[which(correlate_subfamily$State %in% states & correlate_subfamily$Metric %in% c(features[1:3],"coding_exon",features[5:8]) & correlate_subfamily$class_update == "LTR"),],aes(x=State,y=Metric,fill=estimate.rho)) + geom_tile() + scale_fill_gradient2(limits=c(-1,1),high="darkred",low="darkblue",name="Spearman's\nrho") + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),axis.text=element_text(size=7),axis.title.x = element_text(margin = margin(r = -10)),axis.title.y=element_text(size=9)) + scale_x_discrete(limits=states[c(1:14,16:17,19:21)],labels=all_state_labels) + ylab("% members\noverlapping feature") + xlab("Samples enriched in state") +  scale_y_discrete(limits=rev(c(features[1:3],"coding_exon",features[5:8])),labels=genic_labels) 

# Load GREAT GO BP results (downloaded in tsv format from the GREAT website)
great_names = gsub(".txt","",list.files("enrichment/great/"))
great = lapply(great_names,function(x) read.table(paste("enrichment/great/",x,".txt",sep=""),sep='\t'))
names(great) = great_names
great = ldply(great)
colnames(great) = c("set","Term Name","Binom Rank","Binom Raw P-Value","Binom FDR Q-Val","Binom Fold Enrichment","Binom Observed Region Hits","Binom Region Set Coverage",
                    "Hyper Rank","Hyper FDR Q-Val","Hyper Fold Enrichment","Hyper Observed Gene Hits","Hyper Total Genes","Hyper Gene Set Coverage")

c = ggplot(great[which(great$set == "AmnSINE1_7_Enh"),],aes(x=reorder(`Term Name`,`Binom Raw P-Value`,median),y=log10(`Binom Raw P-Value`))) + geom_bar(stat="identity") + coord_flip() + theme(axis.title.y = element_blank(),plot.title = element_text(hjust = 1,size=7),axis.text=element_text(size=6),plot.margin = margin(0,0,0,0)) + ylab("-Log10(p-value)") + ggtitle("AmnSINE1, 7_Enh")

d = ggplot(great[which(great$set == "MER121_7_Enh"),],aes(x=reorder(`Term Name`,`Binom Raw P-Value`,median),y=log10(`Binom Raw P-Value`))) + geom_bar(stat="identity") + coord_flip() + theme(axis.title.y = element_blank(),plot.title = element_text(hjust = 1,size=7),axis.text=element_text(size=6),plot.margin = margin(0,0,0,0)) + ylab("-Log10(p-value)") + ggtitle("MER121, 7_Enh")

e = ggplot(great[which(great$set == "MER131_DNase"),],aes(x=reorder(`Term Name`,`Binom Raw P-Value`,median),y=log10(`Binom Raw P-Value`))) + geom_bar(stat="identity") + coord_flip() + theme(axis.title.y = element_blank(),plot.title = element_text(hjust = 1,size=7),axis.text=element_text(size=6),plot.margin = margin(0,0,0,0)) + ylab("-Log10(p-value)") + ggtitle("MER131, DHS")

grid.arrange(a,b,c,d,e,nrow=4,layout_matrix=rbind(c(1,2),c(3),c(4),c(5)),widths=c(0.45,0.55),heights=c(0.35,0.25,0.25,0.15))
```

```{r Figure S14 source data}
write.table(correlate_subfamily[which(correlate_subfamily$State %in% states & correlate_subfamily$Metric %in% measure_metrics_subfam[c(1,3,5,8)] & correlate_subfamily$class_update == "LTR"),c("State","Metric","estimate.rho")],file="source_data/Figure_S14a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(correlate_subfamily[which(correlate_subfamily$State %in% states & correlate_subfamily$Metric %in% c(features[1:3],"coding_exon",features[5:8]) & correlate_subfamily$class_update == "LTR"),c("State","Metric","estimate.rho")],file="source_data/Figure_S14b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(great[which(great$set == "AmnSINE1_7_Enh"),c("set","Term Name","Binom Raw P-Value")],file="source_data/Figure_S14c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(great[which(great$set == "MER121_7_Enh"),c("set","Term Name","Binom Raw P-Value")],file="source_data/Figure_S14d.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(great[which(great$set == "MER131_DNase"),c("set","Term Name","Binom Raw P-Value")],file="source_data/Figure_S14e.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Subfamily enrichment correlation

Correlations between TE subfamily characteristics and number of enrichments in each state. 

```{r subfamily enrichment correlation}
# Spearman correlation between subfamily characteristics
## Sequence features (e.g., number of members, total length): absolute correlation > 0.3, uncorrected p-value < 0.05, all subfamilies
correlate_subfam_feature[which(abs(correlate_subfam_feature$estimate.rho) > 0.3 & correlate_subfam_feature$State %in% measure_metrics_subfam & correlate_subfam_feature$Metric %in% measure_metrics_subfam & correlate_subfam_feature$class_update == "All" & correlate_subfam_feature$p.value < 0.05),]

## Proportion of members overlapping genic elements/VISTA enhancers: absolute correlation > 0.3, uncorrected p-value < 0.05, all subfamilies
correlate_subfam_feature[which(abs(correlate_subfam_feature$estimate.rho) > 0.3 & correlate_subfam_feature$State %in% c(features[1:3],"coding_exon",features[5:8],"Vista_enhancers") & correlate_subfam_feature$Metric %in% c(features[1:3],"coding_exon",features[5:8],"Vista_enhancers") & correlate_subfam_feature$class_update == "All" & correlate_subfam_feature$State != correlate_subfam_feature$Metric & correlate_subfam_feature$p.value < 0.05),]

## Proportion of members on each chromosome: absolute correlation > 0.3, uncorrected p-value < 0.05, all subfamilies
correlate_subfam_feature[which(abs(correlate_subfam_feature$estimate.rho) > 0.3 & correlate_subfam_feature$State %in% standard_chromosomes & correlate_subfam_feature$Metric %in% standard_chromosomes & correlate_subfam_feature$class_update == "All" & correlate_subfam_feature$State != correlate_subfam_feature$Metric & correlate_subfam_feature$p.value < 0.05),]

# Spearman correlation between the proportion of samples a subfamily is enriched in each state
## Absolute correlation > 0.3, uncorrected p-value < 0.05, all subfamilies
correlate_subfam_state[which(abs(correlate_subfam_state$estimate.rho) > 0.3 & correlate_subfam_state$State %in% states & correlate_subfam_state$State2 %in% states & correlate_subfam_state$class_update == "All" & correlate_subfam_state$State != correlate_subfam_state$State2 & correlate_subfam_state$p.value < 0.05),]

## Median correlation for active regulatory states (1_TssA, 2_TssAFlnk, 7_Enh, hypomethylated, intermediate methylation, DHS, H3K27ac)
## All subfamilies and LTR subfamilies only
median(correlate_subfam_state[which(correlate_subfam_state$class_update == "All" & correlate_subfam_state$State %in% states[c(1:2,7,16:17,20:21)] & correlate_subfam_state$State2 %in% states[c(1:2,7,16:17,20:21)] & correlate_subfam_state$State != correlate_subfam_state$State2),]$estimate.rho)
median(correlate_subfam_state[which(correlate_subfam_state$class_update == "LTR" & correlate_subfam_state$State %in% states[c(1:2,7,16:17,20:21)] & correlate_subfam_state$State2 %in% states[c(1:2,7,16:17,20:21)] & correlate_subfam_state$State != correlate_subfam_state$State2),]$estimate.rho)

# Spearman correlation between the proportion of samples a subfamily is enriched in each state and subfamily characteristics
## Sequence features (e.g., number of members, total length) vs. active regulatory states
## Absolute correlation > 0.3, uncorrected p-value < 0.05, all subfamilies and by class
correlate_subfamily[which(abs(correlate_subfamily$estimate.rho) > 0.3 & correlate_subfamily$State %in% states[c(1:7,16:17,20:21)] & correlate_subfamily$Metric %in% measure_metrics_subfam & correlate_subfamily$class_update == "All" & correlate_subfamily$p.value < 0.05),]

correlate_subfamily[which(abs(correlate_subfamily$estimate.rho) > 0.3 & correlate_subfamily$State %in% states[c(1:7,16:17,20:21)] & correlate_subfamily$Metric %in% measure_metrics_subfam & correlate_subfamily$class_update != "All"  & correlate_subfamily$p.value < 0.05),]

## Proportion of members overlapping genic elements/VISTA enhancers vs. active regulatory states
## Absolute correlation > 0.3, uncorrected p-value < 0.05, all subfamilies and by class
correlate_subfamily[which(abs(correlate_subfamily$estimate.rho) > 0.3 & correlate_subfamily$State %in% states[c(1:7,16:17,20:21)] & correlate_subfamily$Metric %in% c(features[1:3],"coding_exon",features[5:8],"Vista_enhancers") & correlate_subfamily$class_update == "All" & correlate_subfamily$p.value < 0.05),]

correlate_subfamily[which(abs(correlate_subfamily$estimate.rho) > 0.3 & correlate_subfamily$State %in% states[c(1:7,16:17,20:21)] & correlate_subfamily$Metric %in% c(features[1:3],"coding_exon",features[5:8],"Vista_enhancers") & correlate_subfamily$class_update != "All" & correlate_subfamily$p.value < 0.05),]

## Proportion of members on each chromosome vs. active regulatory states
## Absolute correlation > 0.3, uncorrected p-value < 0.05, all subfamilies and by class
correlate_subfamily[which(abs(correlate_subfamily$estimate.rho) > 0.3 & correlate_subfamily$State %in% states[c(1:7,16:17,20:21)] & correlate_subfamily$Metric %in% standard_chromosomes & correlate_subfamily$class_update == "All" & correlate_subfamily$p.value < 0.05),]

correlate_subfamily[which(abs(correlate_subfamily$estimate.rho) > 0.3 & correlate_subfamily$State %in% states[c(1:7,16:17,20:21)] & correlate_subfamily$Metric %in% standard_chromosomes & correlate_subfamily$class_update != "All" & correlate_subfamily$p.value < 0.05),]
```

### Summary of subfamily metrics

```{r subfamily metrics}
# Median subfamily characteristics, overall and by class
# Including total number of members; total length; median length, mappability, and JC distance; 
# total CpGs; number of TEs with CpGs; and number of CpGs per TE and per kbp of subfamily
apply(rmsk_TE_subfamily[,measure_metrics_subfam],2,median)
aggregate(data=rmsk_TE_subfamily[,c("class_update",measure_metrics_subfam)],.~class_update,function(x) median(na.omit(x)),na.action=na.pass)

# Median proportion of subfamily members overlapping each genic feature, overall and by class
apply(rmsk_TE_subfamily[,cohorts],2,median)
aggregate(data=rmsk_TE_subfamily[,c("class_update",cohorts)],.~class_update,function(x) median(na.omit(x)),na.action=na.pass)

# Median proportion of subfamily members on each chromosome (excluding chrM), overall and by class
apply(rmsk_TE_subfamily[,standard_chromosomes[1:24]],2,median)
aggregate(data=rmsk_TE_subfamily[,c("class_update",standard_chromosomes[1:24])],.~class_update,function(x) median(na.omit(x)),na.action=na.pass)

# Spearman correlation between the number of members and median age per subfamily, overall and by class
correlate_subfam_feature[which(correlate_subfam_feature$Metric == "Count" & correlate_subfam_feature$State == "Age"),]

# Subfamilies with very low mappability (median < 0.25)
# Number of subfamilies, number per family, and list with mappability and number of members
dim(rmsk_TE_subfamily[which(rmsk_TE_subfamily$Mappability < 0.25),])[1]
table(droplevels(rmsk_TE_subfamily[which(rmsk_TE_subfamily$Mappability < 0.25),]$family))
rmsk_TE_subfamily[which(rmsk_TE_subfamily$Mappability < 0.25),c("subfamily","family","class_update","Count","Mappability")]
```

### SINE subfamilies

```{r SINE subfamilies}
# SINE subfamilies enriched LOR > 1.5 in active states, including only those that pass member thresholds
test = subfamily_state_sample_counts[which(subfamily_state_sample_counts$V1 > 0 & subfamily_state_sample_counts$State %in% states[c(1:7,16:17,20:21)] & subfamily_state_sample_counts$class_update == "SINE"),]
test

## Number of enrichments LOR > 1.5 in active states per SINE subfamily
ddply(test,.(subfamily),summarise,sum(V1))

# Dataframe of SINE subfamilies with subfamily characteristics
test = rmsk_TE_subfamily[which(rmsk_TE_subfamily$class_update == "SINE"),]

# SINE subfamilies ordered by median age, along with number of members
test[order(test$Age),c("subfamily","family","class_update","Count","Age")]

# SINE subfamiles ordered by CpG density (CpGs/kbp), along with number of members and median age
test[order(test$CpGs_per_kbp),c("subfamily","family","class_update","Count","Age","CpGs_per_kbp")]

# Median CpG density for Alu subfamilies
median(test[which(test$family == "Alu"),]$CpGs_per_kbp)

# SINE subfamilies ordered by proportion of members overlapping exons (all, protein-coding, and non-coding)
# along with number of members and age
test[order(test$exons),c("subfamily","family","class_update","Count","Age","exons","exons_pc","exons_nc")]
test[order(test$exons_pc),c("subfamily","family","class_update","Count","Age","exons","exons_pc","exons_nc")]
test[order(test$exons_nc),c("subfamily","family","class_update","Count","Age","exons","exons_pc","exons_nc")]

# SINE subfamilies ordered by the proportion of members overlapping a VISTA enhancer, along with number of members and age
test[order(test$Vista_enhancers),c("subfamily","family","class_update","Count","Age","Vista_enhancers")]
```

### VISTA enhancers

```{r vista enhancers analysis}
# Number of TEs that overlap a positively validated human VISTA enhancer
dim(rmsk_TE[which(!is.na(rmsk_TE$Vista_enhancers)),])

# Number of subfamiles that overlap a VISTA enhancer 
length(table(droplevels(rmsk_TE[which(!is.na(rmsk_TE$Vista_enhancers)),c(TE_coordinates,"Vista_enhancers")]$subfamily)))

# Number of VISTA enhancers overlapped by each TE subfamily
sort(table(droplevels(rmsk_TE[which(!is.na(rmsk_TE$Vista_enhancers)),c(TE_coordinates,"Vista_enhancers")]$subfamily)))

# Spearman correlation between the proportion of subfamily members overlapping a VISTA enhancer and the number of enrichments LOR > 1.5 in each state
# with absolute correlation > 0.3, uncorrected p-value < 0.05
correlate_subfamily[which(correlate_subfamily$Metric == "Vista_enhancers" & correlate_subfamily$State %in% states & abs(correlate_subfamily$estimate.rho) > 0.3 & correlate_subfamily$p.value < 0.05),]
```

### LTR subfamilies

```{r LTR subfamilies}
# LTR subfamilies enriched LOR > 1.5 in active states, including only those that pass member thresholds
test = subfamily_state_sample_counts[which(subfamily_state_sample_counts$V1 > 0 & subfamily_state_sample_counts$State %in% states[c(1:7,16:17,20:21)] & subfamily_state_sample_counts$class_update == "LTR"),]

# LTR subfamilies x active states ordered by the number of enrichments LOR > 1.5
test[order(test$V1),]

# Number of enrichments LOR > 1.5 in active states per LTR subfamily
ddply(test,.(subfamily),summarise,sum(V1))
```

### LINE subfamilies

```{r LINE subfamilies}
# Number of enrichments LOR > 1.5 in each LINE subfamily in active states, including only those that pass member thresholds
test = ddply(subfamily_state_sample_counts[which(subfamily_state_sample_counts$class_update == "LINE" & subfamily_state_sample_counts$V1 > 0 & subfamily_state_sample_counts$State %in% states[c(1:7,16:17,20:21)]),],.(subfamily),summarise,Count=sum(V1))
test

# Total number of enrichments LOR > 1.5 in LINE subfamilies
sum(test$Count)

# Dataframe of LINE subfamilies with subfamily characteristics
test = rmsk_TE_subfamily[which(rmsk_TE_subfamily$class_update == "LINE"),]

# LINE subfamilies ordered by median age, along with number of members and CpG density
test[order(test$Age),c("subfamily","family","class_update","Count","Age","CpGs_per_kbp")]

# LINE subfamiles ordered by CpG density (CpGs/kbp), along with number of members and median age
test[order(test$CpGs_per_kbp),c("subfamily","family","class_update","Count","Age","CpGs_per_kbp")]
```

### Genic subfamilies

```{r genic subfamilies}
# Subfamilies enriched LOR > 1.5 in the 4_Tx state, including only those that pass member thresholds
test = subfamily_state_sample_counts[which(subfamily_state_sample_counts$V1 > 0 & subfamily_state_sample_counts$State == "4_Tx"),]
## Subfamilies ordered by the number of enrichments
test[order(test$V1),]

# Subfamilies enriched LOR > 1.5 in the 6_EnhG state, including only those that pass member thresholds
test = subfamily_state_sample_counts[which(subfamily_state_sample_counts$V1 > 0 & subfamily_state_sample_counts$State == "6_EnhG"),]
## Subfamilies ordered by the number of enrichments
test[order(test$V1),]

# Subfamilies enriched LOR > 1.5 in the transcribed states (3_TxFlnk, 4_Tx, 5_TxWk, 6_EnhG), including only those that pass member thresholds
test = subfamily_state_sample_counts[which(subfamily_state_sample_counts$State %in% c("3_TxFlnk","4_Tx","5_TxWk","6_EnhG") & subfamily_state_sample_counts$V1 > 0),]

## Adding the proportion of subfamily members overlapping introns (all, protein-coding, non-coding)
test = merge(test,rmsk_TE_subfamily[,c("subfamily","Count","introns","introns_pc","introns_nc")],by="subfamily")

## Subfamilies enriched in transcribed states with the lowest proportion of members overlapping introns
head(test[order(test$introns),],n=40)
```

### MER57E3

```{r mer57e3}
# Number of enrichments LOR > 1.5 of MER57E3 in each state
subfamily_state_sample_counts[which(subfamily_state_sample_counts$subfamily == "MER57E3"),]

# Dataframe of MER57E3 elements
MER57E3 = rmsk_TE[which(rmsk_TE$subfamily == "MER57E3"),]
## Number of elements
dim(MER57E3)[1]
## Number of elements on each chromosome
table(MER57E3$chromosome)
## Number of elements overlapping each genic feature
apply(MER57E3[,cohorts],2,function(x) dim(MER57E3[which(!is.na(x)),])[1])
```

### Young subfamilies

```{r young subfamilies}
# Identifying the 5th percentile of TE subfamily median age
quantile(rmsk_TE_subfamily$Age,0.05)

# Subfamilies with median age in the 5th percentile (young subfamilies)
young_subfamilies = as.vector(rmsk_TE_subfamily[which(rmsk_TE_subfamily$Age < quantile(rmsk_TE_subfamily$Age,0.05)),]$subfamily)

## List young subfamilies along with median age
rmsk_TE_subfamily[which(rmsk_TE_subfamily$subfamily %in% young_subfamilies),c("subfamily","family","class_update","Age")]

# Matrix of the proportion of samples with enrichment LOR > 1.5 per young subfamily and state, including only those that pass member thresholds
young_subfamilies_enrichment = dcast(subfamily_state_sample_counts[which(subfamily_state_sample_counts$subfamily %in% young_subfamilies & subfamily_state_sample_counts$Sample.Proportion > 0),],subfamily~State,value.var="Sample.Proportion")
rownames(young_subfamilies_enrichment) = young_subfamilies_enrichment$subfamily
young_subfamilies_enrichment = young_subfamilies_enrichment[,2:16]
young_subfamilies_enrichment
```

### Subfamily permutation to identify interesting candidates

For each subfamily x active state combination with LOR enrichment > 1.5 in at least one sample, identifies metadata categories that are represented among the enriched samples more often than expected using permutation of sample labels.

```{r subfam permutation, cache=TRUE, cache.lazy=FALSE}
# Table of subfamilies ever enriched LOR > 1.5 in each active state
subfamily_state_ever = subfamily_state_sample_counts[which(subfamily_state_sample_counts$V1 > 0 & subfamily_state_sample_counts$State %in% states[c(1:7,16:17,20:21)]),c("subfamily","State")]

## Dataframe of subfamily x state x sample LOR enrichments, restricted to subfamilies ever enriched in each active state with >30 members overall
permutation_input = merge(subfamily_state_sample_combined[which(subfamily_state_sample_combined$Count > THRESHOLD_IK_MEMBER),],subfamily_state_ever,by=c("subfamily","State"))

# For each subfamily x active state with at least one enrichment LOR > 1.5 and >30 members, 
# identifies metadata categories that are represented among samples with LOR enrichment > 1.5 more often than expected,
# including only samples with > 10 members in the state.
# Runs all six metadata categories at once, but performs FDR MHC only on category groupings with at least one enrichment LOR > 1.5.
permute_subfam = ddply(permutation_input[,c("subfamily","State","Sample","Enrichment","Members")],.(subfamily,State),function(x) permute_by_sample(matrix=x,metric="Enrichment",direction="+",threshold=THRESHOLD_LOR,filtering="Members",threshold2=THRESHOLD_IJK_MEMBER))
permute_subfam = permute_subfam[which(permute_subfam$Samples.x > 0),]
permute_subfam$Padjust = p.adjust(permute_subfam$Pvalue,method="fdr")

# Reformat results to include only category groupings with adjusted p-value < 0.05, comma-separated, by state, subfamily, and category
permute_subfam_wide = dcast(aggregate(data = permute_subfam[which(permute_subfam$Padjust < 0.05),],Grouping~State+subfamily+Category, paste, collapse = ','),State+subfamily~Category,value.var="Grouping")
## Add total number of enrichments LOR > 1.5 for each subfamily x state combination
permute_subfam_wide = merge(permute_subfam_wide,subfamily_state_sample_counts,by=c("subfamily","State"))[,c(1,10,9,2:8,11)]
## Write out results
write.table(permute_subfam_wide,file="enrichment/state_subfamily_clusters_enriched_true.txt",sep='\t',row.names = FALSE,quote = FALSE)
```

For each subfamily x active state combination where the subfamily represents >1% of the state in at least one sample, identifies metadata categories that are represented among the >1% samples more often than expected using permutation of sample labels.

```{r subfam permutation pc, cache=TRUE, cache.lazy=FALSE}
# Table of subfamilies ever representing >1% of each active state
subfamily_state_ever_pc = subfamily_state_sample_counts_pc[which(subfamily_state_sample_counts_pc$V1 > 0 & subfamily_state_sample_counts_pc$State %in% states[c(1:7,16:17,20:21)]),c("subfamily","State")]

## Dataframe of subfamily x state x sample LOR enrichments, restricted to subfamilies ever representing >1% of each active state
permutation_input_pc = merge(subfamily_state_sample_combined,subfamily_state_ever_pc,by=c("subfamily","State"))

# For each subfamily x active state combination with at least one sample where the subfamily represents >1% of the state, 
# identifies metadata categories that are represented among >1% samples more often than expected.
# Runs all six metadata categories at once, but performs FDR MHC only on category groupings with at least one sample >1%.
permute_subfam_pc = ddply(permutation_input_pc[,c("subfamily","State","Sample","Length_percent_jk","Members")],.(subfamily,State),function(x) permute_by_sample(x,"Length_percent_jk","+",THRESHOLD_PC,"Members",0))
permute_subfam_pc = permute_subfam_pc[which(permute_subfam_pc$Samples.x > 0),]
permute_subfam_pc$Padjust = p.adjust(permute_subfam_pc$Pvalue,method="fdr")

# Reformat results to include only category groupings with adjusted p-value < 0.05, comma-separated, by state, subfamily, and category
permute_subfam_pc_wide = dcast(aggregate(data = permute_subfam_pc[which(permute_subfam_pc$Padjust < 0.05),],Grouping~State+subfamily+Category, paste, collapse = ','),State+subfamily~Category,value.var="Grouping")
## Add total number of samples where subfamily represents >1% of the staet for each subfamily x state combination
permute_subfam_pc_wide = merge(permute_subfam_pc_wide,subfamily_state_sample_counts_pc,by=c("subfamily","State"))[,c(1,10,9,2:8,11)]
## Write out results
write.table(permute_subfam_pc_wide,file="enrichment/state_subfamily_clusters_pc_true.txt",sep='\t',row.names = FALSE,quote = FALSE)
```

From the permutation results, identified subfamily x state combinations enriched specifically in interesting sample categories. For each combination, prints bedfiles of individual TEs in the state when the subfamily is enriched and never in the state. 

```{r print bedfiles, eval=FALSE}
# Load dataframe with individual TE characteristics and the proportion of samples each TE is annotated with each epigenetic state
source("R_scripts/TE_correlation.R")

# Load list of interesting subfamily x state combinations from the permutation results
candidates = read.table(file="enrichment/enrichment_candidates.txt",sep='\t',col.names=c("State","subfamily"))

# Identify subfamilies enriched in >25% of samples in active states, including only those that pass member thresholds
candidates_many = subfamily_state_sample_counts[which(subfamily_state_sample_counts$State %in% states[c(1:7,16:17,20:21)] & subfamily_state_sample_counts$Sample.Proportion > 0.25),]

# Combine lists of subfamilies x states
candidates_all = unique(rbind(candidates,candidates_many[,c("State","subfamily")]))

# For each subfamily x state combination, identify TEs annotated with the state in samples where the subfamily is enriched,
# and write out a bedfile of the individual TEs
candidates_individual = apply(candidates_all,1,function(x) get_subfamily_enriched(x[2],x[1]))
names(candidates_individual) = candidates_all$State
candidates_individual = ldply(candidates_individual,.id="State")

## Write out bedfiles of individual TEs for additional subfamily x state combinations
get_subfamily_enriched("MER131","DNase")
get_subfamily_enriched("LTR22A","DNase")
get_subfamily_enriched("LTR22A","H3K27ac")

# For each subfamily x state combination, identify TEs never in the state and write out a bedfile with the individual TEs
apply(candidates_all,1,function(x) write.table(rmsk_TE_measure[which(rmsk_TE_measure$subfamily == x[2] & rmsk_TE_measure[[x[1]]] == 0),c(colnames(rmsk_TE_measure)[1:4],x[1],"strand")],sep='\t',row.names=FALSE,col.names=FALSE,quote=FALSE,file=paste("enrichment/bedfiles/",x[2],"_never_",x[1],".bed",sep="")))
```

Overlap of individual TEs in the state when the subfamily is enriched with RefSeq features.

```{r explore candidates, eval=FALSE}
# For all subfamily x state combinations of interest, find the overlap of TEs in the state when the subfamily is enriched
# with RefSeq genic features and VISTA enhancers
candidates_features = merge(candidates_individual,rmsk_TE[,c("chromosome","start","stop","subfamily","strand","family","class_update",features[c(1:3,5:8)],"coding_exon","Vista_enhancers")],all.x=TRUE,by=c("chromosome","start","stop","subfamily","strand"))

# For each subfamily x state combination, proportion of TEs in the state when the subfamily is enriched that overlap each feature
candidates_features_summary = ddply(candidates_features,.(subfamily,class_update,State),function(x) apply(x[,c(features[c(1:3,5:8)],"coding_exon","Vista_enhancers")],2,function(x) sum(!is.na(x))/length(x)))

## Write out results
write.table(candidates_features_summary,file="enrichment/candidates_features_summary.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

Number of LTR22A elements in the state when the subfamily is enriched in overlap with DHS peaks that are intergenic. 

```{bash ltr22a stats, eval=FALSE}
bedtools intersect -v -a LTR22A_DNase_enriched.bed -b ~/genic_features/RefSeq/refseq_genes.txt | wc -l
```

### Supplementary Figure 15

Enrichment of TE subfamilies in epigenetic states using the separate 50-state models trained on 7 samples. 

```{bash 50-state enrichment, eval=FALSE}
# Intersected chromHMM bedfiles with merged subfamily files
for file in raw_data/chromHMM_50state/*.bed; do suffix=$(basename $file | cut -d '_' -f1); bedtools intersect -wo -a features/TEs/subfamily/TEother_subfamily_merge.txt -b $file | awk -v OFS='\t' -v tag=$suffix '{print $0, tag}' - > chromHMM/subfamily/50state/subfamily_state_$suffix\.bed; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/subfamily/50state/subfamily_state_E#.bed [7 files]

# Length of overlap between chromHMM state and subfamily
for file in chromHMM/subfamily/50state/subfamily_state_*.bed; do awk -v OFS='\t' '{a[$4, $8, $10]+=$9}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], a[i]}}' $file > chromHMM/subfamily/50state/$(basename $file .bed).txt; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/subfamily/50state/subfamily_state_E#.txt [7 files]

# Number of subfamily TEs annotated with each state in each sample
for file in rmsk_TEother_50state_*_sorted.txt; do awk -v OFS='\t' '{a[$4, $8, $10]+=1}END{for(i in a){split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], a[i]}}' $file > subfamily_state_sample_$(basename $file _sorted.txt).txt; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/state50/subfamily_state_sample_rmsk_TEother_50state_E#.txt [7 files]

# Number of genomic bp in each state in each sample
for file in raw_data/chromHMM_50state/*.bed; do awk -v OFS='\t' '{a[$4]+=$3-$2}END{for(i in a){print i, a[i]}}' $file > chromHMM/genome/state50/$(basename $file .bed)_genome.txt; done

## Output
#/bar/epehrsson/TE_landscape/chromHMM/genome/state50/E#_50_segments_genome.txt [7 files]
```

Calculated the log odds ratio enrichment of each TE subfamily in each 50-state model chromHMM state in each sample, for 7 samples with individual 50-state chromHMM models. Uses the same thresholds for considering a subfamily x state x sample combination that were used for the 15-state model (>30 members overall, >10 members in the state).

Plot: For each sample annotated with a 50-state chromHMM model, the number of subfamily enrichments LOR > 1.5 in each state. Colored by the corresponding 18-state model state.

```{r Figure S15, echo=FALSE}
source("R_scripts/subfamily_enrichment_50state.R")

# Plot
ggplot(subfamily_50state_sample_filter[which(subfamily_50state_sample_filter$Enrichment > THRESHOLD_LOR),],aes(x=State,fill=State18)) + geom_bar(color="black") + facet_wrap(~Sample) + scale_fill_manual(values=chromHMM_states_18,name="State (18-state model)") + ylab("Enrichments LOR > 1.5") + theme(axis.text.x=element_blank(),axis.ticks.x = element_blank(),legend.position="bottom",legend.key.size = unit(2,'mm'))
```

```{r Figure S15 source data}
write.table(subfamily_50state_sample_filter[which(subfamily_50state_sample_filter$Enrichment > THRESHOLD_LOR),c("subfamily","State","State18","Sample")],file="source_data/Figure_S15.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

Identify subfamily x sample combinations enriched LOR > 1.5 in both 15-state and 50-state model states, active chromHMM states only.

```{r compare 50 15 state}
# Combine subfamily x state x sample enrichments with LOR > 1.5 for 15-state and 50-state models, active chromHMM states only, by subfamily and sample
## Including number of members annotated with each state and the corresponding 18-state model state
subfamily_state_sample_compare = merge(subfamily_50state_sample_filter[which(subfamily_50state_sample_filter$Enrichment > THRESHOLD_LOR & subfamily_50state_sample_filter$State18 %in% names(chromHMM_states_18)[1:11]),c("subfamily","Sample","State","Members","State18")],subfamily_state_sample_filter[which(subfamily_state_sample_filter$State %in% chromHMM_states[1:7] & subfamily_state_sample_filter$Enrichment > THRESHOLD_LOR),c("subfamily","family","class_update","Sample",sample_categories,"State","Members")],by=c("subfamily","Sample"))

## Write out results
write.table(subfamily_state_sample_compare,file="enrichment/compare_models.txt",sep='\t',quote=FALSE,row.names=FALSE)
```

Compare MER121 x E017 enrichment in 7_Enh and 50-state enhancer states. Pulled out MER121 elements in the states (based on corresponding 18-state state for 50-state) in E017, then combined. 

```{bash compare MER121 50 15, eval=FALSE}
# MER121 E017 in enhancer states, 50-state
awk '{if(($4 == "MER121") && (($8 == "E26") || ($8 == "E30") || ($8 == "E31") || ($8 == "E32") || ($8 == "E37"))) print $0}' rmsk_TEother_50state_E017_sorted.txt > MER121_E017_7_Enh.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/state50/MER121_E017_7_Enh.txt

# Combine with MER121 E017 in 7_Enh, 15-state
awk '{if($8 == "E017") print $0}' chromHMM/subfamily/by_state/MER121_7_Enh.txt | bedtools intersect -wo -f 1 -r -a /scratch/ecp/TE_landscape/state50/MER121_E017_7_Enh.txt -b - > MER121_7_Enh_50state.txt
cut -f1-11,20-21,23 MER121_7_Enh_50state.txt

## Output
#/bar/epehrsson/TE_landscape/MER121_7_Enh_50state.txt

# Nearest gene for each TE
bedtools closest -a <(sort -k1,1 -k2,2n MER121_7_Enh_50state.txt) -b <(sort -k1,1 -k2,2n ~/genic_features/RefSeq/refseq_genes.txt) -D b -t all | cut -f 2,36 | sort | uniq
```

### Supplementary Figure 16

For each active state, the fraction of the state (fill color) within each subfamily (row) in each sample (column), for a) chromHMM states, b) DHS peak summits, c) H3K27ac peak summits, and d) methylation states, including all subfamilies that represent > 1% of an active state in at least one sample. 

Subfamilies are ordered by total length (bp). Samples are ordered by group. Lines split sample groups ESC/iPSC/ES-derived; Blood & T-cell/HSC & B-cell/Thymus; Brain/Neurosph; Digestive; Epithelial; Adipose/Mesench/Myosat/Muscle/Sm. Muscle/Heart/Other; and IMR90/ENCODE2012.

```{r Figure S16, echo=FALSE}
# Subfamilies representing >1% of an active state in at least one sample
pc_subfamilies = as.vector(unique(subfamily_state_sample_combined[which(subfamily_state_sample_combined$Length_percent_jk > THRESHOLD_PC & subfamily_state_sample_combined$State %in% c(chromHMM_states[1:7],meth_states[1:2],"DNase","H3K27ac")),]$subfamily))

a = ggplot(subfamily_state_sample_combined[which(subfamily_state_sample_combined$subfamily %in% pc_subfamilies & subfamily_state_sample_combined$State %in% chromHMM_states[1:7]),],aes(x=reorder(Sample,match(Group,group_order)),y=reorder(subfamily,rmsk_TE_subfamily[match(subfamily,rmsk_TE_subfamily$subfamily),]$Total_length),fill=Length_percent_jk)) + geom_tile(linetype="blank") + theme(axis.text.x = element_blank(),axis.ticks.x=element_blank(),axis.title.y = element_text(margin = margin(r = -10))) + scale_fill_gradient(low="white",high="darkblue",limits=c(0,0.12),guide=FALSE) +  facet_wrap(~State,nrow=2) + xlab("Sample") + ylab("Subfamily") + geom_vline(xintercept=22.5,color="black") + geom_vline(xintercept=47.5,color="black") + geom_vline(xintercept=59.5,color="black") + geom_vline(xintercept=71.5,color="black") + geom_vline(xintercept=79.5,color="black") +  geom_vline(xintercept=111.5,color="black")

b = ggplot(subfamily_state_sample_combined[which(subfamily_state_sample_combined$subfamily %in% pc_subfamilies & subfamily_state_sample_combined$State == "DNase"),],aes(x=reorder(Sample,match(Group,group_order)),y=reorder(subfamily,rmsk_TE_subfamily[match(subfamily,rmsk_TE_subfamily$subfamily),]$Total_length),fill=Length_percent_jk)) + geom_tile(linetype="blank") + theme(axis.text.x = element_blank(),axis.ticks.x=element_blank(),axis.title.y = element_text(margin = margin(r = -10))) + scale_fill_gradient(low="white",high="darkblue",limits=c(0,0.12),guide=FALSE) +  facet_wrap(~State,ncol=3,labeller=labeller(State=all_state_labels)) + xlab("Sample") + ylab("Subfamily") + geom_vline(xintercept=8.5,color="black") + geom_vline(xintercept=16.5,color="black") + geom_vline(xintercept=18.5,color="black") + geom_vline(xintercept=23.5,color="black") + geom_vline(xintercept=28.5,color="black") +  geom_vline(xintercept=38.5,color="black")

c = ggplot(subfamily_state_sample_combined[which(subfamily_state_sample_combined$subfamily %in% pc_subfamilies & subfamily_state_sample_combined$State == "H3K27ac"),],aes(x=reorder(Sample,match(Group,group_order)),y=reorder(subfamily,rmsk_TE_subfamily[match(subfamily,rmsk_TE_subfamily$subfamily),]$Total_length),fill=Length_percent_jk)) + geom_tile(linetype="blank") + theme(axis.text.x = element_blank(),axis.ticks.x=element_blank(),axis.text.y = element_blank(),axis.ticks.y=element_blank(),axis.title.y=element_blank()) + scale_fill_gradient(low="white",high="darkblue",limits=c(0,0.12),guide=FALSE) +  facet_wrap(~State,ncol=3) + xlab("Sample") + ylab("Subfamily") + geom_vline(xintercept=16.5,color="black") + geom_vline(xintercept=35.5,color="black") + geom_vline(xintercept=42.5,color="black") + geom_vline(xintercept=52.5,color="black") + geom_vline(xintercept=57.5,color="black") +  geom_vline(xintercept=81.5,color="black")

d = ggplot(subfamily_state_sample_combined[which(subfamily_state_sample_combined$subfamily %in% pc_subfamilies & subfamily_state_sample_combined$State %in% meth_states[1:2]),],aes(x=reorder(Sample,match(Group,group_order)),y=reorder(subfamily,rmsk_TE_subfamily[match(subfamily,rmsk_TE_subfamily$subfamily),]$Total_length),fill=Length_percent_jk)) + geom_tile(linetype="blank") + theme(axis.text.x = element_blank(),axis.ticks.x=element_blank(),axis.text.y = element_blank(),axis.ticks.y=element_blank(),axis.title.y=element_blank()) + scale_fill_gradient(low="white",high="darkblue",limits=c(0,0.12),guide=FALSE) +  facet_wrap(~State,ncol=3) + xlab("Sample") + ylab("Subfamily") + geom_vline(xintercept=13.5,color="black") + geom_vline(xintercept=15.5,color="black") + geom_vline(xintercept=19.5,color="black") + geom_vline(xintercept=25.5,color="black") + geom_vline(xintercept=35.5,color="black") + geom_vline(xintercept=36.5,color="black")

scale_length_jk = get_legend(ggplot(subfamily_state_sample_combined[which(subfamily_state_sample_combined$subfamily %in% pc_subfamilies & subfamily_state_sample_combined$State %in% chromHMM_states[1:7]),],aes(x=reorder(Sample,match(Group,group_order)),y=reorder(subfamily,rmsk_TE_subfamily[match(subfamily,rmsk_TE_subfamily$subfamily),]$Total_length),fill=Length_percent_jk)) + geom_tile(linetype="blank") + scale_fill_gradient(name="Proportion of state\nin subfamily",low="white",high="darkblue",limits=c(0,0.12)) + theme(legend.position = "bottom"))

grid.arrange(a,b,c,d,scale_length_jk,layout_matrix=rbind(c(1,1,1),c(2,3,4),c(5)),heights=c(0.6,0.35,0.05),widths=c(0.33,0.37,0.3))
```

```{r Figure S16 source data}
write.table(subfamily_state_sample_combined[which(subfamily_state_sample_combined$subfamily %in% pc_subfamilies & subfamily_state_sample_combined$State %in% chromHMM_states[1:7]),c("Sample","Group","subfamily","Length_percent_jk","State")],file="source_data/Figure_S16a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_state_sample_combined[which(subfamily_state_sample_combined$subfamily %in% pc_subfamilies & subfamily_state_sample_combined$State == "DNase"),c("Sample","Group","subfamily","Length_percent_jk","State")],file="source_data/Figure_S16b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_state_sample_combined[which(subfamily_state_sample_combined$subfamily %in% pc_subfamilies & subfamily_state_sample_combined$State == "H3K27ac"),c("Sample","Group","subfamily","Length_percent_jk","State")],file="source_data/Figure_S16c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(subfamily_state_sample_combined[which(subfamily_state_sample_combined$subfamily %in% pc_subfamilies & subfamily_state_sample_combined$State %in% meth_states[1:2]),c("Sample","Group","subfamily","Length_percent_jk","State")],file="source_data/Figure_S16d.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Subfamilies representing >1% of a state

```{r pc table}
# For each state, the subfamily x sample combination with the highest proportion of the state within a subfamily
ddply(subfamily_state_sample_combined,~State,function(x) x[which.max(x$Length_percent_jk),])

## Excluding combinations with LOR enrichment > 1.5
ddply(subfamily_state_sample_combined[which(subfamily_state_sample_combined$Enrichment <= THRESHOLD_LOR),],~State,function(x) x[which.max(x$Length_percent_jk),])

# Table with the number of instances where a subfamily represents >1% of each state
# and the number of subfamilies that represent >1% of each state at least once, overall and by class
pc_table = merge(as.data.frame(table(subfamily_state_sample_combined[which(subfamily_state_sample_combined$Length_percent_jk > THRESHOLD_PC),]$State)),subfamily_state_sample_counts_pc_combine,by.x="Var1",by.y="State")
colnames(pc_table)[1:3] = c("State","Instances","Subfamilies")

# Add the number of instances where a subfamily represents >1% of each state and the subfamily is also enriched LOR > 1.5 in the state
pc_table = merge(pc_table,as.data.frame(table(subfamily_state_sample_combined[which(subfamily_state_sample_combined$Length_percent_jk > THRESHOLD_PC & subfamily_state_sample_combined$Enrichment > THRESHOLD_LOR),]$State)),by.x="State",by.y="Var1",all=TRUE)
colnames(pc_table)[10] = "Enriched_Instances"

## Proportion of instances where a subfamily represents >1% of a state that it is also enriched LOR > 1.5 in the state
pc_table$Percent = pc_table$Enriched_Instances/pc_table$Instances

# Total number of subfamily x state x sample instances where the subfamily represents >1% of the state
sum(pc_table$Instances)
pc_table

# Number of subfamilies that ever represent >1% of a state
length(unique(subfamily_state_sample_counts_pc[which(subfamily_state_sample_counts_pc$V1 > 0),]$subfamily))
## Active states only
length(unique(subfamily_state_sample_counts_pc[which(subfamily_state_sample_counts_pc$V1 > 0 & subfamily_state_sample_counts_pc$State %in% states[c(1:7,16:17,20:21)]),]$subfamily))

# Subfamilies that represent >1% of bases in the genome
rmsk_TE_subfamily[which(rmsk_TE_subfamily$Total_length > GENOME_WIDTH*THRESHOLD_PC),c("subfamily","Count","Total_length","Total_length_noY","CpGs")]
## 1% of genome width
GENOME_WIDTH*THRESHOLD_PC

# Subfamilies that represent >1% of CpGs in the genome
rmsk_TE_subfamily[which(rmsk_TE_subfamily$CpGs > ALL_CPGS*THRESHOLD_PC),c("subfamily","Count","Total_length","Total_length_noY","CpGs")]
## 1% of CpGs in genome
ALL_CPGS*THRESHOLD_PC

# Dataframe with the number of samples each subfamily represents >1% of each state,
# plus subfamily number of members, total length, total CpGs, and number of members with CpGs
test = merge(dcast(subfamily_state_sample_counts_pc,subfamily+family+class_update~State,value.var = "V1"),rmsk_TE_subfamily[,c("subfamily","Count","Total_length","CpGs","Count_CpGs")],by="subfamily",all=TRUE)

# Subfamilies ordered by decreasing total length (rank listed in "Order")
test = test[order(test$Total_length,decreasing=TRUE),]
rownames(test) <- NULL
test$Order = rownames(test)

# For each state, print all subfamilies that represent >1% of the state in at least one sample, along with their rank by total subfamily length,
# number of samples representing >1% of the state, and number of samples enriched LOR > 1.5 (if applicable)
lapply(c(chromHMM_states,"DNase","H3K27ac"),function(x) if(pc_table[which(pc_table$State == x),]$Enriched_Instances > 0) {join(test[which(test[,x] > 0),c("Order","subfamily","family","class_update","Count","Total_length","CpGs","Count_CpGs",x)],aggregate(data=subfamily_state_sample_combined[which(subfamily_state_sample_combined$State == x & subfamily_state_sample_combined$Length_percent_jk > THRESHOLD_PC),],Enrichment~subfamily,function(y) sum(y > THRESHOLD_LOR)),by="subfamily",type="left")} else{test[which(test[,x] > 0),c("Order","subfamily","family","class_update","Count","Total_length","CpGs",x)]})

# Subfamilies ordered by decreasing total CpGs (rank listed in "Order")
test = test[order(test$CpGs,decreasing=TRUE),]
rownames(test) <- NULL
test$Order = rownames(test)

# For each methylation state, print all subfamilies that represent >1% of the state in at least one sample, along with their rank by total number of CpGs,
# number of samples representing >1% of the state, and number of samples enriched LOR > 1.5 (if applicable)
lapply(meth_states,function(x) if(pc_table[which(pc_table$State == x),]$Enriched_Instances > 0) {join(test[which(test[,x] > 0),c("Order","subfamily","family","class_update","Count","Total_length","CpGs","Count_CpGs",x)],aggregate(data=subfamily_state_sample_combined[which(subfamily_state_sample_combined$State == x & subfamily_state_sample_combined$Length_percent_jk > THRESHOLD_PC),],Enrichment~subfamily,function(y) sum(y > THRESHOLD_LOR)),by="subfamily",type="left")} else{test[which(test[,x] > 0),c("Order","subfamily","family","class_update","Count","Total_length","CpGs",x)]})
```

## TE epigenetic profiles as a function of age, chromosome, and feature overlap

### Figure 6

```{r Figure 6 scripts, echo=FALSE, cache=TRUE, cache.lazy=FALSE, dependson=c("Figure 2 scripts","Figure 5 scripts")}
# Creates a single data frame with individual TE characteristics and the proportion of samples each TE is annotated with each epigenetic state
source("R_scripts/TE_correlation.R")

# Creates dataframes with the Spearman correlation between individual TE characteristics 
# and the proportion of samples the TE is annotated with each epigenetic state, for all TEs and by class
source("R_scripts/state_metric_correlation.R")
```

a. Scaled density plot of the JC distance from subfamily consensus for all SINE elements. b. Relationship between individual SINE element age (JC distance) and the proportion of samples the TE is annotated with each methylation state (line color). c. Relationship between individual SINE element age (JC distance) and TE length, mappability, and CpG density (CpGs/kbp) (line type) and the likelihood a TE overlaps a CpG island (pink line). d. Scaled density plot of the JC distance from subfamily consensus for all Alu elements. e. Relationship between individual Alu element age (JC distance) and the proportion of samples the TE is annotated with each methylation state (line color). f. Relationship between individual Alu element age (JC distance) and TE length, mappability, and CpG density (CpGs/kbp) (line type) and the likelihood a TE overlaps a CpG island (pink line). 

```{r Figure 6, echo=FALSE,fig.height=6}
# Binary (0/1) variable for whether a TE overlaps a CpG island
rmsk_TE_measure$cpgIsland_binary = as.numeric(ifelse(rmsk_TE_measure$cpgIsland == 0,0,1))

# Dataframe of individual SINE elements with TE characteristics and the proportion of samples each TE is annotated with each methylation state
rmsk_TE_measure_SINE = na.omit(rmsk_TE_measure[which(rmsk_TE_measure$class_update == "SINE"),c(TE_coordinates,"class_update",measure_metrics,meth_states,"cpgIsland_binary")])
## Convert TE length to kbp
rmsk_TE_measure_SINE$Length = rmsk_TE_measure_SINE$Length/1000

a = ggplot(rmsk_TE_measure_SINE,aes(x=JC_distance,y=..scaled..)) + geom_density(fill=class_colors["SINE"]) + xlab("Jukes-Cantor Evolutionary Distance") + ylab("Density")

b = ggplot(melt(rmsk_TE_measure_SINE,id.vars=c(TE_coordinates,"class_update","Length","mappability","JC_distance","CpGs","CpGs_per_length","cpgIsland_binary")),aes(x=JC_distance,y=value,color=variable)) + geom_smooth() + scale_color_manual(values=meth_colors, guide=FALSE) + xlab("Jukes-Cantor Evolutionary Distance") + ylab("Proportion of samples in state") + ylim(-0.05,1)

c = ggplot(melt(rmsk_TE_measure_SINE,id.vars=c(TE_coordinates,"class_update","JC_distance","CpGs",meth_states,"cpgIsland_binary"))) + geom_smooth(aes(x=JC_distance,y=value,linetype=variable),color="black") + scale_linetype_manual(values=c(2,3,6),guide=FALSE) + xlab("Jukes-Cantor Evolutionary Distance") + ylab("TE characteristic") + geom_smooth(data=rmsk_TE_measure_SINE,aes(x=JC_distance,y=cpgIsland_binary),color="pink",method="glm",method.args = list(family = "binomial")) + scale_y_continuous(breaks=seq(0,1.25,0.25))

# Dataframe of individual Alu elements with TE characteristics and the proportion of samples each TE is annotated with each methylation state
rmsk_TE_measure_Alu = na.omit(rmsk_TE_measure[which(rmsk_TE_measure$family == "Alu"),c(TE_coordinates,"class_update",measure_metrics,meth_states,"cpgIsland_binary")])
## Convert TE length to kbp
rmsk_TE_measure_Alu$Length = rmsk_TE_measure_Alu$Length/1000

d = ggplot(rmsk_TE_measure_Alu,aes(x=JC_distance,y=..scaled..)) + geom_density(fill=class_colors["SINE"]) + xlab("Jukes-Cantor Evolutionary Distance") + ylab("Density")

e = ggplot(melt(rmsk_TE_measure_Alu,id.vars=c(TE_coordinates,"class_update","Length","mappability","JC_distance","CpGs","CpGs_per_length","cpgIsland_binary")),aes(x=JC_distance,y=value,color=variable)) + geom_smooth() + scale_color_manual(values=meth_colors, guide=FALSE) + xlab("Jukes-Cantor Evolutionary Distance") + ylab("Proportion of samples in state") + ylim(-0.05,1)

f = ggplot(melt(rmsk_TE_measure_Alu,id.vars=c(TE_coordinates,"class_update","JC_distance","CpGs",meth_states,"cpgIsland_binary"))) + geom_smooth(aes(x=JC_distance,y=value,linetype=variable),color="black") + scale_linetype_manual(values=c(2,3,6),guide=FALSE) + xlab("Jukes-Cantor Evolutionary Distance") + ylab("TE characteristic") + geom_smooth(data=rmsk_TE_measure_Alu,aes(x=JC_distance,y=cpgIsland_binary),color="pink",method="glm",method.args = list(family = "binomial")) + scale_y_continuous(breaks=seq(0,1.25,0.25),limits=c(0,1.25))

legend1 = get_legend(ggplot(melt(rmsk_TE_measure_Alu,id.vars=c(TE_coordinates,"class_update","Length","mappability","JC_distance","CpGs","CpGs_per_length","cpgIsland_binary")),aes(x=JC_distance,y=value,color=variable)) + geom_smooth() + scale_color_manual(values=meth_colors,name="State",labels=all_state_labels) + theme(legend.direction = "horizontal",legend.key.size = unit(3,'mm')))
  
legend2 = get_legend(ggplot(melt(rmsk_TE_measure_Alu,id.vars=c(TE_coordinates,"class_update","JC_distance","CpGs",meth_states,"cpgIsland_binary"))) + geom_smooth(aes(x=JC_distance,y=value,linetype=variable),color="black") + scale_linetype_manual(values=c(2,3,6),labels=c("Length (kbp)","Mappability","CpGs per bp"),name="Characteristic") + ylab("Property") + theme(legend.direction = "horizontal",legend.key.size = unit(3,'mm')) + geom_smooth(data=rmsk_TE_measure_Alu,aes(x=JC_distance,y=cpgIsland_binary),color="pink",method="glm",method.args = list(family = "binomial")))

grid.arrange(a,b,c,d,e,f,legend1,legend2,nrow=5,layout_matrix=rbind(c(1,4),c(2,5),c(7,7),c(3,6),c(8,8)),heights=c(0.2,0.35,0.05,0.35,0.05))
```

```{r Figure 6 source data}
write.table(rmsk_TE_measure_SINE[,c(TE_coordinates,"class_update","Length","mappability","JC_distance","CpGs","CpGs_per_length",meth_states,"cpgIsland_binary")],file="source_data/Figure_6abc.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(rmsk_TE_measure_Alu[,c(TE_coordinates,"class_update","Length","mappability","JC_distance","CpGs","CpGs_per_length",meth_states,"cpgIsland_binary")],file="source_data/Figure_6def.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Figure 6 analysis

```{r analysis Fig 6}
# Spearman correlation between the proportion of samples a TE is annotated with each state and TE characteristics
# Absolute Spearman correlation > 0.3, overall and by class
correlate_TE_state[which(abs(correlate_TE_state$estimate.rho) > 0.3 & correlate_TE_state$State %in% states),]

# Proportion of SINE elements in the Alu family
dim(rmsk_TE[which(rmsk_TE$family == "Alu"),])[1]/dim(rmsk_TE[which(rmsk_TE$class_update == "SINE"),])[1]

# Median/IQR JC distance for all SINE elements, non-Alu SINE elements, and Alu elements
median(rmsk_TE[which(rmsk_TE$class == "SINE"),]$JC_distance)
IQR(rmsk_TE[which(rmsk_TE$class == "SINE"),]$JC_distance)
median(rmsk_TE[which(rmsk_TE$class == "SINE" & rmsk_TE$family != "Alu"),]$JC_distance)
IQR(rmsk_TE[which(rmsk_TE$class == "SINE" & rmsk_TE$family != "Alu"),]$JC_distance)
median(rmsk_TE[which(rmsk_TE$family == "Alu"),]$JC_distance)
IQR(rmsk_TE[which(rmsk_TE$family == "Alu"),]$JC_distance)

# Number/proportion of non-Alu SINE elements and Alu elements that overlap a CpG island
dim(rmsk_TE[which(!is.na(rmsk_TE$cpgIsland) & rmsk_TE$class_update == "SINE" & rmsk_TE$family != "Alu"),])[1]
dim(rmsk_TE[which(!is.na(rmsk_TE$cpgIsland) & rmsk_TE$class_update == "SINE" & rmsk_TE$family != "Alu"),])[1]/dim(rmsk_TE[which(rmsk_TE$class_update == "SINE" & rmsk_TE$family != "Alu"),])[1]
dim(rmsk_TE[which(!is.na(rmsk_TE$cpgIsland) & rmsk_TE$family == "Alu"),])[1]
dim(rmsk_TE[which(!is.na(rmsk_TE$cpgIsland) & rmsk_TE$family == "Alu"),])[1]/dim(rmsk_TE[which(rmsk_TE$family == "Alu"),])[1]

# Generalized additive model for TE JC distance versus proportion of samples in each methylation state, length, mappability, and CpG density
# For SINE elements and Alu elements
test = lapply(c(meth_states,"Length","mappability","CpGs_per_length"),function(x) gam(rmsk_TE_measure_SINE[[x]]~rmsk_TE_measure_SINE$JC_distance))
lapply(test,function(x) summary(x))
test = lapply(c(meth_states,"Length","mappability","CpGs_per_length"),function(x) gam(rmsk_TE_measure_Alu[[x]]~rmsk_TE_measure_Alu$JC_distance))
lapply(test,function(x) summary(x))

# Logistic regression model for TE JC distance versus overlap with CpG islands, for all TEs, SINE elements, and Alu elements
test = glm(as.factor(cpgIsland_binary)~JC_distance,data=rmsk_TE_measure,family="binomial")
summary(test)
test = glm(as.factor(cpgIsland_binary)~JC_distance,data=rmsk_TE_measure_SINE,family="binomial")
summary(test)
test = glm(as.factor(cpgIsland_binary)~JC_distance,data=rmsk_TE_measure_Alu,family="binomial")
summary(test)

# Median age of TEs by overlap with CpG islands, for all TEs, SINE elements, and Alu elements
ddply(rmsk_TE_measure,.(cpgIsland_binary),summarise,Age=median(JC_distance))
ddply(rmsk_TE_measure[which(rmsk_TE_measure$class_update == "SINE"),],.(cpgIsland_binary),summarise,Age=median(JC_distance))
ddply(rmsk_TE_measure[which(rmsk_TE_measure$family == "Alu"),],.(cpgIsland_binary),summarise,Age=median(JC_distance))
```

### Supplementary Figure 17

Epigenetic state annotations by chromosome. For chromHMM, sums the total length of the chromosome annotated with the state. For WGBS, counts the number of CpGs in each methylation state per chromosome. For DHS and H3K27ac, counts the number of peaks per chromosome. 

```{bash chromosome epigenetics, eval=FALSE}
# chromHMM
while read line; do awk -v OFS='\t' -v sample=$line '{a[$1,$4]+=$3-$2}END{for(i in a){split(i,sep,SUBSEP); print sample, sep[1], sep[2], a[i];}}' raw_data/chromHMM/$line\_15_coreMarks_mnemonics.bed; done < sample_lists/mnemonics.txt > chromHMM/chromosome_states.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/chromosome_states.txt

# WGBS
awk '{print $0 > $1}' ~/TE_landscape/WGBS/all_CpG_Meth.bed
for file in chr*; do awk -v chr=$file -v OFS='\t' '{for (i=4;i<=NF;i++){if($i == -1) miss[i]+=1; else if ($i < 0.3) hypo[i]+=1; else if ($i > 0.7) hyper[i]+=1; else if (($i <= 0.7) && ($i >= 0.3)) inter[i]+=1;}}; END{for (i in hyper) print chr, i, hypo[i], inter[i], hyper[i], miss[i];}' $file >> chr_CpG_meth_states.txt; done &

## Output
#/bar/epehrsson/TE_landscape/WGBS/chr_CpG_meth_states.txt

# DHS, number of peaks by chromosome
while read line; do awk -v sample=$line '{a[$1]+=1}END{for(i in a){print sample, i, a[i]}}' raw_data/DNase/DNase_narrow_peaks/$line\-DNase.macs2.narrowPeak >> DNase/chr_DNase.txt ; done < sample_lists/DNase_samples.txt

## Output
#/bar/epehrsson/TE_landscape/DNase/chr_DNase.txt

# H3K27ac, number of peaks by chromosome
while read line; do awk -v sample=$line '{a[$1]+=1}END{for(i in a){print sample, i, a[i]}}' raw_data/H3K27ac/H3K27ac_narrow_peaks/$line\-H3K27ac.narrowPeak >> H3K27ac/chr_H3K27ac.txt ; done < sample_lists/H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/H3K27ac/chr_H3K27ac.txt
```

Generates dataframes with the length of each chromosome annotated with each epigenetic state across samples, the proportion of the genome and individual TEs represented by each chromosome, and the proportion of TEs on each chromosome that overlap each genic feature.

a. Proportion of bases on each chromosome annotated with each chromHMM state, across all samples. b. Proportion of CpGs on each chromosome annotated with each methylation state, across all samples. c. Number of DHS peaks on each chromosome versus total length, across all samples. Dashed line indicates linear correlation between peaks and total length. d. Number of H3K27ac peaks on each chromosome versus total length, across all samples. Dashed line indicates linear correlation between peaks and total length. e. For each epigenetic state, the Z-score for each chromosome based on the mean number of samples each TE on the chromosome is annotated with the state. 

```{r Figure S17, echo=FALSE}
source("R_scripts/chromosome_potential.R")

# Epigenetic annotation by chromosome
a = ggplot(chrom_state_total,aes(x=chromosome,y=Bases,fill=State)) + geom_bar(stat="identity",position="fill") + scale_fill_manual(values = chromHMM_colors) + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),legend.position="bottom",legend.box.margin = margin(t=-10),legend.margin=margin(0,0,0,0),legend.key.size = unit(3,'mm')) + ylab("Proportion in state") + xlab("Chromosome") + guides(fill = guide_legend(nrow=4,title.position = "top"))

b = ggplot(chr_state_WGBS_total,aes(x=chromosome,y=CpGs,fill=State)) + geom_bar(position="fill",stat="identity") + scale_fill_manual(values=meth_colors,labels=meth_labels) + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),legend.position="bottom",legend.box.margin = margin(t=-10),legend.margin=margin(0,0,0,0),legend.key.size = unit(3,'mm')) + ylab("Proportion in state") + xlab("Chromosome") + guides(fill = guide_legend(nrow=4,title.position = "top"))

c = ggplot(chr_state_DNase_total,aes(x=size/1000000,y=Peaks/1000000,label=chromosome)) + geom_text(size=3) + geom_smooth(method='lm',se=FALSE,color="black",linetype="dashed") + xlab("Chromosome size (Mbp)") + ylab("Peaks (millions)")  + scale_y_continuous(limits=c(0,1.5))

d = ggplot(chr_state_H3K27ac_total,aes(x=size/1000000,y=Peaks/1000000,label=chromosome)) + geom_text(size=3) + geom_smooth(method='lm',se=FALSE,color="black",linetype="dashed") + xlab("Chromosome size (Mbp)") + ylab("Peaks (millions)") + scale_y_continuous(limits=c(0,1.5))

# Proportion of samples in state versus chromosome
e = ggplot(chromosome_potential,aes(x=State,y=Zscore,label=chromosome)) + geom_point() + geom_text(angle=45,hjust=0,vjust=0,size=3) + scale_x_discrete(limits=rev(states),labels=all_state_labels) + scale_y_continuous(limits=c(-5,5)) + theme(panel.grid.major.y=element_line(color="grey"),axis.title.y = element_text(margin = margin(r = -10))) + ylab("Z-score, % of samples in state") + geom_hline(yintercept = 1,color="grey") + geom_hline(yintercept = 2,color="grey") + geom_hline(yintercept = -1,color="grey") + geom_hline(yintercept = -2,color="grey") + geom_hline(yintercept = -3,color="grey") + geom_hline(yintercept = 3,color="grey") + geom_hline(yintercept = -4,color="grey") + geom_hline(yintercept = 4,color="grey") + coord_flip() + geom_hline(yintercept = -5,color="grey") + geom_hline(yintercept = 5,color="grey")

grid.arrange(a,b,c,d,e,layout_matrix=rbind(c(1,2),c(3,4),c(5)),heights=c(0.35,0.25,0.4))
```

```{r Figure S17 source data}
write.table(chrom_state_total[,c("chromosome","Bases","State")],file="source_data/Figure_S17a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(chr_state_WGBS_total[,c("chromosome","CpGs","State")],file="source_data/Figure_S17b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(chr_state_DNase_total[,c("chromosome","size","Peaks")],file="source_data/Figure_S17c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(chr_state_H3K27ac_total[,c("chromosome","size","Peaks")],file="source_data/Figure_S17d.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(chromosome_potential[,c("State","Zscore","chromosome")],file="source_data/Figure_S17e.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Supplementary Figure 18 (intergenic potential)

```{r Figure S18 scripts, cache=TRUE, cache.lazy=FALSE, dependson=c("Figure 6 scripts")}
# Calculates the proportion of increase in the mean proportion of samples each TE is annotated with each epigenetic state,
# for TEs overlapping each genic feature versus those that do not
source("R_scripts/feature_overlap_state.R")

# Calculates the proportion of TEs in each TE class annotated with each epigenetic state that overlap genic features,
# scaled by the frequency with which the TE is annotated with the state
source("R_scripts/intergenic_potential.R")
```

a. Percent increase in the mean proportion of samples each TE is annotated with each epigenetic state, for TEs overlapping each genic feature versus those that do not, by class. Value > 5000 are colored dark blue. b. For each state (1_TssA, 4_Tx, 5_TxWk, 7_Enh, DHS, and H3K27ac) and TE class (DNA, LINE, LTR, SINE only), the proportion of TEs that overlap genes (including promoters) at each frequency of annotation (x-axis). The dashed lines give the proportion of all members of the class that overlap genes. Analysis is split between all genes and ony protein-coding genes.

```{r Figure S18, echo=FALSE}
a = ggplot(na.omit(feature_state_mean_class[which(feature_state_mean_class$Coding == "All"),]),aes(x=State,y=forcats::fct_rev(Feature),fill=Enrichment*100)) + geom_tile() + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),panel.background = element_rect(fill="lightgrey"),axis.title.x = element_text(margin = margin(t = -10))) + scale_fill_gradient2(name="% increase\nin sample\nproportion",low="darkblue",high="darkred",mid="white",midpoint=0,limits=c(-100,5000),na.value="darkblue") + scale_y_discrete(labels=genic_labels) + scale_x_discrete(labels=all_state_labels) + facet_wrap(~Class) + ylab("Feature")

b = ggplot(state_genic[which(!(state_genic$class_update %in% c("SVA","Other")) & state_genic$State %in% states[c(1,4:5,7,20:21)]),],aes(x=Bin,y=Ratio,color=class_update)) + geom_line() + ylim(0,1) + scale_color_manual(values=class_colors,name="Class") + xlab("Proportion of samples in state") + ylab("Proportion of TEs overlapping genes") +  geom_hline(data=rmsk_TE_genic[which(!(rmsk_TE_genic$class_update %in% c("SVA","Other"))),],aes(yintercept=Proportion,color=class_update),linetype="dotted") + facet_grid(State~Genes,labeller = labeller(Genes=setNames(c("All genes","Protein-coding genes"),c("All","PC")),State=all_state_labels))

grid.arrange(a,b,nrow=2)
```

```{r Figure S18 source data}
write.table(na.omit(feature_state_mean_class[which(feature_state_mean_class$Coding == "All"),])[,c("State","Feature","Enrichment","Class")],file="source_data/Figure_S18a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(state_genic[which(!(state_genic$class_update %in% c("SVA","Other")) & state_genic$State %in% states[c(1,4:5,7,20:21)]),c("Bin","Ratio","class_update","State","Genes")],file="source_data/Figure_S18b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(rmsk_TE_genic[which(!(rmsk_TE_genic$class_update %in% c("SVA","Other"))),c("Proportion","class_update","Genes")],file="source_data/Figure_S18b_dashed.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### CpG density vs class repression mechanism

To answer the question of whether CpG density or class has a stronger impact on TE epigenetic repression mechanism, created generalized linear models with a quasi-Poisson distribution for the response, comparing the number of samples in the hypermethylated or 9_Het state to just class or class and CpG density.

```{r repression modelling}
source("R_scripts/CpG_density_class_represssion.R")

# Mean/median proportion of samples each TE is in the 9_Het state or hypermethylated, for SINE and LTR elements
ddply(rmsk_TE_measure,.(class_update),summarise,Het=mean(`9_Het`),Hyper=mean(na.omit(Hypermethylated)))
ddply(rmsk_TE_measure,.(class_update),summarise,Het=median(`9_Het`),Hyper=median(na.omit(Hypermethylated)))

# Median/IQR CpG density (CpGs/kbp) per TE, by class
ddply(rmsk_TE,.(class_update),summarise,CpG_dens=median(CpGs_per_length)*1000,dens=IQR(CpGs_per_length)*1000)

# Anova with F test to determine whether adding CpG density to the model significantly improves it
anova(model_het1,model_het2,test="F")
anova(model_hyper1,model_hyper2,test="F")

# Summaries for models with both variables included
summary(model_het2)
summary(model_hyper2)
```

### Supplementary Figure 19

a. Spearman correlation between TE mappability, JC distance, number of CpGs, and CpG density, for all TEs. b. Spearman correlation between TE mappability, JC distance, number of CpGs, and CpG density, by class. c. Spearman correlation between TE characteristics and proportion of samples annotated with each methylation state, for all TEs. d. Spearman correlation between TE characteristics and proportion of samples annotated with each methylation state, by class. 

```{r Figure S19, echo=FALSE,fig.height=6}
a = ggplot(correlate_TE_feature[which(correlate_TE_feature$class_update == "All"),],aes(x=Metric,y=State,fill=estimate.rho)) + geom_tile() + scale_fill_gradient2(limits=c(-1,1),high="darkred",low="darkblue",mid="white",guide=FALSE) + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),panel.background=element_rect(fill="lightgrey"),aspect.ratio = 1) + scale_y_discrete(labels=c("Mappability","Age","CpGs","CpGs per bp")) + scale_x_discrete(labels=c("Length","Mappability","Age","CpGs")) + xlab("TE characteristic") + ylab("TE characteristic")

b = ggplot(correlate_TE_feature[which(correlate_TE_feature$class_update != "All"),],aes(x=Metric,y=State,fill=estimate.rho)) + geom_tile() + scale_fill_gradient2(limits=c(-1,1),high="darkred",low="darkblue",mid="white",guide=FALSE) + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),panel.background=element_rect(fill="lightgrey"),aspect.ratio = 1) + scale_y_discrete(labels=c("Mappability","Age","CpGs","CpGs per bp")) + scale_x_discrete(labels=c("Length","Mappability","Age","CpGs")) + xlab("TE characteristic") + ylab("TE characteristic") + facet_wrap(~class_update)

c = ggplot(droplevels(correlate_TE_state[which(correlate_TE_state$class_update == "All" & correlate_TE_state$State %in% meth_states),]),aes(x=Metric,y=State,fill=estimate.rho)) + geom_tile() + scale_fill_gradient2(limits=c(-1,1),high="darkred",low="darkblue",mid="white",guide=FALSE) + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),panel.background=element_rect(fill="lightgrey"),aspect.ratio = 1) + scale_y_discrete(limits=rev(meth_states)) + scale_x_discrete(limits=measure_metrics[c(1,4:5,2:3)],labels=c("Length","CpGs","CpGs per bp","Mappability","Age")) + xlab("TE characteristic") + ylab("Samples in state")

d = ggplot(droplevels(correlate_TE_state[which(correlate_TE_state$class_update != "All" & correlate_TE_state$State %in% meth_states),]),aes(x=Metric,y=State,fill=estimate.rho)) + geom_tile() + scale_fill_gradient2(limits=c(-1,1),high="darkred",low="darkblue",mid="white",guide=FALSE) + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),panel.background=element_rect(fill="lightgrey"),aspect.ratio = 1) + scale_y_discrete(limits=rev(meth_states)) + scale_x_discrete(limits=measure_metrics[c(1,4:5,2:3)],labels=c("Length","CpGs","CpGs per bp","Mappability","Age")) + xlab("TE characteristic") + ylab("Samples in state") + facet_wrap(~class_update) 

scale_legend = get_legend(ggplot(correlate_TE_feature[which(correlate_TE_feature$class_update == "All"),],aes(x=Metric,y=State,fill=estimate.rho)) + geom_tile() + scale_fill_gradient2(limits=c(-1,1),high="darkred",low="darkblue",mid="white",name="Spearman's rho") + theme(panel.background=element_rect(fill="lightgrey"),aspect.ratio = 1,legend.position="bottom"))

grid.arrange(a,b,c,d,scale_legend,nrow=3,heights=c(0.45,0.45,0.1),widths=c(0.45,0.55))
```

```{r Figure S19 source data}
write.table(correlate_TE_feature[which(correlate_TE_feature$class_update == "All"),c("Metric","State","estimate.rho")],file="source_data/Figure_S19a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(correlate_TE_feature[which(correlate_TE_feature$class_update != "All"),c("Metric","State","estimate.rho","class_update")],file="source_data/Figure_S19b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(droplevels(correlate_TE_state[which(correlate_TE_state$class_update == "All" & correlate_TE_state$State %in% meth_states),])[,c("Metric","State","estimate.rho")],file="source_data/Figure_S19c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(droplevels(correlate_TE_state[which(correlate_TE_state$class_update != "All" & correlate_TE_state$State %in% meth_states),])[,c("Metric","State","estimate.rho","class_update")],file="source_data/Figure_S19d.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### TE feature stats

```{r TE feature stats, cache=TRUE, cache.lazy=FALSE, dependson=c("Figure 6 scripts")}
# Proportion of increase in the mean proportion of samples each TE is annotated with each epigenetic state,
# For TEs overlapping each genic feature versus those that do not
## Overall
dcast(feature_state_mean[which(feature_state_mean$Coding == "All"),],Feature~State,value.var = "Enrichment")
## By class
feature_state_mean_class[which(feature_state_mean_class$Coding == "All"),]

# Proportion of TEs in each epigenetic state in at least one sample that overlap each genic feature
apply(rmsk_TE_measure[,states],2,function(x) apply(rmsk_TE_measure[,cohorts],2,function(y) dim(rmsk_TE_measure[which(x > 0 & y > 0),])[1]/dim(rmsk_TE_measure[which(x > 0),])[1]))
```

### Chromosome stats

```{r chromosome stats}
# Chromosomes ordered by the proportion of TEs on the chromosome versus the proportion of the genome represented by the chromosome
hg19_TEs[order(hg19_TEs$Prop_TEs/hg19_TEs$Prop),]

# Z-scores for each state and chromosome based on the mean number of samples each TE is annotated with the state, ordered by Z-score
chromosome_potential[order(chromosome_potential$Zscore),]

# Proportion of TEs in each epigenetic state in at least one sample on each chromosome
apply(rmsk_TE_measure[,states],2,function(x) table(rmsk_TE_measure[which(x > 0),]$chromosome)/dim(rmsk_TE_measure[which(x > 0),])[1])[standard_chromosomes[1:24],]
```

## Evolutionary conservation of TE regulatory signatures in mouse

Identified corresponding human and mouse sample pairs based on anatomy. 

```{bash sample pairs, eval=FALSE}
# Twelve Roadmap samples with corresponding mouse WGBS samples (7) and mouse chromHMM samples (12)
#/bar/epehrsson/TE_landscape/sample_lists/human_mouse_samples.txt
```

Filtered the mm10 RepeatMasker file to remove contigs and include only TE classes (four large classes, plus Other, RC, Unknown, and five ? classes). 

```{bash mm10 TEs, eval=FALSE}
# RepeatMasker file for mm10	 
cp /bar/genomes/mm10/rmsk/rmsk.txt.gz rmsk_mm10.txt.gz

## Output
#/bar/epehrsson/TE_landscape/features/mouse/rmsk_mm10.txt

# mm10 mouse TEs, chromosomes 1-19, X, Y, M only	 
awk -v OFS='\t' '{if(($12 == "LTR" || $12 == "DNA" || $12 == "SINE" || $12 == "LINE" || 12 == "Other" || $12 == "RC" || $12 == "Unknown" || $12 == "DNA?" || $12 == "LINE?" || $12 == "LTR?" || $12 == "SINE?" || $12 == "RC?") && ($6 !~ /_/)) print $6, $7, $8, $11, $12, $13, $10}' rmsk_mm10.txt > mm10_rmsk_TE.txt

## Output
#/bar/epehrsson/TE_landscape/features/mouse/mm10_rmsk_TE.txt
```

Identified orthologous human and mouse TEs. First, gave each of the human TEs a number. Then, lifted the locations from hg19 to mm10, using the minMatch value recommended for inter-species conversions. Pulled out the hg19 TEs that lifted over to mm10 by number, then combined the hg19 and mm10 coordinates for each TE. Then, intersected the mm10 coordinates of the hg19 TEs with the mouse mm10 TEs and identified those with the same subfamily name. 

Confirmed that no orthologs are missed by first filtering the mouse mm10 TEs to TE classes. 

```{bash human-mouse liftover, eval=FALSE}
#To load liftOver
ml kentUCSC

# All TE RepeatMasker bedfile with unique number for each	 
cat rmsk_TE.txt rmsk_other.txt | awk -v OFS="\t" '{print $1,$2,$3,NR,".",$7}' - > rmsk_TEother_numbered.txt

## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TEother_numbered.txt

# Liftover between human TEs and mouse mm10	 
liftOver rmsk_TEother_numbered.txt ../../genomes/hg19/chainFiles/hg19ToMm10.over.chain.gz rmsk_TE_hg19tomm10.bed out_mm10.txt -minMatch=0.1

## Output
#/bar/epehrsson/TE_landscape/Mouse/liftover/rmsk_TE_hg19tomm10.bed

## Human TEs that do not lift over to mouse	
#/bar/epehrsson/TE_landscape/Mouse/liftover/out_mm10.txt

# Numbers of human TEs that lifted over to mouse	 
awk '{print $4}' rmsk_TE_hg19tomm10.bed > numbers.txt

## Output
#/bar/epehrsson/TE_landscape/Mouse/liftover/numbers.txt

# Pull out human TEs lifted over to mouse by number	 
python ../bin/TE_landscape/subset_file.py rmsk_TEother.txt numbers.txt rmsk_TE_in_mm10.txt

## Output
#/bar/epehrsson/TE_landscape/Mouse/liftover/rmsk_TE_in_mm10.txt

# Combine human and lifted over mouse coordinates	 
cut -f1-3,6 rmsk_TE_hg19tomm10.bed | paste - rmsk_TE_in_mm10.txt > rmsk_TE_mm10.bed

## Output
#/bar/epehrsson/TE_landscape/Mouse/rmsk_TE_mm10.bed

# Lifted-over human coordinates intersected with mouse TE coordinates	 
bedtools intersect -wo -a rmsk_TE_mm10.bed -b mm10_rmsk_TE.txt > hg19_mm10_TE_intersect.bed

## Output
#/bar/epehrsson/TE_landscape/Mouse/liftover/hg19_mm10_TE_intersect.bed

# Human-mouse orthologs TEs with same subfamily name (hg19-mm10)	 
awk -v OFS="\t" '{if($8==$15)print $0}' hg19_mm10_TE_intersect.bed > hg19_mm10_TE_intersect_same.bed

## Output
#/bar/epehrsson/TE_landscape/Mouse/liftover/hg19_mm10_TE_intersect_same.bed

# Test to confirm I am not missing orthologs by filtering repeats
awk -v OFS='\t' '{print $6, $7, $8, $11, $12, $13, $10}' features/mouse/rmsk_mm10.txt | bedtools intersect -wo -a Mouse/rmsk_TE_mm10.bed -b - > Mouse/hg19_mm10_repeat_intersect.bed
awk -v OFS="\t" '{if($8==$15)print $0}' Mouse/hg19_mm10_repeat_intersect.bed > Mouse/hg19_mm10_repeat_intersect_same.bed
```

Methylation level of mm10 TEs. Intersected the mm10 TEs with the CpG methylation level in 9 mouse samples, then combined into one file. Calculated the average methylation level per TE over all CpGs overlapping the TE, for each sample. 

```{bash mm10 WGBS, eval=FALSE}
# WGBS (mm10)
awk -v OFS='\t' '{print $1, $2, $3, $10, $11}' $file | bedtools intersect -wo -a mm10_rmsk_TE.txt -b - > mm10_rmsk_TE_$file;

## Output
#/bar/epehrsson/TE_landscape/Mouse/WGBS/intersect/TEs/mm10_rmsk_TE_ENCFF#.bed [9 files]
#/bar/epehrsson/TE_landscape/Mouse/WGBS/mm10_rmsk_TE_WGBS.bed

# mm10 TE average methylation 
python ~/bin/TE_landscape/average_methylation_stranded.py mm10_rmsk_TE_WGBS.bed mm10_rmsk_TE.txt samples.txt  mm10_rmsk_TE_WGBS_avg.txt

## Output
#/bar/epehrsson/TE_landscape/Mouse/WGBS/mm10_rmsk_TE_WGBS_avg.txt
```

Counted the number of CpGs per mm10 TE. 

```{bash mm10 CpG count, eval=FALSE}
# Mouse TEs
awk -v OFS='\t' '{a[$1, $2, $3, $4, $5, $6, $7]+=1}END{for(i in a){split(i,sep,SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], a[i];}}' Mouse/WGBS/intersect/TEs/mm10_rmsk_TE_WGBS.bed > Mouse/WGBS/intersect/TEs/mm10_rmsk_TE_CpG_count.txt

## Output
#/bar/epehrsson/TE_landscape/Mouse/WGBS/mm10_rmsk_TE_CpG_count.txt
```

For the hg19 TEs with an ortholog in mouse, found the chromHMM state of each TE in the 12 human samples used for comparison to mouse. To ensure that only those TEs were pulled out, required complete reciprocal overlap. Confirmed that only TEs with matching subfamily were pulled out, and that the number of unique TEs is 269,096 for each sample. 

```{bash hg19 ortholog chromHMM, eval=FALSE}
# Orthologous hg19 TEs
cut -f5-11 Mouse/liftover/hg19_mm10_TE_intersect_same.bed | uniq | sort -k1,1 -k2,2n > Mouse/hg19_orthologs.txt

## Output
#/bar/epehrsson/TE_landscape/Mouse/liftover/hg19_orthologs.txt

# Uses the chromHMM state of TEs, split by sample, used for combine marks analysis
for file in E066 E071 E081 E083 E084 E086 E088 E092 E094 E096 E105 E106; do tail -n +2 /scratch/ecp/pandas/$file | sort -k1,1 -k2,2n - | bedtools intersect -sorted -wo -a Mouse/hg19_orthologs.txt -b - -f 1 -r > Mouse/hg19_ortholog_chromHMM_$file\.txt; done

for file in Mouse/chromHMM/hg19_ortholog_chromHMM_*.txt; do cut -f8-18 $file >> Mouse/chromHMM/hg19_chromHMM_TE.txt; done

## Output
#/bar/epehrsson/TE_landscape/Mouse/chromHMM/hg19_ortholog_chromHMM_E#.txt [12 files]
#/bar/epehrsson/TE_landscape/Mouse/chromHMM/hg19_chromHMM_TE.txt
```

For the mm10 TEs with an ortholog in human, found the chromHMM state of each TE in the 12 mouse samples used for comparison to human. First, intersected the mm10 TEs with the mm10 chromHMM annotations, then found the total length of overlap with the state in that sample for each TE. Unlike hg19 TEs, any overlap with a chromHMM state was sufficient for the mm10 TE to be assigned to the state. Because some mm10 TEs are orthologs for multiple hg19 TEs, duplicate intersections were removed first.

```{bash mm10 chromHMM, eval=FALSE}
# Intersected with TEs with human orthologs
while read a b c; do gunzip raw_data/mouse/chromHMM/$c\.bed.gz; cut -f12-18 Mouse/liftover/hg19_mm10_TE_intersect_same.bed | bedtools intersect -wo -a - -b raw_data/mouse/chromHMM/$c\.bed > Mouse/chromHMM/$c\_TE.txt; gzip raw_data/mouse/chromHMM/$c\.bed; done < Mouse/human_mouse_samples.txt

## Output
#/bar/epehrsson/TE_landscape/Mouse/chromHMM/ENCFF#_TE.txt [12 files]

while read a b c; do uniq Mouse/chromHMM/$c\_TE.txt | awk -v OFS='\t' -v sample=$c '{a[$1, $2, $3, $4, $5, $6, $7, $11]+=$17}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sep[8], sample, a[i]}}' - | sort -k1,1V -k2,2V -k3,3n -k9,9 - >> Mouse/chromHMM/mm10_chromHMM_TE_sorted.txt; done < Mouse/human_mouse_samples.txt

## Output
#/bar/epehrsson/TE_landscape/Mouse/chromHMM/mm10_chromHMM_TE_sorted.txt
```

Found the number of members in each mm10 subfamily annotated with each chromHMM state. First, intersected all mm10 TEs, including those without an hg19 ortholog, with the mm10 chromHMM annotations, then found the total overlap with each state for each TE by sample. Then, counted the number of TEs in each subfamily in each state by sample. 

```{bash mm10 subfamily, eval=FALSE}
# Intersected with all mm10 TEs
while read a b c; do gunzip raw_data/mouse/chromHMM/$c\.bed.gz; bedtools intersect -wo -a features/mouse/mm10_rmsk_TE.txt -b raw_data/mouse/chromHMM/$c\.bed > Mouse/chromHMM/$c\_TE_all.txt; gzip raw_data/mouse/chromHMM/$c\.bed; done < Mouse/human_mouse_samples.txt &

## Output
#/bar/epehrsson/TE_landscape/Mouse/chromHMM/ENCFF#_TE_all.txt [12 files]

# chromHMM, any overlap, for all mm10 TEs
while read a b c; do awk -v OFS='\t' -v sample=$c '{a[$1, $2, $3, $4, $5, $6, $7, $11]+=$17}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], sep[4], sep[5], sep[6], sep[7], sep[8], sample, a[i]}}' Mouse/chromHMM/$c\_TE_all.txt | sort -k1,1V -k2,2V -k3,3n -k9,9 - >> Mouse/chromHMM/mm10_chromHMM_TE_sorted_all.txt; done < Mouse/human_mouse_samples.txt

## Output
#/bar/epehrsson/TE_landscape/Mouse/chromHMM/mm10_chromHMM_TE_sorted_all.txt

## mm10 TEs in state x subfamily x sample
awk -v OFS='\t' '{a[$4,$8,$9]+=1}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], sep[3], a[i]}}' Mouse/chromHMM/mm10_chromHMM_TE_sorted_all.txt > Mouse/chromHMM/mm10_chromHMM_subfamily.txt

## Output
#/bar/epehrsson/TE_landscape/Mouse/chromHMM/mm10_chromHMM_subfamily.txt
```

Found the proportion of each hg19 subfamily in an active regulatory chromHMM state. For the 12 samples used for comparison to mouse, pulled out all TEs in any active regulatory state in the sample, then counted the number per subfamily. 

```{bash hg19 subfamily active, eval=FALSE}
# Uses chromHMM state of TEs, split by sample, used for combine marks analysis
for file in E066 E071 E081 E083 E084 E086 E088 E092 E094 E096 E105 E106; do awk -v OFS='\t' '{if(($10 == "1_TssA") || ($10 == "2_TssAFlnk") || ($10 == "3_TxFlnk") || ($10 == "6_EnhG") || ($10 == "7_Enh")) print $1, $2, $3, $4, $5, $6, $7, $8}' /scratch/ecp/pandas/$file | uniq | awk '{a[$4,$8]+=1}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], a[i]}}' - >> Mouse/chromHMM/hg19_chromHMM_subfamily_active.txt; done

## Output
#/bar/epehrsson/TE_landscape/Mouse/chromHMM/hg19_chromHMM_subfamily_active.txt
```

Found the proportion of each mm10 subfamily in an active regulatory chromHMM state. For the 12 samples used for comparison to human, pulled out all TEs in any active regulatory state in the sample, then counted the number per subfamily. 

```{bash mm10 subfamily active, eval=FALSE}
# Proportion in any active regulatory state
awk -v OFS='\t' '{if(($8 == "TssA") || ($8 == "TssAFlnk1") || ($8 == "TssAFlnk2") || ($8 == "Enh") || ($8 == "EnhLo1") || ($8 == "EnhLo2")) print $1, $2, $3, $4, $5, $6, $7, $9}' Mouse/chromHMM/mm10_chromHMM_TE_sorted_all.txt | uniq | awk -v OFS='\t' '{a[$4,$8]+=1}END{for(i in a) {split (i, sep, SUBSEP); print sep[1], sep[2], a[i]}}' - > Mouse/chromHMM/mm10_chromHMM_subfamily_active.txt

## Output
#/bar/epehrsson/TE_landscape/Mouse/chromHMM/mm10_chromHMM_subfamily_active.txt
```

### Figure 7

```{r mouse scripts, cache=TRUE, cache.lazy=FALSE}
# Creates dataframes of human and corresponding mouse samples, mm10 TEs, and hg19/mm10 orthologous TEs
source("R_scripts/human_mouse_orthologs.R")

# Generates dataframes of the average methylation level and methylation state for orthologous TEs, including mm10 TEs and hg19/mm10 ortholog pairs
source("R_scripts/human_mouse_ortholog_WGBS.R") 

# Generates dataframes of chromHMM state for orthologous TEs, including hg19 TEs, mm10 TEs and hg19/mm10 ortholog pairs,
# As well as tables of the number of samples each orthologous TE pair is in a promoter or active regulatory state by tissue
source("R_scripts/human_mouse_ortholog_chromHMM.R")

# Generates dataframes with the proportion of each shared subfamily hypomethylated, in each chromHMM state, or in an active regulatory state
# in each species in each human/mouse sample pair, restricted to those with >30 members in both species
source("R_scripts/human_mouse_subfamily.R")
```

a. 2D density plot of orthologous TE average methylation in paired human (x-axis) and mouse (y-axis) samples. Excludes TEs missing methylation in either sample of a pair. b. Orthologous TE methylation state in paired human (x-axis) and mouse (y-axis) samples. X-axis includes proportion of orthologous hg19 TEs in each methylation state in human samples. c. Orthologous TE chromHMM state in paired human (x-axis) and mouse (y-axis) samples. X-axis includes proportion of orthologous hg19 TEs in each chromHMM state in human samples. d. Proportion of orthologous TEs with tissue-specific epigenetic activity in humans with each level of specificity for the same tissue in mouse, for 5 tissues and all active regulatory chromHMM states or only promoter states. 

```{r Figure 7, echo=FALSE, fig.height=5}
a = ggplot(human_mouse_orthologs_WGBS_paired,aes(x=Human_methylation,y=Mouse_methylation)) + stat_bin2d(bins=50) + scale_fill_gradient(low="white",high="darkblue",name="Number\nof TEs") + xlab("Methylation level (human)") + ylab("Methylation level\n(mouse)") + coord_equal() + theme(legend.position = "bottom",legend.direction="horizontal",legend.box.margin = margin(t=-10))

# Proportion of orthologous hg19 TEs in each methylation state, across all human samples with a corresponding mouse sample
state_freq = ddply(human_mouse_orthologs_WGBS_paired,.(Human_state_WGBS),summarise,Entries=length(Human_state_WGBS)/dim(human_mouse_orthologs_WGBS_paired)[1])

b = ggplot(human_mouse_orthologs_WGBS_paired,aes(x=Human_state_WGBS,fill=Mouse_state_WGBS)) + geom_bar(position="fill") + scale_fill_manual(values=meth_colors,name="State (mouse)",labels=meth_labels) + xlab("State (human)") + ylab("Proportion in state\nin mouse") + scale_x_discrete(labels=as.vector(apply(state_freq,1,function(x) paste(x[1],"\n(",round(as.numeric(x[2])*100,0),"%)",sep="")))) + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5),legend.direction="horizontal",legend.position="bottom",legend.box.margin = margin(t=-10),legend.key.size = unit(3,'mm')) + guides(fill=guide_legend(nrow=2,title.position="top"))

# Proportion of orthologous hg19 TEs in each chromHMM state, across all human samples with a corresponding mouse sample
state_freq_chromHMM = ddply(human_mouse_orthologs_chromHMM_paired,.(Human_state_chromHMM),summarise,Entries=length(Human_state_chromHMM)/dim(human_mouse_orthologs_chromHMM_paired)[1])

c = ggplot(human_mouse_orthologs_chromHMM_paired,aes(x=Human_state_chromHMM,fill=Mouse_state_chromHMM)) + geom_bar(position="fill") + xlab("State (human)") + ylab("Proportion in state\nin mouse") + theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5),legend.direction="horizontal",legend.position="bottom",legend.box.margin = margin(t=-10),legend.key.size = unit(3,'mm')) + scale_x_discrete(labels=as.vector(apply(state_freq_chromHMM,1,function(x) paste(x[1],"\n(",round(as.numeric(x[2])*100,1),"%)",sep="")))) + guides(fill = guide_legend(nrow=3,title.position = "top")) + scale_fill_manual(values=mouse_chromHMM_colors,name="State (mouse)")

d = ggplot(hg19_orthologs_specific,aes(x=factor(1),y=Count,fill=Subset,label=paste(round(100*Proportion,0),"%",sep=""))) + geom_bar(stat="identity",position="fill",width=0.8) + coord_polar(theta="y") + theme(axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.x = element_blank(),axis.text.y = element_blank(),axis.ticks.y = element_blank(),strip.text.x = element_text(size = 8),legend.key.size = unit(3,'mm'),legend.position="bottom") + facet_grid(State~Tissue,labeller=labeller(Tissue = setNames(c("Brain","Intestine","Stomach","Heart","Lung"),unique(hg19_orthologs_specific$Tissue)))) + scale_fill_discrete(labels=c("<2 samples","Tissue-specific","2-8 samples","> 8 samples"),name="Category") + guides(fill = guide_legend(nrow = 2)) + geom_text(position = position_fill(vjust = 0.5),size=2)

grid.arrange(a,b,c,d,nrow=2,layout_matrix=rbind(c(1,4,4),c(2,3,3)),heights=c(0.45,0.55))
```

```{r Figure 7 source data}
write.table(human_mouse_orthologs_WGBS_paired[,c("Mouse_sample_WGBS","Human_sample","Human_methylation","Mouse_methylation","Human_state_WGBS","Mouse_state_WGBS")],file="source_data/Figure_7ab.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(state_freq,file="source_data/Figure_7b_freq.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(human_mouse_orthologs_chromHMM_paired[,c("Mouse_sample_chromHMM","Human_sample","Human_state_chromHMM","Mouse_state_chromHMM")],file="source_data/Figure_7c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(state_freq_chromHMM,file="source_data/Figure_7c_freq.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(hg19_orthologs_specific[,c("Count","Subset","Proportion","State","Tissue")],file="source_data/Figure_7d.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Supplementary Figure 20

a. Pie chart with the percent of all TEs and TEs with an mm10 ortholog in each class. b. Pie chart with the percent of all subfamilies and subfamilies shared with mm10 in each class. c-e. Proportion of members in each shared subfamily that are d) hypomethylated, e) in the promoter state (1_TssA/TssA), or f) in any active regulatory chromHMM state in human (x-axis) and mouse (y-axis) in paired samples. The linear correlation for each sample pair is presented as a line. Only subfamilies with >30 members in each species are considered. 

```{r Figure S20, echo=FALSE,fig.height=6}
# Add the number of hg19 TEs with an mm10 ortholog per class to the dataframe of class statistics
rmsk_TE_class$Mouse_orthologs = table(unique(human_mouse_orthologs_mm10[,c(hg19_coordinates,"class_update")])$class_update)[as.vector(rmsk_TE_class$class_update)]

# Add the number of hg19 subfamlies also present in mm10 per class to the dataframe of class statistics
rmsk_TE_class = merge(rmsk_TE_class,ddply(rmsk_TE_subfamily[which(rmsk_TE_subfamily$subfamily %in% mm10_rmsk_TE$subfamily),],~class_update,summarize,
                                          Mouse_ortholog_subfamily = length(subfamily)),by="class_update",all.x=TRUE)
rmsk_TE_class[which(is.na(rmsk_TE_class$Mouse_ortholog_subfamily)),]$Mouse_ortholog_subfamily = 0

# Table with the number/percent of TEs per class, overall and with an ortholog in mouse (mm10)
ortholog_stats = melt(rmsk_TE_class[,c("class_update","Count","Mouse_orthologs")],id.var="class_update",variable.name="Category",value.name="TEs")
ortholog_stats = ddply(ortholog_stats,.(Category),transform,Percent=100*TEs/sum(TEs))

a = ggplot(ortholog_stats,aes(x=Category,y=TEs,fill=class_update,label=paste(format(TEs, big.mark=",", scientific=FALSE)," (",round(Percent,0),"%) ",sep=""))) + geom_bar(stat="identity",position="fill",width=0.8) + coord_polar(theta="y") + scale_fill_manual(values=class_colors,guide=FALSE) + theme(axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.x = element_blank(),axis.text.y = element_blank(),axis.ticks.y = element_blank(),plot.title = element_text(size=8)) + ggtitle("TEs") + geom_text(position = position_fill(vjust = 0.5),size=2)

# Table with the number/percent of subfamilies per class, overall and shared with mouse (mm10)
ortholog_stats_subfam = melt(rmsk_TE_class[,c("class_update","Subfamilies","Mouse_ortholog_subfamily")],
                             id.var="class_update",variable.name="Category",value.name="Count")
ortholog_stats_subfam = ddply(ortholog_stats_subfam,.(Category),transform,Percent=100*Count/sum(Count))
  
b = ggplot(ortholog_stats_subfam,aes(x=Category,y=Count,fill=class_update,label=paste(format(Count, big.mark=",", scientific=FALSE)," (",round(Percent,0),"%) ",sep=""))) + geom_bar(stat="identity",position="fill",width=0.8) + coord_polar(theta="y") + scale_fill_manual(values=class_colors,guide=FALSE) + theme(axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.x = element_blank(),axis.text.y = element_blank(),axis.ticks.y = element_blank(),plot.title = element_text(size=8)) + ggtitle("Subfamilies") + geom_text(position = position_fill(vjust = 0.5),size=2)

c = ggplot(hg19_mm10_TE_WGBS_subfamily_hypo_paired,aes(x=Human_hypo,y=Mouse_hypo,color=Anatomy,shape=Age)) + geom_point() + scale_color_manual(values=anatomy_colors,guide=FALSE) + xlab("Subfamily proportion (human)") + ylab("Subfamily proportion (mouse)") + geom_abline(slope=1,linetype="dotted") + geom_smooth(aes(linetype=Age),method='lm',fullrange=TRUE,alpha=0.1) + xlim(0,0.4) + ylim(0,0.4) + guides(color=FALSE,shape=FALSE,linetype=FALSE) + coord_equal() + scale_linetype_manual(values=c(1,2),guide=FALSE)

d = ggplot(hg19_mm10_chromHMM_subfamily[which(hg19_mm10_chromHMM_subfamily$Human_state_chromHMM == "1_TssA" & hg19_mm10_chromHMM_subfamily$Mouse_state_chromHMM == "TssA"),],aes(x=Percent.x,y=Percent.y,color=Anatomy,shape=Age)) + geom_point() + scale_color_manual(values=anatomy_colors,guide=FALSE) + xlab("Subfamily proportion (human)") + ylab("Subfamily proportion (mouse)") + geom_abline(slope=1,linetype="dotted") + geom_smooth(aes(linetype=Age),method='lm',fullrange=TRUE,alpha=0.1) + xlim(0,0.05) + ylim(0,0.05) + guides(color=FALSE,shape=FALSE,linetype=FALSE) + coord_equal() + scale_linetype_manual(values=c(1,2),guide=FALSE)

e = ggplot(hg19_mm10_chromHMM_subfamily_active,aes(x=Percent.x,y=Percent.y,color=Anatomy,shape=Age)) + geom_point() + scale_color_manual(values=anatomy_colors,guide=FALSE) + xlab("Subfamily proportion (human)") + ylab("Subfamily proportion (mouse)") + geom_abline(slope=1,linetype="dotted") + geom_smooth(aes(linetype=Age),method='lm',fullrange=TRUE,alpha=0.1) + xlim(0,0.4) + ylim(0,0.4) + guides(color=FALSE,shape=FALSE,linetype=FALSE) + coord_equal() + scale_linetype_manual(values=c(1,2),guide=FALSE)

legend = get_legend(ggplot(hg19_mm10_chromHMM_subfamily_active,aes(x=Percent.x,y=Percent.y,color=Anatomy,shape=Age)) + geom_point() + scale_color_manual(values=anatomy_colors) + scale_shape_discrete(labels=age_labels) + geom_smooth(aes(linetype=Age),method='lm',fullrange=TRUE,alpha=0.1) + scale_linetype_manual(labels=age_labels,values=c(1,2)) + xlab("Human") + ylab("Mouse") + theme(legend.position = "bottom",legend.direction = "horizontal",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')))

grid.arrange(a,b,legend_class,c,d,e,legend,nrow=4,layout_matrix=rbind(c(1,NA,2),c(3),c(4,5,6),c(7)),heights=c(0.45,0.05,0.45,0.05))
```

```{r Figure S20 source data}
write.table(ortholog_stats[,c("Category","TEs","class_update","Percent")],file="source_data/Figure_S20a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(ortholog_stats_subfam[,c("Category","Count","class_update","Percent")],file="source_data/Figure_S20b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(hg19_mm10_TE_WGBS_subfamily_hypo_paired[,c("subfamily","Human_hypo","Mouse_hypo","Anatomy","Age")],file="source_data/Figure_S20c.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(hg19_mm10_chromHMM_subfamily[which(hg19_mm10_chromHMM_subfamily$Human_state_chromHMM == "1_TssA" & hg19_mm10_chromHMM_subfamily$Mouse_state_chromHMM == "TssA"),c("subfamily","Percent.x","Percent.y","Anatomy","Age")],file="source_data/Figure_S20d.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(hg19_mm10_chromHMM_subfamily_active[,c("subfamily","Percent.x","Percent.y","Anatomy","Age")],file="source_data/Figure_S20e.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### Human-mouse ortholog analysis

Number of hg19 TEs that liftover to mm10.

```{bash count liftover, eval=FALSE}
wc -l Mouse/liftover/rmsk_TE_hg19tomm10.bed
```

```{r mouse ortholog analysis}
# Number of mouse TEs (mm10)
MOUSE_TE = dim(mm10_rmsk_TE)[1]
MOUSE_TE

# Number/proportion of hg19 TEs that lift over to mm10
1265775
1265775/NUM_TE

# Number of lifted-over hg19 positions in mm10 (hg19->mm10)
dim(unique(human_mouse_orthologs_mm10[,c("human_chr_mm10","human_start_mm10","human_stop_mm10","human_strand_mm10")]))[1]

# Number/proportion of human TEs with orthologs in mm10
HG19_TE = dim(unique(human_mouse_orthologs_mm10[,hg19_coordinates]))[1]
HG19_TE
HG19_TE/NUM_TE

# Number of orthologous mouse TEs
MM10_TE = dim(unique(human_mouse_orthologs_mm10[,mm10_coordinates]))[1]
MM10_TE

# Correspondence between hg19 TEs and lifted over hg19 TE positions in mm10 (hg19->mm10)
## Number of hg19->mm10 TEs per hg19 TE
table(table(unique(human_mouse_orthologs_mm10[,c("human_TE_mm10","human_TE_hg19")])$human_TE_hg19))
## Number of hg19 TEs per hg19->mm10 TE
table(table(unique(human_mouse_orthologs_mm10[,c("human_TE_mm10","human_TE_hg19")])$human_TE_mm10))
## Proportion of hg19->mm10 TEs that correspond to 2+ hg19 TEs
sum(table(table(unique(human_mouse_orthologs_mm10[,c("human_TE_mm10","human_TE_hg19")])$human_TE_mm10))[2:11])/length(unique(human_mouse_orthologs_mm10$human_TE_mm10))

# Correspondence between lifted over hg19 TE positions in mm10 (hg19->mm10) and mm10 orthologs
## Number of mm10 TEs per hg19->mm10 TE
table(table(unique(human_mouse_orthologs_mm10[,c("human_TE_mm10","mouse_TE_mm10")])$human_TE_mm10))
## Proportion of hg19->mm10 TEs that correspond to 2+ mm10 TEs
sum(table(table(unique(human_mouse_orthologs_mm10[,c("human_TE_mm10","mouse_TE_mm10")])$human_TE_mm10))[2:10])/length(unique(human_mouse_orthologs_mm10$human_TE_mm10))
## Number of hg19->mm10 TEs per mm10 TE
table(table(unique(human_mouse_orthologs_mm10[,c("human_TE_mm10","mouse_TE_mm10")])$mouse_TE_mm10))
## Proportion of mm10 TEs that correspond to 2+ hg19->mm10 TEs
sum(table(table(unique(human_mouse_orthologs_mm10[,c("human_TE_mm10","mouse_TE_mm10")])$mouse_TE_mm10))[2:15])/length(unique(human_mouse_orthologs_mm10$mouse_TE_mm10))

# Correspondence between hg19 TEs and mm10 orthologs
## Number of mm10 TEs per hg19 TE
table(table(unique(human_mouse_orthologs_mm10[,c("human_TE_hg19","mouse_TE_mm10")])$human_TE_hg19))
## Proportion of hg19 TEs that correspond to 2+ mm10 TEs
sum(table(table(unique(human_mouse_orthologs_mm10[,c("human_TE_hg19","mouse_TE_mm10")])$human_TE_hg19))[2:10])/length(unique(human_mouse_orthologs_mm10$human_TE_hg19))
## Number of hg19 TEs per mm10 TE
table(table(unique(human_mouse_orthologs_mm10[,c("human_TE_hg19","mouse_TE_mm10")])$mouse_TE_mm10))
## Proportion of mm10 TEs that correspond to 2+ hg19 TEs
sum(table(table(unique(human_mouse_orthologs_mm10[,c("human_TE_hg19","mouse_TE_mm10")])$mouse_TE_mm10))[2:25])/length(unique(human_mouse_orthologs_mm10$mouse_TE_mm10))

# Median length of hg19 TEs with an mm10 ortholog, lifted over hg19->mm10 positions, and mm10 orthologs
median(unique(human_mouse_orthologs_mm10[,c("human_chr_mm10","human_start_mm10","human_stop_mm10","human_strand_mm10")])$human_stop_mm10-
         unique(human_mouse_orthologs_mm10[,c("human_chr_mm10","human_start_mm10","human_stop_mm10","human_strand_mm10")])$human_start_mm10)
median(unique(human_mouse_orthologs_mm10[,hg19_coordinates])$human_stop_hg19-unique(human_mouse_orthologs_mm10[,hg19_coordinates])$human_start_hg19)
median(unique(human_mouse_orthologs_mm10[,mm10_coordinates])$mouse_stop_mm10-unique(human_mouse_orthologs_mm10[,mm10_coordinates])$mouse_start_mm10)

# Proportion of all TEs, TEs with an mm10 ortholog, all subfamilies, and subfamilies shared with mm10 in each class
test = rmsk_TE_class[,c("class_update","Count","Mouse_orthologs","Subfamilies","Mouse_ortholog_subfamily")]
test[,2:5] = apply(test[,2:5],2,function(x) x/sum(x))
test 

# Proportion of hg19 TEs in each class with an ortholog in mm10
rmsk_TE_class$Mouse_orthologs/rmsk_TE_class$Count

# Proportion of subfamilies in each class shared with mouse (mm10)
rmsk_TE_class$Mouse_ortholog_subfamily/rmsk_TE_class$Subfamilies

# Median age (JC distance) of all TEs and those with an ortholog in mouse (mm10)
median(rmsk_TE$JC_distance)
median(merge(rmsk_TE,unique(human_mouse_orthologs_mm10[,hg19_coordinates]),by.x=TE_coordinates,by.y=hg19_coordinates[c(1:4,6,5,7)])$JC_distance)
```

### Human-mouse ortholog WGBS analysis

Effect size for Chi-squared tests from [http://rcompanion.org/handbook/H_10.html| rcompanion]. 

```{r mouse WGBS analysis}
# Number/proportion of hg19 TEs with an mm10 ortholog that overlap CpGs
x = merge(TE_meth_average[,c(TE_coordinates,"CpGs")],unique(human_mouse_orthologs_mm10[,hg19_coordinates]),by.x=TE_coordinates[c(1:4,6,5,7)],by.y=hg19_coordinates)
dim(x)
dim(x)[1]/HG19_TE

## Median number of CpGs per hg19 TE with an mm10 ortholog that overlaps CpGs
median(x$CpGs)
rm(x)

# Number/proportion of mm10 orthologs that overlap CpGs
y = merge(mm10_rmsk_TE_WGBS,unique(human_mouse_orthologs_mm10[,mm10_coordinates]),by.x=TE_coordinates[c(1:4,6,5,7)],by.y=mm10_coordinates)
dim(y)
dim(y)[1]/MM10_TE

## Median number of CpGs per mm10 ortholog that overlaps CpGs
median(y$CpGs)
rm(y)

# Number of hg19-mm10 orthologous TE pairs that overlap CpGs in both mm10 and hg19
dim(human_mouse_orthologs_WGBS)[1]
## Unique hg19 TEs
dim(unique(human_mouse_orthologs_WGBS[,hg19_coordinates]))[1]
## Unique mm10 TEs
dim(unique(human_mouse_orthologs_WGBS[,mm10_coordinates]))[1]

# Orthologous TE pairs with methylation data in at least one human and one mouse sample
# among those that overlap CpGs in both mouse and human
test = human_mouse_orthologs_WGBS[which(apply(human_mouse_orthologs_WGBS,1,function(x) sum(is.na(x[17:23])) < 7) & apply(human_mouse_orthologs_WGBS,1,function(x) sum(is.na(x[24:30])) < 7)),]

## Number of unique hg19 TEs missing methylation data in all samples
dim(unique(human_mouse_orthologs_WGBS[,hg19_coordinates]))[1]-dim(unique(test[,hg19_coordinates]))[1]

## Number of unique mm10 orthologs missing methylation data in all samples
dim(unique(human_mouse_orthologs_WGBS[,mm10_coordinates]))[1]-dim(unique(test[,mm10_coordinates]))[1]

# Number of orthologous TE pairs with methylation data in both members of each human/mouse sample pair, by sample pair
ddply(human_mouse_orthologs_WGBS_paired,.(Human_sample,Mouse_sample_WGBS),summarise,Both=sum(!is.na(Human_methylation) & !is.na(Mouse_methylation)))

# Number of orthologous TE pairs in each human/mouse methylation state combination, across all human/mouse sample pairs
table(human_mouse_orthologs_WGBS_paired[,c("Mouse_state_WGBS","Human_state_WGBS")])

# Chi-squared test and effect size on human/mouse methylation state combinations, for all orthologous TE pairs
## Across all human/mouse sample pairs
ddply(human_mouse_orthologs_WGBS_paired,.(),summarise,Pvalue=unlist(chisq.test(x=Human_state_WGBS,y=Mouse_state_WGBS))["p.value"])
cramerV(x=as.vector(human_mouse_orthologs_WGBS_paired$Human_state_WGBS),y=as.vector(human_mouse_orthologs_WGBS_paired$Mouse_state_WGBS))

## By sample pair
ddply(human_mouse_orthologs_WGBS_paired,.(Human_sample),summarise,Pvalue=unlist(chisq.test(x=Human_state_WGBS,y=Mouse_state_WGBS))["p.value"])
by(human_mouse_orthologs_WGBS_paired,human_mouse_orthologs_WGBS_paired$Human_sample,function(x) cramerV(x=as.vector(x$Human_state_WGBS),y=as.vector(x$Mouse_state_WGBS)))

# Chi-squared test and effect size on human/mouse methylation state combinations, for all orthologous TE pairs,
# comparing only the hypomethylated state to all other states
## Convert non-hypomethylated states to "Missing"
test = human_mouse_orthologs_WGBS_paired[,c("Human_sample","Human_state_WGBS","Mouse_state_WGBS")]
test$Human_state_WGBS = mapvalues(test$Human_state_WGBS,meth_states[2:3],rep("Missing",2))
test$Mouse_state_WGBS = mapvalues(test$Mouse_state_WGBS,meth_states[2:3],rep("Missing",2))

## Chi-squared test and effect size, across all human/mouse sample pairs
ddply(test,.(),summarise,Pvalue=unlist(chisq.test(x=Human_state_WGBS,y=Mouse_state_WGBS))["p.value"])
cramerV(x=as.vector(test$Human_state_WGBS),y=as.vector(test$Mouse_state_WGBS))

## Chi-squared test and effect size, by sample pair
ddply(test,.(Human_sample),summarise,Pvalue=unlist(chisq.test(x=Human_state_WGBS,y=Mouse_state_WGBS))["p.value"])
by(test,test$Human_sample,function(x) cramerV(x=as.vector(x$Human_state_WGBS),y=as.vector(x$Mouse_state_WGBS)))

# Orthologous TE pairs hypomethylated in both human and mouse samples in paired samples
human_mouse_orthologs_WGBS_hypo = human_mouse_orthologs_WGBS_paired[which(human_mouse_orthologs_WGBS_paired$Mouse_state_WGBS == "Hypomethylated" & human_mouse_orthologs_WGBS_paired$Human_state_WGBS == "Hypomethylated"),]

## Number of pairs, across all samples
dim(human_mouse_orthologs_WGBS_hypo)
## Number of unique hg19 TEs, across all samples
dim(unique(human_mouse_orthologs_WGBS_hypo[,hg19_coordinates]))
## Number of unique mm10 TEs, across all samples
dim(unique(human_mouse_orthologs_WGBS_hypo[,mm10_coordinates]))
## Number of pairs, by sample
ddply(human_mouse_orthologs_WGBS_hypo,.(Human_sample),function(x) dim(x)[1])
```

### Human-mouse ortholog chromHMM analysis

```{r mouse chromHMM analysis}
# QC on orthologous TE x chromHMM state x sample dataframe
## Confirming that all orthologous TE pairs have a chromHMM state
count_na(human_mouse_orthologs_chromHMM_paired)
## Number of hg19 TEs
dim(unique(hg19_orthologs_chromHMM[,hg19_coordinates]))[1]
## Number of hg19 TEs x chromHMM state x sample
dim(hg19_orthologs_chromHMM)
dim(unique(human_mouse_orthologs_chromHMM_paired[,c(hg19_coordinates,"Human_sample","Human_state_chromHMM")]))[1]
## Number of mm10 TEs
dim(unique(mm10_orthologs_chromHMM[,mm10_coordinates]))[1]
## Number of mm10 TEs x chromHMM state x sample
dim(mm10_orthologs_chromHMM)
dim(unique(human_mouse_orthologs_chromHMM_paired[,c(mm10_coordinates,"Mouse_sample_chromHMM","Mouse_state_chromHMM")]))[1]

# Number of orthologous TE pairs in each human/mouse chromHMM state combination, across all human/mouse sample pairs
human_mouse_chromHMM_table = table(human_mouse_orthologs_chromHMM_paired$Human_state_chromHMM,human_mouse_orthologs_chromHMM_paired$Mouse_state_chromHMM)
human_mouse_chromHMM_table

# Chi-squared test and effect size on human/mouse chromHMM state combinations, for all orthologous TE pairs
## Across all human/mouse sample pairs
ddply(human_mouse_orthologs_chromHMM_paired,.(),summarise,Pvalue=unlist(chisq.test(x=Human_state_chromHMM,y=Mouse_state_chromHMM))["p.value"])
cramerV(x=as.vector(human_mouse_orthologs_chromHMM_paired$Human_state_chromHMM),y=as.vector(human_mouse_orthologs_chromHMM_paired$Mouse_state_chromHMM))

## By sample pair
ddply(human_mouse_orthologs_chromHMM_paired,.(Human_sample),summarise,Pvalue=unlist(chisq.test(x=Human_state_chromHMM,y=Mouse_state_chromHMM))["p.value"])
by(human_mouse_orthologs_chromHMM_paired,human_mouse_orthologs_chromHMM_paired$Human_sample,function(x) cramerV(x=as.vector(x$Human_state_chromHMM),y=as.vector(x$Mouse_state_chromHMM)))

# Orthologous TE pairs in the promoter state in both human and mouse samples in paired samples
x = human_mouse_orthologs_chromHMM_paired[which(human_mouse_orthologs_chromHMM_paired$Human_state_chromHMM == "1_TssA" & human_mouse_orthologs_chromHMM_paired$Mouse_state_chromHMM == "TssA"),]
## Number of orthologous TE pairs x samples
dim(x)[1]
## Unique hg19 TEs
dim(unique(x[,hg19_coordinates]))[1]
## Unique hg19 TEs, by class
table(unique(x[,hg19_coordinates])$human_class)
## Unique mm10 TEs
dim(unique(x[,mm10_coordinates]))[1]
## Number of pairs by sample
ddply(x,.(Human_sample),function(x) dim(x)[1])

# Orthologous TE pairs in an active regulatory state in both human and mouse samples in paired samples
y = unique(human_mouse_orthologs_chromHMM_paired[which(human_mouse_orthologs_chromHMM_paired$Human_state_chromHMM %in% chromHMM_states[c(1:3,6:7)] & human_mouse_orthologs_chromHMM_paired$Mouse_state_chromHMM %in% mouse_chromHMM_states[c(1:3,6:8)]),c(hg19_coordinates,mm10_coordinates,"Human_sample","Mouse_sample_chromHMM")])
## Number of orthologous TE pairs x samples
dim(y)[1]
## Unique hg19 TEs
dim(unique(y[,hg19_coordinates]))[1]
## Unique hg19 TEs, by class
table(unique(y[,hg19_coordinates])$human_class)
## Unique mm10 TEs
dim(unique(y[,mm10_coordinates]))[1]
## Number of pairs by sample
ddply(y,.(Human_sample),function(z) dim(z)[1])

# Number of orthologous TEs with tissue-specific epigenetic activity for 5 tissues, 
# including only TEs in the state in <5 human tissues and in both samples of the specified tissue (All),
# divided by the number of tissues they are in the state in mouse:
# Specific, both samples of the specified tissue, <5 samples overall; On, >8 samples; Off, <2 samples

## Promoter state
as.data.frame(t(sapply(c("BRAIN","GI_INTESTINE","GI_STOMACH","HEART","LUNG"),function(x) tissue_matrix(x,hg19_mm10_chromHMM_promoter))))

### Write out orthologous TEs with conserved tissue-specific promoter state
write.table(ldply(c("BRAIN","GI_INTESTINE","GI_STOMACH","HEART","LUNG"),function(x) hg19_mm10_chromHMM_promoter[which(hg19_mm10_chromHMM_promoter$Human_samples < 5 & hg19_mm10_chromHMM_promoter[[paste(x,".x",sep="")]] == 2 & hg19_mm10_chromHMM_promoter$Mouse_samples < 5 & hg19_mm10_chromHMM_promoter[[paste(x,".y",sep="")]] == 2),]),file="Mouse/tissue_specific_promoter.txt",sep='\t',quote=FALSE,row.names=FALSE)

## Active regulatory state
as.data.frame(t(sapply(c("BRAIN","GI_INTESTINE","GI_STOMACH","HEART","LUNG"),function(x) tissue_matrix(x,hg19_mm10_chromHMM_active))))

### Write out orthologous TEs with conserved tissue-specific active regulatory state
lapply(c("BRAIN","GI_INTESTINE","GI_STOMACH","HEART","LUNG"),function(x) write.table(hg19_mm10_chromHMM_active[which(hg19_mm10_chromHMM_active$Human_samples < 5 & hg19_mm10_chromHMM_active[[paste(x,".x",sep="")]] == 2 & hg19_mm10_chromHMM_active$Mouse_samples < 5 & hg19_mm10_chromHMM_active[[paste(x,".y",sep="")]] == 2),],file=paste("Mouse/tissue_specific_active_",x,".txt",sep=""),sep='\t',quote=FALSE,row.names=FALSE))

# Orthologous TEs constitutively in the state (>8 samples) in both human and mouse 
# or constitutively in the state in one species but in <2 samples in the other

## Promoter state
dim(hg19_mm10_chromHMM_promoter[which(hg19_mm10_chromHMM_promoter$Human_samples > 8 & hg19_mm10_chromHMM_promoter$Mouse_samples > 8),])
dim(hg19_mm10_chromHMM_promoter[which(hg19_mm10_chromHMM_promoter$Human_samples > 8 & hg19_mm10_chromHMM_promoter$Mouse_samples < 2),])
dim(hg19_mm10_chromHMM_promoter[which(hg19_mm10_chromHMM_promoter$Human_samples < 2 & hg19_mm10_chromHMM_promoter$Mouse_samples > 8),])

### Write out orthologous TEs with constitutive promoter state annotation in one or both species
write.table(hg19_mm10_chromHMM_promoter[which(hg19_mm10_chromHMM_promoter$Human_samples > 8 & hg19_mm10_chromHMM_promoter$Mouse_samples > 8),],file="Mouse/constitutive_both_promoter.txt",sep='\t',quote=FALSE,row.names=FALSE)
write.table(hg19_mm10_chromHMM_promoter[which(hg19_mm10_chromHMM_promoter$Human_samples > 8 & hg19_mm10_chromHMM_promoter$Mouse_samples < 2),],file="Mouse/constitutive_human_promoter.txt",sep='\t',quote=FALSE,row.names=FALSE)
write.table(hg19_mm10_chromHMM_promoter[which(hg19_mm10_chromHMM_promoter$Human_samples < 2 & hg19_mm10_chromHMM_promoter$Mouse_samples > 8),],file="Mouse/constitutive_mouse_promoter.txt",sep='\t',quote=FALSE,row.names=FALSE)

## Active regulatory states
dim(hg19_mm10_chromHMM_active[which(hg19_mm10_chromHMM_active$Human_samples > 8 & hg19_mm10_chromHMM_active$Mouse_samples > 8),])
dim(hg19_mm10_chromHMM_active[which(hg19_mm10_chromHMM_active$Human_samples > 8 & hg19_mm10_chromHMM_active$Mouse_samples < 2),])
dim(hg19_mm10_chromHMM_active[which(hg19_mm10_chromHMM_active$Human_samples < 2 & hg19_mm10_chromHMM_active$Mouse_samples > 8),])

### Write out orthologous TEs with constitutive active regulatory state annotation in one or both species
write.table(hg19_mm10_chromHMM_active[which(hg19_mm10_chromHMM_active$Human_samples > 8 & hg19_mm10_chromHMM_active$Mouse_samples > 8),],file="Mouse/constitutive_both_active.txt",sep='\t',quote=FALSE,row.names=FALSE)
write.table(hg19_mm10_chromHMM_active[which(hg19_mm10_chromHMM_active$Human_samples > 8 & hg19_mm10_chromHMM_active$Mouse_samples < 2),],file="Mouse/constitutive_human_active.txt",sep='\t',quote=FALSE,row.names=FALSE)
write.table(hg19_mm10_chromHMM_active[which(hg19_mm10_chromHMM_active$Human_samples < 2 & hg19_mm10_chromHMM_active$Mouse_samples > 8),],file="Mouse/constitutive_mouse_active.txt",sep='\t',quote=FALSE,row.names=FALSE)
```

Identifying orthologous TEs with shared epigenetic profiles that are intergenic. For each, used the hg19 coordinates to find the closest RefSeq gene (reporting all ties). In some cases, filtered to a specific distance to find those far from any gene. 

```{bash investigate ortholog, eval=FALSE}
# Orthologs with shared tissue-specific promoter annotation, intergenic
bedtools closest -a <(cut -f8-31 Mouse/tissue_specific_promoter.txt | tail -n +2 -  | sort -k1,1 -k2,2n - ) -b <(sort -k1,1 -k2,2n ~/genic_features/RefSeq/refseq_genes.txt) -D b -t all | awk -v OFS='\t' '{if($37 != 0) print $1, $2, $3, $4, $37}' | sort | uniq

# Orthologs with shared tissue-specific active state annotation, >50kb from the nearest gene
bedtools closest -a <(cat Mouse/tissue_specific_active_*.txt | cut -f8-31 | awk '{if($1 != "human_chr_hg19") print $0}' -  | sort -k1,1 -k2,2n - ) -b <(sort -k1,1 -k2,2n ~/genic_features/RefSeq/refseq_genes.txt) -D b -t all | awk -v OFS='\t' '{if(($37 > 50000) || ($37 < -50000)) print $1, $2, $3, $4, $37}' | sort | uniq | wc -l

# Orthologs with shared tissue-specific active state annotation, >100kb from the nearest gene
bedtools closest -a <(cat Mouse/tissue_specific_active_*.txt | cut -f8-31 | awk '{if($1 != "human_chr_hg19") print $0}' -  | sort -k1,1 -k2,2n - ) -b <(sort -k1,1 -k2,2n ~/genic_features/RefSeq/refseq_genes.txt) -D b -t all | awk -v OFS='\t' '{if(($37 > 100000) || ($37 < -100000)) print $1, $2, $3, $4, $37}' | sort | uniq | wc -l

# Orthologs with shared constitutive promoter activity, intergenic
bedtools closest -a <(cut -f8-31 Mouse/constitutive_both_promoter.txt | tail -n +2 -  | sort -k1,1 -k2,2n - ) -b <(sort -k1,1 -k2,2n ~/genic_features/RefSeq/refseq_genes.txt) -D b -t all | awk -v OFS='\t' '{if($37 != 0) print $1, $2, $3, $4, $37}' | sort | uniq
```

### Subfamily analysis

```{r ortholog subfamily analysis}
# Number of subfamilies in mm10
length(unique(mm10_rmsk_TE$subfamily))

# Number of subfamilies shared between hg19 and mm10
dim(hg19_mm10_subfamily_count)[1]

# Median age of all hg19 subfamilies and those shared with mm10
median(rmsk_TE_subfamily$Age)
median(rmsk_TE_subfamily[which(rmsk_TE_subfamily$subfamily %in% mm10_rmsk_TE$subfamily),]$Age)

# Number of subfamilies with at least 30 members in both species, overall and TEs overlapping CpGs only
dim(hg19_mm10_subfamily_count[which(hg19_mm10_subfamily_count$Count.x > THRESHOLD_IK_MEMBER & hg19_mm10_subfamily_count$Count.y > THRESHOLD_IK_MEMBER),])[1]
dim(hg19_mm10_subfamily_count[which(hg19_mm10_subfamily_count$Count_CpGs.x > THRESHOLD_IK_MEMBER & hg19_mm10_subfamily_count$Count_CpGs.y > THRESHOLD_IK_MEMBER),])[1]

# Median ratio of subfamily members in mouse vs. human for shared subfamilies
median(hg19_mm10_subfamily_count$Count.y/hg19_mm10_subfamily_count$Count.x)
median(hg19_mm10_subfamily_count$Count_CpGs.y/hg19_mm10_subfamily_count$Count_CpGs.x)
## Outliers
head(hg19_mm10_subfamily_count[order(hg19_mm10_subfamily_count$Count.y/hg19_mm10_subfamily_count$Count.x),])
tail(hg19_mm10_subfamily_count[order(hg19_mm10_subfamily_count$Count.y/hg19_mm10_subfamily_count$Count.x),])

# Linear correlation between the proportion of subfamily members hypomethylated, in the promoter state, or in an active regulatory state
# in human and mouse in paired samples, by sample
by(hg19_mm10_TE_WGBS_subfamily_hypo_paired,hg19_mm10_TE_WGBS_subfamily_hypo_paired$Human_sample,function(x) lm(Mouse_hypo~Human_hypo,x))
by(hg19_mm10_chromHMM_subfamily[which(hg19_mm10_chromHMM_subfamily$Human_state_chromHMM == "1_TssA" & hg19_mm10_chromHMM_subfamily$Mouse_state_chromHMM == "TssA"),],droplevels(hg19_mm10_chromHMM_subfamily[which(hg19_mm10_chromHMM_subfamily$Human_state_chromHMM == "1_TssA" & hg19_mm10_chromHMM_subfamily$Mouse_state_chromHMM == "TssA"),]$Human_sample),function(x) lm(Percent.y~Percent.x,x))
by(hg19_mm10_chromHMM_subfamily_active,hg19_mm10_chromHMM_subfamily_active$Human_sample,function(x) lm(Percent.y~Percent.x,x))
```

### Browser tracks

Processed the mm10 tracks for display on the Wash U Epigenome Browser. For chromHMM, converted each of the chromHMM states to an integer, sorted, zipped, and indexed. For the WGBS methylation files, separated read depth and methylation level for each CpG, sorted, zipped, and indexed. 

```{bash make browser, eval=FALSE}
# Process tracks for display on the Browser

# mm10 chromHMM categorical tracks
## Fourth column must be positive integer
while read a b c; do gunzip raw_data/mouse/chromHMM/$c\.bed.gz; awk -v OFS='\t' '{if($4 == "TssA"){print $1, $2, $3, 1} else if ($4 == "TssAFlnk1"){print $1, $2, $3, 2} else if ($4 == "TssAFlnk2"){print $1, $2, $3, 3} else if ($4 == "Tx1"){print $1, $2, $3, 4} else if ($4 == "Tx2"){print $1, $2, $3, 5} else if ($4 == "Enh"){print $1, $2, $3, 6} else if ($4 == "EnhLo1"){print $1, $2, $3, 7} else if ($4 == "EnhLo2"){print $1, $2, $3, 8} else if ($4 == "HetCons"){print $1, $2, $3, 9} else if ($4 == "HetFac"){print $1, $2, $3, 10} else if ($4 == "TssBiv"){print $1, $2, $3, 11} else if ($4 == "EnhPois1"){print $1, $2, $3, 12} else if ($4 == "EnhPois2"){print $1, $2, $3, 13} else if ($4 == "QuiesG"){print $1, $2, $3, 14} else if ($4 == "Quies"){print $1, $2, $3, 15}}' raw_data/mouse/chromHMM/$c\.bed | sort -k1,1 -k2,2n - > /tavern/epehrsson/mouseENCODE/signal_tracks/$c\.bed; bgzip /tavern/epehrsson/mouseENCODE/signal_tracks/$c\.bed; tabix -p bed /tavern/epehrsson/mouseENCODE/signal_tracks/$c\.bed.gz; done < Mouse/human_mouse_samples.txt &

## Output (linked to /tavern/epehrsson/mouseENCODE/signal_tracks/)
#/bar/epehrsson/public_html/mouseENCODE/ENCFF027XQM.bed.gz [12 files]
#/bar/epehrsson/public_html/mouseENCODE/ENCFF027XQM.bed.gz.tbi [12 files]

# mm10 WGBS tracks 
for file in ~/TE_landscape/raw_data/mouse/WGBS/*.bed; do awk -v OFS='\t' '{print $1, $2, $3, $10}' $file | sort -k1,1 -k2,2n - > $(basename $file .bed)_readDepth.bed; bgzip ~/public_html/mouseENCODE/$(basename $file .bed)_readDepth.bed; done
for file in ~/TE_landscape/raw_data/mouse/WGBS/*.bed; do awk -v OFS='\t' '{print $1, $2, $3, $11}' $file | sort -k1,1 -k2,2n - > $(basename $file .bed)_meth.bed; bgzip ~/public_html/mouseENCODE/$(basename $file .bed)_meth.bed; done
for file in *.bed.gz; do tabix -p bed $file; done

## Output (linked to /tavern/epehrsson/mouseENCODE/signal_tracks/)
#/bar/epehrsson/public_html/mouseENCODE/ENCFF#_meth.bed.gz [9 files]
#/bar/epehrsson/public_html/mouseENCODE/ENCFF#_meth.bed.gz.tbi  [9 files]
#/bar/epehrsson/public_html/mouseENCODE/ENCFF#_readDepth.bed.gz  [9 files]
#/bar/epehrsson/public_html/mouseENCODE/ENCFF#_readDepth.bed.gz.tbi  [9 files]
```

# Methods

## Basic statistics

```{r QC statistics}
# Number/proportion of TEs on chrY
NUM_TE-NUM_TE_noY
(NUM_TE-NUM_TE_noY)/NUM_TE

# Total genomic bp within TEs
MERGED_TE_WIDTH

# Total length of TEs, including bases counted multiple times for overlapping TEs,
# versus unique bases overlapping TEs
sum(rmsk_TE$Length)-MERGED_TE_WIDTH
sum(rmsk_TE$Length)/MERGED_TE_WIDTH

# Number of CpGs in the genome and overlapping TEs
ALL_CPGS
TE_CPGS
```

# Supplement

Found the width of all chromHMM annotation blocks, DHS peaks, and H3K27ac peaks, by sample.  

```{bash peak widths, eval=FALSE}
# ChromHMM, block size by state and sample
while read line; do awk -v OFS='\t' -v sample=$line '{print $3-$2, $4, sample}' raw_data/chromHMM/$line\_15_coreMarks_mnemonics.bed >> chromHMM/chromHMM_blocks.txt; done < sample_lists/mnemonics.txt

# DHS peaks
while read line; do awk -v OFS='\t' -v sample=$line '{print $3-$2, sample}' ../raw_data/DNase/DNase_narrow_peaks/$line\-DNase.macs2.narrowPeak >> peak_widths.txt; done < ../sample_lists/DNase_samples.txt

# H3K27ac
while read line; do awk -v OFS='\t' -v sample=$line '{print $3-$2, sample}' $line\-H3K27ac.narrowPeak >> peak_widths.txt; done < ../../../sample_lists/H3K27ac_samples.txt

## Output
#/bar/epehrsson/TE_landscape/chromHMM/chromHMM_blocks.txt
#/bar/epehrsson/TE_landscape/DNase/peak_widths.txt
#/bar/epehrsson/TE_landscape/H3K27ac/peak_widths.txt
```

Profiling TEs assigned to each chromHMM state. First, extended each TE from the center to 5kb in each direction, then combined the extended and original coordinates into one bedfile. Split TEs by sample and state; then, for each instance of a TE in a state in a sample, found the average fold enrichment over input for histone modifications and DHS in 50bp bins over the 10kb extended region, as well as the average level of DNA methylation. Combined all into one file. 

```{bash histone profile, eval=FALSE}
# Each TE extended from center to 10kb region, excluding those that would extend past the ends of the chromosomes.	 
while read chr size; do awk -v OFS="\t" -v chr=$chr -v size=$size '{if($1 == chr && $2 > 4999 && ($3+5000 < size)) print $1,int($2+(($3-$2)/2))-5000,int($2+(($3-$2)/2))+5000,$1,$2,$3,$4,$5,$6,$7}' rmsk_TE.txt >> rmsk_TE_10kb.txt; done < hg19_standard.genome
while read chr size; do awk -v OFS="\t" -v chr=$chr -v size=$size '{if($1 == chr && $2 > 4999 && ($3+5000 < size)) print $1,int($2+(($3-$2)/2))-5000,int($2+(($3-$2)/2))+5000,$1,$2,$3,$4,$5,$6,$7}' rmsk_other.txt >> rmsk_other_10kb.txt; done < hg19_standard.genome

## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TE_10kb.txt
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_other_10kb.txt

# 10kb-extended TE bed file, rearranged so original coordinates are first	 
awk -v OFS="\t" '{print $4,$5,$6,$7,$8,$9,$10,$1,$2,$3}' /bar/epehrsson/TE_landscape/rmsk_TE_10kb.txt > rmsk_TE_10kb.bed
awk -v OFS="\t" '{print $4,$5,$6,$7,$8,$9,$10,$1,$2,$3}' rmsk_other_10kb.txt > /scratch/ecp/rmsk_other_10kb.bed

## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TE_10kb.bed
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_other_10kb.bed

# Combined into one file
cat features/TEs/rmsk_*_10kb.bed > features/TEs/rmsk_TEother_10kb.bed

## Output
#/bar/epehrsson/TE_landscape/features/TEs/rmsk_TEother_10kb.bed

# TEs in state in sample
awk -v OFS='\t' '{print $0 > "ever/rmsk_TEother_"$8".txt"}' rmsk_TEother_chromHMM_summit_sorted.txt
while read line; do echo $line; awk -v OFS='\t' '{if($10 != "8_ZNF/Rpts") print $0 > "ever/rmsk_TEother_"$8"_"$10".txt"}' ever/rmsk_TEother_$line\.txt; done < mnemonics.txt
while read line; do echo $line; awk -v OFS='\t' '{if($10 == "8_ZNF/Rpts") print $0 > "ever/rmsk_TEother_"$8"_8_ZNF.Rpts.txt"}' ever/rmsk_TEother_$line\.txt; done < mnemonics.txt
 
# Finds fold change over input for histone modifications and DHS over 10kb region centered on TEs, in 50bp bins	
intersect_histone.sh

# Finds DNA methylation level over 10kb region centered on TEs, in 50bp bins
intersect_meth.sh

# Epigenetic profiles by state and mark
while read state; do while read mark; do echo $state $mark; cat averages/rmsk_TEother_E*_$state\_$mark\_average.txt > rmsk_TEother_$state\_$mark\_average.tmp; python calculate_bin_average_all.py rmsk_TEother_$state\_$mark\_average.tmp rmsk_TEother_$state\_$mark\_average.txt; rm rmsk_TEother_$state\_$mark\_average.tmp; done < marks.txt; done < chromHMM_states.txt

## Input
# marks.txt is a list of 9 epigenetic marks
#/bar/epehrsson/TE_landscape/sample_lists/marks.txt

## Output
#/bar/epehrsson/TE_landscape/compare_marks/profile_histone/rmsk_TEother_[state]_[mark]_average.txt [108 files]

# Combined histone profiles
while read state; do while read mark; do awk -v OFS='\t' -v mark=$mark -v state=$state '{print $0, mark, state}' rmsk_TEother_$state\_$mark\_average.txt >> TEother_average.txt; done < marks.txt; done < chromHMM_states.txt

## Output
#/bar/epehrsson/TE_landscape/compare_marks/profile_histone/rmsk_TEother_average.txt
```

### Supplementary Figure 22

```{r Figure S22 scripts, echo=FALSE, cache=TRUE, cache.lazy=FALSE}
# Dataframe of the width of all chromHMM annotation blocks, DHS peaks, and H3K27ac peaks, by sample and state
source("R_scripts/state_lengths.R")

# Number of chromHMM blocks and DHS/H3K27ac peaks per state and sample, plus mean, SD, and median length
all_lengths_blocks = ddply(all_lengths,.(State,Sample),summarise,Mean=mean(Length),SD=sd(Length),Median=median(Length),Blocks=length(Length))
## Add sample categories
all_lengths_blocks = merge(all_lengths_blocks,metadata[,c("Sample",sample_categories)],by="Sample",all.x=TRUE)

# Spearman correlation between the number of blocks/peaks and their median length per sample, by state
all_lengths_blocks_corr = ddply(all_lengths_blocks,.(State),summarise,Cor=as.numeric(unlist(cor.test(Blocks,Median,method="spearman"))["estimate.rho"]),Pvalue=as.numeric(unlist(cor.test(Blocks,Median,method="spearman"))["p.value"]))
```

Loads a dataframe ("histones") with the fold enrichment over input for histone modification ChIP-seq and DHS, plus average DNA methylation level, averaged over all instances of TEs in each chromHMM state, in 50bp bins, extended 5kb from the center of the TEs in each direction. Excludes the 5_TxWk, 14_ReprPCWk, and 15_Quies states.

a. Fold enrichment over input for histone modification ChIP-seq and DHS (left y-axis), plus average DNA methylation level (right y-axis), averaged over all instances of TEs in each chromHMM state, in 50bp bins, extended 5kb from the center of the TEs in each direction. b. For each state, the number of blocks/peaks per sample (x-axis) versus the median length of the blocks/peaks (y-axis), colored by sample Group. Linear correlation between the two is represented by a line. The Spearman correlation is listed for each state. 

```{r Figure S22, echo=FALSE}
source("R_scripts/histone_modification.R")

a = ggplot(histones,aes(x=Position/1000,y=Level,color=Mark,group=Mark)) + geom_line(size=1.5,alpha=0.7) + theme(panel.grid=element_blank(),legend.position = "bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm')) + ylab("Fold enrichment over input") + scale_color_manual(labels=c(levels(histones$Mark)[1:8],"DNA methylation"),values=c(gg_color_hue(8),"black")) + xlab("Distance from TE center (kbp)")+ylim(0,12) + facet_wrap(~ State,nrow=3) + scale_y_continuous(sec.axis = sec_axis(~./12.5, name = "DNA methylation level"))

b = ggplot(all_lengths_blocks,aes(x=log10(Blocks),y=log10(Median))) + geom_point(aes(color=Group)) + xlab("Number of blocks or peaks (log10)") + ylab("Median length of blocks or peaks (log10(bp))") + scale_color_manual(values=group_colors) + facet_wrap(~State,scales="free",labeller=labeller(State=all_state_labels)) + geom_smooth(method="lm",se=FALSE,color="black") + geom_text(data=all_lengths_blocks_corr,aes(x = -Inf, y = Inf,label=round(Cor,2)),hjust=-0.1,vjust=1.5,size=3) + scale_y_continuous(breaks=pretty_breaks(n=3)) + scale_x_continuous(breaks=pretty_breaks(n=3)) + theme(legend.position = "bottom",legend.margin=margin(0,0,0,0),legend.key.size = unit(1,'mm'))

grid.arrange(a,b,heights=c(0.45,0.55))
```

```{r Figure S22 source data}
write.table(histones[,c("Position","Level","Mark","State")],file="source_data/Figure_S22a.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(all_lengths_blocks[,c("Sample","Blocks","Median","Group","State")],file="source_data/Figure_S22b.txt",sep='\t',row.names=FALSE,quote=FALSE)
write.table(all_lengths_blocks_corr,file="source_data/Figure_S22b_cor.txt",sep='\t',row.names=FALSE,quote=FALSE)
```

### State/peak lengths analysis

```{r state peak lengths analysis, cache=TRUE, cache.lazy=FALSE}
# Mean, SD, and median length of all chromHMM blocks/peaks by state, across all samples 
ddply(all_lengths,.(State),summarise,Mean=mean(Length),SD=sd(Length),Median=median(Length))

# Kruskal-Wallis test for differences in the length of chromHMM blocks/peaks across samples, by state
ddply(all_lengths,.(State),summarise,KW_length=unlist(kruskal.test(Length~Sample))["p.value"])

# Range and CV for the number of blocks/peaks and median length per sample, by state
test = ddply(all_lengths_blocks,.(State),summarise,Range_number=max(Blocks)-min(Blocks),CV_number=sd(Blocks)/mean(Blocks),Range_length=max(Median)-min(Median),CV_length=sd(Median)/mean(Median))
## Ordered by CV
test[order(test$CV_number),]
test[order(test$CV_length),]

# Correlation between the number of blocks/peaks and median length across samples by state, ordered by Spearman rho
all_lengths_blocks_corr[order(as.numeric(all_lengths_blocks_corr$Cor)),]

rm(all_lengths)
```

### Mappability of TEs

```{r mappability}
# Spearman correlation between TE length and mappability, overall and by class
cor.test(rmsk_TE$Length,rmsk_TE$mappability,method="spearman")
by(rmsk_TE,rmsk_TE$class_update,function(x) cor.test(x$Length,x$mappability,method="spearman"))

# Spearman correlations between the number of samples a TE is missing methylation data
# and the number of CpGs, CpG density, and mappability per TE, overall and by class
correlate_TE_state[which(correlate_TE_state$State == "Missing" & correlate_TE_state$Metric %in% c("mappability","CpGs","CpGs_per_length")),]

# Spearman correlations between the number of samples a TE is in the 15_Quies state and TE mappability, overall and by class
correlate_TE_state[which(correlate_TE_state$Metric == "mappability" & correlate_TE_state$State == "15_Quies"),]
```