6 Load per genomic region

6.1 Introduction

Both functional noncoding and protein-coding regions can be under selective constraint. However, the former include various regulatory elements such as promoters, enhancers and silencers, which play different roles in gene regulation and might therefore be expected to experience different strengths of purifying selection. To investigate whether the fitness effects of deleterious mutations vary by genomic region, we classified each deleterious mutation according to whether it was located within a promoter, transcription start site, intron or exon. Then, we calculate total load based on these subsets and model their effects on LMS.

6.1.1 Genomic regions

We use a combination of GenomicRanges and rtracklayer and other packages to divide up the genome annotation file into the four genomic regions.

6.1.1.1 Subset reference genome

Code

#### Packages #####
pacman::p_load(BiocManager, rtracklayer, GenomicFeatures, BiocGenerics, data.table, dplyr, genomation, GenomicRanges, tibble)

#### Genome data ####
# first change scaffold names
gff_raw <- fread("data/genomic/annotation/PO2979_Lyrurus_tetrix_black_grouse.annotation.gff") 
gff_raw$V1 <- gsub(";", "__", gff_raw$V1)
gff_raw$V1 <- gsub("=", "_", gff_raw$V1)
write.table(gff_raw, file = "data/genomic/PO2979_Lyrurus_tetrix_black_grouse.annotation_editedscafnames.gff", sep = "\t", col.names = FALSE, quote=F, row.names = FALSE)

#read in new file
gff <- makeTxDbFromGFF("data/genomic/PO2979_Lyrurus_tetrix_black_grouse.annotation_editedscafnames.gff", format="gff3", organism="Lyrurus tetrix") 

## divide up between the 4 regions ###

promoters <- promoters(gff, upstream=2000, downstream=200, columns=c("tx_name", "gene_id")) # From NIOO
TSS <- promoters(gff, upstream=300, downstream=50, columns=c("tx_name", "gene_id")) # TSS as in Laine et al., 2016. Nature Communications
exons_gene <- unlist(exonsBy(gff, "gene")) # group exons by genes
introns <- unlist(intronsByTranscript(gff, use.names=TRUE))

### write out files
export(promoters, "data/genomic/annotation/promoters.gff3")
export(TSS, "data/genomic/annotation/TSS.gff3")
introns@ranges@NAMES[is.na(introns@ranges@NAMES)]<-"Unknown"
export(introns, "data/genomic/annotation/introns_transcripts.gff3")
export(exons_gene, "data/genomic/annotation/exons_gene.gff3")

6.1.1.2 Annotate deleterious mutations

Then, we load in all deleterious mutations for GERP and SnpEff separately and annotate the mutations according to the four regions.

Code

###### Annotate high impact and GERP mutations #####
### annotate gene regions per SNP for high impact and gerp mutations ###

### load data ###

load(file = "output/load/snpeff/snpeff_high.RData")
load(file = "output/load/gerp/gerp_over4.RData")

### change scaf names
snpeff$CHROM <- gsub(";", "__", snpeff$CHROM)
snpeff$CHROM <- gsub("=", "_", snpeff$CHROM)

gerp$chr <- gsub(";", "__", gerp$chr)
gerp$chr <- gsub("=", "_", gerp$chr)

### remove genotypes: not necessary here
snpeff <- snpeff[,c(1:9)]
gerp <- gerp[,c(1:9)]

####### Gene regions ###########

### load annotation data
annotation_dir <- "data/genomic/annotation"

promoter=unique(gffToGRanges(paste0(annotation_dir, "/promoters.gff3")))
TSS=unique(gffToGRanges(paste0(annotation_dir, "/TSS.gff3")))
introns=unique(gffToGRanges(paste0(annotation_dir, "/introns_transcripts.gff3")))
exons_gene=unique(gffToGRanges(paste0(annotation_dir, "/exons_gene.gff3")))

#### Annotate SNPeff regions ####
snpeff$end <- snpeff$POS
snpeff$start <- snpeff$POS
snpef_gr <- as(snpeff, "GRanges")

snpef_promoter <- as.data.frame(subsetByOverlaps(snpef_gr, promoter)) %>%
  add_column("region_promoter" = 1) %>% dplyr::rename(chr = seqnames)%>% unique()

snpef_TSS <- as.data.frame(subsetByOverlaps(snpef_gr, TSS))%>%
  add_column("region_tss" = 1) %>% dplyr::rename(chr = seqnames)%>% unique()

snpef_exons <- as.data.frame(subsetByOverlaps(snpef_gr, exons_gene))%>%
  add_column("region_exon" = 1) %>% dplyr::rename(chr = seqnames)%>% unique()

snpef_introns <- as.data.frame(subsetByOverlaps(snpef_gr, introns))%>%
  add_column("region_intron" = 1) %>% dplyr::rename(chr = seqnames)%>% unique()

snpef_all <- left_join(snpeff, snpef_promoter[,c("chr", "start", "region_promoter")], by = c("CHROM" = "chr", "start"))%>%
  left_join(snpef_TSS[,c("chr", "start", "region_tss")], by = c("CHROM" = "chr", "start"))%>%
  left_join(snpef_exons[,c("chr", "start", "region_exon")], by = c("CHROM" = "chr", "start"))%>%
  left_join(snpef_introns[,c("chr", "start", "region_intron")], by = c("CHROM" = "chr", "start"))

# correct promoters
snpef_all <- snpef_all %>% mutate(region_promoter = case_when(
  region_tss == 1 & region_promoter == 1 ~ NA,
  is.na(region_tss) & region_promoter == 1 ~ 1
))

save(snpef_all, file = "output/load/snpeff/snpeff_high_annotated_region.RData")

#### Annotate gerp regions ####
gerp$end <- gerp$pos
gerp$start <- gerp$pos
gerp_gr <- as(gerp, "GRanges")

gerp_promoter <- as.data.frame(subsetByOverlaps(gerp_gr, promoter)) %>%
  add_column("region_promoter" = 1) %>% dplyr::rename(chr = seqnames)%>% unique()

gerp_TSS <- as.data.frame(subsetByOverlaps(gerp_gr, TSS))%>%
  add_column("region_tss" = 1) %>% dplyr::rename(chr = seqnames)%>% unique()

gerp_exons <- as.data.frame(subsetByOverlaps(gerp_gr, exons_gene))%>%
  add_column("region_exon" = 1) %>% dplyr::rename(chr = seqnames)%>% unique()

gerp_introns <- as.data.frame(subsetByOverlaps(gerp_gr, introns))%>%
  add_column("region_intron" = 1) %>% dplyr::rename(chr = seqnames)%>% unique()

gerp_all <- left_join(gerp, gerp_promoter[,c("chr", "start", "region_promoter")], by = c("chr" = "chr", "start"))%>%
   left_join(gerp_TSS[,c("chr", "start", "region_tss")], by = c("chr" = "chr", "start"))%>%
  left_join(gerp_exons[,c("chr", "start", "region_exon")], by = c("chr" = "chr", "start"))%>%
  left_join(gerp_introns[,c("chr", "start", "region_intron")], by = c("chr" = "chr", "start"))
 
# correct promoters
gerp_all <- gerp_all %>% mutate(region_promoter = case_when(
  region_tss == 1 & region_promoter == 1 ~ NA,
  is.na(region_tss) & region_promoter == 1 ~ 1
))

save(gerp_all, file = "output/load/gerp/gerp_annotated_region.RData")

6.1.1.3 Calculate mutation load

These two files (gerp_all and snpeff_all) contain the SNP locations and additional binary columns whether the mutations was located in one of the 4 regions. We can use this file together with the files containing the genotypes to calculate mutation load based on the subsets of mutations only.

Code

#### Calculate load per region #####

# load annotated gerp data
load(file = "output/load/gerp/gerp_annotated_region.RData")
gerp_all$chr <- gsub("__", ";", gerp_all$chr)
gerp_all$chr <- gsub("HRSCAF_", "HRSCAF=", gerp_all$chr)

# load annotation
load(file = "output/load/snpeff/snpeff_high_annotated_region.RData")
snpef_all$CHROM <- gsub("__", ";", snpef_all$CHROM)
snpef_all$CHROM <- gsub("HRSCAF_", "HRSCAF=", snpef_all$CHROM)

## load functions to calculate load
source("scripts/7_calculate_load/function_calculate_load.R")

#### calculate load per region as defined below 

regions <- c("region_promoter", "region_tss" ,"region_exon","region_intron")

#load existing combined load file
load("output/load/all_loads_combined_da_nosex_29scaf.RData") #loads no sex chr only 29 scaf

# load gt again to include genotypes
load(file = "output/load/snpeff/snpeff_high.RData")
load(file = "output/load/gerp/gerp_over4.RData")

#first gerp5
load_per_region <- load
for (region in regions){
  subset_locs <- subset(gerp_all, gerp_all[,region] == 1) #subset based on region name
  
  subset_locs$chr_pos <- paste0(subset_locs$chr, "_", subset_locs$pos) #make a col for the snp position
  gerp$chr_pos <- paste0(gerp$chr, "_", gerp$pos) #make a col for the snp position
  
  sub_genotypes <- subset(gerp, chr_pos %in% subset_locs$chr_pos) #subset genotypes based on subset
  
  sub_genotypes$chr_pos <- NULL #remove chr_pos again
  
  loadtype = paste0("gerp45", gsub("region", "", region))
  
  load_sub <- calculate_load_gerp(sub_genotypes, loadtype = loadtype, output_vcf = F) #calculate loads
  
  load_per_region <- rbind(load_per_region, load_sub[,c("id", "het_load", "hom_load", "total_load", "loadtype")])
}

#snpeff
for (region in regions){
  subset_locs <- subset(snpef_all, snpef_all[,region] == 1) #subset based on region name
  
  subset_locs$chr_pos <- paste0(subset_locs$CHROM, "_", subset_locs$POS) #make a col for the snp position
  snpeff$chr_pos <- paste0(snpeff$CHROM, "_", snpeff$POS) #make a col for the snp position
  
  sub_genotypes <- subset(snpeff, chr_pos %in% subset_locs$chr_pos) #subset genotypes based on subset
  
  sub_genotypes$chr_pos <- NULL #remove chr_pos again
  
  loadtype = paste0("high", gsub("region", "", region))
  load_sub <- calculate_load_snpeff(sub_genotypes, loadtype = loadtype, output_vcf = F) #calculate loads
  
  load_per_region <- rbind(load_per_region, load_sub[,c("id", "het_load", "hom_load", "total_load", "loadtype")])
}

save(load_per_region, file = "output/load/all_loads_combined_da_nosex_29scaf_plus_per_region.RData")

6.1.2 Modelling

We then computed the exact same models as the total load models presented before with the following model structure:

LMS ~ scale(total_load) + core + (1|site)

Where total load is not a measure taken from a subset of deleterious mutations located within a specific genomic region.

6.1.3 Results

Below you find the posterior distributions:

Code

library(readxl); library(dplyr); library(kableExtra)
gerp <- read.csv("../output/models/intervals/regions_gerp45.csv")
gerp %>% kbl()

parameter	outer_width	inner_width	point_est	ll	l	m	h	hh	region
b_scaletotal_load	0.95	0.8	median	-0.26	-0.23	-0.18	-0.13	-0.09	Promoter
b_scaletotal_load	0.95	0.8	median	-0.35	-0.32	-0.27	-0.22	-0.20	TSS
b_scaletotal_load	0.95	0.8	median	-0.37	-0.35	-0.29	-0.23	-0.21	Intron
b_scaletotal_load	0.95	0.8	median	0.15	0.18	0.23	0.29	0.31	Exon

Code

high <- read.csv("../output/models/intervals/regions_high.csv")
high %>% kbl()

parameter	outer_width	inner_width	point_est	ll	l	m	h	hh	region
b_scaletotal_load	0.95	0.8	median	-0.34	-0.31	-0.26	-0.20	-0.18	Promoter
b_scaletotal_load	0.95	0.8	median	-0.12	-0.09	-0.04	0.01	0.04	TSS
b_scaletotal_load	0.95	0.8	median	-0.15	-0.13	-0.08	-0.02	0.01	Intron
b_scaletotal_load	0.95	0.8	median	-0.20	-0.17	-0.11	-0.06	-0.03	Exon

These results indicate that regulatory regions, especially promoter regions (SnpEff) are very important!