7  Load per biological process

7.1 Introduction

Deleterious mutations that disrupt certain biological processes might affect LMS more negatively than in others. To test if there are certain biological processes where deleterious mutations have negative fitness effects, we hypothesized six processes and used gene ontology annotations. We used Amigo to extract deleterious mutations associated with the GO terms: androgen metabolism, cellular respiration, developmental growth, immunity, muslce tissue development and response to oxidative stress. In the scripts below, you’ll see how we calculate total load based on these GO terms.

7.1.0.1 Subset mutations

First, we subset the mutations located in genes associated with each GO term, and calculate total load based on that subset. We also divide it per genomic region (promoter, intron, exon). We work with lists to iterate the process with ease by looping over the list items.

Code
#### load packages ####
pacman::p_load(BiocManager, rtracklayer, GenomicFeatures, BiocGenerics, data.table, tidyverse, genomation, GenomicRanges, tibble)

#### load the list of genes per GO term ####
files <- list.files(path = "data/gene_lists_go", pattern = "*.txt", full.names = T)

list <- list()
for (i in 1:length(files)){
  data <- fread(files[i])
  list[[i]] <- data
}

names(list) <- gsub("data/gene_lists_go/|.txt", "", files)

#### extract only the unique gene names per GO term ####
for (i in 1:length(list)){
  list[[i]] <- list[[i]] %>% dplyr::select(`V1`) %>%
    mutate(go_term = names(list[i])) %>%
    dplyr::rename(gene_id = `V1`) %>% unique() %>%
    mutate(gene_id = toupper(gene_id))
}

#### GERP ####
#### import gerp data ####
load("output/load/gerp/gerp_over4.RData")
gerp$chr <- gsub(";", "__", gerp$chr)
gerp$chr <- gsub("=", "_", gerp$chr)
gerp$chr_pos <- paste0(gerp$chr, "_", gerp$pos)

# extract only the positions for annotation
gerp_snps <- gerp %>% dplyr::select(chr, pos)
gerp_snps$end <- gerp_snps$pos
gerp_snps$start <- gerp_snps$pos
gerp_gr <- as(gerp_snps, "GRanges")

#### import gene regions ####

### load annotation data ####
annotation_dir <- "data/genomic/annotation"

promoter=unique(gffToGRanges(paste0(annotation_dir, "/promoters.gff3")))
exons_gene=unique(gffToGRanges(paste0(annotation_dir, "/exons_gene.gff3")))
introns=unique(gffToGRanges(paste0(annotation_dir, "/introns_transcripts.gff3")))

#### annotate gerp mutations ####

gerp_promoter <- as.data.frame(mergeByOverlaps(gerp_gr, promoter)) %>%
  add_column("region_promoter" = 1) %>% unique() %>% dplyr::select(c(`gerp_gr.seqnames`, pos, region_promoter,ID)) %>% dplyr::rename(chr = gerp_gr.seqnames)

gerp_exons <- as.data.frame(mergeByOverlaps(gerp_gr, exons_gene))%>%
  add_column("region_exon" = 1) %>% unique() %>% dplyr::select(c(gerp_gr.seqnames, pos, region_exon,ID)) %>% dplyr::rename(chr = gerp_gr.seqnames)

gerp_intron <- as.data.frame(mergeByOverlaps(gerp_gr, introns))%>%
  add_column("region_intron" = "1") %>% unique() %>% dplyr::select(c(gerp_gr.seqnames, pos, region_intron,ID)) %>% dplyr::rename(chr = gerp_gr.seqnames)


gerp_list <- list(gerp_promoter, gerp_exons, gerp_intron)

#### merge with 'similar to' column to get gene IDs ####
lookup <- fread("../grouse-annotation/data/lookup_ANN_gene_id.txt")
names(lookup) <- c("ann", "gene_id")

for (i in 1:length(gerp_list)){
  gerp_list[[i]] <- left_join(gerp_list[[i]], lookup, by = c("ID" = "ann"))
  names(gerp_list[[i]])[4] <- c("ann")
  gerp_list[[i]]$gene_id <- toupper(gerp_list[[i]]$gene_id)
}

#### extract snps per GO term ####

go_snps_promo <- list()
for (i in 1:length(list)){
  overlap <- subset(gerp_list[[1]], gene_id %in% list[[i]]$gene_id)
  go_snps_promo[[i]] <- overlap}

go_snps_exon <- list()
for (i in 1:length(list)){
  overlap <- subset(gerp_list[[2]], gene_id %in% list[[i]]$gene_id)
  go_snps_exon[[i]] <- overlap}

go_snps_intron <- list()
for (i in 1:length(list)){
  overlap <- subset(gerp_list[[3]], gene_id %in% list[[i]]$gene_id)
  go_snps_intron[[i]] <- overlap}

# name
names(go_snps_promo) <- names(list)
names(go_snps_exon) <- names(list)
names(go_snps_intron) <- names(list)

summary_snps_per_go <- data.frame()
for (i in 1:length(list)){
  snps_go <- data.frame(go = names(list[i]),
                        snps_promo = nrow(subset(go_snps_promo[[i]], region_promoter == 1)),
                        snps_exon = nrow(subset(go_snps_exon[[i]], region_exon == 1)),
                        snps_intron = nrow(subset(go_snps_intron[[i]], region_intron == 1)),
                        og_genes_total = nrow(list[[i]]))
  summary_snps_per_go <- rbind(summary_snps_per_go, snps_go)
}

# how many genes
genes_n <- data.frame()
for (i in 1:length(list)){
  promo <- subset(go_snps_promo[[i]], region_promoter == 1)
  exon <- subset(go_snps_exon[[i]], region_exon == 1)
  intron <- subset(go_snps_intron[[i]], region_intron == 1)
  genes <- unique(c(promo$gene_id, exon$gene_id, intron$gene_id))
  genes_go <- data.frame(go = names(list[i]),
                        n_genes = length(genes))
  genes_n <- rbind(genes_n, genes_go)
}

write_tsv(genes_n, file = "output/biological_pathways/n_genes_gerp_per_go.tsv")
#note: there are snps for every go term!

#### extract genotypes per GO term ####
#promo
genotypes_go_promo <- list()

for (i in 1:length(go_snps_promo)){
  go_snps_promo[[i]]$chr_pos <- paste0(go_snps_promo[[i]]$chr, "_", go_snps_promo[[i]]$pos)
  geno <- subset(gerp, chr_pos %in% go_snps_promo[[i]]$chr_pos)
  geno$chr_pos <- NULL
  genotypes_go_promo[[i]] <- geno
}
names(genotypes_go_promo) <- names(list)

#exon
genotypes_go_exon <- list()

for (i in 1:length(go_snps_exon)){
  go_snps_exon[[i]]$chr_pos <- paste0(go_snps_exon[[i]]$chr, "_", go_snps_exon[[i]]$pos)
  geno <- subset(gerp, chr_pos %in% go_snps_exon[[i]]$chr_pos)
  geno$chr_pos <- NULL
  genotypes_go_exon[[i]] <- geno
}
names(genotypes_go_exon) <- names(list)

#intron
genotypes_go_intron <- list()

for (i in 1:length(go_snps_intron)){
  go_snps_intron[[i]]$chr_pos <- paste0(go_snps_intron[[i]]$chr, "_", go_snps_intron[[i]]$pos)
  geno <- subset(gerp, chr_pos %in% go_snps_intron[[i]]$chr_pos)
  geno$chr_pos <- NULL
  genotypes_go_intron[[i]] <- geno
}
names(genotypes_go_intron) <- names(list)

#### SnpEff ####
#### extract only the unique gene names per GO term ####

#### import snpeff data ####
load("output/load/snpeff/snpeff_high.RData")
snpeff$CHROM <- gsub(";", "__", snpeff$CHROM)
snpeff$CHROM <- gsub("=", "_", snpeff$CHROM)
snpeff$chr_pos <- paste0(snpeff$CHROM, "_", snpeff$POS)

# extract only the positions for annotation
snpeff_snps <- snpeff %>% dplyr::select(CHROM, POS)
snpeff_snps$end <- snpeff_snps$POS
snpeff_snps$start <- snpeff_snps$POS
snpeff_gr <- as(snpeff_snps, "GRanges")

#### import gene regions ####

#### annotate snpeff mutations ####
snpeff_promoter <- mergeByOverlaps(snpeff_gr, promoter)
snpeff_promoter <- as.data.frame(snpeff_promoter@listData)
snpeff_promoter <- snpeff_promoter %>%
  add_column("region_promoter" = 1) %>% unique() %>% dplyr::select(c(`snpeff_gr.seqnames`, POS, region_promoter,ID)) %>% dplyr::rename(chr = snpeff_gr.seqnames)

snpeff_exons <- mergeByOverlaps(snpeff_gr, exons_gene)
snpeff_exons <- as.data.frame(snpeff_exons@listData)
snpeff_exons <- snpeff_exons%>%
  add_column("region_exon" = 1) %>% unique() %>% dplyr::select(c(snpeff_gr.seqnames, POS, region_exon,ID)) %>% dplyr::rename(chr = snpeff_gr.seqnames)

snpeff_introns <- mergeByOverlaps(snpeff_gr, introns)
snpeff_introns <- as.data.frame(snpeff_introns@listData)
snpeff_introns <- snpeff_introns%>%
  add_column("region_intron" = 1) %>% unique() %>% dplyr::select(c(snpeff_gr.seqnames, POS, region_intron,ID)) %>% dplyr::rename(chr = snpeff_gr.seqnames)

snpeff_list <- list(snpeff_promoter, snpeff_exons, snpeff_introns)

#### merge with 'similar to' column to get gene IDs ####
lookup <- fread("data/genomic/annotation/lookup_ANN_gene_id.txt")
names(lookup) <- c("ann", "gene_id")

for (i in 1:length(snpeff_list)){
  snpeff_list[[i]] <- left_join(snpeff_list[[i]], lookup, by = c("ID" = "ann"))
  names(snpeff_list[[i]])[4] <- c("ann")
  snpeff_list[[i]]$gene_id <- toupper(snpeff_list[[i]]$gene_id)
}

#### extract snps per GO term ####

go_snps_promo_snpeff <- list()
for (i in 1:length(list)){
  overlap <- subset(snpeff_list[[1]], gene_id %in% list[[i]]$gene_id)
  go_snps_promo_snpeff[[i]] <- overlap}

go_snps_exon_snpeff <- list()
for (i in 1:length(list)){
  overlap <- subset(snpeff_list[[2]], gene_id %in% list[[i]]$gene_id)
  go_snps_exon_snpeff[[i]] <- overlap}

go_snps_intron_snpeff <- list()
for (i in 1:length(list)){
  overlap <- subset(snpeff_list[[3]], gene_id %in% list[[i]]$gene_id)
  go_snps_intron_snpeff[[i]] <- overlap}

# name
names(go_snps_promo_snpeff) <- names(list)
names(go_snps_exon_snpeff) <- names(list)
names(go_snps_intron_snpeff) <- names(list)

summary_snps_per_go <- data.frame()
for (i in 1:length(list)){
  snps_go <- data.frame(go = names(list[i]),
                        snps_promo = nrow(subset(go_snps_promo_snpeff[[i]], region_promoter == 1)),
                        snps_exon = nrow(subset(go_snps_exon_snpeff[[i]], region_exon == 1)),
                        snps_intron = nrow(subset(go_snps_intron_snpeff[[i]], region_intron == 1)),
                        og_genes_total = nrow(list[[i]]))
  summary_snps_per_go <- rbind(summary_snps_per_go, snps_go)
}

genes_n_snpeff <- data.frame()
for (i in 1:length(list)){
  promo <- subset(go_snps_promo_snpeff[[i]], region_promoter == 1)
  exon <- subset(go_snps_exon_snpeff[[i]], region_exon == 1)
  intron <- subset(go_snps_intron_snpeff[[i]], region_intron == 1)
  genes <- unique(c(promo$gene_id, exon$gene_id, intron$gene_id))
  genes_go <- data.frame(go = names(list[i]),
                         n_genes = length(genes))
  genes_n_snpeff <- rbind(genes_n_snpeff, genes_go)
}

write_tsv(genes_n_snpeff, file = "output/biological_pathways/n_genes_snpeff_per_go.tsv")

#note: there are snps for every go term!

#### extract genotypes per GO term ####
#only select those with >10 snps
go_snps_promo_snpeff <- go_snps_promo_snpeff[c(2,4:9)]
go_snps_exon_snpeff <- go_snps_exon_snpeff[c(2,4:9)]
go_snps_intron_snpeff <- go_snps_intron_snpeff[c(2,4:9)]

#promo
genotypes_go_promo_snpeff <- list()

for (i in 1:length(go_snps_promo_snpeff)){
  go_snps_promo_snpeff[[i]]$chr_pos <- paste0(go_snps_promo_snpeff[[i]]$chr, "_", go_snps_promo_snpeff[[i]]$POS)
  geno <- subset(snpeff, chr_pos %in% go_snps_promo_snpeff[[i]]$chr_pos)
  geno$chr_pos <- NULL
  genotypes_go_promo_snpeff[[i]] <- geno
}

names(genotypes_go_promo_snpeff) <- names(go_snps_promo_snpeff)

#exon
genotypes_go_exon_snpeff <- list()

for (i in 1:length(go_snps_exon_snpeff)){
  go_snps_exon_snpeff[[i]]$chr_pos <- paste0(go_snps_exon_snpeff[[i]]$chr, "_", go_snps_exon_snpeff[[i]]$POS)
  geno <- subset(snpeff, chr_pos %in% go_snps_exon_snpeff[[i]]$chr_pos)
  geno$chr_pos <- NULL
  genotypes_go_exon_snpeff[[i]] <- geno
}
names(genotypes_go_exon_snpeff) <- names(go_snps_exon_snpeff)

#intron
genotypes_go_intron_snpeff <- list()

for (i in 1:length(go_snps_intron_snpeff)){
  go_snps_intron_snpeff[[i]]$chr_pos <- paste0(go_snps_intron_snpeff[[i]]$chr, "_", go_snps_intron_snpeff[[i]]$POS)
  geno <- subset(snpeff, chr_pos %in% go_snps_intron_snpeff[[i]]$chr_pos)
  geno$chr_pos <- NULL
  genotypes_go_intron_snpeff[[i]] <- geno
}
names(genotypes_go_intron_snpeff) <- names(go_snps_intron_snpeff)

7.1.0.2 Calculate mutation load

Then, we calculate the total load based on these subsets, again for GERP and SnpEff separately.

Code
#### calculate mutation load based on mutations in each GO term ####

### GERP
source("scripts/7_calculate_load/function_calculate_load.R")

loads <- data.frame()
for (i in 1:length(list)){
  #promo
  load_promo <- calculate_load_gerp(unique(genotypes_go_promo[[i]]), loadtype = names(genotypes_go_promo[i]), output_vcf = F)
  load_promo$method <- "gerp"
  load_promo$region = "promo"
  loads <- rbind(loads, load_promo)
  #exon
  load_exon <- calculate_load_gerp(genotypes_go_exon[[i]], loadtype = names(genotypes_go_exon[i]), output_vcf = F)
  load_exon$method <- "gerp"
  load_exon$region = "exon"
  loads <- rbind(loads, load_exon)
  #intron
  load_intron <- calculate_load_gerp(genotypes_go_intron[[i]], loadtype = names(genotypes_go_intron[i]), output_vcf = F)
  load_intron$method <- "gerp"
  load_intron$region = "intron"
  loads <- rbind(loads, load_intron)
}

#### SnpEff
loads_snpeff <- data.frame()
for (i in 1:length(genotypes_go_promo_snpeff)){
  #promo
  load <- calculate_load_snpeff(genotypes_go_promo_snpeff[[i]], loadtype = names(genotypes_go_promo_snpeff[i]), output_vcf = F)
  load$method <- "snpeff"
  load$region = "promo"
  loads_snpeff <- rbind(loads_snpeff, load)
  #exon
  load <- calculate_load_snpeff(genotypes_go_exon_snpeff[[i]], loadtype = names(genotypes_go_exon_snpeff[i]), output_vcf = F)
  load$method <- "snpeff"
  load$region = "exon"
  loads_snpeff <- rbind(loads_snpeff, load)
  #intron
  load <- calculate_load_snpeff(genotypes_go_intron_snpeff[[i]], loadtype = names(genotypes_go_intron_snpeff[i]), output_vcf = F)
  load$method <- "snpeff"
  load$region = "intron"
  loads_snpeff <- rbind(loads_snpeff, load)
}

loads <- rbind(loads, loads_snpeff)
save(loads, file = "output/biological_pathways/loads_per_go_term_per_region.RData")

7.1.1 Modelling

We then computed the exact same models as the total load models presented before with the following model structure:

LMS ~ scale(total_load) + core + (1|site)

Where total load is not a measure taken from a subset of deleterious mutations located within a specific genomic region and associated with a certain subset of genes.

7.1.2 Results

Below you find the posterior distributions:

Results per genomic region per GO term for GERP

Results per genomic region per GO term for SnpEff
Code
library(data.table); library(kableExtra)
gerp <- fread("../output/biological_pathways/intervals_gerp.tsv")
gerp %>% kbl() 
parameter region go median ci_95 ci_80
b_scaletotal_load Exon Androgen metabolism -0.17 -0.25, -0.1 -0.23, -0.12
b_scaletotal_load Exon Cellular respiration 0.01 -0.06, 0.09 -0.03, 0.07
b_scaletotal_load Exon Developmental growth 0.05 -0.04, 0.13 -0.01, 0.1
b_scaletotal_load Exon Immune response 0.14 0.06, 0.23 0.08, 0.2
b_scaletotal_load Exon Muscle tissue development -0.02 -0.1, 0.07 -0.07, 0.04
b_scaletotal_load Exon Response to oxidative stress -0.17 -0.25, -0.09 -0.22, -0.11
b_scaletotal_load Intron Androgen metabolism 0.41 0.32, 0.49 0.35, 0.46
b_scaletotal_load Intron Cellular respiration 0.33 0.23, 0.43 0.27, 0.4
b_scaletotal_load Intron Developmental growth 0.09 0, 0.18 0.03, 0.15
b_scaletotal_load Intron Immune response -0.29 -0.38, -0.21 -0.35, -0.24
b_scaletotal_load Intron Muscle tissue development 0.08 0.01, 0.15 0.03, 0.13
b_scaletotal_load Intron Response to oxidative stress -0.03 -0.12, 0.06 -0.09, 0.03
b_scaletotal_load Promoter Androgen metabolism -0.33 -0.4, -0.25 -0.38, -0.28
b_scaletotal_load Promoter Cellular respiration -0.25 -0.34, -0.16 -0.3, -0.19
b_scaletotal_load Promoter Developmental growth -0.22 -0.3, -0.15 -0.27, -0.17
b_scaletotal_load Promoter Immune response -0.23 -0.3, -0.15 -0.28, -0.18
b_scaletotal_load Promoter Muscle tissue development -0.09 -0.17, -0.01 -0.14, -0.04
b_scaletotal_load Promoter Response to oxidative stress -0.11 -0.19, -0.02 -0.16, -0.05
Code
high <- fread("../output/biological_pathways/intervals_high.tsv")
high %>% kbl()
parameter region go median ci_95 ci_80
b_scaletotal_load Exon Cellular respiration 0.19 0.1, 0.28 0.13, 0.25
b_scaletotal_load Exon Developmental growth -0.02 -0.1, 0.06 -0.07, 0.03
b_scaletotal_load Exon Immune response -0.14 -0.23, -0.05 -0.2, -0.08
b_scaletotal_load Exon Muscle tissue development 0.01 -0.07, 0.08 -0.04, 0.06
b_scaletotal_load Exon Response to oxidative stress -0.07 -0.15, 0.02 -0.12, -0.02
b_scaletotal_load Intron Cellular respiration 0.09 0.02, 0.17 0.04, 0.14
b_scaletotal_load Intron Developmental growth 0.29 0.2, 0.38 0.23, 0.35
b_scaletotal_load Intron Immune response -0.08 -0.15, 0 -0.13, -0.03
b_scaletotal_load Intron Muscle tissue development 0.20 0.11, 0.29 0.14, 0.26
b_scaletotal_load Intron Response to oxidative stress 0.37 0.28, 0.45 0.31, 0.42
b_scaletotal_load Promoter Cellular respiration 0.00 -0.09, 0.09 -0.06, 0.06
b_scaletotal_load Promoter Developmental growth -0.05 -0.14, 0.03 -0.11, 0
b_scaletotal_load Promoter Immune response -0.25 -0.34, -0.17 -0.31, -0.2
b_scaletotal_load Promoter Muscle tissue development 0.05 -0.03, 0.13 0, 0.1
b_scaletotal_load Promoter Response to oxidative stress 0.11 0.03, 0.19 0.06, 0.16

These results indicate the results really depend on the GO term and region in question. But three patterns are clear: - GERP mutations in promoters of many GO terms are deleterious - GERP mutations in two or more genomic regions of the GO terms: androgen metabolism, immune response and response to oxidative stress are deleterious - High impact SnpEff mutations in all genomic regions of immune response are negative