Chi Yen Tseng1, Christine M. Custer2, Thomas W. Custer2, Paul M. Dummer2, Natalie Karouna‐Renier3 and Cole W. Matson1
All
the sites included in this study
Detail location can be downloaded here at GLRI_site_details
library(glmnet)
library(edgeR)
library(readr)
library(tidyverse)
library(foreach)
library(doParallel)
library(reshape2)
library(stringr)
library(ggpubr)
library(ggrepel)
library(ComplexHeatmap)
library(ggplot2)
library(GGally)
GLRI_coldata
data.contaminants.majorSubset.geo.bysite
contaminants.coldata.subset
ID.convert
siteMAP.all
dds.bothSex.adjusted or counts.tswallow
Cont.major.list
counts.tswallow.norm
topgenelistPCBs.rds
, including 91 genes which were
appeared more than 5% in the first round of lasso regression analysis in
the cross-validation. Detail script of first round lasso models were
provided here
lasso.by.site.PCBs.top100.lasso(i,Con,Con_ind, test.genelist, s, lam.m)
i
site) out to build lasso regression models to
predict concentrations of contaminant (Con
and
Con_ind
) using top predictor genes (genelist
),
model parameter lamda (s
: 1se and lam.m
:
minimum log10 lamda value to test model fit)
Run_lasso.by.site.PCBs.top100.lasso(re, Con, Con_ind, s, lam.m)
lasso.by.site.PCBs.top100.lasso
function
re
times
evaluation_funciton.bysite
evaluation_funciton.byindividual
## Con, Con_ind: contaminant type; test.genelist: top genes selected from the first run; s: lamda.sequence (l.1se,l.min,l.median); lam.m: lambda = 10^seq(0,lam.m,length=600)
lasso.by.site.PCBs.top100.lasso <- function(i,Con,Con_ind, test.genelist, s, lam.m) {
## genomic samples with the same nest id
nestid <- contaminants.coldata.subset2 %>% filter(Key == Con_ind) %>% dplyr::pull(MergeID)
match1 <- colnames(counts.tswallow.norm) %in% nestid
counts.con <- counts.tswallow.norm[,match1]
counts.con.batch <- as.factor(as.character(dds.bothSex.adjusted$batch2)[match1])
counts.con.sex <- as.factor(as.character(dds.bothSex.adjusted$sex)[match1])
contamin.raw <- contaminants.coldata.subset2 %>% filter(Key == Con_ind) %>% dplyr::pull(value.adjust) %>% log10()
contamin.matrix <- contaminants.coldata.subset2 %>% dplyr::slice(match(colnames(counts.con), contaminants.coldata.subset2$MergeID)) %>% filter(Key == Con_ind)
counts.con.contamin <- log10(contamin.matrix$value.adjust) ## get log10 value to suppress potential outlier
site.list = unique(as.character(dds.bothSex.adjusted$site[match1]))
## Building matrix
if (length(levels(counts.con.batch)) > 1) {
counts.con.t <- as.data.frame(t(counts.con[test.genelist,]))
counts.con.t <- cbind(counts.con.t, batch = counts.con.batch, sex = counts.con.sex, contaminant = counts.con.contamin)
m <- model.matrix(contaminant ~ batch +., counts.con.t) # m for training
m <- m[,-1] } else {
counts.con.t <- as.data.frame(t(counts.con[test.genelist,]))
counts.con.t <- cbind(counts.con.t, sex = counts.con.sex, contaminant = counts.con.contamin)
m <- model.matrix(contaminant ~ ., counts.con.t)
m <- m[,-1] }
## to include all genomic samples as m2
nestid <- contaminants.coldata.subset2 %>% filter(Key == Con_ind) %>% dplyr::pull(MergeID)
counts.con2 <- counts.tswallow.norm
counts.con.batch2 <- as.factor(as.character(dds.bothSex.adjusted$batch2))
counts.con.sex2 <- as.factor(as.character(dds.bothSex.adjusted$sex))
## Building matrix
if (length(levels(counts.con.batch2)) > 1) {
counts.con.t2 <- as.data.frame(t(counts.con2[test.genelist,]))
counts.con.t2 <- cbind(counts.con.t2, batch = counts.con.batch2, sex = counts.con.sex2)
m2 <- model.matrix(~ batch +., counts.con.t2) # m2 for validation in leave one site out
m2 <- m2[,-1] } else {
counts.con.t2 <- as.data.frame(t(counts.con2[test.genelist,]))
counts.con.t2 <- cbind(counts.con.t2, sex = counts.con.sex2)
m2 <- model.matrix( ~ ., counts.con.t2)
m2 <- m2[,-1] }
site.count.nest.subset = table(str_sub(colnames(counts.con),3,4))
site.count.genomic.subset = table(str_sub(colnames(counts.con2),3,4))
# sites_verifications = c("ER", "HW", "LS", "PB", "SC", "SP", "ST", "TH", "WK")
## CV leave one out by site
test.site = str_sub(row.names(m),3,4) %in% site.list[i]
l.1se <- {} ; l.min <- {}
for (j in 1:20) {
cvfit <- cv.glmnet(x=m[!test.site,], y=counts.con.t$contaminant[!test.site], nfold = 10, alpha = 1, lambda = 10^seq(0,lam.m,length=600), relax =FALSE)
l.1se <- c(l.1se, cvfit$lambda.1se)
l.min <- c(l.min, cvfit$lambda.min)
} # repeat 20 times and pick median lamda to avoid picking extreme in 10 folds cross validation
l.1se <- median(l.1se)
l.min <- median(l.min)
l.median <- exp(mean(c(log(l.min),log(l.1se))))
lamda.sequence <- c(l.1se,l.min,l.median)
ByNestResult.withlamdaP <- {}
training.site = sample(which(!test.site), round(sum(!test.site)*0.9))
fit.lasso <- glmnet(x=m[training.site,], y=counts.con.t$contaminant[training.site], alpha = 1, lambda = 10^seq(0,lam.m,length=600)) # lasso model using training set
## get all the metrics c(features, dev.ratio, lamda)
variables = coef(fit.lasso, s = lamda.sequence[s])[,1]
variables = variables[variables != 0][-1]
dev.ratio = fit.lasso$dev.ratio[which(sort(c(lamda.sequence[s],10^seq(0,lam.m,length=600)),decreasing = TRUE) == lamda.sequence[s])[1] -1]
## get all predictions using all avaialble genomic samples
test.site2 = str_sub(row.names(m2),3,4) == site.list[i]
## lasso
predict.all.genomic.samples.subset <- predict(fit.lasso, newx = m[test.site,], s = lamda.sequence[s]) ## only use overlapped portion
predict.all.genomic.samples <- predict(fit.lasso, newx = m2[test.site2,], s = lamda.sequence[s]) ## only use overlapped portion
predict.all.genomic.samples.subset.lasso = predict.all.genomic.samples.subset[,1]
predict.all.genomic.samples.lasso = predict.all.genomic.samples[,1]
predict.all.genomic.samples.subset.lasso.mean = mean(predict.all.genomic.samples.subset.lasso)
names(predict.all.genomic.samples.subset.lasso.mean) = site.list[i]
predict.all.genomic.samples.lasso.mean = mean(predict.all.genomic.samples.lasso)
names(predict.all.genomic.samples.lasso.mean) = site.list[i]
## combine
result = list(variables =variables, dev.ratio = dev.ratio ,lamda.sequence = lamda.sequence, predict.all.genomic.samples.subset.lasso = predict.all.genomic.samples.subset.lasso, predict.all.genomic.samples.subset.lasso.mean = predict.all.genomic.samples.subset.lasso.mean, predict.all.genomic.samples.lasso = predict.all.genomic.samples.lasso, predict.all.genomic.samples.lasso.mean = predict.all.genomic.samples.lasso.mean)
return(result)
}
Run_lasso.by.site.PCBs.top100.lasso <- function(re, Con, Con_ind, s, lam.m) {
lassoCVbysite <- foreach(i = rep(1:31, each = re), .packages = c("dplyr","glmnet","stringr"), .combine = 'cbind', .export = c("lasso.by.site.PCBs.top100.lasso", "contaminants.coldata.subset2","counts.tswallow.norm","dds.bothSex.adjusted", "test.genelist", "data.contaminants.majorSubset.geo.bysite")) %dopar% {
LassoCVbysite_result <- Vectorize(lasso.by.site.PCBs.top100.lasso(i,Con,Con_ind, test.genelist, s, lam.m))
return(LassoCVbysite_result)
}
return(lassoCVbysite)
}
evaluation_funciton.bysite <- function(Result) {
result.all <- {}
evaluation.result <- {}
predictions = Result
## geoMean: only use overlapped genomic site individulas training samples to calculate geoMean
predictions2 = data.frame(site = names(Result), predictions, geoMean = contaminant.geoMean.bysite.genomic[names(Result)], b33 = quantile(contaminant.geoMean.bysite, probs = 0.33, na.rm = FALSE, names = FALSE), t33 = quantile(contaminant.geoMean.bysite, probs = 0.67, na.rm = FALSE, names = FALSE))
predictions3 = cbind(predictions2, is.top33 = predictions2$geoMean >= predictions2$t33, is.bottom33 = predictions2$geoMean <= predictions2$b33, is.top.predict = predictions2$predictions >= predictions2$t33, is.bottom.predict = predictions2$predictions <= predictions2$b33)
predictions.all = predictions3
predictions.all$delta = (predictions.all$predictions - predictions.all$geoMean)^2
MSE = sum(predictions.all$delta)/nrow(predictions.all)
error.top = sum(predictions.all$is.top.predict & predictions.all$is.bottom33)/ sum(predictions.all$is.top.predict)
error.bottom = sum(predictions.all$is.bottom.predict & predictions.all$is.top33)/ sum(predictions.all$is.bottom.predict)
error.both = sum((predictions.all$is.top.predict & predictions.all$is.bottom33) | (predictions.all$is.bottom.predict & predictions.all$is.top33))/ sum(predictions.all$is.bottom.predict | predictions.all$is.top.predict)
accuracy.top = sum(predictions.all$is.top.predict & predictions.all$is.top33)/ sum(predictions.all$is.top.predict)
accuracy.bottom = sum(predictions.all$is.bottom.predict & predictions.all$is.bottom33)/ sum(predictions.all$is.bottom.predict)
accuracy.both = sum((predictions.all$is.top.predict & predictions.all$is.top33) | (predictions.all$is.bottom.predict & predictions.all$is.bottom33))/ sum(predictions.all$is.bottom.predict | predictions.all$is.top.predict)
testing_ratio = c(top_ratio = sum(predictions.all$is.top33)/nrow(predictions.all), bottom_ratio = sum(predictions.all$is.bottom33)/nrow(predictions.all))
performance.result = c(error.top = error.top,error.bottom = error.bottom, error.both = error.both, accuracy.top = accuracy.top ,accuracy.bottom = accuracy.bottom,accuracy.both = accuracy.both, testing_ratio, MSE = MSE)
evaluation.result = list(predictions.by.site=predictions.all, performance.result = performance.result)
return(evaluation.result)
}
evaluation_funciton.byindividual <- function(Result) {
result.all <- {}
evaluation.result <- {}
predictions = Result
## geoMean: only use overlapped genomic site individulas training samples to calculate geoMean
predictions2 = data.frame(mergeID = names(Result), predictions, actural = contamin.array[names(predictions)] , b33 = quantile(contaminant.geoMean.bysite, probs = 0.33, na.rm = FALSE, names = FALSE), t33 = quantile(contaminant.geoMean.bysite, probs = 0.67, na.rm = FALSE, names = FALSE))
predictions3 = cbind(predictions2, is.top33 = predictions2$actural >= predictions2$t33, is.bottom33 = predictions2$actural <= predictions2$b33, is.top.predict = predictions2$predictions >= predictions2$t33, is.bottom.predict = predictions2$predictions <= predictions2$b33)
predictions3 = predictions3[!is.na(predictions3$is.bottom33),]
predictions.all = predictions3
predictions.all$delta = (predictions.all$predictions - predictions.all$actural)^2
MSE = sum(predictions.all$delta)/nrow(predictions.all)
error.top = sum(predictions.all$is.top.predict & predictions.all$is.bottom33)/ sum(predictions.all$is.top.predict)
error.bottom = sum(predictions.all$is.bottom.predict & predictions.all$is.top33)/ sum(predictions.all$is.bottom.predict)
error.both = sum(predictions.all$is.top.predict & predictions.all$is.bottom33 | predictions.all$is.bottom.predict & predictions.all$is.top33)/ sum(predictions.all$is.top.predict | predictions.all$is.bottom.predict)
accuracy.top = sum(predictions.all$is.top.predict & predictions.all$is.top33)/ sum(predictions.all$is.top.predict)
accuracy.bottom = sum(predictions.all$is.bottom.predict & predictions.all$is.bottom33)/ sum(predictions.all$is.bottom.predict)
accuracy.both = sum((predictions.all$is.top.predict & predictions.all$is.top33) | (predictions.all$is.bottom.predict & predictions.all$is.bottom33))/ sum(predictions.all$is.bottom.predict | predictions.all$is.top.predict)
testing_ratio = c(top_ratio = sum(predictions.all$is.top33)/nrow(predictions.all), bottom_ratio = sum(predictions.all$is.bottom33)/nrow(predictions.all))
performance.result = c(error.top = error.top,error.bottom = error.bottom, error.both = error.both, accuracy.top = accuracy.top ,accuracy.bottom = accuracy.bottom,accuracy.both = accuracy.both, testing_ratio, MSE = MSE)
evaluation.result = list(predictions.all,performance.result)
return(evaluation.result)
}
Run lasso analysis with leave one site out cross validatino with 40 rounds, 40 * 31 sites = 1240 cross-validation and resample 90% of training individuals every trial.
### Total PCBs
Con="PCBs_data";Con_ind="TOTAL PCBs"
## refresh between chemicals
lamda.sequence <- {}
Result.lasso.assessment <- {}
variables <- {}
predict.site.test <- {}
## result measures
rlassoResult <- list()
# prepare data
contaminants.coldata.subset2 <-contaminants.coldata.subset
### Individual chemicals
contaminants.coldata.subset2 <- contaminants.coldata.subset2 %>% filter(Key == Con_ind)
contaminants.coldata.subset2 <- contaminants.coldata.subset2 %>% filter(!is.na(value.adjust)) ## remove those with value == "NR"
## combine all duplicated items using mean value becasue they have the same nest id
contaminants.coldata.subset2 <- contaminants.coldata.subset2 %>% group_by(MergeID,Species,Matrix,AOC,Proper_Site,Key,type,class,MergeSite,proper_site2) %>% summarise(value.adjust2 = mean(value.adjust)) %>% ungroup()
colnames(contaminants.coldata.subset2)[colnames(contaminants.coldata.subset2) == "value.adjust2"] <- "value.adjust" ## change the name back
# print(sum(duplicated(contaminants.coldata.subset2$MergeID))) ## check duplication again
## read topgenelist 91 genes
test.genelist = readRDS("topgenelistPCBs.rds")$total.PCBs
contaminant.geoMean.bysite <- data.contaminants.majorSubset.geo.bysite %>% filter(Key == Con_ind) %>% dplyr::pull(value.geoMean) %>% log10() ## log10 of contaminant value
names(contaminant.geoMean.bysite) <- data.contaminants.majorSubset.geo.bysite %>% filter(Key == Con_ind) %>% dplyr::pull(MergeSite)
nestid <- contaminants.coldata.subset2 %>% filter(Key == Con_ind) %>% dplyr::pull(MergeID)
match1 <- colnames(counts.tswallow.norm) %in% nestid
counts.con <- counts.tswallow.norm[,match1]
contamin.matrix <- contaminants.coldata.subset2 %>% dplyr::slice(match(colnames(counts.con), contaminants.coldata.subset2$MergeID)) %>% filter(Key == Con_ind) %>% mutate(value.adjust.log = log10(value.adjust))
contamin.array <- contamin.matrix$value.adjust.log
names(contamin.array) = contamin.matrix$MergeID
contamin.matrix.mean <- contamin.matrix %>% group_by(MergeSite) %>% summarise(geoMean = mean(value.adjust.log)) %>% ungroup()
contamin.matrix.mean$MergeSite[31] = "NA"
contaminant.geoMean.bysite.genomic <- contamin.matrix.mean$geoMean
names(contaminant.geoMean.bysite.genomic) <- contamin.matrix.mean$MergeSite
# Run lasso analysis with leave one site out cross validatino with 40 rounds, 40 * 31 sites = 1240 cross-validation and resample 90% of training individuals every trial.
print(c("2nd lasso cv",Con_ind))
t1 = Sys.time()
cl <- parallel::makeCluster(12)
doParallel::registerDoParallel(cl)
t1 = Sys.time()
PCBs_CV_bySite_result = Run_lasso.by.site.PCBs.top100.lasso(re = 40, Con = Con, Con_ind = Con_ind, s = 1,lam.m = -2)
t2 = Sys.time()
t2 -t1
parallel::stopCluster(cl)
# Result list : 1. variables, 2. dev.ratio 3. lamda.sequence 4. predict.all.genomic.samples.subset.lasso 5. predict.all.genomic.samples.subset.lasso.mean 6. predict.all.genomic.samples.lasso
# 7. predict.all.genomic.samples.lasso.mean
# saveRDS(PCBs_CV_bySite_result, "PCBs_CV_bySite_result2nd.rds")
PCBs_CV_bySite_result <- readRDS("PCBs_CV_bySite_result2nd.rds")
# PCBs_CV_bySite_result by site
Result = unlist(unname(PCBs_CV_bySite_result[5,])) ## genomic samples overlapped with contaminated samples
PCBs.performance.bysite.overlapped.report = evaluation_funciton.bysite(Result = Result)
Result = unlist(unname(PCBs_CV_bySite_result[7,])) ## all genomic samples
PCBs.performance.bysite.allgenomic.report = evaluation_funciton.bysite(Result = Result)
#
# ## overlapped portion PCBs
# $performance.result
# error.top error.bottom error.both accuracy.top accuracy.bottom accuracy.both top_ratio bottom_ratio MSE
# 0.132075472 0.006072874 0.050065876 0.569811321 0.844129555 0.748353096 0.322580645 0.451612903 0.230140947
# # ## all genomic samples PCBs
# $performance.result
# error.top error.bottom error.both accuracy.top accuracy.bottom accuracy.both top_ratio bottom_ratio MSE
# 0.090551181 0.006507592 0.036363636 0.614173228 0.854663774 0.769230769 0.322580645 0.451612903 0.233187702
# PCBs_CV_bySite_result by individual, only use overlapped portion
Result = unlist(unname(PCBs_CV_bySite_result[4,])) ## genomic samples overlapped with contaminated samples
PCBs.performance.byindividual.overlapped.report = evaluation_funciton.byindividual(Result = Result)
# error.top error.bottom error.both accuracy.top accuracy.bottom accuracy.both top_ratio bottom_ratio MSE
# 0.1002028 0.1817585 0.1452929 0.7346856 0.6286089 0.6760385 0.4234694 0.3724490 0.3985988
## other performance metrics
# determine predictor genes; variablers
variables.length.summary <- summary(sapply(1:length(PCBs_CV_bySite_result[1,]), function(x) length(PCBs_CV_bySite_result[1,][[x]])))
variables.length.p95 <- quantile(sapply(1:length(PCBs_CV_bySite_result[1,]), function(x) length(PCBs_CV_bySite_result[1,][[x]])),0.95)
variables <- names(sort(table(names(unlist(unname(PCBs_CV_bySite_result[1,])))),decreasing = TRUE)[1:variables.length.p95])
variables <- list(id = variables, name = na.omit(ID.convert$GeneName[match(variables,ID.convert$GeneID2)]))
variables.coef = unlist(lapply(1:ncol(PCBs_CV_bySite_result), function(x) na.omit(PCBs_CV_bySite_result[1,][[x]][variables$id])))
geomean_func = function(x) {exp(mean(log(x)))}
variables.coef.aggregate = aggregate(variables.coef, list(Gene = names(variables.coef)),mean)
colnames(variables.coef.aggregate)[2] = "coef"
variables.coef.aggregate = data.frame(name = ID.convert$GeneName[match(variables.coef.aggregate$Gene, ID.convert$GeneID2)], variables.coef.aggregate)
# R square, or deviance ratio
# dev.ratio
dev.ratio.n <- ncol(PCBs_CV_bySite_result)
dev.ratio.mean <- mean(unlist(unname(PCBs_CV_bySite_result[2,])))
dev.ratio.s <- sd(unlist(unname(PCBs_CV_bySite_result[2,])))
margin <- qt(0.975,df=dev.ratio.n-1)*dev.ratio.s/sqrt(dev.ratio.n)
dev.ratio.PCBs = c(mean = dev.ratio.mean, margin = margin)
# print(dev.ratio.PCBs)
# mean margin
# 0.766489759 0.001045139
# ##
predict.result = unlist(unname(PCBs_CV_bySite_result[4,])) # only the overlapping part
predict.result2 = unlist(unname(PCBs_CV_bySite_result[5,])) # only the overlapping part site average
Predictions.by.site = tibble(site= names(predict.result2), predictions= predict.result2, genomic.PCB.mean = contaminant.geoMean.bysite.genomic[names(predict.result2)], proper_site2 = siteMAP.all$propersite[match(names(predict.result2), siteMAP.all$SiteID)])
Predictions.by.individual = tibble(MergeID = names(predict.result), proper_site2 = siteMAP.all$propersite[match(str_sub(names(predict.result),3,4), siteMAP.all$SiteID)], predicted.PCBs = predict.result, measured.PCBs = contamin.array[names(predict.result)])
PCBs.predictions.lasso.aggregate = list(by.site = Predictions.by.site, by.individual = Predictions.by.individual)
# saveRDS(PCBs.predictions.lasso.aggregate, "PCBs.predictions.lasso.aggregate.rds")
# list all predictions, variables, R2 and model performance
PCBs.performance.report <- list(PCBs.performance.bysite.overlapped.report = PCBs.performance.bysite.overlapped.report, PCBs.performance.bysite.allgenomic.report = PCBs.performance.bysite.allgenomic.report, PCBs.performance.byindividual.overlapped.report = PCBs.performance.byindividual.overlapped.report, PCB.variables = list(variables,variables.coef.aggregate), PCBs.dev.ratio = dev.ratio.PCBs)
# saveRDS(PCBs.performance.report, "PCBs.performance.report.rds")
Because not evey nest was selected for measuring chemistry, the following performance metrics are calculated by
Result = unlist(unname(PCBs_CV_bySite_result[5,])) ## genomic samples overlapped with contaminated samples
PCBs.performance.bysite.overlapped.report = evaluation_funciton.bysite(Result = Result)
Result = unlist(unname(PCBs_CV_bySite_result[7,])) ## all genomic samples
PCBs.performance.bysite.allgenomic.report = evaluation_funciton.bysite(Result = Result)
#
# ## overlapped portion PCBs
# $performance.result
# error.top error.bottom accuracy.top accuracy.bottom accuracy.both top_ratio bottom_ratio MSE
# 0.111969112 0.004056795 0.583011583 0.845841785 0.755319149 0.322580645 0.451612903 0.229426453
# ## all genomic samples PCBs
# $performance.result
# error.top error.bottom accuracy.top accuracy.bottom accuracy.both top_ratio bottom_ratio MSE
# 0.065306122 0.004347826 0.632653061 0.847826087 0.773049645 0.322580645 0.451612903 0.232414433
# PCBs_CV_bySite_result by individual, only use overlapped portion
Result = unlist(unname(PCBs_CV_bySite_result[4,])) ## genomic samples overlapped with contaminated samples
PCBs.performance.byindividual.overlapped.report = evaluation_funciton.byindividual(Result = Result)
# error.top error.bottom accuracy.top accuracy.bottom accuracy.both top_ratio bottom_ratio MSE
# 0.1002028 0.1817585 0.7346856 0.6286089 0.6760385 0.4234694 0.3724490 0.3985988
performance.metrics <- round(data.frame(PCBs.performance.bysite.overlapped.report$performance.result, PCBs.performance.bysite.allgenomic.report$performance.result, PCBs.performance.byindividual.overlapped.report[[2]]),3)
colnames(performance.metrics) <- c("by.site (with chemistry only)", "by.site (all genomic samples)", "by.nest (with chemistry only)")
# saveRDS(performance.metrics, "PCB.performance.metrics.rds")
Predicted vs actural PCB concentrations yy plot by site
by.site (with chemistry only) | by.site (all genomic samples) | by.nest (with chemistry only) | |
---|---|---|---|
error.top | 0.132 | 0.091 | 0.100 |
error.bottom | 0.006 | 0.007 | 0.182 |
error.both | 0.050 | 0.036 | 0.145 |
accuracy.top | 0.570 | 0.614 | 0.735 |
accuracy.bottom | 0.844 | 0.855 | 0.629 |
accuracy.both | 0.748 | 0.769 | 0.676 |
top_ratio | 0.323 | 0.323 | 0.423 |
bottom_ratio | 0.452 | 0.452 | 0.372 |
MSE | 0.230 | 0.233 | 0.399 |
name | coef | Gene |
---|---|---|
OPRM1 | 0.0626781 | TacBic00000248 |
HHIPL2 | 0.0629860 | TacBic00000723 |
BROX | -0.0765754 | TacBic00000728 |
ALK | -0.0343974 | TacBic00000778 |
ahrr | 0.2032059 | TacBic00001060 |
DNAH5 | -0.0421367 | TacBic00001152 |
MT-CO2 | 0.0306515 | TacBic00001169 |
EDN1 | 0.2580644 | TacBic00001272 |
MRC1 | -0.0445834 | TacBic00001408 |
NA | -0.0345405 | TacBic00001866 |
PDE1A | -0.0335550 | TacBic00002231 |
TDH | -0.0368326 | TacBic00002440 |
WDR35 | -0.0689035 | TacBic00002543 |
CPSF3 | -0.1160736 | TacBic00002590 |
LMBRD1 | -0.0738019 | TacBic00002694 |
LD | -0.0339494 | TacBic00004095 |
PLEKHH1 | -0.1973106 | TacBic00004124 |
NA | 0.0719234 | TacBic00004433 |
NA | -0.0637095 | TacBic00005036 |
NA | -0.0547057 | TacBic00005289 |
MAPK6 | -0.0716260 | TacBic00006048 |
CCDC78 | 0.0286038 | TacBic00006808 |
DAGLB | -0.0965885 | TacBic00006993 |
HYDIN | 0.0160680 | TacBic00007206 |
FLT4 | -0.0471608 | TacBic00007722 |
DBN1 | -0.1532679 | TacBic00007823 |
NA | -0.1268583 | TacBic00007892 |
SLC17A9 | 0.0313378 | TacBic00008664 |
ZC2HC1A | -0.0751931 | TacBic00009014 |
LOC423119 | -0.1004947 | TacBic00009296 |
MN1 | -0.1422477 | TacBic00009683 |
NOS1 | -0.1495260 | TacBic00009882 |
NA | -0.0715350 | TacBic00010159 |
SLCO1A2 | -0.0209454 | TacBic00010681 |
PIK3C3 | 0.0274661 | TacBic00011042 |
ZNF185L | -0.0567324 | TacBic00011230 |
ARHGAP36 | 0.1152496 | TacBic00011246 |
SPIA9 | 0.0417792 | TacBic00011339 |
ETNPPL | -0.0423135 | TacBic00011604 |
ARHGAP6 | 0.0735161 | TacBic00011724 |
RPL3 | 0.0500954 | TacBic00012455 |
NA | 0.0131619 | TacBic00012685 |
CLDN18 | 0.0408827 | TacBic00012986 |
NA | -0.1276152 | TacBic00013588 |
P2RY10 | 0.0980806 | TacBic00013697 |
HMGCR | 0.0355349 | TacBic00013782 |
ERRFI1 | -0.0187293 | TacBic00014439 |
ABCA9 | 0.0371875 | TacBic00014601 |
NA | -0.0291986 | TacBic00015118 |
CMTM4 | 0.0638626 | TacBic00015406 |
CHSY3 | -0.0525856 | TacBic00015571 |
KCNIP1 | 0.1172414 | TacBic00015722 |
NA | 0.0512586 | TacBic00016990 |
NA | 0.0300411 | TacBic00017215 |
CYP1A5 | 0.0917243 | TacBic00018777 |
CD248 | 0.2015480 | TacBic00019767 |
CYP1A4 | 0.0381010 | TacBic00019894 |
ERAL1 | -0.1732755 | TacBic00020260 |
LOC776507 | 0.0547346 | TacBic00021387 |
NA | -0.3323687 | TacBic00021964 |
NA | 0.0631366 | TacBic00022010 |
SNX27 | -0.1014123 | TacBic00022200 |
Start from first round top genes (110) genes and refine lasso
regression analysis for the second round
GLRI_coldata
data.contaminants.majorSubset.geo.bysite
contaminants.coldata.subset
ID.convert
siteMAP.all
dds.bothSex.adjusted or counts.tswallow
Cont.major.list
counts.tswallow.norm
topgene.list
lasso.by.site.top100(Con,Con_ind,i,Min_L,s)
: Lasso
leave one (site) out cross-validaiton model training, Min_L for the
lowest the lamda cv goes 10^(-2,-3,-4); s: lamda.sequence: 1) 1se 2) min
3) log median
Run_lasso_by_site.top100(n,Con,Con_ind,N_train,Min_L,s)
: Run lasso.by.site.top100
with n runs throughout all 28
sites = 28n
evaluation_funciton.bysite.PAH
: generate
performance metrics
# N: N repeat leave one (site) out validation; Min_L for the lowest the lamda cv goes 10^(-2,-3,-4); s: lamda.sequence: 1) 1se 2) min 3) log median
lasso.by.site.top100 <- function(Con,Con_ind,i,Min_L,s) {
BySiteResult.withlamdaP <- {} ## refresh
variables.all <- list()
predict.site.test.all <- list()
match1 <- str_sub(colnames(counts.tswallow.norm),3,4) %in% contaminants.coldata.subset2$MergeSite
counts.con <- counts.tswallow.norm[,match1]
counts.con.batch <- as.factor(as.character(dds.bothSex.adjusted$batch2)[match1])
counts.con.sex <- as.factor(as.character(dds.bothSex.adjusted$sex)[match1])
counts.con.contamin <- log10(contaminants.coldata.subset2$value.geoMean[match(str_sub(colnames(counts.con),3,4), contaminants.coldata.subset2$MergeSite)])
## Building matrix
if (length(levels(counts.con.batch)) > 1) {
counts.con.t <- as.data.frame(t(counts.con[topgene.list,]))
counts.con.t <- cbind(counts.con.t, batch = counts.con.batch, sex = counts.con.sex, contaminant = counts.con.contamin)
m <- model.matrix(contaminant ~ batch +., counts.con.t)
m <- m[,-1] } else {
counts.con.t <- as.data.frame(t(counts.con[topgene.list,]))
counts.con.t <- cbind(counts.con.t, sex = counts.con.sex, contaminant = counts.con.contamin)
m <- model.matrix(contaminant ~ ., counts.con.t)
m <- m[,-1] }
site.all <- str_sub(colnames(counts.con),3,4)
# print(length(unique(site.all)))
site <- unique(str_sub(colnames(counts.con),3,4))
site.test <- site[i]
match1.test <- site.all == site.test
training = sample(which(!match1.test), size = round(sum(!match1.test)*0.9))
l.1se.all = {}
l.min.all = {}
l.median.all = {}
for (j in 1:20) {
l.1se <- {} ## repeat 10 times to avoid outlier in cross validation
l.min <- {}
cvfit <- cv.glmnet(x=m[training,], y=counts.con.t$contaminant[training], nfolds = 10, alpha = 1, lambda = 10^seq(0,Min_L,length=600))
l.1se <- cvfit$lambda.1se
l.min <- cvfit$lambda.min
l.median <- exp(mean(c(log(l.1se),log(l.min))))
l.1se.all <- c(l.1se.all,l.1se)
l.min.all <- c(l.min.all,l.min)
l.median.all <- c(l.median.all,l.median)
}
l.1se = median(l.1se.all)
l.min = median(l.min.all)
l.median = median(l.median.all)
lamda.sequence <- c(l.1se,l.min,l.median)
names(lamda.sequence) <- c("1se","min","median")
fit <- glmnet(x=m[training,], y=counts.con.t$contaminant[training], alpha = 1, lambda = 10^seq(0,Min_L,length=600))
error.result <- assess.glmnet(fit, newx = m[match1.test,], newy = counts.con.t$contaminant[match1.test], s = lamda.sequence[s])
error.term <- c(error.result$mse, error.result$mae)
names(error.term) <- c("mse","mae")
variables <- coef(fit, s = lamda.sequence[s], gamma = 1)
variables <- variables[,1][!(variables[,1] == 0)]
variables <- variables[-1]
pseudo_R2 <- fit$dev.ratio[which(sort(c(lamda.sequence[s],10^seq(0,Min_L,length=600)),decreasing = TRUE) == lamda.sequence[s])[1] -1]
predict.site <- predict(fit, newx = m[match1.test,], s = lamda.sequence[s])
result = list(lamda.sequence,error.term,variables,pseudo_R2,predict.site)
names(result) <- c("lamda.sequence","mse_mae","variables","pseudo_R2","predict.site")
return(result)
}
Run_lasso_by_site.top100 <- function(n,Con,Con_ind,N_train,Min_L,s) {
TIME1 <- Sys.time()
lassositeResult.run <- foreach(i = rep(1:28,each= n) , .packages = c("stringr", "dplyr","glmnet"), .export = c("lasso.by.site.top100", "contaminants.coldata.subset2","topgene.list","counts.tswallow.norm","dds.bothSex.adjusted", "data.contaminants.majorSubset.geo.bysite")) %dopar% {
lasso_result <- Vectorize(lasso.by.site.top100)(Con,Con_ind,i,Min_L,s)
return(lasso_result)
}
TIME2 <- Sys.time()
print(TIME2 - TIME1)
return(lassositeResult.run)
}
evaluation_funciton.bysite.PAH <- function(Result) {
result.all <- {}
evaluation.result <- {}
predictions = Result
## geoMean: only use overlapped genomic site individulas training samples to calculate geoMean
predictions2 = data.frame(site = names(Result), predictions, geoMean = log10(contaminants.coldata.subset2$value.geoMean[match(names(Result),contaminants.coldata.subset2$MergeSite)]), b33 = log10(quantile(contaminants.coldata.subset2$value.geoMean, probs = 0.33, na.rm = FALSE, names = FALSE)), t33 = log10(quantile(contaminants.coldata.subset2$value.geoMean, probs = 0.67, na.rm = FALSE, names = FALSE)))
predictions3 = cbind(predictions2, is.top33 = predictions2$geoMean >= predictions2$t33, is.bottom33 = predictions2$geoMean <= predictions2$b33, is.top.predict = predictions2$predictions >= predictions2$t33, is.bottom.predict = predictions2$predictions <= predictions2$b33)
predictions.all = predictions3
predictions.all$delta = (predictions.all$predictions - predictions.all$geoMean)^2
MSE = sum(predictions.all$delta)/nrow(predictions.all)
error.top = sum(predictions.all$is.top.predict & predictions.all$is.bottom33)/ sum(predictions.all$is.top.predict)
error.bottom = sum(predictions.all$is.bottom.predict & predictions.all$is.top33)/ sum(predictions.all$is.bottom.predict) # 0 / 0
eorror.both = sum((predictions.all$is.top.predict & predictions.all$is.bottom33) | (predictions.all$is.bottom.predict & predictions.all$is.top33))/ sum(predictions.all$is.top.predict | predictions.all$is.bottom.predict)
accuracy.top = sum(predictions.all$is.top.predict & predictions.all$is.top33)/ sum(predictions.all$is.top.predict)
accuracy.bottom = sum(predictions.all$is.bottom.predict & predictions.all$is.bottom33)/ sum(predictions.all$is.bottom.predict)
accuracy.both = sum((predictions.all$is.top.predict & predictions.all$is.top33) | (predictions.all$is.bottom.predict & predictions.all$is.bottom33))/ sum(predictions.all$is.bottom.predict | predictions.all$is.top.predict)
testing_ratio = c(top_ratio = sum(predictions.all$is.top33)/nrow(predictions.all), bottom_ratio = sum(predictions.all$is.bottom33)/nrow(predictions.all))
performance.result = c(error.top = error.top,error.bottom = error.bottom, error.both = eorror.both, accuracy.top = accuracy.top ,accuracy.bottom = accuracy.bottom,accuracy.both = accuracy.both, testing_ratio, MSE = MSE)
evaluation.result = list(predictions.by.site=predictions.all, performance.result = performance.result)
return(evaluation.result)
}
Run Run_lasso_by_site.top100
with 40 re-sampling,
1120 leave one out cross validation; lamda 1se then evaluate the model
performance using evaluation_funciton.bysite.PAH
Con="PAHs_data";Con_ind="Total PAHs"
### edit subset for overlappign genomic samples with contaminant data
contaminants.coldata.subset2 <-data.contaminants.majorSubset.geo.bysite
### Individual chemicals
contaminants.coldata.subset2 <- contaminants.coldata.subset2 %>% filter(Key == Con_ind)
## combine all duplicated items using mean value becasue they have the same MergeSite ## only RiverRaisin will be affected
contaminants.coldata.subset2 <- contaminants.coldata.subset2 %>% group_by(MergeSite, Key) %>% summarise(value.geoMean2 = mean(value.geoMean)) %>% ungroup()
colnames(contaminants.coldata.subset2)[colnames(contaminants.coldata.subset2) == "value.geoMean2"] <- "value.geoMean" ## change the name back
# print(sum(duplicated(contaminants.coldata.subset2$MergeSite))) ## check duplication again
cl <- parallel::makeCluster(12)
doParallel::registerDoParallel(cl)
PAHs_CV_bySite_result = Run_lasso_by_site.top100(n=40,Con=Con,Con_ind = Con_ind, Min_L = -2, s = 1)
parallel::stopCluster(cl)
# saveRDS(PAHs_CV_bySite_result, "PAHs_CV_bySite_result.rds")
PAHs_CV_bySite_result <- readRDS("PAHs_CV_bySite_result.rds")
Result = sapply(1:1120, function(x) mean(PAHs_CV_bySite_result[[x]][5][[1]]))
names(Result) = sapply(1:1120, function(x) str_sub(row.names(PAHs_CV_bySite_result[[x]][5][[1]])[1],3,4))
PAHs.performance.bysite.report = evaluation_funciton.bysite.PAH(Result = Result)
# print(PAHs.performance.bysite.report$performance.result)
# error.top error.bottom error.both accuracy.top accuracy.bottom accuracy.both top_ratio bottom_ratio MSE
# 0.07827476 0.00000000 0.07790143 0.74281150 1.00000000 0.74403816 0.42857143 0.17857143 0.13560998
# Edit variables
variables.length.summary <- summary(sapply(1:length(PAHs_CV_bySite_result), function(x) length(PAHs_CV_bySite_result[[x]][3][[1]])))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 37.00 54.00 57.00 56.59 60.00 70.00
variables.length.p95 <- quantile(sapply(1:length(PAHs_CV_bySite_result), function(x) length(PAHs_CV_bySite_result[[x]][3][[1]])),0.95)
variables <- names(sort(table(unlist(sapply(1:length(PAHs_CV_bySite_result), function(x) names(PAHs_CV_bySite_result[[x]][3][[1]])))),decreasing = TRUE)[1:variables.length.p95])
variables <- list(variables)
variables$name <- ID.convert$GeneName[match(variables[[1]],ID.convert$GeneID2)]
variables.coef = unlist(lapply(1:length(PAHs_CV_bySite_result), function(x) na.omit(PAHs_CV_bySite_result[[x]][3][[1]][variables[[1]]])))
# geomean_func = function(x) {exp(mean(log(x)))}
variables.coef.aggregate = aggregate(variables.coef, list(Gene = names(variables.coef)),mean)
colnames(variables.coef.aggregate)[2] = "coef"
variables.coef.aggregate = data.frame(name = ID.convert$GeneName[match(variables.coef.aggregate$Gene, ID.convert$GeneID2)], variables.coef.aggregate)
# remove duplicates TFCP2L1 and CSMD1
variables.coef.aggregate <- variables.coef.aggregate[c(-8,-10),]
# dev.ratio
dev.ratio.n <- length(PAHs_CV_bySite_result)
dev.ratio <- sapply(1:dev.ratio.n, function(x) PAHs_CV_bySite_result[[x]][[4]])
dev.ratio.mean <- mean(dev.ratio)
dev.ratio.s <- sd(dev.ratio)
margin <- qt(0.975,df=dev.ratio.n-1)*dev.ratio.s/sqrt(dev.ratio.n)
dev.ratio.PAHs = c(mean = dev.ratio.mean, margin = margin)
# print(dev.ratio.PAHs)
# mean margin
# 0.734270005 0.001355791
# Construct panther GO analysis
library(xml2)
PAHs_bp.data.raw <- read_xml("PAHs_2ndLasso_funcionts.xml") ## from PANTHER17.0 http://www.pantherdb.org/
id = xml_text(xml_find_all(PAHs_bp.data.raw, "//result/term/id"))
term = xml_text(xml_find_all(PAHs_bp.data.raw, "//result/term/label"))[-319] # remove no class item
number_in_list = as.numeric(xml_text(xml_find_all(PAHs_bp.data.raw, "//result/input_list/number_in_list")))[-319]
number_in_reference = as.numeric(xml_text(xml_find_all(PAHs_bp.data.raw, "//result/number_in_reference")))[-319]
pValue = as.numeric(xml_text(xml_find_all(PAHs_bp.data.raw, "//pValue")))[-319]
fold_enrichment = as.numeric(xml_text(xml_find_all(PAHs_bp.data.raw, "//fold_enrichment")))[-319]
mapped_id = sapply(1:319, function(x) xml_text(xml_children(xml_find_all(PAHs_bp.data.raw, "//result/input_list/mapped_id_list")[x])))[-319]
names(mapped_id) <- id
PAHs_bp.panther = tibble(id = id, term = term, number_in_list = number_in_list, number_in_reference = number_in_reference, pValue = pValue, fold_enrichment = fold_enrichment, mapped_id = mapped_id)
PAHs_bp.panther.mapped_id = PAHs_bp.panther$mapped_id
## check lasso selected PAHs related genes funcions and plot as heatmap
PAHs_bp.panther.trim = PAHs_bp.panther %>% filter(number_in_reference < 5000 & number_in_reference > 200) # select more general go terms
PAHs_bp.panther.mapped_id$all = unique(na.omit(variables.coef.aggregate$name))
PAHs_bp.panther.data = data.frame(GOBP_ID = rep(names(PAHs_bp.panther.mapped_id), sapply(1:319, function(x) length(PAHs_bp.panther.mapped_id[[x]])) ), name = unlist(PAHs_bp.panther.mapped_id, use.names = FALSE))
PAHs_bp.panther.data = dplyr::left_join(PAHs_bp.panther.data, variables.coef.aggregate)
PAHs_bp.panther2 = PAHs_bp.panther
colnames(PAHs_bp.panther2)[1] = "GOBP_ID"
PAHs_bp.panther.data = dplyr::left_join(PAHs_bp.panther.data, PAHs_bp.panther2[,1:6])
## Using Revigo to remove redundnat GO terms : PLoS ONE 2011. doi:10.1371/journal.pone.0021800
# The Gene Ontology database (go.obo) which is dated Thursday, November 3, 2022.
# The UniProt-to-GO mapping database from the EBI GOA project (goa_uniprot_gcrp.gaf.gz) which is dated Friday, September 16, 2022.
Revigo.selected.gobp <- c(
"all",
"GO:0009987",
"GO:0007010",
"GO:0007154",
"GO:0023052",
"GO:0030029",
"GO:0033554",
"GO:0051716",
"GO:0042592",
"GO:0048523",
"GO:0048878",
"GO:0050896",
"GO:0072359"
)
PAHs_bp.panther.data$term[PAHs_bp.panther.data$GOBP_ID == "all"] = "all"
data <- as_tibble(PAHs_bp.panther.data[,c(1,2,4,5)])
data <- data %>% filter(GOBP_ID %in% Revigo.selected.gobp)
data <- data %>% spread(key = "name" , value = "coef")
data.matrix = data[,3:59]
data.matrix = as.matrix(data.matrix)
row.names(data.matrix) = data$term
data.matrix[is.na(data.matrix)] = 0
PAHs_lasso_by_site_50_functional_data = list(panther_result = PAHs_bp.panther, gene_mapped_panther_result = PAHs_bp.panther.data, heatmap_plot_data = data.matrix, top_go_terms = Revigo.selected.gobp)
# saveRDS(PAHs_lasso_by_site_50_functional_data, "PAHs_lasso_by_site_50_functional_data.rds")
PAHs.performance.report <- list(PAHs.performance.bysite.report = PAHs.performance.bysite.report, PAHs.variables = list(variables,variables.coef.aggregate, PAHs_bp.panther.data), PAHs.dev.ratio = dev.ratio.PAHs)
# saveRDS(PAHs.performance.report, "PAHs.performance.reportaggregate.rds")
performance.metrics <- round(data.frame(PAHs.performance.report$PAHs.performance.bysite.report$performance.result),3)
colnames(performance.metrics) <- c("Pooled PAHs concentrations by site")
# saveRDS(performance.metrics, "PAHs.performance.metrics.rds")
Pooled PAHs concentrations by site | |
---|---|
error.top | 0.078 |
error.bottom | 0.000 |
error.both | 0.078 |
accuracy.top | 0.743 |
accuracy.bottom | 1.000 |
accuracy.both | 0.744 |
top_ratio | 0.429 |
bottom_ratio | 0.179 |
MSE | 0.136 |
name | coef | Gene |
---|---|---|
EPHX2 | -0.0237888 | TacBic00000001 |
CTGFL | 0.0116673 | TacBic00000185 |
PHACTR2 | 0.0229792 | TacBic00000280 |
ahrr | 0.0187001 | TacBic00001060 |
MASTL | -0.0720205 | TacBic00001352 |
IGF2BP3 | 0.0840547 | TacBic00001551 |
HEG1 | -0.0364054 | TacBic00001970 |
TFCP2L1 | -0.0422021 | TacBic00002010 |
CSMD1 | 0.0525004 | TacBic00002636 |
PDCD11 | -0.0643387 | TacBic00002956 |
PLCE1 | 0.1743130 | TacBic00003080 |
BMF | 0.0329964 | TacBic00004110 |
PLEK2 | 0.0184932 | TacBic00004117 |
AVPR1A | -0.0714536 | TacBic00004508 |
FAM19A5 | -0.1237966 | TacBic00004633 |
SEPT14 | 0.0202087 | TacBic00005539 |
SLC12A1 | 0.1045253 | TacBic00006063 |
ROBO2 | 0.0174586 | TacBic00006592 |
PRUNE2 | 0.0225887 | TacBic00006667 |
ROGDI | 0.0294154 | TacBic00006760 |
NME5 | 0.0259847 | TacBic00007699 |
STC2 | 0.0249300 | TacBic00007847 |
MAP3K15 | 0.0296867 | TacBic00008341 |
SLC7A7 | 0.0173609 | TacBic00008970 |
FABP4 | 0.0055431 | TacBic00008996 |
GIF | -0.0183763 | TacBic00009241 |
CCDC174 | -0.0532543 | TacBic00009931 |
P4HTM | 0.0340958 | TacBic00009961 |
TENM1 | 0.0271381 | TacBic00011055 |
COL4A2 | 0.0234530 | TacBic00011068 |
FYTTD1L | 0.0339534 | TacBic00011106 |
GPR143 | 0.0838167 | TacBic00011733 |
HEPH | 0.0414136 | TacBic00012310 |
VSIG4 | 0.0190829 | TacBic00012312 |
IL1RAPL1 | 0.0196637 | TacBic00012352 |
MCM5 | 0.0513213 | TacBic00012529 |
ARMC10 | 0.0867078 | TacBic00012663 |
DUSP28 | 0.0496005 | TacBic00013079 |
DCLK2 | 0.0212307 | TacBic00013136 |
RPL29 | 0.0239895 | TacBic00013847 |
EVPL | 0.0161164 | TacBic00014829 |
LRRN4 | 0.0279689 | TacBic00015286 |
gadd45 | 0.0131892 | TacBic00015586 |
MPP7 | 0.0197704 | TacBic00016314 |
TIMELESS | -0.0238267 | TacBic00018586 |
MRPL34 | -0.0133720 | TacBic00018618 |
SRF | -0.0227717 | TacBic00018836 |
MT-ND5 | 0.1604075 | TacBic00019663 |
CYP1A4 | 0.0752220 | TacBic00019894 |
CITED2 | 0.0209557 | TacBic00019942 |
LBR | 0.0313310 | TacBic00020007 |
NR1D1 | -0.0967599 | TacBic00020399 |
LOC107054425 | -0.0069665 | TacBic00020762 |
NELFE | -0.0157811 | TacBic00021126 |
### PAHs top ge | ne predictors | functions and pathways {.tabset} |
sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 grid parallel stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] gghighlight_0.4.0 dendsort_0.3.4
## [3] circlize_0.4.15 xml2_1.3.3
## [5] DESeq2_1.36.0 SummarizedExperiment_1.26.1
## [7] Biobase_2.56.0 MatrixGenerics_1.8.1
## [9] matrixStats_0.62.0 GenomicRanges_1.48.0
## [11] GenomeInfoDb_1.32.4 IRanges_2.30.1
## [13] S4Vectors_0.34.0 BiocGenerics_0.42.0
## [15] GGally_2.1.2 ComplexHeatmap_2.12.1
## [17] ggrepel_0.9.2 ggpubr_0.4.0
## [19] reshape2_1.4.4 doParallel_1.0.17
## [21] iterators_1.0.14 foreach_1.5.2
## [23] forcats_0.5.2 stringr_1.5.0
## [25] dplyr_1.0.10 purrr_0.3.5
## [27] tidyr_1.2.1 tibble_3.1.8
## [29] ggplot2_3.4.0 tidyverse_1.3.2
## [31] readr_2.1.3 edgeR_3.38.4
## [33] limma_3.52.4 glmnet_4.1-4
## [35] Matrix_1.5-3
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.1 backports_1.4.1 plyr_1.8.8
## [4] splines_4.2.2 BiocParallel_1.30.4 digest_0.6.31
## [7] htmltools_0.5.4 fansi_1.0.4 magrittr_2.0.3
## [10] memoise_2.0.1 googlesheets4_1.0.1 cluster_2.1.4
## [13] tzdb_0.3.0 Biostrings_2.64.1 annotate_1.74.0
## [16] modelr_0.1.10 vroom_1.6.0 timechange_0.1.1
## [19] colorspace_2.1-0 blob_1.2.3 rvest_1.0.3
## [22] haven_2.5.1 xfun_0.37 crayon_1.5.2
## [25] RCurl_1.98-1.9 jsonlite_1.8.4 genefilter_1.78.0
## [28] survival_3.4-0 glue_1.6.2 gtable_0.3.1
## [31] gargle_1.2.1 zlibbioc_1.42.0 XVector_0.36.0
## [34] GetoptLong_1.0.5 DelayedArray_0.22.0 car_3.1-1
## [37] shape_1.4.6 abind_1.4-5 scales_1.2.1
## [40] DBI_1.1.3 rstatix_0.7.1 Rcpp_1.0.9
## [43] xtable_1.8-4 clue_0.3-62 bit_4.0.4
## [46] httr_1.4.4 RColorBrewer_1.1-3 ellipsis_0.3.2
## [49] pkgconfig_2.0.3 reshape_0.8.9 XML_3.99-0.12
## [52] farver_2.1.1 sass_0.4.5 dbplyr_2.2.1
## [55] locfit_1.5-9.6 utf8_1.2.3 tidyselect_1.2.0
## [58] labeling_0.4.2 rlang_1.0.6 AnnotationDbi_1.58.0
## [61] munsell_0.5.0 cellranger_1.1.0 tools_4.2.2
## [64] cachem_1.0.6 cli_3.6.0 generics_0.1.3
## [67] RSQLite_2.2.18 broom_1.0.1 evaluate_0.20
## [70] fastmap_1.1.0 yaml_2.3.7 knitr_1.42
## [73] bit64_4.0.5 fs_1.6.0 KEGGREST_1.36.3
## [76] nlme_3.1-160 compiler_4.2.2 rstudioapi_0.14
## [79] png_0.1-7 ggsignif_0.6.4 reprex_2.0.2
## [82] geneplotter_1.74.0 bslib_0.4.2 stringi_1.7.12
## [85] highr_0.10 lattice_0.20-45 vctrs_0.5.2
## [88] pillar_1.8.1 lifecycle_1.0.3 jquerylib_0.1.4
## [91] GlobalOptions_0.1.2 bitops_1.0-7 R6_2.5.1
## [94] codetools_0.2-18 assertthat_0.2.1 rjson_0.2.21
## [97] withr_2.5.0 GenomeInfoDbData_1.2.8 mgcv_1.8-41
## [100] hms_1.1.2 rmarkdown_2.20 carData_3.0-5
## [103] googledrive_2.0.0 lubridate_1.9.0