######################################################################## ######################################################################## ### ### ### ### ######################################################################## ######################################################################## #if(('parallel' %in% names(sessionInfo()$'otherPkgs') & 'snowfall' %in% names(sessionInfo()$'otherPkgs'))){ # message('edgeR already loaded') #}else{ # library(parallel) # Used for detecting number of available cores # library(snowfall) # Used for parallelizing apply functions #} ### need to add two extreme points beyound the range of the ranked.test.labels to make sure the ROC curve is complete calculate_ROC <- function(condition_positives, condition_negatives, ranked_test_labels, parallelization = F){ c = 1 condition_positives = as.character(condition_positives) condition_negatives = as.character(condition_negatives) ranked_test_labels = as.character(ranked_test_labels) FPR = numeric(length(ranked_test_labels)) TPR = numeric(length(ranked_test_labels)) if (parallelization){ sfInit(parallel=TRUE, cpus=detectCores()) sfExport('f_roc_onePoint','condition_positives','condition_negatives','ranked_test_labels') res <- as.data.frame( t( sfSapply( seq_along(ranked_test_labels),f_roc_onePoint, condition_positives, condition_negatives, ranked_test_labels ) ) ) sfStop() }else if (!parallelization){ res <- as.data.frame( t( sapply(seq_along(ranked_test_labels),f_roc_onePoint, condition_positives, condition_negatives, ranked_test_labels ) ) ) } return(res) } ## a function to calculate TPR and FPR at one postion of the ranked_test_labels f_roc_onePoint <- function(ind,condition_positives, condition_negatives, ranked_test_labels){ pos_test = ranked_test_labels[1:ind] # postitives identified by the test neg_test = ranked_test_labels[-(1:ind)] # negatives identified by the test # calculate the number of TP, FP, TN, FN TP = sum(pos_test %in% condition_positives) FP = sum(pos_test %in% condition_negatives) TN = sum(neg_test %in% condition_negatives) FN = sum(neg_test %in% condition_positives) if(sum(TP,FP,TN,FN) != length(ranked_test_labels)) {warning('check contingency table')} if(sum(TP,FP,TN,FN) != length(condition_positives)+length(condition_negatives)) {warning('check contingency table')} if(TP+FP != length(pos_test)) {warning('check contingency table')} if(TN+FN != length(neg_test)) {warning('check contingency table')} if(TP+FN != length(condition_positives)) {warning('check contingency table')} if(FP+TN != length(condition_negatives)) {warning('check contingency table')} if(length(pos_test)+length(neg_test) != length(ranked_test_labels)) {warning('check contingency table')} ## FPR = FP/length(condition_negatives) ## TPR = TP/length(condition_positives) return(c(TP = TP, FP = FP, TN = TN, FN = FN)) } create_gold_standard <- function(simulated_matrix, p=.5){ idx <- grep('DEG',names(simulated_matrix)) n = length(idx) simulated_matrix$gold_standard = rowSums(simulated_matrix[,idx]) >= (n*p) return(simulated_matrix) } get_kmeans_pvals <- function(kmeans_list){ pvals <- sapply(1:length(kmeans_list), function(x) kmeans_list[[x]]$Pathway$pval_adj) pvals <- data.frame(t(pvals)) names(pvals) <- c('positive','negative') pvals <- data.frame(positive = median(pvals$positive), negative = median(pvals$negative)) return(pvals) } get_ttest_enrichment <- function(t_test, cohort,GO_list, alternative='greater', FDR.method='BY' ){ sim_matrix <- cohort$simulated_matrix genesets <- cohort$geneset_name t_test[is.na(t_test)] <- F p = numeric(2) for(i in 1:length(genesets)){ geneset <- genesets[i] idx <- which(row.names(sim_matrix) %in% GO_list[[geneset]]) a <- table(t_test[idx]) b <- table(t_test[-idx]) mat <- matrix(c(rev(a),rev(b)), ncol=2) # print(mat) p[i]= fisher.test(mat, alternative=alternative)$p.value } p = p.adjust(p, method=FDR.method) return(p) } ### the following loop calculates metrics ### for all the simulation results library(parallel) #load('/rsgrps/yves/samir/simulation_study2017/data/GO_list.RData') go.bp <- read.delim2('/rsgrps/yves/Lab-Tools/GeneSets/GO/2015/go_bp_all.txt', stringsAsFactors=F) ## change go.bp into GO_list format unique_pathid <- unique(go.bp$path_id) GO_list <- list() transform_to_list <- function(path_id){ GO_list[[path_id]] <- go.bp$symbol[go.bp$path_id %in% path_id] } GO_list <- mclapply(unique_pathid, function(x) transform_to_list(x), mc.cores=detectCores()) GO_list <- setNames(GO_list, unique_pathid) start <- Sys.time() get_results <- function(sim_number, GO_list, results_folder = results_folder, create_plots=F){ print(sim_number) fname <- paste('sim',sim_number, sep='') fname2 <- paste('/rsgrps/yves/samir/simulation_study2017/results/',results_folder,'/',fname,'.R',sep='') fname2a<- paste('/rsgrps/yves/samir/simulation_study2017/results/',results_folder,'2/',fname,'.R', sep='') fname3 <- paste('/rsgrps/yves/samir/simulation_study2017/code/pbs_shellscripts/iDEG.',sim_number,'.sh',sep='') # print(fname2) # print(fname3) if(results_folder %in% 'full_results'){ load(fname2); full_lista <- full_list; rm(full_list) load(fname2a) full_list <- append(full_list, full_lista); rm(full_lista) } load(fname2) bash_script <- readLines(fname3) bash_script <- bash_script[19] bash_script <- unlist(strsplit(bash_script,' ')) params <- c(size=bash_script[5],N=bash_script[6], deg =bash_script[7],up=bash_script[8]) t_test_pvals <- sapply(full_list, function(x) get_ttest_enrichment(x$T_test, x$Cohort,GO_list)) t_test_pvals <- data.frame(t(t_test_pvals)) names(t_test_pvals) <- c('positive','negative') # t_test_pvals <- 1-t_test_pvals k_mean_pvals <- sapply(full_list, function(x) get_kmeans_pvals(x$KMeans)) k_mean_pvals <- data.frame(t(k_mean_pvals)) p_vec <- sort(unique(unlist(c(t_test_pvals[,1], t_test_pvals[,2], k_mean_pvals[,1], k_mean_pvals[,2]))), decreasing=F) # p_vec = round(p_vec, 2) # p_vec = seq(0, 1 , .003) tt_recall <- tt_precision <- numeric(length(p_vec)) km_recall <- km_precision <- numeric(length(p_vec)) for(i in 1:length(p_vec)){ p = p_vec[i] tt_fp <- sum(t_test_pvals$negative < p) tt_fn <- sum(t_test_pvals$positive > p) tt_tp <- sum(t_test_pvals$positive < p) km_fp <- sum(k_mean_pvals$negative < p) km_fn <- sum(k_mean_pvals$positive > p) km_tp <- sum(k_mean_pvals$positive < p) tt_recall[i] <- tt_tp/(tt_tp + tt_fn) tt_precision[i] <- tt_tp/(tt_tp + tt_fp) ### clean up 0/0 if(tt_tp + tt_fp ==0){ tt_precision[i]=1 } km_recall[i] <- km_tp/(km_tp + km_fn) km_precision[i] <- km_tp/(km_tp + km_fp) if(km_tp + km_fp==0){ km_precision[i]=1 } } km_recall[length(km_recall)+1] <- 1 km_precision[length(km_precision)+1] <- 0 tt_recall[length(tt_recall)+1] <-1 tt_precision[length(tt_precision)+1] <- 0 # param_legend <- c(paste('N=',params['N']), # paste('Pathway Size=',params['size']), # paste('% Responsive=',params['deg']), # paste('% Up Regulated=', params['up'])) # if(create_plots){ # fname4 <- paste('/rsgrps/yves/samir/simulation_study2017/plots/sim_result_plots/',fname,'.pdf',sep='') # pdf(fname4) # boxplot(cbind(t_test_pvals, k_mean_pvals), main='P-values By Method', ylab='p-values', ylim=c(0,1.2)) # legend('topright', param_legend) # dev.off() # } params <- data.frame( N = params['N'], Size=params['size'], Deg = params['deg'], Up = params['up'] ) results <- data.frame(tt_recall = tt_recall, tt_precision=tt_precision, km_recall =km_recall, km_precision=km_precision) results$sim_number <- sim_number #idx1 <- order(tt_recall) #idx2 <- order(km_recall) #par(mfrow=c(3,3)) #plot(tt_recall , tt_precision, type='l', xlab='Recall', ylab='Precision') #points(km_recall , km_precision, type='l', col='blue') #legend('topright', c('kMEn', 'Paired T'), col=c('blue','black'), lty=1) #t_name <- paste('N=',params$N, '; Size=',params$Size,'; % DEG=' , params$Deg, '; %Up Responsive=',params$Up) #title(t_name) #detach(results) #results <- data.frame(cbind(t_test_pvals, k_mean_pvals)) #names(results) <- c('t_test_positive','t_test_negative', 'kMEn_positive','kMEn_negative') return(list(results, params)) } library(parallel) #for(fname in fnames){ # # fname2 = paste('/rsgrps/yves/samir/simulation_study2017/results/output2/', fname,'.R',sep='') # load(fname2) # # results <- mclapply(full_list, function(x) scores_replicates(x), mc.cores=detectCores()) # # results <- data.frame(do.call(rbind, results)) # row.names(results) <- paste('replicate', 1:length(results[,1])) # # write.csv(results, file=paste('/rsgrps/yves/samir/simulation_study2017/results/eval_results/',fname,'.csv',sep='')) # total_means[[fname]] <- colMeans(results) # rm(results) #} #sfInit(parallel=T, cpus=detectCores(), type='SOCK') #sfExport('get_results','GO_list','get_kmeans_pvals', 'get_ttest_enrichment') results_folder='full_results2' aggregate_results <- function(results_folder){ #require(snowfall) #sfInit(parallel=T, cpus = detectCores()-1, type='SOCK') #sfExport('results_folder','GO_list', 'get_results') full_results1 <- mclapply(1:16, function(x) get_results(sim_number=x, GO_list, results_folder=results_folder, create_plots=T)[1], mc.cores=detectCores(), mc.cleanup=T, mc.preschedule=F) #sfStop() full_results2 <- mclapply(17:32, function(x) get_results(sim_number=x, GO_list, results_folder=results_folder, create_plots=T)[1], mc.cores=detectCores(), mc.cleanup=T, mc.preschedule=F) full_results3 <- mclapply(33:48, function(x) get_results(sim_number=x, GO_list, results_folder=results_folder, create_plots=T)[1], mc.cores=detectCores(), mc.cleanup=T, mc.preschedule=F) full_results4 <- mclapply(49:54, function(x) get_results(sim_number=x, GO_list, results_folder=results_folder, create_plots=T)[1], mc.cores=detectCores(), mc.cleanup=T, mc.preschedule=F) # full_results <- c(full_results1, full_results2, full_results3, full_results4) full_results <- do.call(rbind, do.call(rbind, full_results)) #`` #sfStop() params <- mclapply(1:54, function(x) get_results(sim_number=x, GO_list, results_folder='test')[2], mc.cores=detectCores(), mc.cleanup=T, mc.preschedule=F) return(list(full_results =full_results,params= params)) } agg_results <- aggregate_results(results_folder) full_results <- agg_results$full_results params <- agg_results$params library(lattice) library(latticeExtra) p1 <- xyplot(km_precision ~ km_recall | factor(sim_number), data=full_results, type='l', col='blue', as.table=T, xlab='Recall', ylab='Precision', main='Precision - Recall Curves') p2 <- xyplot(tt_precision ~ tt_recall | factor(sim_number), data=full_results, type='l', col='red', as.table=T) #pdf('../results/precision_recall.pdf') #print(p1 + p2) #dev.off() params <- do.call(rbind, do.call(rbind, params)) write.csv(params, file='../results/params.csv', row.names=F) ######################## ## calculate distance to ## origin to computationally ## assess each group ######################## library(data.table) full_results <- data.table(full_results) setkey(full_results, sim_number) dist_to_corner = full_results[, list(max_ttest = max(sqrt(tt_recall^2 + tt_precision^2)), max_kmeans= max(sqrt(km_recall^2 + km_precision^2))), by=sim_number] dist_to_corner = cbind(dist_to_corner, params) idx_ttest <- order(dist_to_corner$max_ttest, decreasing=T) idx_kmean <- order(dist_to_corner$max_kmeans, decreasing=T) dist_to_corner[idx_ttest] dist_to_corner[idx_kmean] write.csv(dist_to_corner, '../results/dist_to_corner.csv', row.names=F) write.csv(full_results, '../results/results.csv', row.names=F) ########################### ## calculate AUC for ## precision-recall curves ########################### library(caTools) results_auc <- full_results[, list(auc_tt = trapz(tt_recall, tt_precision), auc_km = trapz(km_recall, km_precision)), by = sim_number] idx_ttest2 <- order(results_auc$auc_tt, decreasing=T) idx_kmen2 <- order(results_auc$auc_km, decreasing=T) results_auc = cbind(results_auc, params) write.csv(results_auc, '../results/results_auc.csv', row.names=F) #N <- sapply(1:54, function(x) full_results[[x]]$params['N']) #deg <- sapply(1:54, function(x) full_results[[x]]$params['deg']) #size <- sapply(1:54, function(x) full_results[[x]]$params['size']) #up <- sapply(1:54, function(x) full_results[[x]]$params['up']) #full_results <- do.call(rbind, full_results) #full_results <- data.frame(cbind(full_results,N, deg, size, up)) #write.csv(full_results, file='/rsgrps/yves/samir/simulation_study2017/results/full_results.csv') #deg_idx <- unlist(lapply(1:54, function(x) unlist(full_results[[x]]$params)['deg'])) #deg_.05 <- which(deg_idx %in% '.05') #deg_.10 <- which(deg_idx %in% '.10') #deg_.25 <- which(deg_idx %in% '.25') #t_test_counts.05 <- unlist(lapply(deg_.05, function(x) full_results[[x]]$t_test_preds)) #t_test_pvals.05 <- unlist(lapply(deg_.05, function(x) full_results[[x]]$t_test_pvals)) #k_mean_counts.05 <- unlist(lapply(deg_.05, function(x) full_results[[x]]$k_mean_preds)) #k_mean_pvals.05 <- unlist(lapply(deg_.05, function(x) full_results[[x]]$k_mean_pvals)) #t_test_counts.10 <- unlist(lapply(deg_.10, function(x) full_results[[x]]$t_test_preds)) #t_test_pvals.10 <- unlist(lapply(deg_.10, function(x) full_results[[x]]$t_test_pvals)) #k_mean_counts.10 <- unlist(lapply(deg_.10, function(x) full_results[[x]]$k_mean_preds)) #k_mean_pvals.10 <- unlist(lapply(deg_.10, function(x) full_results[[x]]$k_mean_pvals)) #t_test_counts.25 <- unlist(lapply(deg_.25, function(x) full_results[[x]]$t_test_preds)) #t_test_pvals.25 <- unlist(lapply(deg_.25, function(x) full_results[[x]]$t_test_pvals)) #k_mean_counts.25 <- unlist(lapply(deg_.25, function(x) full_results[[x]]$k_mean_preds)) #k_mean_pvals.25 <- unlist(lapply(deg_.25, function(x) full_results[[x]]$k_mean_pvals)) #pdf('/rsgrps/yves/samir/simulation_study2017/plots/DEG_plot.pdf') #par(mfrow=c(2,2)) #boxplot(cbind(t_test_counts.05, t_test_counts.10, t_test_counts.25),ylab='Recall', xlab='% Responsive', names=c('5%','10%','25%'), main='T-test Pathway Predictions') #boxplot(cbind(k_mean_counts.05, k_mean_counts.10, k_mean_counts.25),ylab='Recall', xlab='% Responsive', names=c('5%','10%','25%'),main='KMEns Pathway Prediction') #boxplot(cbind(-log(t_test_pvals.05), t_test_pvals.10, t_test_pvals.25), ylab= '-Log(P-values)', xlab='% Responsive', names=c('5%','10%','25%'), main='T-test P-Values Distribution') #boxplot(cbind(-log(k_mean_pvals.05), k_mean_pvals.10, k_mean_pvals.25), ylab= '-Log(P-values)', xlab='% Responsive', names=c('5%','10%','25%'), main='KMEns P-Values Distribution') #dev.off() #Sys.time() -start #full_k_mean_preds <- unlist(lapply(1:54, function(x) full_results[[x]]$k_mean_preds)) #full_t_test_preds <- unlist(lapply(1:54, function(x) full_results[[x]]$t_test_preds)) #final_results_df <- data.frame(kMeans = full_k_mean_preds, # T_test = full_t_test_preds, # PopulationSize=N, # FractionResponsive=deg, # PathwaySize=size, # FractionUpRegulated=up) #write.csv(final_results_df, file='/rsgrps/yves/samir/simulation_study2017/results/eval_results/final_results_matrix.csv', row.names=F) #results = final_results_df #jpeg('../plots/T_test.png') #par(mfrow=c(2,2)) #boxplot(jitter(results$T_test) ~ results$PopulationSize, xlab='N',ylab='Recall', main='T-test Predictions by Condition') #boxplot(jitter(results$T_test) ~ results$FractionResponsive, xlab='% Responsive',ylab='Recall', main='T-test Predictions by Condition') #boxplot(jitter(results$T_test) ~ results$PathwaySize, xlab='Pathway size',ylab='Recall', main='T-test Predictions by Condition') #boxplot(jitter(results$T_test) ~ results$FractionUpRegulated, xlab='% Up Regulated',ylab='Recall', main='T-test Predictions by Condition') #dev.off() #jpeg('../plots/kMeans.png') #par(mfrow=c(2,2)) #boxplot(jitter(results$kMeans) ~ results$PopulationSize, xlab='N',ylab='Recall', main='kMeans Predictions by Condition') #boxplot(jitter(results$kMeans) ~ results$FractionResponsive, xlab='% Responsive',ylab='Recall', main='kMeans Predictions by Condition') #boxplot(jitter(results$kMeans) ~ results$PathwaySize, xlab='Pathway size',ylab='Recall', main='kMeans Predictions by Condition') #boxplot(jitter(results$kMeans) ~ results$FractionUpRegulated, xlab='% Up Regulated',ylab='', main='kMeans Predictions by Condition') #dev.off()