########################################################################
########################################################################
###
###
###
###
########################################################################
########################################################################


#if(('parallel' %in% names(sessionInfo()$'otherPkgs') & 'snowfall' %in% names(sessionInfo()$'otherPkgs'))){
#    message('edgeR already loaded')
#}else{
#    library(parallel) # Used for detecting number of available cores
#    library(snowfall) # Used for parallelizing apply functions
#}

### need to add two extreme points beyound the range of the ranked.test.labels to make sure the ROC curve is complete
calculate_ROC <- function(condition_positives, condition_negatives, ranked_test_labels, parallelization = F){
    c = 1
    condition_positives = as.character(condition_positives)
    condition_negatives = as.character(condition_negatives)
    ranked_test_labels = as.character(ranked_test_labels)
    FPR = numeric(length(ranked_test_labels))
    TPR = numeric(length(ranked_test_labels))
    if (parallelization){
        sfInit(parallel=TRUE, cpus=detectCores())
        sfExport('f_roc_onePoint','condition_positives','condition_negatives','ranked_test_labels')
        res <- as.data.frame( t( sfSapply( seq_along(ranked_test_labels),f_roc_onePoint, condition_positives, condition_negatives, ranked_test_labels ) ) )
        sfStop()
    }else if (!parallelization){
        res <- as.data.frame( t( sapply(seq_along(ranked_test_labels),f_roc_onePoint, condition_positives, condition_negatives, ranked_test_labels ) ) )
    }
    return(res)
}


## a function to calculate TPR and FPR at one postion of the ranked_test_labels

f_roc_onePoint <- function(ind,condition_positives, condition_negatives, ranked_test_labels){
    pos_test = ranked_test_labels[1:ind] # postitives identified by the test
    neg_test = ranked_test_labels[-(1:ind)] # negatives identified by the test
    # calculate the number of TP, FP, TN, FN
    TP = sum(pos_test %in% condition_positives)
    FP = sum(pos_test %in% condition_negatives) 
    TN = sum(neg_test %in% condition_negatives)
    FN = sum(neg_test %in% condition_positives)
    if(sum(TP,FP,TN,FN) != length(ranked_test_labels)) {warning('check contingency table')}
    if(sum(TP,FP,TN,FN) != length(condition_positives)+length(condition_negatives)) {warning('check contingency table')}
    if(TP+FP != length(pos_test)) {warning('check contingency table')}
    if(TN+FN != length(neg_test)) {warning('check contingency table')}
    if(TP+FN != length(condition_positives)) {warning('check contingency table')}
    if(FP+TN != length(condition_negatives)) {warning('check contingency table')}
    if(length(pos_test)+length(neg_test) != length(ranked_test_labels)) {warning('check contingency table')}
    ## FPR = FP/length(condition_negatives)
    ## TPR = TP/length(condition_positives)
    return(c(TP = TP, FP = FP, TN = TN, FN = FN))
  }

create_gold_standard <- function(simulated_matrix, p=.5){

    idx <- grep('DEG',names(simulated_matrix))
    n = length(idx)
    
    simulated_matrix$gold_standard = rowSums(simulated_matrix[,idx]) >= (n*p)
    return(simulated_matrix)

}

get_kmeans_pvals <- function(kmeans_list){

    pvals <- sapply(1:length(kmeans_list), function(x) kmeans_list[[x]]$Pathway$pval_adj)
    pvals <- data.frame(t(pvals))
    names(pvals) <- c('positive','negative')   
    pvals <- data.frame(positive = median(pvals$positive), 
			negative = median(pvals$negative))
    return(pvals)

}

get_ttest_enrichment <- function(t_test, cohort,GO_list, alternative='greater', FDR.method='BY' ){

    sim_matrix <- cohort$simulated_matrix
    genesets    <- cohort$geneset_name

    t_test[is.na(t_test)] <- F

    p = numeric(2) 

    for(i in 1:length(genesets)){
	
    	    geneset <- genesets[i]
	    idx <- which(row.names(sim_matrix) %in% GO_list[[geneset]])

	    a <- table(t_test[idx])
	    b <- table(t_test[-idx])

	    mat <- matrix(c(rev(a),rev(b)), ncol=2)
	 #   print(mat)
	    p[i]= fisher.test(mat, alternative=alternative)$p.value

    }

    p = p.adjust(p, method=FDR.method)

    return(p)
}

### the following loop calculates metrics
### for all the simulation results
library(parallel)
#load('/rsgrps/yves/samir/simulation_study2017/data/GO_list.RData')
go.bp <- read.delim2('/rsgrps/yves/Lab-Tools/GeneSets/GO/2015/go_bp_all.txt', stringsAsFactors=F)

## change go.bp into GO_list format
unique_pathid <- unique(go.bp$path_id)
GO_list <- list()

transform_to_list <- function(path_id){
	GO_list[[path_id]] <- go.bp$symbol[go.bp$path_id %in% path_id]
}
GO_list <- mclapply(unique_pathid, function(x)  transform_to_list(x), mc.cores=detectCores())
GO_list <- setNames(GO_list, unique_pathid)


start <- Sys.time()
get_results <- function(sim_number, GO_list, results_folder = results_folder, create_plots=F){

    print(sim_number)

    fname  <- paste('sim',sim_number, sep='')
    fname2 <- paste('/rsgrps/yves/samir/simulation_study2017/results/',results_folder,'/',fname,'.R',sep='')
    fname2a<- paste('/rsgrps/yves/samir/simulation_study2017/results/',results_folder,'2/',fname,'.R', sep='')
    fname3 <- paste('/rsgrps/yves/samir/simulation_study2017/code/pbs_shellscripts/iDEG.',sim_number,'.sh',sep='')

#    print(fname2)
#    print(fname3) 

    
    if(results_folder %in% 'full_results'){
    
	load(fname2); full_lista <- full_list; rm(full_list)
       	load(fname2a)
   	full_list <- append(full_list, full_lista); rm(full_lista)
    }

    load(fname2)

    bash_script <- readLines(fname3)
    bash_script <- bash_script[19]
    bash_script <- unlist(strsplit(bash_script,' '))

    params <- c(size=bash_script[5],N=bash_script[6],
		deg =bash_script[7],up=bash_script[8])


    t_test_pvals <- sapply(full_list, function(x) get_ttest_enrichment(x$T_test, x$Cohort,GO_list))
    t_test_pvals <- data.frame(t(t_test_pvals))
    names(t_test_pvals) <- c('positive','negative')

 #   t_test_pvals <- 1-t_test_pvals
    
    k_mean_pvals <- sapply(full_list, function(x) get_kmeans_pvals(x$KMeans))
    k_mean_pvals <- data.frame(t(k_mean_pvals))

    p_vec <- sort(unique(unlist(c(t_test_pvals[,1], 
			     t_test_pvals[,2], 
			     k_mean_pvals[,1], 
			     k_mean_pvals[,2]))), decreasing=F)
#    p_vec = round(p_vec, 2) 
#   p_vec = seq(0, 1 , .003)

   tt_recall <- tt_precision  <- numeric(length(p_vec))
   km_recall <- km_precision  <- numeric(length(p_vec))

    for(i in 1:length(p_vec)){

    p = p_vec[i]

    tt_fp  <- sum(t_test_pvals$negative < p)
    tt_fn  <- sum(t_test_pvals$positive > p)
    tt_tp  <- sum(t_test_pvals$positive < p)

    km_fp <- sum(k_mean_pvals$negative < p)
    km_fn <- sum(k_mean_pvals$positive > p)
    km_tp <- sum(k_mean_pvals$positive < p)


    tt_recall[i] <- tt_tp/(tt_tp + tt_fn)
    tt_precision[i] <- tt_tp/(tt_tp + tt_fp)
    
    ### clean up 0/0
    if(tt_tp + tt_fp ==0){
	tt_precision[i]=1
    	}

    km_recall[i] <- km_tp/(km_tp + km_fn)
    km_precision[i] <- km_tp/(km_tp + km_fp)

    if(km_tp + km_fp==0){
	km_precision[i]=1
    	}
    }

    km_recall[length(km_recall)+1] <- 1
    km_precision[length(km_precision)+1] <- 0

    tt_recall[length(tt_recall)+1] <-1
    tt_precision[length(tt_precision)+1] <- 0
    

#    param_legend <- c(paste('N=',params['N']),
#		      paste('Pathway Size=',params['size']),
#		      paste('% Responsive=',params['deg']),
#		      paste('% Up Regulated=', params['up']))

#    if(create_plots){
#            fname4 <- paste('/rsgrps/yves/samir/simulation_study2017/plots/sim_result_plots/',fname,'.pdf',sep='')
#	    pdf(fname4)
#	    boxplot(cbind(t_test_pvals, k_mean_pvals), main='P-values By Method', ylab='p-values', ylim=c(0,1.2))
#	    legend('topright', param_legend)
#	    dev.off()
#    }

 params <- data.frame( N = params['N'],
		 Size=params['size'],
		 Deg = params['deg'],
		 Up  = params['up'] )
 
    results <- data.frame(tt_recall = tt_recall, tt_precision=tt_precision, km_recall =km_recall, km_precision=km_precision)
    results$sim_number <- sim_number
    #idx1 <- order(tt_recall)
    #idx2 <- order(km_recall)
    
    #par(mfrow=c(3,3))
    #plot(tt_recall , tt_precision, type='l', xlab='Recall', ylab='Precision')
    #points(km_recall , km_precision, type='l', col='blue')

    #legend('topright', c('kMEn', 'Paired T'), col=c('blue','black'), lty=1)
    
    #t_name <- paste('N=',params$N, '; Size=',params$Size,'; % DEG=' ,  params$Deg, ';  %Up Responsive=',params$Up)
    #title(t_name)
    #detach(results)


    #results <- data.frame(cbind(t_test_pvals, k_mean_pvals))
    #names(results) <- c('t_test_positive','t_test_negative', 'kMEn_positive','kMEn_negative')

    return(list(results, params))
}


library(parallel)


#for(fname in fnames){
#	
#	fname2 = paste('/rsgrps/yves/samir/simulation_study2017/results/output2/', fname,'.R',sep='')
#	load(fname2)
#	
#	results <- mclapply(full_list, function(x) scores_replicates(x), mc.cores=detectCores())
#
#	results <- data.frame(do.call(rbind, results))
#	row.names(results) <- paste('replicate', 1:length(results[,1]))
#
#	write.csv(results, file=paste('/rsgrps/yves/samir/simulation_study2017/results/eval_results/',fname,'.csv',sep=''))
#	total_means[[fname]] <- colMeans(results)
#	rm(results)
#}

#sfInit(parallel=T, cpus=detectCores(), type='SOCK')
#sfExport('get_results','GO_list','get_kmeans_pvals', 'get_ttest_enrichment')

results_folder='full_results2'

aggregate_results <- function(results_folder){

	#require(snowfall)
	#sfInit(parallel=T, cpus = detectCores()-1, type='SOCK')
	#sfExport('results_folder','GO_list', 'get_results')
	full_results1 <- mclapply(1:16, function(x) get_results(sim_number=x, GO_list,
			results_folder=results_folder, 
			create_plots=T)[1],
			mc.cores=detectCores(),
			mc.cleanup=T,
			mc.preschedule=F) 
#sfStop()

	full_results2 <- mclapply(17:32, function(x) get_results(sim_number=x, GO_list,
			results_folder=results_folder, 
			create_plots=T)[1],
			mc.cores=detectCores(),
			mc.cleanup=T,
			mc.preschedule=F)


	full_results3 <- mclapply(33:48, function(x) get_results(sim_number=x, GO_list,	
			results_folder=results_folder, 
			create_plots=T)[1], 
			mc.cores=detectCores(),
			mc.cleanup=T,
			mc.preschedule=F)


	full_results4 <- mclapply(49:54, function(x) get_results(sim_number=x, GO_list,	
			results_folder=results_folder, 
			create_plots=T)[1],
			mc.cores=detectCores(),
			mc.cleanup=T,
			mc.preschedule=F)
#
	full_results <- c(full_results1, full_results2, full_results3, full_results4) 
	full_results <- do.call(rbind, do.call(rbind, full_results))
#``
	#sfStop()
	params <- mclapply(1:54, function(x) get_results(sim_number=x, GO_list, 
			results_folder='test')[2],
			mc.cores=detectCores(),
			mc.cleanup=T,
			mc.preschedule=F)


	return(list(full_results =full_results,params= params))
}

agg_results <- aggregate_results(results_folder)
full_results <- agg_results$full_results
params <- agg_results$params


library(lattice)
library(latticeExtra)
p1 <- xyplot(km_precision ~ km_recall | factor(sim_number), 
		data=full_results, type='l', col='blue', as.table=T, xlab='Recall',
		ylab='Precision', main='Precision - Recall Curves')
p2 <- xyplot(tt_precision ~ tt_recall | factor(sim_number), 
		data=full_results, type='l', col='red', as.table=T)

#pdf('../results/precision_recall.pdf')
#print(p1 + p2)
#dev.off()

params <- do.call(rbind, do.call(rbind, params))
write.csv(params, file='../results/params.csv', row.names=F)

########################
## calculate distance to 
## origin to computationally 
## assess each group
########################
library(data.table)


full_results <- data.table(full_results)
setkey(full_results, sim_number)
	
dist_to_corner = full_results[, list(max_ttest = max(sqrt(tt_recall^2 + tt_precision^2)),
			    max_kmeans= max(sqrt(km_recall^2 + km_precision^2))), by=sim_number]

dist_to_corner = cbind(dist_to_corner, params)

idx_ttest <- order(dist_to_corner$max_ttest, decreasing=T)
idx_kmean <- order(dist_to_corner$max_kmeans, decreasing=T)

dist_to_corner[idx_ttest]
dist_to_corner[idx_kmean]

write.csv(dist_to_corner, '../results/dist_to_corner.csv', row.names=F)
write.csv(full_results,   '../results/results.csv', row.names=F)


###########################
## calculate AUC for 
## precision-recall curves
###########################
library(caTools)
results_auc <- full_results[, list(auc_tt = trapz(tt_recall, tt_precision), 
				   auc_km = trapz(km_recall, km_precision)),
				by = sim_number]


idx_ttest2 <- order(results_auc$auc_tt, decreasing=T)
idx_kmen2 <- order(results_auc$auc_km, decreasing=T)

results_auc = cbind(results_auc, params)
write.csv(results_auc, '../results/results_auc.csv', row.names=F)


#N            <- sapply(1:54, function(x) full_results[[x]]$params['N'])
#deg          <- sapply(1:54, function(x) full_results[[x]]$params['deg'])
#size         <- sapply(1:54, function(x) full_results[[x]]$params['size'])
#up           <- sapply(1:54, function(x) full_results[[x]]$params['up'])
#full_results <- do.call(rbind, full_results)
#full_results <- data.frame(cbind(full_results,N, deg, size, up))
#write.csv(full_results, file='/rsgrps/yves/samir/simulation_study2017/results/full_results.csv')

#deg_idx <- unlist(lapply(1:54, function(x) unlist(full_results[[x]]$params)['deg'])) 

#deg_.05 <- which(deg_idx %in% '.05')
#deg_.10  <- which(deg_idx %in% '.10')
#deg_.25 <- which(deg_idx %in% '.25')


#t_test_counts.05 <- unlist(lapply(deg_.05, function(x) full_results[[x]]$t_test_preds))
#t_test_pvals.05 <- unlist(lapply(deg_.05, function(x) full_results[[x]]$t_test_pvals))
#k_mean_counts.05 <- unlist(lapply(deg_.05, function(x) full_results[[x]]$k_mean_preds))
#k_mean_pvals.05 <- unlist(lapply(deg_.05, function(x) full_results[[x]]$k_mean_pvals))


#t_test_counts.10 <- unlist(lapply(deg_.10, function(x) full_results[[x]]$t_test_preds))
#t_test_pvals.10 <- unlist(lapply(deg_.10, function(x) full_results[[x]]$t_test_pvals))
#k_mean_counts.10 <- unlist(lapply(deg_.10, function(x) full_results[[x]]$k_mean_preds))
#k_mean_pvals.10 <- unlist(lapply(deg_.10, function(x) full_results[[x]]$k_mean_pvals))


#t_test_counts.25 <- unlist(lapply(deg_.25, function(x) full_results[[x]]$t_test_preds))
#t_test_pvals.25 <- unlist(lapply(deg_.25, function(x) full_results[[x]]$t_test_pvals))
#k_mean_counts.25 <- unlist(lapply(deg_.25, function(x) full_results[[x]]$k_mean_preds))
#k_mean_pvals.25 <- unlist(lapply(deg_.25, function(x) full_results[[x]]$k_mean_pvals))


#pdf('/rsgrps/yves/samir/simulation_study2017/plots/DEG_plot.pdf')
#par(mfrow=c(2,2))
#boxplot(cbind(t_test_counts.05, t_test_counts.10, t_test_counts.25),ylab='Recall',  xlab='% Responsive', names=c('5%','10%','25%'), main='T-test Pathway Predictions')
#boxplot(cbind(k_mean_counts.05, k_mean_counts.10, k_mean_counts.25),ylab='Recall',  xlab='% Responsive', names=c('5%','10%','25%'),main='KMEns Pathway Prediction')


#boxplot(cbind(-log(t_test_pvals.05), t_test_pvals.10, t_test_pvals.25), ylab= '-Log(P-values)', xlab='% Responsive', names=c('5%','10%','25%'), main='T-test P-Values Distribution')
#boxplot(cbind(-log(k_mean_pvals.05), k_mean_pvals.10, k_mean_pvals.25), ylab= '-Log(P-values)', xlab='% Responsive', names=c('5%','10%','25%'), main='KMEns P-Values Distribution')
#dev.off()

#Sys.time() -start


#full_k_mean_preds <- unlist(lapply(1:54, function(x) full_results[[x]]$k_mean_preds)) 
#full_t_test_preds <- unlist(lapply(1:54, function(x) full_results[[x]]$t_test_preds))
#final_results_df <- data.frame(kMeans = full_k_mean_preds, 
#				T_test = full_t_test_preds, 
#				PopulationSize=N,
#				FractionResponsive=deg,
#				PathwaySize=size,
#				FractionUpRegulated=up)


#write.csv(final_results_df, file='/rsgrps/yves/samir/simulation_study2017/results/eval_results/final_results_matrix.csv', row.names=F)

#results = final_results_df 

#jpeg('../plots/T_test.png')
#par(mfrow=c(2,2))
#boxplot(jitter(results$T_test) ~ results$PopulationSize, xlab='N',ylab='Recall', main='T-test Predictions by Condition')
#boxplot(jitter(results$T_test) ~ results$FractionResponsive, xlab='% Responsive',ylab='Recall', main='T-test Predictions by Condition')
#boxplot(jitter(results$T_test) ~ results$PathwaySize, xlab='Pathway size',ylab='Recall', main='T-test Predictions by Condition')
#boxplot(jitter(results$T_test) ~ results$FractionUpRegulated, xlab='% Up Regulated',ylab='Recall', main='T-test Predictions by Condition')
#dev.off()

#jpeg('../plots/kMeans.png')
#par(mfrow=c(2,2))
#boxplot(jitter(results$kMeans) ~ results$PopulationSize, xlab='N',ylab='Recall', main='kMeans Predictions by Condition')
#boxplot(jitter(results$kMeans) ~ results$FractionResponsive, xlab='% Responsive',ylab='Recall', main='kMeans Predictions by Condition')
#boxplot(jitter(results$kMeans) ~ results$PathwaySize, xlab='Pathway size',ylab='Recall', main='kMeans Predictions by Condition')
#boxplot(jitter(results$kMeans) ~ results$FractionUpRegulated, xlab='% Up Regulated',ylab='', main='kMeans Predictions by Condition')
#dev.off()