# -*- tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
# vi: set ts=2 noet:
#
# (c) Copyright Rosetta Commons Member Institutions.
# (c) This file is part of the Rosetta software suite and is made available under license.
# (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
# (c) For more information, see http://www.rosettacommons.org. Questions about this can be
# (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.

## @file test/scientific/rotamer_recovery_analysis.R

loaded_ok = TRUE;
loaded_ok = loaded_ok | require(plyr)
loaded_ok = loaded_ok | require(reshape)
loaded_ok = loaded_ok | require(ggplot2)
loaded_ok = loaded_ok | require(Design)
if(!loaded_ok){
	print("A required package was not found. To install R packages, open R andrun:")
	print("   install.package('package_name_to_install')")
}


########## I/O files and Parameters #############
match_counts_fname="outputs/rotamer_recovery_results_with_resolutions.out"
results_fname="results.out"
test_threshold_z_score = .2
#################################################

z_score <- function(x, mu, sigma) (x - mu)/sigma

results <- read.csv("outputs/rotamer_recovery_results_with_resolutions.out",
	header=TRUE,
	sep=",",
	strip.white=TRUE)

#filter for low crystallographic B-Factor Residues
results <- results[results$pdbBFactor <= 30,]
results <- transform(results, nDOFs = as.integer(!is.na(nativeChi1)) +
	as.integer(!is.na(nativeChi2)) +
	as.integer(!is.na(nativeChi3)) +
	as.integer(!is.na(nativeChi4)))


# Build a model
rr.model <- lrm(rotamerMatch ~ nativeResName + nativeNumNeighbors,
	data=results, x=TRUE, y=TRUE )

# An example of an alternative model
#rr.model <- lrm(rotamerMatch ~ nativeResName + nativeNumNeighbors + pdbBFactor*nativeResolution, data=results, x=TRUE, y=TRUE )

rr.predictions <- expand.grid(nativeNumNeighbors=quantile(results$nativeNumNeighbors),
															nativeResName=levels(results$nativeResName))
rr.predictions$rotamerMatch <- predict(rr.model, rr.predictions, type="fitted")


sink(results_fname)

# summary of data
cat("Observations Counts Summary:\n")
print(xtabs( ~ results$nativeResName + results$rotamerMatch ))
cat("\n\n\n")

cat("Model Analysis of Variance:\n")
print(anova(rr.model))
cat("\n\n\n")

cat("Model Coefficients:\n")
print(coef(rr.model))
cat("\n\n\n")

#cat("Model Cross Validation:\n")
#print(validate(rr.model))
#cat("\n\n\n")

#cat("Fast Backward Variable Selection:")
#print(fastbw(rr.model))
#cat("\n\n\n")

cat("Predicted fraction recovery:\n")
print(rr.predictions)


sink()



# Residuals vs B-Factors
d <- data.frame(residual = residuals(rr.model)^2,
                results[,c("pdbBFactor", "nativeResName")])

p <- ggplot(d, aes(x=pdbBFactor, y=residual, color=nativeResName))
p <- p + geom_point()
p <- p + opts(title="Rotamer Recovery Logistic Regression Model\nResiduals by BFactors")
p <- p + labs(x="BFactor", y="Residuals^2")
print(p)
ggsave(filename="residuals_vs_bfactors.pdf",
       plot=p)


# Residuals vs Resolution
d <- data.frame(residual = residuals(rr.model)^2,
                results[,c("nativeResName", "nativeResolution")])

p <- ggplot(d, aes(x=nativeResolution, y=residual, color=nativeResName))
p <- p + geom_point()
p <- p + opts(title="Rotamer Recovery Logistic Regression Model\nResiduals by Resolution")
p <- p + labs(x="Resolution", y="Residuals^2")
print(p)
ggsave(filename="residuals_vs_resolution.pdf",
       plot=p)


# Residuals vs Number of Degrees of Freedom
d <- data.frame(residual = residuals(rr.model)^2,
                results[,c("nDOFs", "nativeResName", "nativeResolution")])

p <- ggplot(d, aes(x=nDOFs, y=residual, color=nativeResName))
p <- p + geom_point() + geom_jitter()
p <- p + opts(title="Rotamer Recovery Logistic Regression Model\nResiduals by Number of Degrees of Freedom")
p <- p + labs(x="Number of Degrees of Freedom", y="Residuals^2")
print(p)
ggsave(filename="residuals_vs_nDOFs.pdf",
       plot=p)
