Change to support plotting of indel quality as a function of covariates - for now, just call different R calling script.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5053 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2011-01-22 14:09:23 +00:00
parent fa0c476b82
commit a50d7f74fa
2 changed files with 129 additions and 10 deletions

View File

@ -0,0 +1,108 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
covariateName = args[2]
outfile = paste(input, ".indelQual_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
c <- read.table(input, header=T)
c <- c[sort.list(c[,1]),]
#
# Plot qual as a function of the covariate
#
d.good <- c[c$nBases >= 1000,]
d.1000 <- c[c$nBases < 1000,]
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
if( length(d.good$nBases) == length(c$nBases) ) {
theTitle = paste("RMSE =", round(rmseAll,digits=3))
}
# Don't let residual error go off the edge of the plot
d.good$residualError = d.good$Qempirical#-d.good$Qreported
#d.good$residualError[which(d.good$residualError > 10)] = 10
#d.good$residualError[which(d.good$residualError < -10)] = -10
d.1000$residualError = d.1000$Qempirical#-d.1000$Qreported
#d.1000$residualError[which(d.1000$residualError > 10)] = 10
#d.1000$residualError[which(d.1000$residualError < -10)] = -10
c$residualError = c$Qempirical
#c$residualError[which(c$residualError > 10)] = 10
#c$residualError[which(c$residualError < -10)] = -10
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical Indel Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-0, 50), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical Indel Quality", xlab=covariateName, col="blue", ylim=c(0, 50))
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot mean quality versus the covariate
#
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
theTitle = paste("Quality By", covariateName);
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot histogram of the covariate
#
e = d.good
f = d.1000
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
lwdSize=2
if( length(c$Covariate) <= 20 ) {
lwdSize=7
} else if( length(c$Covariate) <= 70 ) {
lwdSize=4
}
if( is.numeric(c$Covariate) ) {
if( length(hst$e.Covariate) == 0 ) {
plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
} else {
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
if( length(hst$c.Covariate) > 9 ) {
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
} else {
axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
}
dev.off()

View File

@ -70,6 +70,8 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
private int MAX_QUALITY_SCORE = 50;
@Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
private int MAX_HISTOGRAM_VALUE = 0;
@Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, this value will be the max value of the histogram plots")
private boolean DO_INDEL_QUALITY = false;
/////////////////////////////
@ -274,17 +276,26 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
Covariate cov = requestedCovariates.get(iii);
try {
if (DO_INDEL_QUALITY) {
p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_indelQuality.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
p.waitFor();
} else {
if( iii == 1 ) {
// Analyze reported quality
p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
IGNORE_QSCORES_LESS_THAN + " " + MAX_QUALITY_SCORE + " " + MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
p.waitFor();
} else { // Analyze all other covariates
p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
p.waitFor();
// Analyze reported quality
p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
IGNORE_QSCORES_LESS_THAN + " " + MAX_QUALITY_SCORE + " " + MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
p.waitFor();
} else { // Analyze all other covariates
p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
p.waitFor();
}
}
} catch (InterruptedException e) {
e.printStackTrace();