Added an option to AnalyzeCovariates to set the max value of the histograms to make them easier to directly compare.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2753 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-01-31 23:13:57 +00:00
parent f6da57dc79
commit 64fc76e4bf
2 changed files with 14 additions and 3 deletions

View File

@ -5,6 +5,7 @@ args <- commandArgs(TRUE)
input = args[1]
Qcutoff = as.numeric(args[2])
maxQ = as.numeric(args[3])
maxHist = as.numeric(args[4])
t=read.table(input, header=T)
@ -42,7 +43,11 @@ hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
entropy = -sum(log2(percentBases)*percentBases)
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,max(hst$e.nBases)), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
yMax = max(hst$e.nBases)
if(maxHist != 0) {
yMax = maxHist
}
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()
@ -55,7 +60,11 @@ outfile = paste(input, ".quality_rep_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,max(hst$e.nBases)), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
yMax = max(hst$e.nBases)
if(maxHist != 0) {
yMax = maxHist
}
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()

View File

@ -65,6 +65,8 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 40")
private int MAX_QUALITY_SCORE = 40;
@Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
private int MAX_HISTOGRAM_VALUE = 0;
/////////////////////////////
@ -274,7 +276,7 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
// Analyze reported quality
p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
IGNORE_QSCORES_LESS_THAN + " " + MAX_QUALITY_SCORE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
IGNORE_QSCORES_LESS_THAN + " " + MAX_QUALITY_SCORE + " " + MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
if(numReadGroups % 3 == 0) { // Don't want to spawn all the RScript jobs too quickly. So wait for this one to finish
p.waitFor();
}