diff --git a/R/plot_residualError_QualityScoreCovariate.R b/R/plot_residualError_QualityScoreCovariate.R index 06f03b8c8..2678e657d 100644 --- a/R/plot_residualError_QualityScoreCovariate.R +++ b/R/plot_residualError_QualityScoreCovariate.R @@ -4,6 +4,7 @@ args <- commandArgs(TRUE) input = args[1] Qcutoff = as.numeric(args[2]) +maxQ = as.numeric(args[3]) t=read.table(input, header=T) @@ -24,7 +25,7 @@ theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round( if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) { theTitle = paste("RMSE =", round(rmseAll,digits=3)); } -plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score") +plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score") points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16) points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16) points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16) @@ -41,7 +42,7 @@ hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0) hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0) percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases)) entropy = -sum(log2(percentBases)*percentBases) -plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,40), ylim=c(0,max(hst$e.nBases)), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n") +plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,max(hst$e.nBases)), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n") points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1") axis(2,axTicks(2), format(axTicks(2), scientific=F)) dev.off() @@ -54,7 +55,7 @@ outfile = paste(input, ".quality_rep_hist.pdf", sep="") pdf(outfile, height=7, width=7) hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0) hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0) -plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,40), ylim=c(0,max(hst$e.nBases)), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n") +plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,max(hst$e.nBases)), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n") points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1") axis(2,axTicks(2), format(axTicks(2), scientific=F)) dev.off() diff --git a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java index 468d86075..7f2b94aa3 100755 --- a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java +++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java @@ -38,6 +38,9 @@ class AnalyzeCovariatesCLP extends CommandLineProgram { private int IGNORE_QSCORES_LESS_THAN = 5; @Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false) private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups + @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 40") + private int MAX_QUALITY_SCORE = 40; + ///////////////////////////// // Private Member Variables @@ -187,7 +190,7 @@ class AnalyzeCovariatesCLP extends CommandLineProgram { String readGroup = readGroupKey.toString(); RecalDatum readGroupDatum = (RecalDatum) dataManager.getCollapsedTable(0).data.get(readGroupKey); System.out.print("Writing out data tables for read group: " + readGroup + "\twith " + readGroupDatum.getNumObservations() + " observations" ); - System.out.println("\tand aggregate residual error = " + String.format("%.3f", readGroupDatum.empiricalQualDouble(0) - readGroupDatum.getEstimatedQReported())); + System.out.println("\tand aggregate residual error = " + String.format("%.3f", readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE) - readGroupDatum.getEstimatedQReported())); // for each covariate for( int iii = 1; iii < requestedCovariates.size(); iii++ ) { @@ -207,12 +210,12 @@ class AnalyzeCovariatesCLP extends CommandLineProgram { output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases"); for( Object covariateKey : ((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).keySet()) { - output.print( covariateKey.toString() + "\t" ); // Covariate + output.print( covariateKey.toString() + "\t" ); // Covariate RecalDatum thisDatum = (RecalDatum)((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).get(covariateKey); - output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported - output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0)) + "\t" ); // Qempirical - output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches - output.println( thisDatum.getNumObservations() ); // nBases + output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported + output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)) + "\t" ); // Qempirical + output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches + output.println( thisDatum.getNumObservations() ); // nBases } // Close the PrintStream @@ -246,7 +249,7 @@ class AnalyzeCovariatesCLP extends CommandLineProgram { // Analyze reported quality p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " + OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " + - IGNORE_QSCORES_LESS_THAN); // The third argument is the Q scores that should be turned pink in the plot because they were ignored + IGNORE_QSCORES_LESS_THAN + " " + MAX_QUALITY_SCORE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored if(numReadGroups % 3 == 0) { // Don't want to spawn all the RScript jobs too quickly. So wait for this one to finish p.waitFor(); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 53e8294c6..14565d141 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -133,24 +133,25 @@ public class RecalDataManager { * Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score * that will be used in the sequential calculation in TableRecalibrationWalker * @param smoothing The smoothing paramter that goes into empirical quality score calculation + * @param maxQual At which value to cap the quality scores */ - public final void generateEmpiricalQualities( final int smoothing ) { + public final void generateEmpiricalQualities( final int smoothing, final int maxQual ) { - recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing); - recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing); + recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual); + recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual); for( NestedHashMap map : dataCollapsedByCovariate ) { - recursivelyGenerateEmpiricalQualities(map.data, smoothing); + recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual); } } - private void recursivelyGenerateEmpiricalQualities( final Map data, final int smoothing ) { + private void recursivelyGenerateEmpiricalQualities( final Map data, final int smoothing, final int maxQual ) { for( Object comp : data.keySet() ) { final Object val = data.get(comp); if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps - ((RecalDatum)val).calcCombinedEmpiricalQuality(smoothing); + ((RecalDatum)val).calcCombinedEmpiricalQuality(smoothing, maxQual); } else { // Another layer in the nested hash map - recursivelyGenerateEmpiricalQualities( (Map) val, smoothing); + recursivelyGenerateEmpiricalQualities( (Map) val, smoothing, maxQual); } } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java index 7de53aeb6..adc352b1b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java @@ -1,7 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import org.broadinstitute.sting.utils.QualityUtils; - /* * Copyright (c) 2009 The Broad Institute * @@ -77,7 +75,7 @@ public class RecalDatum extends RecalDatumOptimized { final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); this.increment( other.numObservations, other.numMismatches ); this.estimatedQReported = -10 * Math.log10(sumErrors / (double)this.numObservations); - if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; } + //if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; } } //--------------------------------------------------------------------------------------------------------------- @@ -86,7 +84,9 @@ public class RecalDatum extends RecalDatumOptimized { // //--------------------------------------------------------------------------------------------------------------- - public final void calcCombinedEmpiricalQuality(final int smoothing){ this.empiricalQuality = empiricalQualDouble(smoothing); } + public final void calcCombinedEmpiricalQuality( final int smoothing, final int maxQual ) { + this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again + } //--------------------------------------------------------------------------------------------------------------- // diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java index 0807fef8b..b3bb58f61 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java @@ -96,19 +96,19 @@ public class RecalDatumOptimized { // //--------------------------------------------------------------------------------------------------------------- - public final double empiricalQualDouble( final int smoothing ) { + public final double empiricalQualDouble( final int smoothing, final double maxQual ) { final double doubleMismatches = (double) ( numMismatches + smoothing ); final double doubleObservations = (double) ( numObservations + smoothing ); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) { empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; } + if (empiricalQual > maxQual) { empiricalQual = maxQual; } return empiricalQual; } - public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero + public final double empiricalQualDouble() { return empiricalQualDouble( 0, QualityUtils.MAX_REASONABLE_Q_SCORE ); } // 'default' behavior is to use smoothing value of zero public final byte empiricalQualByte( final int smoothing ) { final double doubleMismatches = (double) ( numMismatches + smoothing ); final double doubleObservations = (double) ( numObservations + smoothing ); - return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations ); + return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations ); // This is capped at Q40 } public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 02fbc1eb3..7c57f9ded 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -77,6 +77,8 @@ public class TableRecalibrationWalker extends ReadWalker e = new HashMap(); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "c12b4fd5b4905cc632aa1da19be5c66a" ); + + for ( Map.Entry entry : e.entrySet() ) { + String bam = entry.getKey(); + String md5 = entry.getValue(); + String paramsFile = paramsFiles.get(bam); + System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile); + if ( paramsFile != null ) { + WalkerTestSpec spec = new WalkerTestSpec( + "-R " + oneKGLocation + "reference/human_b36_both.fasta" + + " -T TableRecalibration" + + " -I " + bam + + ( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" ) + ? " -L 1:10,800,000-10,810,000" : " -L 1:10,100,000-10,300,000" ) + + " -outputBam %s" + + " --no_pg_tag" + + " -maxQ 70" + + " --solid_recal_mode SET_Q_ZERO" + + " -recalFile " + paramsFile, + 1, // just one output file + Arrays.asList(md5)); + executeTest("testTableRecalibratorMaxQ70", spec); + } + } + } + @Test public void testCountCovariatesVCF() { HashMap e = new HashMap();