Fixed an issue with recalibrating original quality scores above Q40. There is a new option -maxQ which sets the maximum quality score possible for when a RecalDatum tries to compute its quality score from the mismatch rate. The same option was added to AnalyzeCovariates to help with plotting q scores above Q40. Added an integration test which makes use of this new -maxQ option.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2534 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-01-07 13:50:30 +00:00
parent 6c739e30e0
commit cea544871d
7 changed files with 63 additions and 27 deletions

View File

@ -4,6 +4,7 @@ args <- commandArgs(TRUE)
input = args[1]
Qcutoff = as.numeric(args[2])
maxQ = as.numeric(args[3])
t=read.table(input, header=T)
@ -24,7 +25,7 @@ theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(
if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
theTitle = paste("RMSE =", round(rmseAll,digits=3));
}
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
@ -41,7 +42,7 @@ hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
entropy = -sum(log2(percentBases)*percentBases)
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,40), ylim=c(0,max(hst$e.nBases)), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,max(hst$e.nBases)), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()
@ -54,7 +55,7 @@ outfile = paste(input, ".quality_rep_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,40), ylim=c(0,max(hst$e.nBases)), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,max(hst$e.nBases)), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()

View File

@ -38,6 +38,9 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
private int IGNORE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 40")
private int MAX_QUALITY_SCORE = 40;
/////////////////////////////
// Private Member Variables
@ -187,7 +190,7 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
String readGroup = readGroupKey.toString();
RecalDatum readGroupDatum = (RecalDatum) dataManager.getCollapsedTable(0).data.get(readGroupKey);
System.out.print("Writing out data tables for read group: " + readGroup + "\twith " + readGroupDatum.getNumObservations() + " observations" );
System.out.println("\tand aggregate residual error = " + String.format("%.3f", readGroupDatum.empiricalQualDouble(0) - readGroupDatum.getEstimatedQReported()));
System.out.println("\tand aggregate residual error = " + String.format("%.3f", readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE) - readGroupDatum.getEstimatedQReported()));
// for each covariate
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
@ -207,12 +210,12 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases");
for( Object covariateKey : ((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).keySet()) {
output.print( covariateKey.toString() + "\t" ); // Covariate
output.print( covariateKey.toString() + "\t" ); // Covariate
RecalDatum thisDatum = (RecalDatum)((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).get(covariateKey);
output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported
output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0)) + "\t" ); // Qempirical
output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches
output.println( thisDatum.getNumObservations() ); // nBases
output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported
output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)) + "\t" ); // Qempirical
output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches
output.println( thisDatum.getNumObservations() ); // nBases
}
// Close the PrintStream
@ -246,7 +249,7 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
// Analyze reported quality
p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " +
OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
IGNORE_QSCORES_LESS_THAN); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
IGNORE_QSCORES_LESS_THAN + " " + MAX_QUALITY_SCORE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
if(numReadGroups % 3 == 0) { // Don't want to spawn all the RScript jobs too quickly. So wait for this one to finish
p.waitFor();
}

View File

@ -133,24 +133,25 @@ public class RecalDataManager {
* Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score
* that will be used in the sequential calculation in TableRecalibrationWalker
* @param smoothing The smoothing paramter that goes into empirical quality score calculation
* @param maxQual At which value to cap the quality scores
*/
public final void generateEmpiricalQualities( final int smoothing ) {
public final void generateEmpiricalQualities( final int smoothing, final int maxQual ) {
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing);
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing);
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual);
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual);
for( NestedHashMap map : dataCollapsedByCovariate ) {
recursivelyGenerateEmpiricalQualities(map.data, smoothing);
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
}
}
private void recursivelyGenerateEmpiricalQualities( final Map data, final int smoothing ) {
private void recursivelyGenerateEmpiricalQualities( final Map data, final int smoothing, final int maxQual ) {
for( Object comp : data.keySet() ) {
final Object val = data.get(comp);
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
((RecalDatum)val).calcCombinedEmpiricalQuality(smoothing);
((RecalDatum)val).calcCombinedEmpiricalQuality(smoothing, maxQual);
} else { // Another layer in the nested hash map
recursivelyGenerateEmpiricalQualities( (Map) val, smoothing);
recursivelyGenerateEmpiricalQualities( (Map) val, smoothing, maxQual);
}
}
}

View File

@ -1,7 +1,5 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.QualityUtils;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -77,7 +75,7 @@ public class RecalDatum extends RecalDatumOptimized {
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
this.increment( other.numObservations, other.numMismatches );
this.estimatedQReported = -10 * Math.log10(sumErrors / (double)this.numObservations);
if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
//if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
}
//---------------------------------------------------------------------------------------------------------------
@ -86,7 +84,9 @@ public class RecalDatum extends RecalDatumOptimized {
//
//---------------------------------------------------------------------------------------------------------------
public final void calcCombinedEmpiricalQuality(final int smoothing){ this.empiricalQuality = empiricalQualDouble(smoothing); }
public final void calcCombinedEmpiricalQuality( final int smoothing, final int maxQual ) {
this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again
}
//---------------------------------------------------------------------------------------------------------------
//

View File

@ -96,19 +96,19 @@ public class RecalDatumOptimized {
//
//---------------------------------------------------------------------------------------------------------------
public final double empiricalQualDouble( final int smoothing ) {
public final double empiricalQualDouble( final int smoothing, final double maxQual ) {
final double doubleMismatches = (double) ( numMismatches + smoothing );
final double doubleObservations = (double) ( numObservations + smoothing );
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) { empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; }
if (empiricalQual > maxQual) { empiricalQual = maxQual; }
return empiricalQual;
}
public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero
public final double empiricalQualDouble() { return empiricalQualDouble( 0, QualityUtils.MAX_REASONABLE_Q_SCORE ); } // 'default' behavior is to use smoothing value of zero
public final byte empiricalQualByte( final int smoothing ) {
final double doubleMismatches = (double) ( numMismatches + smoothing );
final double doubleObservations = (double) ( numObservations + smoothing );
return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations );
return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations ); // This is capped at Q40
}
public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero

View File

@ -77,6 +77,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private int PRESERVE_QSCORES_LESS_THAN = 5;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
private int SMOOTHING = 1;
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 40")
private int MAX_QUALITY_SCORE = 40;
/////////////////////////////
// Debugging-only Arguments
@ -92,7 +94,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
private static final String versionString = "v2.2.3"; // Major version, minor version, and build number
private static final String versionString = "v2.2.4"; // Major version, minor version, and build number
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
private static final long RANDOM_SEED = 1032861495;
@ -217,7 +219,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// Create the tables of empirical quality scores that will be used in the sequential calculation
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
dataManager.generateEmpiricalQualities( SMOOTHING );
dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE );
logger.info( "...done!" );
// Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used

View File

@ -76,6 +76,35 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
}
}
@Test
public void testTableRecalibratorMaxQ70() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "c12b4fd5b4905cc632aa1da19be5c66a" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
String md5 = entry.getValue();
String paramsFile = paramsFiles.get(bam);
System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile);
if ( paramsFile != null ) {
WalkerTestSpec spec = new WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" -T TableRecalibration" +
" -I " + bam +
( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" )
? " -L 1:10,800,000-10,810,000" : " -L 1:10,100,000-10,300,000" ) +
" -outputBam %s" +
" --no_pg_tag" +
" -maxQ 70" +
" --solid_recal_mode SET_Q_ZERO" +
" -recalFile " + paramsFile,
1, // just one output file
Arrays.asList(md5));
executeTest("testTableRecalibratorMaxQ70", spec);
}
}
}
@Test
public void testCountCovariatesVCF() {
HashMap<String, String> e = new HashMap<String, String>();