Updating Covariate interface with Mauricio to include an errorModel parameter. On the fly recalibration of base insertion and base deletion quals is live for the HaplotypeCaller

This commit is contained in:
Ryan Poplin 2012-02-06 11:10:24 -05:00
parent b7ffd144e8
commit dc05b71e39
17 changed files with 56 additions and 32 deletions

View File

@ -26,6 +26,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import java.util.Arrays;
@ -35,7 +36,7 @@ import java.util.Arrays;
* Date: 9/26/11
*/
public class ContextCovariate implements Covariate {
public class ContextCovariate implements ExperimentalCovariate {
final int CONTEXT_SIZE = 8;
String allN = "";
@ -49,7 +50,7 @@ public class ContextCovariate implements Covariate {
}
@Override
public void getValues(SAMRecord read, Comparable[] comparable) {
public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) {
byte[] bases = read.getReadBases();
for(int i = 0; i < read.getReadLength(); i++) {
comparable[i] = ( i-CONTEXT_SIZE < 0 ? allN : new String(Arrays.copyOfRange(bases,i-CONTEXT_SIZE,i)) );

View File

@ -41,6 +41,7 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
@ -374,7 +375,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
RecalDataManager.parseColorSpace( gatkRead );
gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE,
RecalDataManager.computeCovariates( gatkRead, requestedCovariates ));
RecalDataManager.computeCovariates( gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION ));
}
// Skip this position if base quality is zero

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
/*
* Copyright (c) 2009 The Broad Institute
@ -32,7 +33,7 @@ import net.sf.samtools.SAMRecord;
* User: rpoplin
* Date: Oct 30, 2009
*
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read, offset, and corresponding reference bases
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read.
* In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code.
* This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up.
*/
@ -40,9 +41,10 @@ import net.sf.samtools.SAMRecord;
public interface Covariate {
public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers
public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public void getValues( SAMRecord read, Comparable[] comparable ); //Takes an array of size (at least) read.getReadLength() and fills it with covariate
//values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows
//read-specific calculations to be done just once rather than for each offset.
public void getValues( SAMRecord read, Comparable[] comparable, BaseRecalibration.BaseRecalibrationType modelType );
//Takes an array of size (at least) read.getReadLength() and fills it with covariate
//values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows
//read-specific calculations to be done just once rather than for each offset.
}
interface RequiredCovariate extends Covariate {

View File

@ -4,6 +4,7 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.EnumSet;
@ -65,7 +66,7 @@ public class CycleCovariate implements StandardCovariate {
// Used to pick out the covariate's value from attributes of the read
@Override
public void getValues(SAMRecord read, Comparable[] comparable) {
public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) {
//-----------------------------
// Illumina, Solid, PacBio, and Complete Genomics

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import java.util.HashMap;
@ -65,7 +66,7 @@ public class DinucCovariate implements StandardCovariate {
* Takes an array of size (at least) read.getReadLength() and fills it with the covariate values for each position in the read.
*/
@Override
public void getValues( SAMRecord read, Comparable[] result ) {
public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) {
final HashMap<Integer, Dinuc> dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap
final int readLength = read.getReadLength();
final boolean negativeStrand = read.getReadNegativeStrandFlag();
@ -78,7 +79,7 @@ public class DinucCovariate implements StandardCovariate {
if(negativeStrand) {
bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place
}
result[0] = NO_DINUC; // No dinuc at the beginning of the read
comparable[0] = NO_DINUC; // No dinuc at the beginning of the read
prevBase = bases[0];
offset++;
@ -87,16 +88,16 @@ public class DinucCovariate implements StandardCovariate {
// previous base in the reference. This is done in part to be consistent with unmapped reads.
base = bases[offset];
if( BaseUtils.isRegularBase( prevBase ) ) {
result[offset] = dinucHashMapRef.get( Dinuc.hashBytes( prevBase, base ) );
comparable[offset] = dinucHashMapRef.get( Dinuc.hashBytes( prevBase, base ) );
} else {
result[offset] = NO_DINUC;
comparable[offset] = NO_DINUC;
}
offset++;
prevBase = base;
}
if(negativeStrand) {
reverse( result );
reverse( comparable );
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
/*
* Copyright (c) 2010 The Broad Institute
@ -78,7 +79,7 @@ public class GCContentCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(SAMRecord read, Comparable[] comparable) {
public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) {
for(int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
/*
* Copyright (c) 2009 The Broad Institute
@ -92,7 +93,7 @@ public class HomopolymerCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(SAMRecord read, Comparable[] comparable) {
public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) {
for(int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
/*
* Copyright (c) 2009 The Broad Institute
@ -54,7 +55,7 @@ public class MappingQualityCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(SAMRecord read, Comparable[] comparable) {
public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) {
for(int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
/*
* Copyright (c) 2009 The Broad Institute
@ -63,7 +64,7 @@ public class MinimumNQSCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(SAMRecord read, Comparable[] comparable) {
public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) {
for(int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
/*
* Copyright (c) 2009 The Broad Institute
@ -53,7 +54,7 @@ public class PositionCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(SAMRecord read, Comparable[] comparable) {
public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) {
for(int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
/*
* Copyright (c) 2009 The Broad Institute
@ -59,7 +60,7 @@ public class PrimerRoundCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(SAMRecord read, Comparable[] comparable) {
public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) {
for(int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -1,6 +1,9 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import java.util.Arrays;
/*
* Copyright (c) 2009 The Broad Institute
@ -43,10 +46,15 @@ public class QualityScoreCovariate implements RequiredCovariate {
}
@Override
public void getValues(SAMRecord read, Comparable[] comparable) {
byte[] baseQualities = read.getBaseQualities();
for(int i = 0; i < read.getReadLength(); i++) {
comparable[i] = (int) baseQualities[i];
public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) {
if( modelType == BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION ) {
byte[] baseQualities = read.getBaseQualities();
for(int i = 0; i < read.getReadLength(); i++) {
comparable[i] = (int) baseQualities[i];
}
} else { // model == BASE_INSERTION || model == BASE_DELETION
Arrays.fill(comparable, 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
/*
* Copyright (c) 2009 The Broad Institute
@ -35,7 +36,7 @@ import net.sf.samtools.SAMRecord;
* The Read Group covariate.
*/
public class ReadGroupCovariate implements RequiredCovariate{
public class ReadGroupCovariate implements RequiredCovariate {
public static final String defaultReadGroup = "DefaultReadGroup";
@ -45,7 +46,7 @@ public class ReadGroupCovariate implements RequiredCovariate{
}
@Override
public void getValues(SAMRecord read, Comparable[] comparable) {
public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) {
final String readGroupId = read.getReadGroup().getReadGroupId();
for(int i = 0; i < read.getReadLength(); i++) {
comparable[i] = readGroupId;

View File

@ -33,6 +33,7 @@ import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -571,7 +572,7 @@ public class RecalDataManager {
* value for the ith position in the read and the jth covariate in
* reqeustedCovariates list.
*/
public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List<Covariate> requestedCovariates) {
public static Comparable[][] computeCovariates( final GATKSAMRecord gatkRead, final List<Covariate> requestedCovariates, final BaseRecalibration.BaseRecalibrationType modelType ) {
//compute all covariates for this read
final List<Covariate> requestedCovariatesRef = requestedCovariates;
final int numRequestedCovariates = requestedCovariatesRef.size();
@ -582,7 +583,7 @@ public class RecalDataManager {
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
for( int i = 0; i < numRequestedCovariates; i++ ) {
requestedCovariatesRef.get(i).getValues( gatkRead, tempCovariateValuesHolder );
requestedCovariatesRef.get(i).getValues( gatkRead, tempCovariateValuesHolder, modelType );
for(int j = 0; j < readLength; j++) {
//copy values into a 2D array that allows all covar types to be extracted at once for
//an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.

View File

@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
import org.broadinstitute.sting.utils.text.XReadLines;
@ -398,7 +399,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
//compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar =
RecalDataManager.computeCovariates(read, requestedCovariates);
RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
// For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) {

View File

@ -177,13 +177,13 @@ public class BaseRecalibration {
dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
}
public byte[] recalibrateRead( final GATKSAMRecord read, final byte[] originalQuals ) {
public byte[] recalibrateRead( final GATKSAMRecord read, final byte[] originalQuals, final BaseRecalibrationType modelType ) {
final byte[] recalQuals = originalQuals.clone();
//compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar =
RecalDataManager.computeCovariates(read, requestedCovariates);
RecalDataManager.computeCovariates(read, requestedCovariates, modelType);
// For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) {

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import java.util.Arrays;
import java.util.HashMap;
@ -194,7 +195,7 @@ public class GATKSAMRecord extends BAMRecord {
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
// else give default values which are flat Q45
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals, BaseRecalibration.BaseRecalibrationType.BASE_INSERTION ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
}
// add the qual array to the read so that we don't have to do the recalibration work again
setAttribute( BQSR_BASE_INSERTION_QUALITIES, quals );
@ -210,7 +211,7 @@ public class GATKSAMRecord extends BAMRecord {
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
// else give default values which are flat Q45
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals, BaseRecalibration.BaseRecalibrationType.BASE_DELETION ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
}
// add the qual array to the read so that we don't have to do the recalibration work again
setAttribute( BQSR_BASE_DELETION_QUALITIES, quals );