On the fly base quality score recalibration now happens up front in a SAMIterator on input instead of in a lazy-loading fashion if the BQSR table is provided as an engine argument. On the fly recalibration is now completely hooked up and live.

This commit is contained in:
Ryan Poplin 2012-02-13 12:35:09 -05:00
parent c8c06c7753
commit 41ffd08d53
13 changed files with 183 additions and 64 deletions

View File

@ -185,12 +185,12 @@ public class GenomeAnalysisEngine {
public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); }
/**
* Static base quality score recalibration helper object
* Base Quality Score Recalibration helper object
*/
private static BaseRecalibration baseRecalibration = null;
public static BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
public static boolean hasBaseRecalibration() { return baseRecalibration != null; }
public static void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
private BaseRecalibration baseRecalibration = null;
public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
public boolean hasBaseRecalibration() { return baseRecalibration != null; }
public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
/**
* Actually run the GATK with the specified walker.
@ -770,6 +770,7 @@ public class GenomeAnalysisEngine {
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
getWalkerBAQQualityMode(),
refReader,
getBaseRecalibration(),
argCollection.defaultBaseQualities);
}

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import java.util.Collection;
/**
@ -27,23 +28,20 @@ import java.util.Collection;
* information about how they should be downsampled, sorted, and filtered.
*/
public class ReadProperties {
private Collection<SAMReaderID> readers = null;
private SAMFileHeader header = null;
private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT;
private DownsamplingMethod downsamplingMethod = null;
private ValidationExclusion exclusionList = null;
private Collection<ReadFilter> supplementalFilters = null;
private boolean includeReadsWithDeletionAtLoci = false;
private boolean useOriginalBaseQualities = false;
private boolean generateExtendedEvents = false;
private BAQ.CalculationMode cmode = BAQ.CalculationMode.OFF;
private BAQ.QualityMode qmode = BAQ.QualityMode.DONT_MODIFY;
IndexedFastaSequenceFile refReader = null; // read for BAQ, if desired
private byte defaultBaseQualities;
// do we want to generate additional piles of "extended" events (indels)
// immediately after the reference base such event is associated with?
private final Collection<SAMReaderID> readers;
private final SAMFileHeader header;
private final SAMFileReader.ValidationStringency validationStringency;
private final DownsamplingMethod downsamplingMethod;
private final ValidationExclusion exclusionList;
private final Collection<ReadFilter> supplementalFilters;
private final boolean includeReadsWithDeletionAtLoci;
private final boolean useOriginalBaseQualities;
private final boolean generateExtendedEvents;
private final BAQ.CalculationMode cmode;
private final BAQ.QualityMode qmode;
private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired
private final BaseRecalibration bqsrApplier;
private final byte defaultBaseQualities;
/**
* Return true if the walker wants to see reads that contain deletions when looking at locus pileups
@ -126,6 +124,8 @@ public class ReadProperties {
return refReader;
}
public BaseRecalibration getBQSRApplier() { return bqsrApplier; }
/**
* @return Default base quality value to fill reads missing base quality information.
*/
@ -165,8 +165,9 @@ public class ReadProperties {
boolean includeReadsWithDeletionAtLoci,
boolean generateExtendedEvents,
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
BaseRecalibration bqsrApplier,
byte defaultBaseQualities) {
this.readers = samFiles;
this.header = header;
@ -180,6 +181,7 @@ public class ReadProperties {
this.cmode = cmode;
this.qmode = qmode;
this.refReader = refReader;
this.bqsrApplier = bqsrApplier;
this.defaultBaseQualities = defaultBaseQualities;
}
}

View File

@ -46,6 +46,8 @@ import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.recalibration.BQSRSamIterator;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
import java.io.File;
@ -201,6 +203,7 @@ public class SAMDataSource {
BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY,
null, // no BAQ
null, // no BQSR
(byte) -1);
}
@ -237,6 +240,7 @@ public class SAMDataSource {
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
BaseRecalibration bqsrApplier,
byte defaultBaseQualities) {
this.readMetrics = new ReadMetrics();
this.genomeLocParser = genomeLocParser;
@ -309,6 +313,7 @@ public class SAMDataSource {
cmode,
qmode,
refReader,
bqsrApplier,
defaultBaseQualities);
// cache the read group id (original) -> read group id (merged)
@ -591,6 +596,7 @@ public class SAMDataSource {
readProperties.getBAQCalculationMode(),
readProperties.getBAQQualityMode(),
readProperties.getRefReader(),
readProperties.getBQSRApplier(),
readProperties.defaultBaseQualities());
}
@ -660,9 +666,10 @@ public class SAMDataSource {
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
BaseRecalibration bqsrApplier,
byte defaultBaseQualities) {
if ( useOriginalBaseQualities || defaultBaseQualities >= 0 )
// only wrap if we are replacing the original qualitiies or using a default base quality
if (useOriginalBaseQualities || defaultBaseQualities >= 0)
// only wrap if we are replacing the original qualities or using a default base quality
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
// NOTE: this (and other filtering) should be done before on-the-fly sorting
@ -675,6 +682,9 @@ public class SAMDataSource {
if (!noValidationOfReadOrder && enableVerification)
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
if (bqsrApplier != null)
wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier);
if (cmode != BAQ.CalculationMode.OFF)
wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode);

View File

@ -68,7 +68,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
if(prevLoc != null) {
for(int iii = prevLoc.getStart() + 1; iii < location.getStart(); iii++ ) {
final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
if( initialIntervals.overlaps( fakeLoc ) ) {
if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) {
final double isActiveProb = ( walker.presetActiveRegions == null ? walker.isActive( null, null, null )
: ( walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ) );
isActiveList.add( isActiveProb );
@ -89,7 +89,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals.overlaps( location ) ) {
if( initialIntervals == null || initialIntervals.overlaps( location ) ) {
final double isActiveProb = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus )
: ( walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0 ) );
isActiveList.add( isActiveProb );

View File

@ -79,7 +79,7 @@ public class ContextCovariate implements StandardCovariate {
}
/**
* calculates the context of a base indenpendent of the covariate mode
* calculates the context of a base independent of the covariate mode
*
* @param bases the bases in the read to build the context from
* @param offset the position in the read to calculate the context for

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* The object temporarily held by a read that describes all of it's covariates.
*
@ -15,9 +17,9 @@ public class CovariateKeySet {
private int nextCovariateIndex;
public final static String mismatchesCovariateName = "M";
public final static String insertionsCovariateName = "I";
public final static String deletionsCovariateName = "D";
private static String mismatchesCovariateName = "M";
private static String insertionsCovariateName = "I";
private static String deletionsCovariateName = "D";
public CovariateKeySet(int readLength, int numberOfCovariates) {
numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format)
@ -36,7 +38,30 @@ public class CovariateKeySet {
transposeCovariateValues(deletionsKeySet, covariate.getDeletions());
nextCovariateIndex++;
}
public static RecalDataManager.BaseRecalibrationType getErrorModelFromString(final String modelString) {
if (modelString.equals(mismatchesCovariateName))
return RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION;
else if (modelString.equals(insertionsCovariateName))
return RecalDataManager.BaseRecalibrationType.BASE_INSERTION;
else if (modelString.equals(deletionsCovariateName))
return RecalDataManager.BaseRecalibrationType.BASE_DELETION;
throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString);
}
public Object[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) {
switch (errorModel) {
case BASE_SUBSTITUTION:
return getMismatchesKeySet(readPosition);
case BASE_INSERTION:
return getInsertionsKeySet(readPosition);
case BASE_DELETION:
return getDeletionsKeySet(readPosition);
default:
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel );
}
}
public Object[] getMismatchesKeySet(int readPosition) {
return mismatchesKeySet[readPosition];
}

View File

@ -34,7 +34,7 @@ public class BAQSamIterator implements StingSAMIterator {
"cmode != null" ,
"qmode != null"})
public BAQSamIterator(IndexedFastaSequenceFile refReader, StingSAMIterator it, BAQ.CalculationMode cmode, BAQ.QualityMode qmode) {
if ( cmode == BAQ.CalculationMode.OFF) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with calculation mode OFF");
if ( cmode == BAQ.CalculationMode.OFF ) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with calculation mode OFF");
if ( qmode == BAQ.QualityMode.DONT_MODIFY ) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with quailty mode DONT_MODIFY");
this.refReader = refReader;

View File

@ -4,6 +4,7 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
@ -150,13 +151,23 @@ public class FragmentUtils {
final int numBases = firstReadStop + secondRead.getReadLength();
final byte[] bases = new byte[numBases];
final byte[] quals = new byte[numBases];
// BUGBUG: too verbose, clean this up.
final byte[] insertionQuals = new byte[numBases];
final byte[] deletionQuals = new byte[numBases];
final byte[] firstReadBases = firstRead.getReadBases();
final byte[] firstReadQuals = firstRead.getBaseQualities();
final byte[] firstReadInsertionQuals = firstRead.getBaseInsertionQualities();
final byte[] firstReadDeletionQuals = firstRead.getBaseDeletionQualities();
final byte[] secondReadBases = secondRead.getReadBases();
final byte[] secondReadQuals = secondRead.getBaseQualities();
final byte[] secondReadInsertionQuals = secondRead.getBaseInsertionQualities();
final byte[] secondReadDeletionQuals = secondRead.getBaseDeletionQualities();
for(int iii = 0; iii < firstReadStop; iii++) {
bases[iii] = firstReadBases[iii];
quals[iii] = firstReadQuals[iii];
insertionQuals[iii] = firstReadInsertionQuals[iii];
deletionQuals[iii] = firstReadDeletionQuals[iii];
}
for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) {
if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) {
@ -164,16 +175,22 @@ public class FragmentUtils {
}
bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] );
quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] );
insertionQuals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadInsertionQuals[iii] : secondReadInsertionQuals[iii-firstReadStop] ); // Purposefully checking the highest base quality score
deletionQuals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadDeletionQuals[iii] : secondReadDeletionQuals[iii-firstReadStop] ); // Purposefully checking the highest base quality score
}
for(int iii = firstRead.getReadLength(); iii < numBases; iii++) {
bases[iii] = secondReadBases[iii-firstReadStop];
quals[iii] = secondReadQuals[iii-firstReadStop];
insertionQuals[iii] = secondReadInsertionQuals[iii-firstReadStop];
deletionQuals[iii] = secondReadDeletionQuals[iii-firstReadStop];
}
final GATKSAMRecord returnRead = new GATKSAMRecord(firstRead.getHeader());
returnRead.setAlignmentStart(firstRead.getUnclippedStart());
returnRead.setReadBases( bases );
returnRead.setBaseQualities( quals );
returnRead.setBaseQualities( quals, RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION );
returnRead.setBaseQualities( insertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION );
returnRead.setBaseQualities( deletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION );
returnRead.setReadGroup( firstRead.getReadGroup() );
returnRead.setReferenceName( firstRead.getReferenceName() );
final CigarElement c = new CigarElement(bases.length, CigarOperator.M);

View File

@ -0,0 +1,50 @@
package org.broadinstitute.sting.utils.recalibration;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Iterator;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 2/13/12
*/
public class BQSRSamIterator implements StingSAMIterator {
private final StingSAMIterator it;
private final BaseRecalibration bqsr;
/**
* Creates a new BQSRSamIterator and applies BQSR on the fly to incoming reads.
*
* @param it The incoming SamIterator to wrap
* @param bqsr The object which holds the BQSR table information and knows how to apply it
*/
@Requires({
"it != null",
"bqsr != null"})
public BQSRSamIterator(StingSAMIterator it, BaseRecalibration bqsr) {
if ( bqsr == null ) throw new ReviewedStingException("BUG: shouldn't create BQSRSamIterator with null recalibration object");
this.it = it;
this.bqsr = bqsr;
}
@Requires("hasNext()")
@Ensures("result != null")
public SAMRecord next() {
SAMRecord read = it.next();
bqsr.recalibrateRead((GATKSAMRecord) read);
return read;
}
public boolean hasNext() { return this.it.hasNext(); }
public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); }
public void close() { it.close(); }
public Iterator<SAMRecord> iterator() { return this; }
}

View File

@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
@ -165,10 +166,8 @@ public class BaseRecalibration {
key[iii] = cov.getValue( vals[iii] );
}
final String modelString = vals[iii++];
final RecalDataManager.BaseRecalibrationType errorModel = ( modelString.equals(CovariateKeySet.mismatchesCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION :
( modelString.equals(CovariateKeySet.insertionsCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_INSERTION :
( modelString.equals(CovariateKeySet.deletionsCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_DELETION : null ) ) );
final RecalDataManager.BaseRecalibrationType errorModel = CovariateKeySet.getErrorModelFromString(modelString);
// Create a new datum using the number of observations, number of mismatches, and reported quality score
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
// Add that datum to all the collapsed tables which will be used in the sequential calculation
@ -183,19 +182,16 @@ public class BaseRecalibration {
final CovariateKeySet covariateKeySet = RecalDataManager.getAllCovariateValuesFor( read );
for( final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values() ) {
final byte[] originalQuals = ( errorModel == RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION ? read.getBaseQualities() :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_INSERTION ? read.getBaseDeletionQualities() :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_DELETION ? read.getBaseDeletionQualities() : null ) ) );
final byte[] originalQuals = read.getBaseQualities( errorModel );
final byte[] recalQuals = originalQuals.clone();
// For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
final Object[] fullCovariateKey =
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION ? covariateKeySet.getMismatchesKeySet(offset) :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_INSERTION ? covariateKeySet.getInsertionsKeySet(offset) :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_DELETION ? covariateKeySet.getDeletionsKeySet(offset) : null ) ) );
final Object[] fullCovariateKeyWithErrorMode = covariateKeySet.getKeySet(offset, errorModel);
final Object[] fullCovariateKey = Arrays.copyOfRange(fullCovariateKeyWithErrorMode, 0, fullCovariateKeyWithErrorMode.length-1); // need to strip off the error mode which was appended to the list of covariates
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if( qualityScore == null ) {
qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey );
@ -206,21 +202,8 @@ public class BaseRecalibration {
}
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
switch (errorModel) {
case BASE_SUBSTITUTION:
read.setBaseQualities( recalQuals );
break;
case BASE_INSERTION:
read.setAttribute( GATKSAMRecord.BQSR_BASE_INSERTION_QUALITIES, recalQuals );
break;
case BASE_DELETION:
read.setAttribute( GATKSAMRecord.BQSR_BASE_DELETION_QUALITIES, recalQuals );
break;
default:
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel );
}
read.setBaseQualities( recalQuals, errorModel );
}
}
/**

View File

@ -25,9 +25,9 @@
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager;
import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
import java.util.HashMap;
@ -163,8 +163,37 @@ public class GATKSAMRecord extends BAMRecord {
}
/**
* Accessors for base insertion and base deletion quality scores
* Setters and Accessors for base insertion and base deletion quality scores
*/
public void setBaseQualities( final byte[] quals, final RecalDataManager.BaseRecalibrationType errorModel ) {
switch( errorModel ) {
case BASE_SUBSTITUTION:
setBaseQualities(quals);
break;
case BASE_INSERTION:
setAttribute( GATKSAMRecord.BQSR_BASE_INSERTION_QUALITIES, quals );
break;
case BASE_DELETION:
setAttribute( GATKSAMRecord.BQSR_BASE_DELETION_QUALITIES, quals );
break;
default:
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel );
}
}
public byte[] getBaseQualities( final RecalDataManager.BaseRecalibrationType errorModel ) {
switch( errorModel ) {
case BASE_SUBSTITUTION:
return getBaseQualities();
case BASE_INSERTION:
return getBaseInsertionQualities();
case BASE_DELETION:
return getBaseDeletionQualities();
default:
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel );
}
}
public byte[] getBaseInsertionQualities() {
byte[] quals = getByteArrayAttribute( BQSR_BASE_INSERTION_QUALITIES );
if( quals == null ) {

View File

@ -79,7 +79,8 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark {
false,
BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY,
null,
null, // no BAQ
null, // no BQSR
(byte)0);
GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary());

View File

@ -308,6 +308,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest {
BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY,
null, // no BAQ
null, // no BQSR
(byte) -1
);
}