Filtering optimizations are now live for UGv2. Instead of re-computing filtered bases at every locus, they are computed just once per read and stored in the read itself. Eyeballing the results on the ~600 sample set from 1kg, we cut out ~40% of the runtime! QUALs are now sometimes different from UGv1 because I noticed a bug in v1 where samples with spanning deletions only were assigned ref calls instead of no-calls which ever so slightly affects the QUAL. Not a big deal though.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4494 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-10-14 05:04:28 +00:00
parent 4ac636e288
commit cfb33d8e12
7 changed files with 128 additions and 131 deletions

View File

@ -30,6 +30,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broad.tribble.util.variantcontext.Genotype;
@ -133,7 +135,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
}
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
attributes.put(VCFConstants.DEPTH_KEY, getUnfilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()));
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
@ -151,7 +153,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
myAlleles.add(ref);
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
attributes.put(VCFConstants.DEPTH_KEY, getUnfilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()));
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
@ -159,7 +161,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
double GQ = GL.getAALikelihoods() - Math.max(GL.getABLikelihoods(), GL.getBBLikelihoods());
calls.put(sample, new Genotype(sample, myAlleles, GQ, null, attributes, false));
}
return calls;
@ -187,6 +188,16 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
return ( refLikelihoodMinusEpsilon > likelihoods[1] && refLikelihoodMinusEpsilon > likelihoods[2]);
}
private int getUnfilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for ( PileupElement p : pileup ) {
if ( DiploidSNPGenotypeLikelihoods.usableBase(p, true) )
count++;
}
return count;
}
protected class CalculatedAlleleFrequency {
public double log10PNonRef;

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
@ -305,14 +306,9 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
int n = 0;
for ( PileupElement p : pileup ) {
// ignore deletions
if ( p.isDeletion() )
continue;
byte base = p.getBase();
if ( ! ignoreBadBases || ! badBase(base) ) {
if ( usableBase(p, ignoreBadBases) ) {
byte qual = capBaseQualsAtMappingQual ? (byte)Math.min((int)p.getQual(), p.getMappingQual()) : p.getQual();
n += add(base, qual, p.getRead(), p.getOffset());
n += add(p.getBase(), qual, p.getRead(), p.getOffset());
}
}
@ -452,6 +448,22 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
return fwdStrand ? 0 : 1;
}
/**
* Returns true when the observedBase is considered usable.
* @param p pileup element
* @param ignoreBadBases should we ignore bad bases?
* @return true if the base is a usable base
*/
protected static boolean usableBase(PileupElement p, boolean ignoreBadBases) {
// ignore deletions and filtered bases
if ( p.isDeletion() ||
(p.getRead() instanceof GATKSAMRecord &&
!((GATKSAMRecord)p.getRead()).isGoodBase(p.getOffset())) )
return false;
return ( !ignoreBadBases || !badBase(p.getBase()) );
}
/**
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
* is considered bad if:
@ -461,7 +473,7 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
* @param observedBase observed base
* @return true if the base is a bad base
*/
protected boolean badBase(byte observedBase) {
protected static boolean badBase(byte observedBase) {
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
}

View File

@ -84,7 +84,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
// create the GenotypeLikelihoods object
DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.baseModel, (DiploidSNPGenotypePriors)priors, UAC.defaultPlatform);
GL.add(pileup, true, UAC.CAP_BASE_QUALITY);
int nGoodBases = GL.add(pileup, true, UAC.CAP_BASE_QUALITY);
if ( nGoodBases == 0 )
continue;
double[] likelihoods = GL.getLikelihoods();
double[] posteriors = GL.getPosteriors();

View File

@ -40,12 +40,18 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broad.tribble.vcf.VCFConstants;
import org.broad.tribble.dbsnp.DbSNPFeature;
import java.io.PrintStream;
import java.util.*;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
import net.sf.picard.reference.IndexedFastaSequenceFile;
public class UnifiedGenotyperEngine {
@ -77,13 +83,18 @@ public class UnifiedGenotyperEngine {
private Logger logger = null;
private PrintStream verboseWriter = null;
// fasta reference reader to supplement the edges of the reference sequence for long reads
private IndexedFastaSequenceFile referenceReader;
// number of chromosomes (2 * samples) in input
private int N;
// the standard filter to use for calls below the confidence threshold but above the emit threshold
private static final Set<String> filter = new HashSet<String>(1);
private static final int MISMATCH_WINDOW_SIZE = 20;
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
// get the number of samples
// if we're supposed to assume a single sample, do so
@ -92,14 +103,14 @@ public class UnifiedGenotyperEngine {
numSamples = 1;
else
numSamples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size();
initialize(UAC, null, null, null, numSamples);
initialize(toolkit, UAC, null, null, null, numSamples);
}
public UnifiedGenotyperEngine(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
initialize(UAC, logger, verboseWriter, engine, numSamples);
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
initialize(toolkit, UAC, logger, verboseWriter, engine, numSamples);
}
private void initialize(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
this.UAC = UAC;
this.logger = logger;
this.verboseWriter = verboseWriter;
@ -111,6 +122,8 @@ public class UnifiedGenotyperEngine {
genotypePriors = createGenotypePriors(UAC);
filter.add(LOW_QUAL_FILTER_NAME);
referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile);
}
/**
@ -130,7 +143,7 @@ public class UnifiedGenotyperEngine {
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
BadReadPileupFilter badReadPileupFilter = new BadReadPileupFilter(refContext);
BadBaseFilter badReadPileupFilter = new BadBaseFilter(refContext, UAC);
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, badReadPileupFilter);
if ( stratifiedContexts == null )
return null;
@ -280,7 +293,7 @@ public class UnifiedGenotyperEngine {
return ( d >= 0.0 && d <= 1.0 );
}
private Map<String, StratifiedAlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, BadReadPileupFilter badReadPileupFilter) {
private Map<String, StratifiedAlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, BadBaseFilter badBaseFilter) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = null;
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) {
@ -290,9 +303,6 @@ public class UnifiedGenotyperEngine {
// filter the context based on min mapping quality
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
// filter the context based on bad mates and mismatch rate
pileup = pileup.getFilteredPileup(badReadPileupFilter);
// don't call when there is no coverage
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
return null;
@ -306,13 +316,13 @@ public class UnifiedGenotyperEngine {
if ( !BaseUtils.isRegularBase(ref) )
return null;
ReadBackedPileup rawPileup = rawContext.getBasePileup();
ReadBackedPileup pileup = rawContext.getBasePileup();
// filter the context based on min base and mapping qualities
ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE);
// filter the reads
filterPileup(pileup, badBaseFilter);
// filter the context based on bad mates and mismatch rate
pileup = pileup.getFilteredPileup(badReadPileupFilter);
//pileup = pileup.getFilteredPileup(badReadPileupFilter);
// don't call when there is no coverage
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
@ -395,23 +405,55 @@ public class UnifiedGenotyperEngine {
verboseWriter.println();
}
/**
* Filters low quality reads out of the pileup.
*/
private class BadReadPileupFilter implements PileupElementFilter {
private ReferenceContext refContext;
private void filterPileup(ReadBackedPileup pileup, BadBaseFilter badBaseFilter) {
for ( PileupElement p : pileup ) {
final SAMRecord read = p.getRead();
if ( read instanceof GATKSAMRecord )
((GATKSAMRecord)read).setGoodBases(badBaseFilter, true);
}
}
public BadReadPileupFilter(ReferenceContext ref) {
// create the +/-20bp window
GenomeLoc window = GenomeLocParser.createGenomeLoc(ref.getLocus().getContig(), Math.max(ref.getWindow().getStart(), ref.getLocus().getStart()-20), Math.min(ref.getWindow().getStop(), ref.getLocus().getStart()+20));
byte[] bases = new byte[41];
System.arraycopy(ref.getBases(), (int)Math.max(0, window.getStart()-ref.getWindow().getStart()), bases, 0, (int)window.size());
refContext = new ReferenceContext(ref.getLocus(), window, bases);
/**
* Filters low quality bases out of the SAMRecords.
*/
private class BadBaseFilter implements GATKSAMRecordFilter {
private ReferenceContext refContext;
private final UnifiedArgumentCollection UAC;
public BadBaseFilter(ReferenceContext refContext, UnifiedArgumentCollection UAC) {
this.refContext = refContext;
this.UAC = UAC;
}
public boolean allow(PileupElement pileupElement) {
return ((UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(pileupElement.getRead())) &&
AlignmentUtils.mismatchesInRefWindow(pileupElement, refContext, true) <= UAC.MAX_MISMATCHES );
public BitSet getGoodBases(final GATKSAMRecord record) {
// all bits are set to false by default
BitSet bitset = new BitSet(record.getReadLength());
// if the mapping quality is too low or the mate is bad, we can just zero out the whole read and continue
if ( record.getMappingQuality() < UAC.MIN_MAPPING_QUALTY_SCORE ||
(!UAC.USE_BADLY_MATED_READS && BadMateFilter.hasBadMate(record)) ) {
return bitset;
}
byte[] quals = record.getBaseQualities();
for (int i = 0; i < quals.length; i++) {
if ( quals[i] >= UAC.MIN_BASE_QUALTY_SCORE )
bitset.set(i);
}
// if a read is too long for the reference context, extend the context
if ( record.getAlignmentEnd() > refContext.getWindow().getStop() ) {
GenomeLoc window = GenomeLocParser.createGenomeLoc(refContext.getLocus().getContig(), refContext.getWindow().getStart(), record.getAlignmentEnd());
byte[] bases = referenceReader.getSubsequenceAt(window.getContig(), window.getStart(), window.getStop()).getBases();
StringUtil.toUpperCase(bases);
refContext = new ReferenceContext(refContext.getLocus(), window, bases);
}
BitSet mismatches = AlignmentUtils.mismatchesInRefWindow(record, refContext, UAC.MAX_MISMATCHES, MISMATCH_WINDOW_SIZE);
if ( mismatches != null )
bitset.and(mismatches);
return bitset;
}
}

View File

@ -128,7 +128,7 @@ public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedG
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
UG_engine = new UnifiedGenotyperEngine(UAC, logger, verboseWriter, annotationEngine, samples.size());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples.size());
// initialize the header
writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ;

View File

@ -258,18 +258,16 @@ public class AlignmentUtils {
int readLength = read.getReadLength();
BitSet mismatches = new BitSet(readLength);
// TODO -- we only care about starting from curpos
// it's possible we aren't starting at the beginning of a read,
// and we don't need to look at any of the previous context outside our window
// (although we do need future context)
int readStartPos = Math.max(read.getAlignmentStart(), (int)ref.getLocus().getStart() - windowSize);
int currentReadPos = read.getAlignmentStart();
byte[] refBases = ref.getBases();
int refIndex = read.getAlignmentStart() - (int)ref.getWindow().getStart();
// it's possible we aren't starting at the beginning of a read
int startOffset = 0;
int refIndex = readStartPos - (int)ref.getWindow().getStart();
if ( refIndex < 0 ) {
startOffset = -1 * refIndex;
refIndex = 0;
// TODO -- fix me
return null;
throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus());
}
byte[] readBases = read.getReadBases();
@ -282,15 +280,21 @@ public class AlignmentUtils {
int cigarElementLength = ce.getLength();
switch ( ce.getOperator() ) {
case M:
for (int j = 0; j < cigarElementLength; j++, readIndex++, refIndex++) {
for (int j = 0; j < cigarElementLength; j++, readIndex++) {
// skip over unwanted bases
if ( currentReadPos++ < readStartPos )
continue;
if ( refIndex >= refBases.length ) {
// TODO -- fix me
return null;
throw new IllegalStateException("When calculating mismatches, we somehow don't have enough trailing reference context for read " + read.getReadName() + " at position " + ref.getLocus());
}
byte refChr = refBases[refIndex];
byte readChr = readBases[readIndex];
if ( readChr != refChr )
mismatches.set(readIndex);
refIndex++;
}
break;
case I:
@ -299,7 +303,9 @@ public class AlignmentUtils {
break;
case D:
case N:
refIndex += cigarElementLength;
if ( currentReadPos >= readStartPos )
refIndex += cigarElementLength;
currentReadPos += cigarElementLength;
break;
case H:
case P:

View File

@ -1,10 +1,8 @@
package org.broadinstitute.sting.utils.sam;
import java.lang.reflect.Method;
import java.util.*;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
@ -18,11 +16,6 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
* if they are ever modified externally then one must also invoke the
* setReadGroup() method here to ensure that the cache is kept up-to-date.
*
* 13 Oct 2010 - mhanna - this class is fundamentally flawed: it uses a decorator
* pattern to wrap a heavyweight object, which can lead
* to heinous side effects if the wrapping is not carefully
* done. Hopefully SAMRecord will become an interface and
* this will eventually be fixed.
*/
public class GATKSAMRecord extends SAMRecord {
@ -83,7 +76,7 @@ public class GATKSAMRecord extends SAMRecord {
}
public boolean isGoodBase(int index) {
return ( mBitSet == null || mBitSet.length() <= index ? true : mBitSet.get(index));
return ( mBitSet == null || mBitSet.size() <= index ? true : mBitSet.get(index));
}
///////////////////////////////////////////////////////////////////////////////
@ -334,76 +327,6 @@ public class GATKSAMRecord extends SAMRecord {
public void setValidationStringency(net.sf.samtools.SAMFileReader.ValidationStringency validationStringency) { mRecord.setValidationStringency(validationStringency); }
public Object getAttribute(final String tag) { return mRecord.getAttribute(tag); }
public Integer getIntegerAttribute(final String tag) { return mRecord.getIntegerAttribute(tag); }
public Short getShortAttribute(final String tag) { return mRecord.getShortAttribute(tag); }
public Byte getByteAttribute(final String tag) { return mRecord.getByteAttribute(tag); }
public String getStringAttribute(final String tag) { return mRecord.getStringAttribute(tag); }
public Character getCharacterAttribute(final String tag) { return mRecord.getCharacterAttribute(tag); }
public Float getFloatAttribute(final String tag) { return mRecord.getFloatAttribute(tag); }
public byte[] getByteArrayAttribute(final String tag) { return mRecord.getByteArrayAttribute(tag); }
protected Object getAttribute(final short tag) {
Object attribute;
try {
Method method = mRecord.getClass().getDeclaredMethod("getAttribute",Short.TYPE);
method.setAccessible(true);
attribute = method.invoke(mRecord,tag);
}
catch(Exception ex) {
throw new ReviewedStingException("Unable to invoke getAttribute method",ex);
}
return attribute;
}
public void setAttribute(final String tag, final Object value) { mRecord.setAttribute(tag,value); }
protected void setAttribute(final short tag, final Object value) {
try {
Method method = mRecord.getClass().getDeclaredMethod("setAttribute",Short.TYPE,Object.class);
method.setAccessible(true);
method.invoke(mRecord,tag,value);
}
catch(Exception ex) {
throw new ReviewedStingException("Unable to invoke setAttribute method",ex);
}
}
public void clearAttributes() { mRecord.clearAttributes(); }
protected void setAttributes(final SAMBinaryTagAndValue attributes) {
try {
Method method = mRecord.getClass().getDeclaredMethod("setAttributes",SAMBinaryTagAndValue.class);
method.setAccessible(true);
method.invoke(mRecord,attributes);
}
catch(Exception ex) {
throw new ReviewedStingException("Unable to invoke setAttributes method",ex);
}
}
protected SAMBinaryTagAndValue getBinaryAttributes() {
SAMBinaryTagAndValue binaryAttributes;
try {
Method method = mRecord.getClass().getDeclaredMethod("getBinaryAttributes");
method.setAccessible(true);
binaryAttributes = (SAMBinaryTagAndValue)method.invoke(mRecord);
}
catch(Exception ex) {
throw new ReviewedStingException("Unable to invoke getBinaryAttributes method",ex);
}
return binaryAttributes;
}
public List<SAMTagAndValue> getAttributes() { return mRecord.getAttributes(); }
public SAMFileHeader getHeader() { return mRecord.getHeader(); }
public void setHeader(SAMFileHeader samFileHeader) { mRecord.setHeader(samFileHeader); }