From cfb33d8e12c2b03930d25d739d161b4ad72f2393 Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 14 Oct 2010 05:04:28 +0000 Subject: [PATCH] Filtering optimizations are now live for UGv2. Instead of re-computing filtered bases at every locus, they are computed just once per read and stored in the read itself. Eyeballing the results on the ~600 sample set from 1kg, we cut out ~40% of the runtime! QUALs are now sometimes different from UGv1 because I noticed a bug in v1 where samples with spanning deletions only were assigned ref calls instead of no-calls which ever so slightly affects the QUAL. Not a big deal though. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4494 348d0f76-0448-11de-a6fe-93d51630548a --- .../AlleleFrequencyCalculationModel.java | 17 +++- .../DiploidSNPGenotypeLikelihoods.java | 28 ++++-- ...NPGenotypeLikelihoodsCalculationModel.java | 5 +- .../genotyper/UnifiedGenotyperEngine.java | 96 +++++++++++++------ .../walkers/genotyper/UnifiedGenotyperV2.java | 2 +- .../sting/utils/sam/AlignmentUtils.java | 32 ++++--- .../sting/utils/sam/GATKSAMRecord.java | 79 +-------------- 7 files changed, 128 insertions(+), 131 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index cc17e9104..2b84e02a6 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -30,6 +30,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.collections.Pair; import org.broad.tribble.util.variantcontext.Genotype; @@ -133,7 +135,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { } HashMap attributes = new HashMap(); - attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()); + attributes.put(VCFConstants.DEPTH_KEY, getUnfilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup())); GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString()); @@ -151,7 +153,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { myAlleles.add(ref); HashMap attributes = new HashMap(); - attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()); + attributes.put(VCFConstants.DEPTH_KEY, getUnfilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup())); GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString()); @@ -159,7 +161,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { double GQ = GL.getAALikelihoods() - Math.max(GL.getABLikelihoods(), GL.getBBLikelihoods()); calls.put(sample, new Genotype(sample, myAlleles, GQ, null, attributes, false)); - } return calls; @@ -187,6 +188,16 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { return ( refLikelihoodMinusEpsilon > likelihoods[1] && refLikelihoodMinusEpsilon > likelihoods[2]); } + private int getUnfilteredDepth(ReadBackedPileup pileup) { + int count = 0; + for ( PileupElement p : pileup ) { + if ( DiploidSNPGenotypeLikelihoods.usableBase(p, true) ) + count++; + } + + return count; + } + protected class CalculatedAlleleFrequency { public double log10PNonRef; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 2500030c8..d98ec0063 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.genotyper; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; @@ -305,14 +306,9 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { int n = 0; for ( PileupElement p : pileup ) { - // ignore deletions - if ( p.isDeletion() ) - continue; - - byte base = p.getBase(); - if ( ! ignoreBadBases || ! badBase(base) ) { + if ( usableBase(p, ignoreBadBases) ) { byte qual = capBaseQualsAtMappingQual ? (byte)Math.min((int)p.getQual(), p.getMappingQual()) : p.getQual(); - n += add(base, qual, p.getRead(), p.getOffset()); + n += add(p.getBase(), qual, p.getRead(), p.getOffset()); } } @@ -452,6 +448,22 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { return fwdStrand ? 0 : 1; } + /** + * Returns true when the observedBase is considered usable. + * @param p pileup element + * @param ignoreBadBases should we ignore bad bases? + * @return true if the base is a usable base + */ + protected static boolean usableBase(PileupElement p, boolean ignoreBadBases) { + // ignore deletions and filtered bases + if ( p.isDeletion() || + (p.getRead() instanceof GATKSAMRecord && + !((GATKSAMRecord)p.getRead()).isGoodBase(p.getOffset())) ) + return false; + + return ( !ignoreBadBases || !badBase(p.getBase()) ); + } + /** * Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base * is considered bad if: @@ -461,7 +473,7 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { * @param observedBase observed base * @return true if the base is a bad base */ - protected boolean badBase(byte observedBase) { + protected static boolean badBase(byte observedBase) { return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 2ba16ec9e..cffa3c465 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -84,7 +84,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // create the GenotypeLikelihoods object DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.baseModel, (DiploidSNPGenotypePriors)priors, UAC.defaultPlatform); - GL.add(pileup, true, UAC.CAP_BASE_QUALITY); + int nGoodBases = GL.add(pileup, true, UAC.CAP_BASE_QUALITY); + if ( nGoodBases == 0 ) + continue; + double[] likelihoods = GL.getLikelihoods(); double[] posteriors = GL.getPosteriors(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 70929c268..c86a9c357 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -40,12 +40,18 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.pileup.*; +import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broad.tribble.vcf.VCFConstants; import org.broad.tribble.dbsnp.DbSNPFeature; import java.io.PrintStream; import java.util.*; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.StringUtil; +import net.sf.picard.reference.IndexedFastaSequenceFile; + public class UnifiedGenotyperEngine { @@ -77,13 +83,18 @@ public class UnifiedGenotyperEngine { private Logger logger = null; private PrintStream verboseWriter = null; + // fasta reference reader to supplement the edges of the reference sequence for long reads + private IndexedFastaSequenceFile referenceReader; + // number of chromosomes (2 * samples) in input private int N; // the standard filter to use for calls below the confidence threshold but above the emit threshold private static final Set filter = new HashSet(1); + private static final int MISMATCH_WINDOW_SIZE = 20; + public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { // get the number of samples // if we're supposed to assume a single sample, do so @@ -92,14 +103,14 @@ public class UnifiedGenotyperEngine { numSamples = 1; else numSamples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size(); - initialize(UAC, null, null, null, numSamples); + initialize(toolkit, UAC, null, null, null, numSamples); } - public UnifiedGenotyperEngine(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) { - initialize(UAC, logger, verboseWriter, engine, numSamples); + public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) { + initialize(toolkit, UAC, logger, verboseWriter, engine, numSamples); } - private void initialize(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) { + private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) { this.UAC = UAC; this.logger = logger; this.verboseWriter = verboseWriter; @@ -111,6 +122,8 @@ public class UnifiedGenotyperEngine { genotypePriors = createGenotypePriors(UAC); filter.add(LOW_QUAL_FILTER_NAME); + + referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile); } /** @@ -130,7 +143,7 @@ public class UnifiedGenotyperEngine { afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); } - BadReadPileupFilter badReadPileupFilter = new BadReadPileupFilter(refContext); + BadBaseFilter badReadPileupFilter = new BadBaseFilter(refContext, UAC); Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, badReadPileupFilter); if ( stratifiedContexts == null ) return null; @@ -280,7 +293,7 @@ public class UnifiedGenotyperEngine { return ( d >= 0.0 && d <= 1.0 ); } - private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, BadReadPileupFilter badReadPileupFilter) { + private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, BadBaseFilter badBaseFilter) { Map stratifiedContexts = null; if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) { @@ -290,9 +303,6 @@ public class UnifiedGenotyperEngine { // filter the context based on min mapping quality ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); - // filter the context based on bad mates and mismatch rate - pileup = pileup.getFilteredPileup(badReadPileupFilter); - // don't call when there is no coverage if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE ) return null; @@ -306,13 +316,13 @@ public class UnifiedGenotyperEngine { if ( !BaseUtils.isRegularBase(ref) ) return null; - ReadBackedPileup rawPileup = rawContext.getBasePileup(); + ReadBackedPileup pileup = rawContext.getBasePileup(); - // filter the context based on min base and mapping qualities - ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE); + // filter the reads + filterPileup(pileup, badBaseFilter); // filter the context based on bad mates and mismatch rate - pileup = pileup.getFilteredPileup(badReadPileupFilter); + //pileup = pileup.getFilteredPileup(badReadPileupFilter); // don't call when there is no coverage if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE ) @@ -395,23 +405,55 @@ public class UnifiedGenotyperEngine { verboseWriter.println(); } - /** - * Filters low quality reads out of the pileup. - */ - private class BadReadPileupFilter implements PileupElementFilter { - private ReferenceContext refContext; + private void filterPileup(ReadBackedPileup pileup, BadBaseFilter badBaseFilter) { + for ( PileupElement p : pileup ) { + final SAMRecord read = p.getRead(); + if ( read instanceof GATKSAMRecord ) + ((GATKSAMRecord)read).setGoodBases(badBaseFilter, true); + } + } - public BadReadPileupFilter(ReferenceContext ref) { - // create the +/-20bp window - GenomeLoc window = GenomeLocParser.createGenomeLoc(ref.getLocus().getContig(), Math.max(ref.getWindow().getStart(), ref.getLocus().getStart()-20), Math.min(ref.getWindow().getStop(), ref.getLocus().getStart()+20)); - byte[] bases = new byte[41]; - System.arraycopy(ref.getBases(), (int)Math.max(0, window.getStart()-ref.getWindow().getStart()), bases, 0, (int)window.size()); - refContext = new ReferenceContext(ref.getLocus(), window, bases); + /** + * Filters low quality bases out of the SAMRecords. + */ + private class BadBaseFilter implements GATKSAMRecordFilter { + private ReferenceContext refContext; + private final UnifiedArgumentCollection UAC; + + public BadBaseFilter(ReferenceContext refContext, UnifiedArgumentCollection UAC) { + this.refContext = refContext; + this.UAC = UAC; } - public boolean allow(PileupElement pileupElement) { - return ((UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(pileupElement.getRead())) && - AlignmentUtils.mismatchesInRefWindow(pileupElement, refContext, true) <= UAC.MAX_MISMATCHES ); + public BitSet getGoodBases(final GATKSAMRecord record) { + // all bits are set to false by default + BitSet bitset = new BitSet(record.getReadLength()); + + // if the mapping quality is too low or the mate is bad, we can just zero out the whole read and continue + if ( record.getMappingQuality() < UAC.MIN_MAPPING_QUALTY_SCORE || + (!UAC.USE_BADLY_MATED_READS && BadMateFilter.hasBadMate(record)) ) { + return bitset; + } + + byte[] quals = record.getBaseQualities(); + for (int i = 0; i < quals.length; i++) { + if ( quals[i] >= UAC.MIN_BASE_QUALTY_SCORE ) + bitset.set(i); + } + + // if a read is too long for the reference context, extend the context + if ( record.getAlignmentEnd() > refContext.getWindow().getStop() ) { + GenomeLoc window = GenomeLocParser.createGenomeLoc(refContext.getLocus().getContig(), refContext.getWindow().getStart(), record.getAlignmentEnd()); + byte[] bases = referenceReader.getSubsequenceAt(window.getContig(), window.getStart(), window.getStop()).getBases(); + StringUtil.toUpperCase(bases); + refContext = new ReferenceContext(refContext.getLocus(), window, bases); + } + + BitSet mismatches = AlignmentUtils.mismatchesInRefWindow(record, refContext, UAC.MAX_MISMATCHES, MISMATCH_WINDOW_SIZE); + if ( mismatches != null ) + bitset.and(mismatches); + + return bitset; } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java index 5c2354d75..99e8c4544 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java @@ -128,7 +128,7 @@ public class UnifiedGenotyperV2 extends LocusWalker= refBases.length ) { - // TODO -- fix me - return null; + throw new IllegalStateException("When calculating mismatches, we somehow don't have enough trailing reference context for read " + read.getReadName() + " at position " + ref.getLocus()); } + byte refChr = refBases[refIndex]; byte readChr = readBases[readIndex]; if ( readChr != refChr ) mismatches.set(readIndex); + + refIndex++; } break; case I: @@ -299,7 +303,9 @@ public class AlignmentUtils { break; case D: case N: - refIndex += cigarElementLength; + if ( currentReadPos >= readStartPos ) + refIndex += cigarElementLength; + currentReadPos += cigarElementLength; break; case H: case P: diff --git a/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c624b09b8..ad557471a 100755 --- a/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -1,10 +1,8 @@ package org.broadinstitute.sting.utils.sam; -import java.lang.reflect.Method; import java.util.*; import net.sf.samtools.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; /** @@ -18,11 +16,6 @@ import org.broadinstitute.sting.utils.exceptions.UserException; * if they are ever modified externally then one must also invoke the * setReadGroup() method here to ensure that the cache is kept up-to-date. * - * 13 Oct 2010 - mhanna - this class is fundamentally flawed: it uses a decorator - * pattern to wrap a heavyweight object, which can lead - * to heinous side effects if the wrapping is not carefully - * done. Hopefully SAMRecord will become an interface and - * this will eventually be fixed. */ public class GATKSAMRecord extends SAMRecord { @@ -83,7 +76,7 @@ public class GATKSAMRecord extends SAMRecord { } public boolean isGoodBase(int index) { - return ( mBitSet == null || mBitSet.length() <= index ? true : mBitSet.get(index)); + return ( mBitSet == null || mBitSet.size() <= index ? true : mBitSet.get(index)); } /////////////////////////////////////////////////////////////////////////////// @@ -334,76 +327,6 @@ public class GATKSAMRecord extends SAMRecord { public void setValidationStringency(net.sf.samtools.SAMFileReader.ValidationStringency validationStringency) { mRecord.setValidationStringency(validationStringency); } - public Object getAttribute(final String tag) { return mRecord.getAttribute(tag); } - - public Integer getIntegerAttribute(final String tag) { return mRecord.getIntegerAttribute(tag); } - - public Short getShortAttribute(final String tag) { return mRecord.getShortAttribute(tag); } - - public Byte getByteAttribute(final String tag) { return mRecord.getByteAttribute(tag); } - - public String getStringAttribute(final String tag) { return mRecord.getStringAttribute(tag); } - - public Character getCharacterAttribute(final String tag) { return mRecord.getCharacterAttribute(tag); } - - public Float getFloatAttribute(final String tag) { return mRecord.getFloatAttribute(tag); } - - public byte[] getByteArrayAttribute(final String tag) { return mRecord.getByteArrayAttribute(tag); } - - protected Object getAttribute(final short tag) { - Object attribute; - try { - Method method = mRecord.getClass().getDeclaredMethod("getAttribute",Short.TYPE); - method.setAccessible(true); - attribute = method.invoke(mRecord,tag); - } - catch(Exception ex) { - throw new ReviewedStingException("Unable to invoke getAttribute method",ex); - } - return attribute; - } - - public void setAttribute(final String tag, final Object value) { mRecord.setAttribute(tag,value); } - - protected void setAttribute(final short tag, final Object value) { - try { - Method method = mRecord.getClass().getDeclaredMethod("setAttribute",Short.TYPE,Object.class); - method.setAccessible(true); - method.invoke(mRecord,tag,value); - } - catch(Exception ex) { - throw new ReviewedStingException("Unable to invoke setAttribute method",ex); - } - } - - public void clearAttributes() { mRecord.clearAttributes(); } - - protected void setAttributes(final SAMBinaryTagAndValue attributes) { - try { - Method method = mRecord.getClass().getDeclaredMethod("setAttributes",SAMBinaryTagAndValue.class); - method.setAccessible(true); - method.invoke(mRecord,attributes); - } - catch(Exception ex) { - throw new ReviewedStingException("Unable to invoke setAttributes method",ex); - } - } - - protected SAMBinaryTagAndValue getBinaryAttributes() { - SAMBinaryTagAndValue binaryAttributes; - try { - Method method = mRecord.getClass().getDeclaredMethod("getBinaryAttributes"); - method.setAccessible(true); - binaryAttributes = (SAMBinaryTagAndValue)method.invoke(mRecord); - } - catch(Exception ex) { - throw new ReviewedStingException("Unable to invoke getBinaryAttributes method",ex); - } - return binaryAttributes; - } - - public List getAttributes() { return mRecord.getAttributes(); } - public SAMFileHeader getHeader() { return mRecord.getHeader(); } public void setHeader(SAMFileHeader samFileHeader) { mRecord.setHeader(samFileHeader); }