diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index a90f8959d..1fb873e81 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -71,6 +71,13 @@ public class LikelihoodCalculationEngine { private final PairHMM pairHMM; private final int minReadLength = 20; + /** + * The expected rate of random sequencing errors for a read originating from its true haplotype. + * + * For example, if this is 0.01, then we'd expect 1 error per 100 bp. + */ + private final double EXPECTED_ERROR_RATE_PER_BASE = 0.02; + public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) { switch (hmmType) { @@ -127,7 +134,14 @@ public class LikelihoodCalculationEngine { for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes - stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue())); + final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); + + final List removedReads = map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); +// logger.info("Removed " + removedReads.size() + " reads because of bad likelihoods from sample " + sampleEntry.getKey()); +// for ( final GATKSAMRecord read : removedReads ) +// logger.info("\tRemoved " + read.getReadName()); + + stratifiedReadMap.put(sampleEntry.getKey(), map); } return stratifiedReadMap; @@ -170,6 +184,7 @@ public class LikelihoodCalculationEngine { perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); } } + return perReadAlleleLikelihoodMap; } diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java index 84bdfd19b..c50849a54 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.utils.genotyper; +import net.sf.samtools.*; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.sting.utils.BaseUtils; @@ -54,33 +56,16 @@ import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.sting.utils.Utils; import java.util.Map; import java.util.List; import org.testng.Assert; import org.testng.annotations.Test; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.vcf.VCFCodec; import java.io.File; import java.io.FileNotFoundException; import java.util.*; @@ -235,7 +220,82 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest { Assert.assertEquals(downsampledStrat.get(base_A).size(),(int) (pileup.depthOfCoverage()/2) - 1); Assert.assertEquals(downsampledStrat.get(base_C).size(),(int) (pileup.depthOfCoverage()/2)); Assert.assertEquals(downsampledStrat.get(base_T).size(),0); + } + + @DataProvider(name = "PoorlyModelledReadData") + public Object[][] makePoorlyModelledReadData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{10, 0.1, false, Arrays.asList(0.0)}); + tests.add(new Object[]{10, 0.1, true, Arrays.asList(-10.0)}); + tests.add(new Object[]{10, 0.1, false, Arrays.asList(0.0, -10.0)}); + tests.add(new Object[]{10, 0.1, true, Arrays.asList(-5.0, -10.0)}); + tests.add(new Object[]{100, 0.1, false, Arrays.asList(-5.0, -10.0)}); + tests.add(new Object[]{100, 0.01, true, Arrays.asList(-5.0, -10.0)}); + tests.add(new Object[]{100, 0.01, false, Arrays.asList(-5.0, -10.0, -3.0)}); + tests.add(new Object[]{100, 0.01, false, Arrays.asList(-5.0, -10.0, -2.0)}); + tests.add(new Object[]{100, 0.01, true, Arrays.asList(-5.0, -10.0, -4.0)}); + tests.add(new Object[]{100, 0.001, true, Arrays.asList(-5.0, -10.0)}); + tests.add(new Object[]{100, 0.001, false, Arrays.asList(-5.0, -10.0, 0.0)}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PoorlyModelledReadData") + public void testPoorlyModelledRead(final int readLen, final double maxErrorRatePerBase, final boolean expected, final List log10likelihoods) { + final byte[] bases = Utils.dupBytes((byte)'A', readLen); + final byte[] quals = Utils.dupBytes((byte) 30, readLen); + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, readLen + "M"); + + final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap(); + final boolean actual = map.readIsPoorlyModelled(read, log10likelihoods, maxErrorRatePerBase); + Assert.assertEquals(actual, expected); + } + @DataProvider(name = "RemovingPoorlyModelledReadData") + public Object[][] makeRemovingPoorlyModelledReadData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final int readLen = 10; + for ( int nReads = 0; nReads < 4; nReads++ ) { + for ( int nBad = 0; nBad <= nReads; nBad++ ) { + final int nGood = nReads - nBad; + tests.add(new Object[]{readLen, nReads, nBad, nGood}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "RemovingPoorlyModelledReadData") + public void testRemovingPoorlyModelledReads(final int readLen, final int nReads, final int nBad, final int nGood) { + final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap(); + final Set goodReads = new HashSet(); + final Set badReads = new HashSet(); + for ( int readI = 0; readI < nReads; readI++ ) { + final boolean bad = readI < nBad; + final double likelihood = bad ? -100.0 : 0.0; + + final byte[] bases = Utils.dupBytes((byte)'A', readLen); + final byte[] quals = Utils.dupBytes((byte) 30, readLen); + + final Allele allele = Allele.create(Utils.dupString("A", readI+1)); + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, readLen + "M"); + read.setReadName("readName" + readI); + map.add(read, allele, likelihood); + (bad ? badReads : goodReads).add(read); + } + + final List removedReads = map.filterPoorlyModelledReads(0.01); + Assert.assertEquals(removedReads.size(), nBad, "nBad " + nBad + " nGood " + nGood); + Assert.assertEquals(new HashSet(removedReads), badReads, "nBad " + nBad + " nGood " + nGood); + Assert.assertEquals(map.size(), nGood, "nBad " + nBad + " nGood " + nGood); + Assert.assertTrue(map.getStoredElements().containsAll(goodReads), "nBad " + nBad + " nGood " + nGood); + Assert.assertEquals(map.getStoredElements().size(), nGood, "nBad " + nBad + " nGood " + nGood); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 02618100d..201e3b9b4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -251,4 +251,62 @@ public class PerReadAlleleLikelihoodMap { } return sb.toString(); } + + /** + * Remove reads from this map that are poorly modelled w.r.t. their per allele likelihoods + * + * Goes through each read in this map, and if it is poorly modelled removes it from the map. + * + * @see #readIsPoorlyModelled(org.broadinstitute.sting.utils.sam.GATKSAMRecord, java.util.Collection, double) + * for more information about the poorly modelled test. + * + * @param maxErrorRatePerBase see equivalent parameter in #readIsPoorlyModelled + * @return the list of reads removed from this map because they are poorly modelled + */ + public List filterPoorlyModelledReads(final double maxErrorRatePerBase) { + final List removedReads = new LinkedList(); + final Iterator>> it = likelihoodReadMap.entrySet().iterator(); + while ( it.hasNext() ) { + final Map.Entry> record = it.next(); + if ( readIsPoorlyModelled(record.getKey(), record.getValue().values(), maxErrorRatePerBase) ) { + it.remove(); + removedReads.add(record.getKey()); + } + } + + return removedReads; + } + + /** + * Is this read poorly modelled by any of the alleles in this map? + * + * A read is poorly modeled when it's likelihood is below what would be expected for a read + * originating from one of the alleles given the maxErrorRatePerBase of the reads in general. + * + * This function makes a number of key assumptions. First, that the likelihoods reflect the total likelihood + * of the read. In other words, that the read would be fully explained by one of the alleles. This means + * that the allele should be something like the full haplotype from which the read might originate. + * + * It further assumes that each error in the read occurs with likelihood of -3 (Q30 confidence per base). So + * a read with a 10% error rate with Q30 bases that's 100 bp long we'd expect to see 10 real Q30 errors + * even against the true haplotype. So for this read to be well modelled by at least one allele we'd expect + * a likelihood to be >= 10 * -3. + * + * @param read the read we want to evaluate + * @param log10Likelihoods a list of the log10 likelihoods of the read against a set of haplotypes. + * @param maxErrorRatePerBase the maximum error rate we'd expect for this read per base, in real space. So + * 0.01 means a 1% error rate + * @return true if none of the log10 likelihoods imply that the read truly originated from one of the haplotypes + */ + protected boolean readIsPoorlyModelled(final GATKSAMRecord read, final Collection log10Likelihoods, final double maxErrorRatePerBase) { + final double maxErrorsForRead = Math.ceil(read.getReadLength() * maxErrorRatePerBase); + final double log10QualPerBase = -3.0; + final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; + + for ( final double log10Likelihood : log10Likelihoods ) + if ( log10Likelihood >= log10MaxLikelihoodForTrueAllele ) + return false; + + return true; + } }