From 15461567d77e9de85e339c5cbcd7ebf6357246a6 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Wed, 3 Apr 2013 18:47:59 -0400
Subject: [PATCH] HaplotypeCaller no longer uses reads with poor likelihoods
 w.r.t. any haplotype

-- The previous likelihood calculation proceeds as normal, but after each read has been evaluated against each haplotype we go through the read / allele / likelihoods map and eliminate all reads that have poor fit to any of the haplotypes.  This functionality stops us from making a particular type of error in the HC, where we have a haplotype that's very far from the reference allele but not the right true haplotype.  All of the reads that are slightly closer to this FP haplotype than the reference previously generated enormous likelihoods in favor of this FP haplotype because they were closer to it than the reference, even if each read had many mismatches w.r.t. the FP haplotype (and so the FP haplotype was a bad model for the true underlying haplotype).
---
 .../LikelihoodCalculationEngine.java          | 17 +++-
 .../PerReadAlleleLikelihoodMapUnitTest.java   | 94 +++++++++++++++----
 .../genotyper/PerReadAlleleLikelihoodMap.java | 58 ++++++++++++
 3 files changed, 151 insertions(+), 18 deletions(-)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index a90f8959d..1fb873e81 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -71,6 +71,13 @@ public class LikelihoodCalculationEngine {
     private final PairHMM pairHMM;
     private final int minReadLength = 20;
 
+    /**
+     * The expected rate of random sequencing errors for a read originating from its true haplotype.
+     *
+     * For example, if this is 0.01, then we'd expect 1 error per 100 bp.
+     */
+    private final double EXPECTED_ERROR_RATE_PER_BASE = 0.02;
+
     public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
 
         switch (hmmType) {
@@ -127,7 +134,14 @@ public class LikelihoodCalculationEngine {
         for( final Map.Entry<String, List<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
             //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
             // evaluate the likelihood of the reads given those haplotypes
-            stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue()));
+            final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue());
+
+            final List<GATKSAMRecord> removedReads = map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
+//            logger.info("Removed " + removedReads.size() + " reads because of bad likelihoods from sample " + sampleEntry.getKey());
+//            for ( final GATKSAMRecord read : removedReads )
+//                logger.info("\tRemoved " + read.getReadName());
+
+            stratifiedReadMap.put(sampleEntry.getKey(), map);
         }
 
         return stratifiedReadMap;
@@ -170,6 +184,7 @@ public class LikelihoodCalculationEngine {
                 perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
             }
         }
+
         return perReadAlleleLikelihoodMap;
     }
 
diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java
index 84bdfd19b..c50849a54 100644
--- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java
@@ -46,6 +46,8 @@
 
 package org.broadinstitute.sting.utils.genotyper;
 
+import net.sf.samtools.*;
+import org.apache.commons.lang.ArrayUtils;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.variant.variantcontext.Allele;
 import org.broadinstitute.sting.utils.BaseUtils;
@@ -54,33 +56,16 @@ import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import org.broadinstitute.variant.variantcontext.Allele;
 import org.broadinstitute.sting.utils.Utils;
 import java.util.Map;
 import java.util.List;
 import org.testng.Assert;
 import org.testng.annotations.Test;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
-import net.sf.samtools.SAMFileHeader;
-import net.sf.samtools.SAMFileReader;
-import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
-import org.broadinstitute.sting.utils.activeregion.ActivityProfileState;
 import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
-import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
-import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
-import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder;
 import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
-import org.broadinstitute.variant.variantcontext.Allele;
-import org.broadinstitute.variant.variantcontext.VariantContext;
-import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
-import org.broadinstitute.variant.vcf.VCFCodec;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.util.*;
@@ -235,7 +220,82 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest {
         Assert.assertEquals(downsampledStrat.get(base_A).size(),(int) (pileup.depthOfCoverage()/2) - 1);
         Assert.assertEquals(downsampledStrat.get(base_C).size(),(int) (pileup.depthOfCoverage()/2));
         Assert.assertEquals(downsampledStrat.get(base_T).size(),0);
+    }
+
+    @DataProvider(name = "PoorlyModelledReadData")
+    public Object[][] makePoorlyModelledReadData() {
+        List<Object[]> tests = new ArrayList<Object[]>();
+
+        // this functionality can be adapted to provide input data for whatever you might want in your data
+        tests.add(new Object[]{10, 0.1, false, Arrays.asList(0.0)});
+        tests.add(new Object[]{10, 0.1, true, Arrays.asList(-10.0)});
+        tests.add(new Object[]{10, 0.1, false, Arrays.asList(0.0, -10.0)});
+        tests.add(new Object[]{10, 0.1, true, Arrays.asList(-5.0, -10.0)});
+        tests.add(new Object[]{100, 0.1, false, Arrays.asList(-5.0, -10.0)});
+        tests.add(new Object[]{100, 0.01, true, Arrays.asList(-5.0, -10.0)});
+        tests.add(new Object[]{100, 0.01, false, Arrays.asList(-5.0, -10.0, -3.0)});
+        tests.add(new Object[]{100, 0.01, false, Arrays.asList(-5.0, -10.0, -2.0)});
+        tests.add(new Object[]{100, 0.01, true, Arrays.asList(-5.0, -10.0, -4.0)});
+        tests.add(new Object[]{100, 0.001, true, Arrays.asList(-5.0, -10.0)});
+        tests.add(new Object[]{100, 0.001, false, Arrays.asList(-5.0, -10.0, 0.0)});
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(dataProvider = "PoorlyModelledReadData")
+    public void testPoorlyModelledRead(final int readLen, final double maxErrorRatePerBase, final boolean expected, final List<Double> log10likelihoods) {
+        final byte[] bases = Utils.dupBytes((byte)'A', readLen);
+        final byte[] quals = Utils.dupBytes((byte) 30, readLen);
+
+        final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, readLen + "M");
+
+        final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap();
+        final boolean actual = map.readIsPoorlyModelled(read, log10likelihoods, maxErrorRatePerBase);
+        Assert.assertEquals(actual, expected);
+    }
 
 
+    @DataProvider(name = "RemovingPoorlyModelledReadData")
+    public Object[][] makeRemovingPoorlyModelledReadData() {
+        List<Object[]> tests = new ArrayList<Object[]>();
+
+        // this functionality can be adapted to provide input data for whatever you might want in your data
+        final int readLen = 10;
+        for ( int nReads = 0; nReads < 4; nReads++ ) {
+            for ( int nBad = 0; nBad <= nReads; nBad++ ) {
+                final int nGood = nReads - nBad;
+                tests.add(new Object[]{readLen, nReads, nBad, nGood});
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(dataProvider = "RemovingPoorlyModelledReadData")
+    public void testRemovingPoorlyModelledReads(final int readLen, final int nReads, final int nBad, final int nGood) {
+        final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap();
+        final Set<GATKSAMRecord> goodReads = new HashSet<GATKSAMRecord>();
+        final Set<GATKSAMRecord> badReads = new HashSet<GATKSAMRecord>();
+        for ( int readI = 0; readI < nReads; readI++ ) {
+            final boolean bad = readI < nBad;
+            final double likelihood = bad ? -100.0 : 0.0;
+
+            final byte[] bases = Utils.dupBytes((byte)'A', readLen);
+            final byte[] quals = Utils.dupBytes((byte) 30, readLen);
+
+            final Allele allele = Allele.create(Utils.dupString("A", readI+1));
+
+            final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, readLen + "M");
+            read.setReadName("readName" + readI);
+            map.add(read, allele, likelihood);
+            (bad ? badReads : goodReads).add(read);
+        }
+
+        final List<GATKSAMRecord> removedReads = map.filterPoorlyModelledReads(0.01);
+        Assert.assertEquals(removedReads.size(), nBad, "nBad " + nBad + " nGood " + nGood);
+        Assert.assertEquals(new HashSet<GATKSAMRecord>(removedReads), badReads, "nBad " + nBad + " nGood " + nGood);
+        Assert.assertEquals(map.size(), nGood, "nBad " + nBad + " nGood " + nGood);
+        Assert.assertTrue(map.getStoredElements().containsAll(goodReads), "nBad " + nBad + " nGood " + nGood);
+        Assert.assertEquals(map.getStoredElements().size(), nGood, "nBad " + nBad + " nGood " + nGood);
     }
 }
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
index 02618100d..201e3b9b4 100644
--- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
+++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
@@ -251,4 +251,62 @@ public class PerReadAlleleLikelihoodMap {
         }
         return sb.toString();
     }
+
+    /**
+     * Remove reads from this map that are poorly modelled w.r.t. their per allele likelihoods
+     *
+     * Goes through each read in this map, and if it is poorly modelled removes it from the map.
+     *
+     * @see #readIsPoorlyModelled(org.broadinstitute.sting.utils.sam.GATKSAMRecord, java.util.Collection, double)
+     * for more information about the poorly modelled test.
+     *
+     * @param maxErrorRatePerBase see equivalent parameter in #readIsPoorlyModelled
+     * @return the list of reads removed from this map because they are poorly modelled
+     */
+    public List<GATKSAMRecord> filterPoorlyModelledReads(final double maxErrorRatePerBase) {
+        final List<GATKSAMRecord> removedReads = new LinkedList<GATKSAMRecord>();
+        final Iterator<Map.Entry<GATKSAMRecord, Map<Allele, Double>>> it = likelihoodReadMap.entrySet().iterator();
+        while ( it.hasNext() ) {
+            final Map.Entry<GATKSAMRecord, Map<Allele, Double>> record = it.next();
+            if ( readIsPoorlyModelled(record.getKey(), record.getValue().values(), maxErrorRatePerBase) ) {
+                it.remove();
+                removedReads.add(record.getKey());
+            }
+        }
+
+        return removedReads;
+    }
+
+    /**
+     * Is this read poorly modelled by any of the alleles in this map?
+     *
+     * A read is poorly modeled when it's likelihood is below what would be expected for a read
+     * originating from one of the alleles given the maxErrorRatePerBase of the reads in general.
+     *
+     * This function makes a number of key assumptions.  First, that the likelihoods reflect the total likelihood
+     * of the read.  In other words, that the read would be fully explained by one of the alleles.  This means
+     * that the allele should be something like the full haplotype from which the read might originate.
+     *
+     * It further assumes that each error in the read occurs with likelihood of -3 (Q30 confidence per base).  So
+     * a read with a 10% error rate with Q30 bases that's 100 bp long we'd expect to see 10 real Q30 errors
+     * even against the true haplotype.  So for this read to be well modelled by at least one allele we'd expect
+     * a likelihood to be >= 10 * -3.
+     *
+     * @param read the read we want to evaluate
+     * @param log10Likelihoods a list of the log10 likelihoods of the read against a set of haplotypes.
+     * @param maxErrorRatePerBase the maximum error rate we'd expect for this read per base, in real space.  So
+     *                            0.01 means a 1% error rate
+     * @return true if none of the log10 likelihoods imply that the read truly originated from one of the haplotypes
+     */
+    protected boolean readIsPoorlyModelled(final GATKSAMRecord read, final Collection<Double> log10Likelihoods, final double maxErrorRatePerBase) {
+        final double maxErrorsForRead = Math.ceil(read.getReadLength() * maxErrorRatePerBase);
+        final double log10QualPerBase = -3.0;
+        final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase;
+
+        for ( final double log10Likelihood : log10Likelihoods )
+            if ( log10Likelihood >= log10MaxLikelihoodForTrueAllele )
+                return false;
+
+        return true;
+    }
 }