First pass of implementation of Reduced Reads with HaplotypeCaller. Main changes: a) Active region: scale PL's by representative count to determine whether region is active. b) Scale per-read, per-haplotype likelihoods by read representative counts. A read representative count is (temporarily) defined as the average representative count over all bases in read, TBD whether this is good enough to avoid biases in GL's. c) DeBruijn assembler inserts kmers N times in graph, where N is min representative count of read over kmer span - TBD again whether this is the best approach. d) Bug fixes in FragmentUtils: logic to merge fragments was wrong in cases where there is discrepancy of overlaps between unclipped/soft clipped bases. Didn't affect things before but RR makes prevalence of hard-clipped bases in CIGARs more prevalent so this was exposed. e) Cache read representative counts along with read likelihoods associated with a Haplotype. Code can/should be cleaned up and unified with PairHMMIndelErrorModelCode, as well as refactored to support arbitrary ploidy in HaplotypeCaller

2012-08-03 12:24:23 -04:00 · 2012-08-03 12:24:23 -04:00 · 9e25b209e0
parent 9ac72dbd4d
commit 9e25b209e0
7 changed files with 115 additions and 16 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@ -356,9 +356,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
                            }
                        }
                    }
-                    genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual);
-                    genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
-                    genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
+                    genotypeLikelihoods[AA] += p.getRepresentativeCount()* QualityUtils.qualToProbLog10(qual);
+                    genotypeLikelihoods[AB] += p.getRepresentativeCount()* MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
+                    genotypeLikelihoods[BB] += p.getRepresentativeCount()* QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
                }
            }
            genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@ -50,7 +50,6 @@ public class LikelihoodCalculationEngine {
    }

    public void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList ) {
-        final int numHaplotypes = haplotypes.size();

        int X_METRIC_LENGTH = 0;
        for( final String sample : perSampleReadList.keySet() ) {
@ -60,8 +59,8 @@ public class LikelihoodCalculationEngine {
            }
        }
        int Y_METRIC_LENGTH = 0;
-        for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
-            final int haplotypeLength = haplotypes.get(jjj).getBases().length;
+        for( Haplotype h: haplotypes ) {
+            final int haplotypeLength = h.getBases().length;
            if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
        }

@ -90,8 +89,10 @@ public class LikelihoodCalculationEngine {
        final int numHaplotypes = haplotypes.size();
        final int numReads = reads.size();
        final double[][] readLikelihoods = new double[numHaplotypes][numReads];
+        final int[][] readCounts = new int[numHaplotypes][numReads];
        for( int iii = 0; iii < numReads; iii++ ) {
            final GATKSAMRecord read = reads.get(iii);
+            final int readCount =  getRepresentativeReadCount(read);

            final byte[] overallGCP = new byte[read.getReadLength()];
            Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
@ -114,13 +115,23 @@ public class LikelihoodCalculationEngine {
                readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(),
                        readQuals, readInsQuals, readDelQuals, overallGCP,
                        haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
+                readCounts[jjj][iii] = readCount;
            }
        }
        for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
-            haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj] );
+            haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj], readCounts[jjj] );
        }
    }

+    private static int getRepresentativeReadCount(GATKSAMRecord read) {
+        if (!read.isReducedRead())
+            return 1;
+
+        // compute mean representative read counts
+        final byte[] counts = read.getReducedReadCounts();
+        return MathUtils.sum(counts)/counts.length;
+    }
+
    private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) {
        for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){
            if( b1[iii] != b2[iii] ) {
@ -154,17 +165,20 @@ public class LikelihoodCalculationEngine {
        }

        // compute the diploid haplotype likelihoods
+        // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
        for( int iii = 0; iii < numHaplotypes; iii++ ) {
            for( int jjj = 0; jjj <= iii; jjj++ ) {                
                for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
                    final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
+                    final int[] readCounts_iii = iii_mapped.getReadCounts(sample);
                    for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
                        final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
                        double haplotypeLikelihood = 0.0;
                        for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
                            // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
+                            // log10(10^(a*x1) + 10^(b*x2))
                            // First term is approximated by Jacobian log with table lookup.
-                            haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF;
+                            haplotypeLikelihood +=readCounts_iii[kkk] *( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF);
                        }
                        haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
                    }
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
@ -4,6 +4,7 @@ import com.google.java.contract.Ensures;
 import org.apache.commons.lang.ArrayUtils;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.Haplotype;
+import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.SWPairwiseAlignment;
 import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -68,7 +69,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
        return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
    }

-    private void createDeBruijnGraphs( final ArrayList<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
+    protected void createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
        graphs.clear();

        // create the graph
@ -161,7 +162,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
        }
    }

-    private static boolean createGraphFromSequences( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final ArrayList<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
+    private static boolean createGraphFromSequences( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Collection<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
        final byte[] refSequence = refHaplotype.getBases();
        if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) {
            final int kmersInSequence = refSequence.length - KMER_LENGTH + 1;
@ -183,6 +184,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
        for( final GATKSAMRecord read : reads ) {
            final byte[] sequence = read.getReadBases();
            final byte[] qualities = read.getBaseQualities();
+            final byte[] reducedReadCounts = read.getReducedReadCounts();  // will be null if read is not readuced
            if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) {
                final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
                for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {                    
@ -194,6 +196,12 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
                            break;
                        }
                    }
+                    int countNumber = 1;
+                    if (read.isReducedRead()) {
+                        // compute min (?) number of reduced read counts in current kmer span
+                        countNumber = MathUtils.arrayMin(Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1));
+                    }
+
                    if( !badKmer ) {
                        // get the kmers
                        final byte[] kmer1 = new byte[KMER_LENGTH];
@ -201,7 +209,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
                        final byte[] kmer2 = new byte[KMER_LENGTH];
                        System.arraycopy(sequence, iii+1, kmer2, 0, KMER_LENGTH);

-                        addKmersToGraph(graph, kmer1, kmer2, false);
+                        for (int k=0; k < countNumber; k++)
+                            addKmersToGraph(graph, kmer1, kmer2, false);
                    }
                }
            }
@ -230,7 +239,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
        return true;
    }

-    private void printGraphs() {
+    protected void printGraphs() {
        int count = 0;
        for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
            GRAPH_WRITER.println("digraph kmer" + count++ +" {");
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java
@ -7,6 +7,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
 */

 import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.walkers.genotyper.ArtificialReadPileupTestProvider;
 import org.broadinstitute.sting.utils.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
@ -18,6 +20,7 @@ import org.testng.annotations.Test;

 import java.io.File;
 import java.io.FileNotFoundException;
+import java.io.PrintStream;
 import java.util.*;

 public class SimpleDeBruijnAssemblerUnitTest extends BaseTest {
@ -143,6 +146,43 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest {
        Assert.assertTrue(graphEquals(graph, expectedGraph));
    }

+    @Test(enabled=true)
+    public void testBasicGraphCreation() {
+        final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
+        final byte refBase = refPileupTestProvider.getReferenceContext().getBase();
+        final String altBase = (refBase==(byte)'A'?"C":"A");
+        final int matches = 50;
+        final int mismatches = 50;
+        Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, altBase, new int[]{matches, mismatches}, false, 30);
+        PrintStream graphWriter = null;
+
+        try{
+            graphWriter = new PrintStream("du.txt");
+        } catch (Exception e) {}
+
+
+        SimpleDeBruijnAssembler assembler = new SimpleDeBruijnAssembler(true,graphWriter);
+        final Haplotype refHaplotype = new Haplotype(refPileupTestProvider.getReferenceContext().getBases());
+        refHaplotype.setIsReference(true);
+        assembler.createDeBruijnGraphs(refContext.get(refPileupTestProvider.getSampleNames().get(0)).getBasePileup().getReads(), refHaplotype);
+
+/*        // clean up the graphs by pruning and merging
+        for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
+            SimpleDeBruijnAssembler.pruneGraph( graph, PRUNE_FACTOR );
+            //eliminateNonRefPaths( graph );
+            SimpleDeBruijnAssembler.mergeNodes( graph );
+        }
+  */
+        if( graphWriter != null ) {
+            assembler.printGraphs();
+        }
+
+        int k=2;
+
+        // find the best paths in the graphs
+    //    return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
+
+    }
    @Test(enabled = true)
    public void testEliminateNonRefPaths() {
        DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@ -40,6 +40,7 @@ public class Haplotype {
    protected final double[] quals;
    private GenomeLoc genomeLocation = null;
    private HashMap<String, double[]> readLikelihoodsPerSample = null;
+    private HashMap<String, int[]> readCountsPerSample = null;
    private HashMap<Integer, VariantContext> eventMap = null;
    private boolean isRef = false;
    private Cigar cigar;
@ -83,18 +84,27 @@ public class Haplotype {
        return Arrays.hashCode(bases);
    }

-    public void addReadLikelihoods( final String sample, final double[] readLikelihoods ) {
+    public void addReadLikelihoods( final String sample, final double[] readLikelihoods, final int[] readCounts ) {
        if( readLikelihoodsPerSample == null ) {
            readLikelihoodsPerSample = new HashMap<String, double[]>();
        }
        readLikelihoodsPerSample.put(sample, readLikelihoods);
+        if( readCountsPerSample == null ) {
+            readCountsPerSample = new HashMap<String, int[]>();
+        }
+        readCountsPerSample.put(sample, readCounts);
    }

    @Ensures({"result != null"})
    public double[] getReadLikelihoods( final String sample ) {
        return readLikelihoodsPerSample.get(sample);
    }
-    
+
+    @Ensures({"result != null"})
+    public int[] getReadCounts( final String sample ) {
+        return readCountsPerSample.get(sample);
+    }
+
    public Set<String> getSampleKeySet() {
        return readLikelihoodsPerSample.keySet();
    }
--- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
@ -210,6 +210,13 @@ public class MathUtils {
        return total;
    }

+    public static int sum(byte[] x) {
+        int total = 0;
+        for (byte v : x)
+            total += (int)v;
+        return total;
+    }
+
    /**
     * Calculates the log10 cumulative sum of an array with log10 probabilities
     *
--- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java
@ -134,17 +134,36 @@ public class FragmentUtils {

        GATKSAMRecord firstRead = overlappingPair.get(0);
        GATKSAMRecord secondRead = overlappingPair.get(1);
-        if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) {
+   /*
+        System.out.println("read 0 unclipped start:"+overlappingPair.get(0).getUnclippedStart());
+        System.out.println("read 0 unclipped end:"+overlappingPair.get(0).getUnclippedEnd());
+        System.out.println("read 1 unclipped start:"+overlappingPair.get(1).getUnclippedStart());
+        System.out.println("read 1 unclipped end:"+overlappingPair.get(1).getUnclippedEnd());
+        System.out.println("read 0 start:"+overlappingPair.get(0).getAlignmentStart());
+        System.out.println("read 0 end:"+overlappingPair.get(0).getAlignmentEnd());
+        System.out.println("read 1 start:"+overlappingPair.get(1).getAlignmentStart());
+        System.out.println("read 1 end:"+overlappingPair.get(1).getAlignmentEnd());
+     */
+        if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) {
            firstRead = overlappingPair.get(1); // swap them
            secondRead = overlappingPair.get(0);
        }
-        if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) {
+        if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) {
            return overlappingPair; // can't merge them, yet:  AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A
        }
        if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) {
            return overlappingPair; // fragments contain indels so don't merge them
        }

+/*        // check for inconsistent start positions between uncliped/soft alignment starts
+        if (secondRead.getAlignmentStart() >= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() < firstRead.getUnclippedStart())
+            return overlappingPair;
+        if (secondRead.getAlignmentStart() <= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() > firstRead.getUnclippedStart())
+            return overlappingPair;
+
+        if (secondRead.getUnclippedStart() < firstRead.getAlignmentEnd() && secondRead.getAlignmentStart() >= firstRead.getAlignmentEnd())
+            return overlappingPair;
+  */
        final Pair<Integer, Boolean> pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart());

        final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() );