From 9e25b209e0ef10cc63dd17f2abf9e61dfbf1b47e Mon Sep 17 00:00:00 2001
From: Guillermo del Angel <delangel@broadinstitute.org>
Date: Fri, 3 Aug 2012 12:24:23 -0400
Subject: [PATCH] First pass of implementation of Reduced Reads with
 HaplotypeCaller. Main changes: a) Active region: scale PL's by representative
 count to determine whether region is active. b) Scale per-read, per-haplotype
 likelihoods by read representative counts. A read representative count is
 (temporarily) defined as the average representative count over all bases in
 read, TBD whether this is good enough to avoid biases in GL's. c) DeBruijn
 assembler inserts kmers N times in graph, where N is min representative count
 of read over kmer span - TBD again whether this is the best approach. d) Bug
 fixes in FragmentUtils: logic to merge fragments was wrong in cases where
 there is discrepancy of overlaps between unclipped/soft clipped bases. Didn't
 affect things before but RR makes prevalence of hard-clipped bases in CIGARs
 more prevalent so this was exposed. e) Cache read representative counts along
 with read likelihoods associated with a Haplotype. Code can/should be cleaned
 up and unified with PairHMMIndelErrorModelCode, as well as refactored to
 support arbitrary ploidy in HaplotypeCaller

---
 .../haplotypecaller/HaplotypeCaller.java      |  6 +--
 .../LikelihoodCalculationEngine.java          | 24 ++++++++---
 .../SimpleDeBruijnAssembler.java              | 17 ++++++--
 .../SimpleDeBruijnAssemblerUnitTest.java      | 40 +++++++++++++++++++
 .../broadinstitute/sting/utils/Haplotype.java | 14 ++++++-
 .../broadinstitute/sting/utils/MathUtils.java |  7 ++++
 .../sting/utils/fragments/FragmentUtils.java  | 23 ++++++++++-
 7 files changed, 115 insertions(+), 16 deletions(-)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 1130feaea..8acab3e3c 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -356,9 +356,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
                             }
                         }
                     }
-                    genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual);
-                    genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
-                    genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
+                    genotypeLikelihoods[AA] += p.getRepresentativeCount()* QualityUtils.qualToProbLog10(qual);
+                    genotypeLikelihoods[AB] += p.getRepresentativeCount()* MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
+                    genotypeLikelihoods[BB] += p.getRepresentativeCount()* QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
                 }
             }
             genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index 365459882..a3179681e 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -50,7 +50,6 @@ public class LikelihoodCalculationEngine {
     }
 
     public void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList ) {
-        final int numHaplotypes = haplotypes.size();
 
         int X_METRIC_LENGTH = 0;
         for( final String sample : perSampleReadList.keySet() ) {
@@ -60,8 +59,8 @@ public class LikelihoodCalculationEngine {
             }
         }
         int Y_METRIC_LENGTH = 0;
-        for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
-            final int haplotypeLength = haplotypes.get(jjj).getBases().length;
+        for( Haplotype h: haplotypes ) {
+            final int haplotypeLength = h.getBases().length;
             if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
         }
 
@@ -90,8 +89,10 @@ public class LikelihoodCalculationEngine {
         final int numHaplotypes = haplotypes.size();
         final int numReads = reads.size();
         final double[][] readLikelihoods = new double[numHaplotypes][numReads];
+        final int[][] readCounts = new int[numHaplotypes][numReads];
         for( int iii = 0; iii < numReads; iii++ ) {
             final GATKSAMRecord read = reads.get(iii);
+            final int readCount =  getRepresentativeReadCount(read);
 
             final byte[] overallGCP = new byte[read.getReadLength()];
             Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
@@ -114,13 +115,23 @@ public class LikelihoodCalculationEngine {
                 readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(),
                         readQuals, readInsQuals, readDelQuals, overallGCP,
                         haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
+                readCounts[jjj][iii] = readCount;
             }
         }
         for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
-            haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj] );
+            haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj], readCounts[jjj] );
         }
     }
 
+    private static int getRepresentativeReadCount(GATKSAMRecord read) {
+        if (!read.isReducedRead())
+            return 1;
+
+        // compute mean representative read counts
+        final byte[] counts = read.getReducedReadCounts();
+        return MathUtils.sum(counts)/counts.length;
+    }
+
     private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) {
         for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){
             if( b1[iii] != b2[iii] ) {
@@ -154,17 +165,20 @@ public class LikelihoodCalculationEngine {
         }
 
         // compute the diploid haplotype likelihoods
+        // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
         for( int iii = 0; iii < numHaplotypes; iii++ ) {
             for( int jjj = 0; jjj <= iii; jjj++ ) {                
                 for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
                     final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
+                    final int[] readCounts_iii = iii_mapped.getReadCounts(sample);
                     for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
                         final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
                         double haplotypeLikelihood = 0.0;
                         for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
                             // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
+                            // log10(10^(a*x1) + 10^(b*x2))
                             // First term is approximated by Jacobian log with table lookup.
-                            haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF;
+                            haplotypeLikelihood +=readCounts_iii[kkk] *( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF);
                         }
                         haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
                     }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
index e2bc7a10f..f3dd3babb 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
@@ -4,6 +4,7 @@ import com.google.java.contract.Ensures;
 import org.apache.commons.lang.ArrayUtils;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.Haplotype;
+import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.SWPairwiseAlignment;
 import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@@ -68,7 +69,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
         return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
     }
 
-    private void createDeBruijnGraphs( final ArrayList<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
+    protected void createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
         graphs.clear();
 
         // create the graph
@@ -161,7 +162,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
         }
     }
 
-    private static boolean createGraphFromSequences( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final ArrayList<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
+    private static boolean createGraphFromSequences( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Collection<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
         final byte[] refSequence = refHaplotype.getBases();
         if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) {
             final int kmersInSequence = refSequence.length - KMER_LENGTH + 1;
@@ -183,6 +184,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
         for( final GATKSAMRecord read : reads ) {
             final byte[] sequence = read.getReadBases();
             final byte[] qualities = read.getBaseQualities();
+            final byte[] reducedReadCounts = read.getReducedReadCounts();  // will be null if read is not readuced
             if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) {
                 final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
                 for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {                    
@@ -194,6 +196,12 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
                             break;
                         }
                     }
+                    int countNumber = 1;
+                    if (read.isReducedRead()) {
+                        // compute min (?) number of reduced read counts in current kmer span
+                        countNumber = MathUtils.arrayMin(Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1));
+                    }
+
                     if( !badKmer ) {
                         // get the kmers
                         final byte[] kmer1 = new byte[KMER_LENGTH];
@@ -201,7 +209,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
                         final byte[] kmer2 = new byte[KMER_LENGTH];
                         System.arraycopy(sequence, iii+1, kmer2, 0, KMER_LENGTH);
 
-                        addKmersToGraph(graph, kmer1, kmer2, false);
+                        for (int k=0; k < countNumber; k++)
+                            addKmersToGraph(graph, kmer1, kmer2, false);
                     }
                 }
             }
@@ -230,7 +239,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
         return true;
     }
 
-    private void printGraphs() {
+    protected void printGraphs() {
         int count = 0;
         for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
             GRAPH_WRITER.println("digraph kmer" + count++ +" {");
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java
index 4f42d5bc8..a83afdbab 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssemblerUnitTest.java
@@ -7,6 +7,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
  */
 
 import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.walkers.genotyper.ArtificialReadPileupTestProvider;
 import org.broadinstitute.sting.utils.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
@@ -18,6 +20,7 @@ import org.testng.annotations.Test;
 
 import java.io.File;
 import java.io.FileNotFoundException;
+import java.io.PrintStream;
 import java.util.*;
 
 public class SimpleDeBruijnAssemblerUnitTest extends BaseTest {
@@ -143,6 +146,43 @@ public class SimpleDeBruijnAssemblerUnitTest extends BaseTest {
         Assert.assertTrue(graphEquals(graph, expectedGraph));
     }
 
+    @Test(enabled=true)
+    public void testBasicGraphCreation() {
+        final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
+        final byte refBase = refPileupTestProvider.getReferenceContext().getBase();
+        final String altBase = (refBase==(byte)'A'?"C":"A");
+        final int matches = 50;
+        final int mismatches = 50;
+        Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, altBase, new int[]{matches, mismatches}, false, 30);
+        PrintStream graphWriter = null;
+
+        try{
+            graphWriter = new PrintStream("du.txt");
+        } catch (Exception e) {}
+
+
+        SimpleDeBruijnAssembler assembler = new SimpleDeBruijnAssembler(true,graphWriter);
+        final Haplotype refHaplotype = new Haplotype(refPileupTestProvider.getReferenceContext().getBases());
+        refHaplotype.setIsReference(true);
+        assembler.createDeBruijnGraphs(refContext.get(refPileupTestProvider.getSampleNames().get(0)).getBasePileup().getReads(), refHaplotype);
+
+/*        // clean up the graphs by pruning and merging
+        for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
+            SimpleDeBruijnAssembler.pruneGraph( graph, PRUNE_FACTOR );
+            //eliminateNonRefPaths( graph );
+            SimpleDeBruijnAssembler.mergeNodes( graph );
+        }
+  */
+        if( graphWriter != null ) {
+            assembler.printGraphs();
+        }
+
+        int k=2;
+
+        // find the best paths in the graphs
+    //    return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
+
+    }
     @Test(enabled = true)
     public void testEliminateNonRefPaths() {
         DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
index 54442622f..eca2e454e 100755
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@@ -40,6 +40,7 @@ public class Haplotype {
     protected final double[] quals;
     private GenomeLoc genomeLocation = null;
     private HashMap<String, double[]> readLikelihoodsPerSample = null;
+    private HashMap<String, int[]> readCountsPerSample = null;
     private HashMap<Integer, VariantContext> eventMap = null;
     private boolean isRef = false;
     private Cigar cigar;
@@ -83,18 +84,27 @@ public class Haplotype {
         return Arrays.hashCode(bases);
     }
 
-    public void addReadLikelihoods( final String sample, final double[] readLikelihoods ) {
+    public void addReadLikelihoods( final String sample, final double[] readLikelihoods, final int[] readCounts ) {
         if( readLikelihoodsPerSample == null ) {
             readLikelihoodsPerSample = new HashMap<String, double[]>();
         }
         readLikelihoodsPerSample.put(sample, readLikelihoods);
+        if( readCountsPerSample == null ) {
+            readCountsPerSample = new HashMap<String, int[]>();
+        }
+        readCountsPerSample.put(sample, readCounts);
     }
 
     @Ensures({"result != null"})
     public double[] getReadLikelihoods( final String sample ) {
         return readLikelihoodsPerSample.get(sample);
     }
-    
+
+    @Ensures({"result != null"})
+    public int[] getReadCounts( final String sample ) {
+        return readCountsPerSample.get(sample);
+    }
+
     public Set<String> getSampleKeySet() {
         return readLikelihoodsPerSample.keySet();
     }
diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
index 1f5eaefee..96704f0b8 100644
--- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
@@ -210,6 +210,13 @@ public class MathUtils {
         return total;
     }
 
+    public static int sum(byte[] x) {
+        int total = 0;
+        for (byte v : x)
+            total += (int)v;
+        return total;
+    }
+
     /**
      * Calculates the log10 cumulative sum of an array with log10 probabilities
      *
diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java
index 851272673..2f31c154c 100644
--- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java
@@ -134,17 +134,36 @@ public class FragmentUtils {
 
         GATKSAMRecord firstRead = overlappingPair.get(0);
         GATKSAMRecord secondRead = overlappingPair.get(1);
-        if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) {
+   /*
+        System.out.println("read 0 unclipped start:"+overlappingPair.get(0).getUnclippedStart());
+        System.out.println("read 0 unclipped end:"+overlappingPair.get(0).getUnclippedEnd());
+        System.out.println("read 1 unclipped start:"+overlappingPair.get(1).getUnclippedStart());
+        System.out.println("read 1 unclipped end:"+overlappingPair.get(1).getUnclippedEnd());
+        System.out.println("read 0 start:"+overlappingPair.get(0).getAlignmentStart());
+        System.out.println("read 0 end:"+overlappingPair.get(0).getAlignmentEnd());
+        System.out.println("read 1 start:"+overlappingPair.get(1).getAlignmentStart());
+        System.out.println("read 1 end:"+overlappingPair.get(1).getAlignmentEnd());
+     */
+        if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) {
             firstRead = overlappingPair.get(1); // swap them
             secondRead = overlappingPair.get(0);
         }
-        if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) {
+        if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) {
             return overlappingPair; // can't merge them, yet:  AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A
         }
         if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) {
             return overlappingPair; // fragments contain indels so don't merge them
         }
 
+/*        // check for inconsistent start positions between uncliped/soft alignment starts
+        if (secondRead.getAlignmentStart() >= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() < firstRead.getUnclippedStart())
+            return overlappingPair;
+        if (secondRead.getAlignmentStart() <= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() > firstRead.getUnclippedStart())
+            return overlappingPair;
+
+        if (secondRead.getUnclippedStart() < firstRead.getAlignmentEnd() && secondRead.getAlignmentStart() >= firstRead.getAlignmentEnd())
+            return overlappingPair;
+  */
         final Pair<Integer, Boolean> pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart());
 
         final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() );