From ffea6dd95f34de0c979273c0783d6da75bbe16f0 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Mon, 18 Mar 2013 17:06:32 -0400
Subject: [PATCH] HaplotypeCaller now has the ability to only consider the best
 N haplotypes for genotyping

-- Added a -dontGenotype mode for testing assembly efficiency
-- However, it looks like this has a very negative impact on the quality of the results, so the code should be deleted
---
 .../haplotypecaller/DeBruijnAssembler.java    | 74 +++++++++++++------
 .../haplotypecaller/HaplotypeCaller.java      | 22 +++++-
 .../broadinstitute/sting/utils/Haplotype.java | 32 +++++++-
 3 files changed, 101 insertions(+), 27 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
index 566605a8c..bf08d1526 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
@@ -52,6 +52,7 @@ import net.sf.samtools.Cigar;
 import net.sf.samtools.CigarElement;
 import net.sf.samtools.CigarOperator;
 import org.apache.commons.lang.ArrayUtils;
+import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
@@ -73,6 +74,7 @@ import java.util.*;
  */
 
 public class DeBruijnAssembler extends LocalAssemblyEngine {
+    private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class);
 
     private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
     private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11;
@@ -85,18 +87,20 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
     private static final double SW_GAP = -22.0;       //-1.0-1.0/3.0;
     private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0;
 
-    private final boolean DEBUG;
-    private final PrintStream GRAPH_WRITER;
+    private final boolean debug;
+    private final PrintStream graphWriter;
     private final List<DeBruijnAssemblyGraph> graphs = new ArrayList<DeBruijnAssemblyGraph>();
-    private final int MIN_KMER;
+    private final int minKmer;
+    private final int maxHaplotypesToConsider;
 
     private int PRUNE_FACTOR = 2;
     
-    public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer) {
+    public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) {
         super();
-        DEBUG = debug;
-        GRAPH_WRITER = graphWriter;
-        MIN_KMER = minKmer;
+        this.debug = debug;
+        this.graphWriter = graphWriter;
+        this.minKmer = minKmer;
+        this.maxHaplotypesToConsider = maxHaplotypesToConsider;
     }
 
     /**
@@ -123,7 +127,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
         createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
 
         // print the graphs if the appropriate debug option has been turned on
-        if( GRAPH_WRITER != null ) {
+        if( graphWriter != null ) {
             printGraphs();
         }
 
@@ -136,11 +140,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
         graphs.clear();
 
         final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
-        if( maxKmer < MIN_KMER ) { return; } // Reads are too small for assembly so don't try to create any assembly graphs
+        if( maxKmer < minKmer) { return; } // Reads are too small for assembly so don't try to create any assembly graphs
 
         // create the graph for each possible kmer
-        for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) {
-            final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG );
+        for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) {
+            //if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
+            final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug);
             if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
                 // do a series of steps to clean up the raw assembly graph to make it analysis-ready
                 pruneGraph(graph, PRUNE_FACTOR);
@@ -320,22 +325,22 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
     }
 
     protected void printGraphs() {
-        GRAPH_WRITER.println("digraph assemblyGraphs {");
+        graphWriter.println("digraph assemblyGraphs {");
         for( final DeBruijnAssemblyGraph graph : graphs ) {
             for( final DeBruijnEdge edge : graph.edgeSet() ) {
                 if( edge.getMultiplicity() > PRUNE_FACTOR ) {
-                    GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];");
+                    graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];");
                 }
                 if( edge.isRef() ) {
-                    GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];");
+                    graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];");
                 }
                 if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); }
             }
             for( final DeBruijnVertex v : graph.vertexSet() ) {
-                GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]");
+                graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]");
             }
         }
-        GRAPH_WRITER.println("}");
+        graphWriter.println("}");
     }
 
     @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
@@ -343,6 +348,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
     private List<Haplotype> findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
 
         // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
+        // TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm
         final List<Haplotype> returnHaplotypes = new ArrayList<Haplotype>();
         refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
         final Cigar c = new Cigar();
@@ -383,7 +389,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
                     }
                     if( !returnHaplotypes.contains(h) ) {
                         h.setAlignmentStartHapwrtRef(activeRegionStart);
-                        h.setCigar( leftAlignedCigar );
+                        h.setCigar(leftAlignedCigar);
+                        h.setScore(path.getScore());
                         returnHaplotypes.add(h);
 
                         // for GGA mode, add the desired allele into the haplotype if it isn't already present
@@ -409,18 +416,39 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
             }
         }
 
-        if( DEBUG ) {
-            if( returnHaplotypes.size() > 1 ) {
-                System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against.");
+        final List<Haplotype> finalHaplotypes = selectHighestScoringHaplotypes(returnHaplotypes);
+        if ( finalHaplotypes.size() < returnHaplotypes.size() )
+            logger.info("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
+
+        if( debug ) {
+            if( finalHaplotypes.size() > 1 ) {
+                System.out.println("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
             } else {
                 System.out.println("Found only the reference haplotype in the assembly graph.");
             }
-            for( final Haplotype h : returnHaplotypes ) {
+            for( final Haplotype h : finalHaplotypes ) {
                 System.out.println( h.toString() );
-                System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() );
+                System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() );
             }
         }
-        return returnHaplotypes;
+
+        return finalHaplotypes;
+    }
+
+    /**
+     * Select the best scoring haplotypes among all present, returning no more than maxHaplotypesToConsider
+     *
+     * @param haplotypes a list of haplotypes to consider
+     * @return a sublist of the best haplotypes, with size() <= maxHaplotypesToConsider
+     */
+    private List<Haplotype> selectHighestScoringHaplotypes(final List<Haplotype> haplotypes) {
+        if ( haplotypes.size() <= maxHaplotypesToConsider )
+            return haplotypes;
+        else {
+            final List<Haplotype> sorted = new ArrayList<Haplotype>(haplotypes);
+            Collections.sort(sorted, new Haplotype.ScoreComparator());
+            return sorted.subList(0, maxHaplotypesToConsider);
+        }
     }
 
     /**
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 4fc075807..cff631802 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
+import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils;
 import org.broadinstitute.sting.gatk.filters.BadMateFilter;
 import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
 import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
@@ -205,6 +206,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false)
     protected int minKmer = 11;
 
+    @Advanced
+    @Argument(fullName="maxHaplotypesToConsider", shortName="maxHaplotypesToConsider", doc="Maximum number of haplotypes to consider in the likelihood calculation.  Setting this number too high can have dramatic performance implications", required = false)
+    protected int maxHaplotypesToConsider = 100000;
+
     /**
      * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling
      * when these reads occur in the region being analyzed.  Typically, for paired end analyses, one pair of the
@@ -227,6 +232,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code.  Useful for benchmarking and scalability testing", required=false)
     protected boolean justDetermineActiveRegions = false;
 
+    @Hidden
+    @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling.  Useful for benchmarking and scalability testing", required=false)
+    protected boolean dontGenotype = false;
+
     /**
      * rsIDs from this file are used to populate the ID column of the output.  Also, the DB INFO flag will be set when appropriate.
      * dbSNP is not used in any way for the calculations themselves.
@@ -296,6 +305,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     // reference base padding size
     private static final int REFERENCE_PADDING = 500;
 
+    private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
+    private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
+
     // bases with quality less than or equal to this value are trimmed off the tails of the reads
     private static final byte MIN_TAIL_QUALITY = 20;
 
@@ -374,7 +386,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
             throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
         }
 
-        assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer );
+        assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer, maxHaplotypesToConsider );
         likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
         genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );
 
@@ -514,6 +526,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
         Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
 
+        if (dontGenotype)
+            return 1;
+
         // evaluate each sample's reads against all haplotypes
         final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) );
         final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
@@ -575,7 +590,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     //
     //---------------------------------------------------------------------------------------------------------------
 
-    private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
+    private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
         if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads:    (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
         final List<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>();
         final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( activeRegion.getReads() );
@@ -599,7 +614,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
                 }
             }
         }
-        activeRegion.addAll(ReadUtils.sortReadsByCoordinate(readsToUse));
+
+        activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart));
     }
 
     private List<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
index 415cb73ac..070ae4f5d 100644
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@@ -41,12 +41,12 @@ import java.io.Serializable;
 import java.util.*;
 
 public class Haplotype extends Allele {
-
     private GenomeLoc genomeLocation = null;
     private Map<Integer, VariantContext> eventMap = null;
     private Cigar cigar;
     private int alignmentStartHapwrtRef;
     private Event artificialEvent = null;
+    private double score = 0;
 
     /**
      * Main constructor
@@ -259,4 +259,34 @@ public class Haplotype extends Allele {
             this.pos = pos;
         }
     }
+
+    /**
+     * Get the score (an estimate of the support) of this haplotype
+     * @return a double, where higher values are better
+     */
+    public double getScore() {
+        return this.isReference() ? Double.MAX_VALUE : score;
+    }
+
+    /**
+     * Set the score (an estimate of the support) of this haplotype.
+     *
+     * Note that if this is the reference haplotype it is always given Double.MAX_VALUE score
+     *
+     * @param score a double, where higher values are better
+     */
+    public void setScore(double score) {
+        this.score = this.isReference() ? Double.MAX_VALUE : score;
+    }
+
+    /**
+     * A comparator that sorts haplotypes in decreasing order of score, so that the best supported
+     * haplotypes are at the top
+     */
+    public static class ScoreComparator implements Comparator<Haplotype> {
+        @Override
+        public int compare(Haplotype o1, Haplotype o2) {
+            return -1 * Double.valueOf(o1.getScore()).compareTo(o2.getScore());
+        }
+    }
 }