HaplotypeCaller now has the ability to only consider the best N haplotypes for genotyping

-- Added a -dontGenotype mode for testing assembly efficiency -- However, it looks like this has a very negative impact on the quality of the results, so the code should be deleted
2013-03-18 17:06:32 -04:00 · 2013-03-18 17:06:32 -04:00 · ffea6dd95f
parent a8fb26bf01
commit ffea6dd95f
3 changed files with 101 additions and 27 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java
@ -52,6 +52,7 @@ import net.sf.samtools.Cigar;
 import net.sf.samtools.CigarElement;
 import net.sf.samtools.CigarOperator;
 import org.apache.commons.lang.ArrayUtils;
+import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
@ -73,6 +74,7 @@ import java.util.*;
 */

 public class DeBruijnAssembler extends LocalAssemblyEngine {
+    private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class);

    private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
    private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11;
@ -85,18 +87,20 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
    private static final double SW_GAP = -22.0;       //-1.0-1.0/3.0;
    private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0;

-    private final boolean DEBUG;
-    private final PrintStream GRAPH_WRITER;
+    private final boolean debug;
+    private final PrintStream graphWriter;
    private final List<DeBruijnAssemblyGraph> graphs = new ArrayList<DeBruijnAssemblyGraph>();
-    private final int MIN_KMER;
+    private final int minKmer;
+    private final int maxHaplotypesToConsider;

    private int PRUNE_FACTOR = 2;
    
-    public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer) {
+    public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) {
        super();
-        DEBUG = debug;
-        GRAPH_WRITER = graphWriter;
-        MIN_KMER = minKmer;
+        this.debug = debug;
+        this.graphWriter = graphWriter;
+        this.minKmer = minKmer;
+        this.maxHaplotypesToConsider = maxHaplotypesToConsider;
    }

    /**
@ -123,7 +127,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
        createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );

        // print the graphs if the appropriate debug option has been turned on
-        if( GRAPH_WRITER != null ) {
+        if( graphWriter != null ) {
            printGraphs();
        }

@ -136,11 +140,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
        graphs.clear();

        final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
-        if( maxKmer < MIN_KMER ) { return; } // Reads are too small for assembly so don't try to create any assembly graphs
+        if( maxKmer < minKmer) { return; } // Reads are too small for assembly so don't try to create any assembly graphs

        // create the graph for each possible kmer
-        for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) {
-            final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG );
+        for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) {
+            //if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
+            final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug);
            if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
                // do a series of steps to clean up the raw assembly graph to make it analysis-ready
                pruneGraph(graph, PRUNE_FACTOR);
@ -320,22 +325,22 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
    }

    protected void printGraphs() {
-        GRAPH_WRITER.println("digraph assemblyGraphs {");
+        graphWriter.println("digraph assemblyGraphs {");
        for( final DeBruijnAssemblyGraph graph : graphs ) {
            for( final DeBruijnEdge edge : graph.edgeSet() ) {
                if( edge.getMultiplicity() > PRUNE_FACTOR ) {
-                    GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];");
+                    graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];");
                }
                if( edge.isRef() ) {
-                    GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];");
+                    graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];");
                }
                if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); }
            }
            for( final DeBruijnVertex v : graph.vertexSet() ) {
-                GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]");
+                graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]");
            }
        }
-        GRAPH_WRITER.println("}");
+        graphWriter.println("}");
    }

    @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
@ -343,6 +348,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
    private List<Haplotype> findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {

        // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
+        // TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm
        final List<Haplotype> returnHaplotypes = new ArrayList<Haplotype>();
        refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
        final Cigar c = new Cigar();
@ -383,7 +389,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
                    }
                    if( !returnHaplotypes.contains(h) ) {
                        h.setAlignmentStartHapwrtRef(activeRegionStart);
-                        h.setCigar( leftAlignedCigar );
+                        h.setCigar(leftAlignedCigar);
+                        h.setScore(path.getScore());
                        returnHaplotypes.add(h);

                        // for GGA mode, add the desired allele into the haplotype if it isn't already present
@ -409,18 +416,39 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
            }
        }

-        if( DEBUG ) {
-            if( returnHaplotypes.size() > 1 ) {
-                System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against.");
+        final List<Haplotype> finalHaplotypes = selectHighestScoringHaplotypes(returnHaplotypes);
+        if ( finalHaplotypes.size() < returnHaplotypes.size() )
+            logger.info("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
+
+        if( debug ) {
+            if( finalHaplotypes.size() > 1 ) {
+                System.out.println("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
            } else {
                System.out.println("Found only the reference haplotype in the assembly graph.");
            }
-            for( final Haplotype h : returnHaplotypes ) {
+            for( final Haplotype h : finalHaplotypes ) {
                System.out.println( h.toString() );
-                System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() );
+                System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() );
            }
        }
-        return returnHaplotypes;
+
+        return finalHaplotypes;
+    }
+
+    /**
+     * Select the best scoring haplotypes among all present, returning no more than maxHaplotypesToConsider
+     *
+     * @param haplotypes a list of haplotypes to consider
+     * @return a sublist of the best haplotypes, with size() <= maxHaplotypesToConsider
+     */
+    private List<Haplotype> selectHighestScoringHaplotypes(final List<Haplotype> haplotypes) {
+        if ( haplotypes.size() <= maxHaplotypesToConsider )
+            return haplotypes;
+        else {
+            final List<Haplotype> sorted = new ArrayList<Haplotype>(haplotypes);
+            Collections.sort(sorted, new Haplotype.ScoreComparator());
+            return sorted.subList(0, maxHaplotypesToConsider);
+        }
    }

    /**
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
+import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils;
 import org.broadinstitute.sting.gatk.filters.BadMateFilter;
 import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
 import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
@ -205,6 +206,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
    @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false)
    protected int minKmer = 11;

+    @Advanced
+    @Argument(fullName="maxHaplotypesToConsider", shortName="maxHaplotypesToConsider", doc="Maximum number of haplotypes to consider in the likelihood calculation.  Setting this number too high can have dramatic performance implications", required = false)
+    protected int maxHaplotypesToConsider = 100000;
+
    /**
     * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling
     * when these reads occur in the region being analyzed.  Typically, for paired end analyses, one pair of the
@ -227,6 +232,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
    @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code.  Useful for benchmarking and scalability testing", required=false)
    protected boolean justDetermineActiveRegions = false;

+    @Hidden
+    @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling.  Useful for benchmarking and scalability testing", required=false)
+    protected boolean dontGenotype = false;
+
    /**
     * rsIDs from this file are used to populate the ID column of the output.  Also, the DB INFO flag will be set when appropriate.
     * dbSNP is not used in any way for the calculations themselves.
@ -296,6 +305,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
    // reference base padding size
    private static final int REFERENCE_PADDING = 500;

+    private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
+    private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
+
    // bases with quality less than or equal to this value are trimmed off the tails of the reads
    private static final byte MIN_TAIL_QUALITY = 20;

@ -374,7 +386,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
            throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
        }

-        assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer );
+        assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer, maxHaplotypesToConsider );
        likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
        genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );

@ -514,6 +526,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
        // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
        Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );

+        if (dontGenotype)
+            return 1;
+
        // evaluate each sample's reads against all haplotypes
        final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) );
        final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
@ -575,7 +590,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
    //
    //---------------------------------------------------------------------------------------------------------------

-    private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
+    private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
        if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads:    (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
        final List<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>();
        final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( activeRegion.getReads() );
@ -599,7 +614,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
                }
            }
        }
-        activeRegion.addAll(ReadUtils.sortReadsByCoordinate(readsToUse));
+
+        activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart));
    }

    private List<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@ -41,12 +41,12 @@ import java.io.Serializable;
 import java.util.*;

 public class Haplotype extends Allele {
-
    private GenomeLoc genomeLocation = null;
    private Map<Integer, VariantContext> eventMap = null;
    private Cigar cigar;
    private int alignmentStartHapwrtRef;
    private Event artificialEvent = null;
+    private double score = 0;

    /**
     * Main constructor
@ -259,4 +259,34 @@ public class Haplotype extends Allele {
            this.pos = pos;
        }
    }
+
+    /**
+     * Get the score (an estimate of the support) of this haplotype
+     * @return a double, where higher values are better
+     */
+    public double getScore() {
+        return this.isReference() ? Double.MAX_VALUE : score;
+    }
+
+    /**
+     * Set the score (an estimate of the support) of this haplotype.
+     *
+     * Note that if this is the reference haplotype it is always given Double.MAX_VALUE score
+     *
+     * @param score a double, where higher values are better
+     */
+    public void setScore(double score) {
+        this.score = this.isReference() ? Double.MAX_VALUE : score;
+    }
+
+    /**
+     * A comparator that sorts haplotypes in decreasing order of score, so that the best supported
+     * haplotypes are at the top
+     */
+    public static class ScoreComparator implements Comparator<Haplotype> {
+        @Override
+        public int compare(Haplotype o1, Haplotype o2) {
+            return -1 * Double.valueOf(o1.getScore()).compareTo(o2.getScore());
+        }
+    }
 }