From ffea6dd95f34de0c979273c0783d6da75bbe16f0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 18 Mar 2013 17:06:32 -0400 Subject: [PATCH] HaplotypeCaller now has the ability to only consider the best N haplotypes for genotyping -- Added a -dontGenotype mode for testing assembly efficiency -- However, it looks like this has a very negative impact on the quality of the results, so the code should be deleted --- .../haplotypecaller/DeBruijnAssembler.java | 74 +++++++++++++------ .../haplotypecaller/HaplotypeCaller.java | 22 +++++- .../broadinstitute/sting/utils/Haplotype.java | 32 +++++++- 3 files changed, 101 insertions(+), 27 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 566605a8c..bf08d1526 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -52,6 +52,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -73,6 +74,7 @@ import java.util.*; */ public class DeBruijnAssembler extends LocalAssemblyEngine { + private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class); private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; @@ -85,18 +87,20 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private static final double SW_GAP = -22.0; //-1.0-1.0/3.0; private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0; - private final boolean DEBUG; - private final PrintStream GRAPH_WRITER; + private final boolean debug; + private final PrintStream graphWriter; private final List graphs = new ArrayList(); - private final int MIN_KMER; + private final int minKmer; + private final int maxHaplotypesToConsider; private int PRUNE_FACTOR = 2; - public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer) { + public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) { super(); - DEBUG = debug; - GRAPH_WRITER = graphWriter; - MIN_KMER = minKmer; + this.debug = debug; + this.graphWriter = graphWriter; + this.minKmer = minKmer; + this.maxHaplotypesToConsider = maxHaplotypesToConsider; } /** @@ -123,7 +127,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); // print the graphs if the appropriate debug option has been turned on - if( GRAPH_WRITER != null ) { + if( graphWriter != null ) { printGraphs(); } @@ -136,11 +140,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { graphs.clear(); final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1; - if( maxKmer < MIN_KMER ) { return; } // Reads are too small for assembly so don't try to create any assembly graphs + if( maxKmer < minKmer) { return; } // Reads are too small for assembly so don't try to create any assembly graphs // create the graph for each possible kmer - for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) { - final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG ); + for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) { + //if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); + final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready pruneGraph(graph, PRUNE_FACTOR); @@ -320,22 +325,22 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } protected void printGraphs() { - GRAPH_WRITER.println("digraph assemblyGraphs {"); + graphWriter.println("digraph assemblyGraphs {"); for( final DeBruijnAssemblyGraph graph : graphs ) { for( final DeBruijnEdge edge : graph.edgeSet() ) { if( edge.getMultiplicity() > PRUNE_FACTOR ) { - GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];"); + graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];"); } if( edge.isRef() ) { - GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];"); + graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];"); } if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } } for( final DeBruijnVertex v : graph.vertexSet() ) { - GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]"); + graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]"); } } - GRAPH_WRITER.println("}"); + graphWriter.println("}"); } @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"}) @@ -343,6 +348,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private List findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes + // TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm final List returnHaplotypes = new ArrayList(); refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart()); final Cigar c = new Cigar(); @@ -383,7 +389,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } if( !returnHaplotypes.contains(h) ) { h.setAlignmentStartHapwrtRef(activeRegionStart); - h.setCigar( leftAlignedCigar ); + h.setCigar(leftAlignedCigar); + h.setScore(path.getScore()); returnHaplotypes.add(h); // for GGA mode, add the desired allele into the haplotype if it isn't already present @@ -409,18 +416,39 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - if( DEBUG ) { - if( returnHaplotypes.size() > 1 ) { - System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against."); + final List finalHaplotypes = selectHighestScoringHaplotypes(returnHaplotypes); + if ( finalHaplotypes.size() < returnHaplotypes.size() ) + logger.info("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); + + if( debug ) { + if( finalHaplotypes.size() > 1 ) { + System.out.println("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); } else { System.out.println("Found only the reference haplotype in the assembly graph."); } - for( final Haplotype h : returnHaplotypes ) { + for( final Haplotype h : finalHaplotypes ) { System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() ); + System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); } } - return returnHaplotypes; + + return finalHaplotypes; + } + + /** + * Select the best scoring haplotypes among all present, returning no more than maxHaplotypesToConsider + * + * @param haplotypes a list of haplotypes to consider + * @return a sublist of the best haplotypes, with size() <= maxHaplotypesToConsider + */ + private List selectHighestScoringHaplotypes(final List haplotypes) { + if ( haplotypes.size() <= maxHaplotypesToConsider ) + return haplotypes; + else { + final List sorted = new ArrayList(haplotypes); + Collections.sort(sorted, new Haplotype.ScoreComparator()); + return sorted.subList(0, maxHaplotypesToConsider); + } } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 4fc075807..cff631802 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -205,6 +206,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false) protected int minKmer = 11; + @Advanced + @Argument(fullName="maxHaplotypesToConsider", shortName="maxHaplotypesToConsider", doc="Maximum number of haplotypes to consider in the likelihood calculation. Setting this number too high can have dramatic performance implications", required = false) + protected int maxHaplotypesToConsider = 100000; + /** * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the @@ -227,6 +232,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) protected boolean justDetermineActiveRegions = false; + @Hidden + @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) + protected boolean dontGenotype = false; + /** * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. * dbSNP is not used in any way for the calculations themselves. @@ -296,6 +305,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // reference base padding size private static final int REFERENCE_PADDING = 500; + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument + private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument + // bases with quality less than or equal to this value are trimmed off the tails of the reads private static final byte MIN_TAIL_QUALITY = 20; @@ -374,7 +386,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer ); + assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer, maxHaplotypesToConsider ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); @@ -514,6 +526,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); + if (dontGenotype) + return 1; + // evaluate each sample's reads against all haplotypes final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) ); final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); @@ -575,7 +590,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // //--------------------------------------------------------------------------------------------------------------- - private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { + private void finalizeActiveRegion( final ActiveRegion activeRegion ) { if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } final List finalizedReadList = new ArrayList(); final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() ); @@ -599,7 +614,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } } } - activeRegion.addAll(ReadUtils.sortReadsByCoordinate(readsToUse)); + + activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart)); } private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 415cb73ac..070ae4f5d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -41,12 +41,12 @@ import java.io.Serializable; import java.util.*; public class Haplotype extends Allele { - private GenomeLoc genomeLocation = null; private Map eventMap = null; private Cigar cigar; private int alignmentStartHapwrtRef; private Event artificialEvent = null; + private double score = 0; /** * Main constructor @@ -259,4 +259,34 @@ public class Haplotype extends Allele { this.pos = pos; } } + + /** + * Get the score (an estimate of the support) of this haplotype + * @return a double, where higher values are better + */ + public double getScore() { + return this.isReference() ? Double.MAX_VALUE : score; + } + + /** + * Set the score (an estimate of the support) of this haplotype. + * + * Note that if this is the reference haplotype it is always given Double.MAX_VALUE score + * + * @param score a double, where higher values are better + */ + public void setScore(double score) { + this.score = this.isReference() ? Double.MAX_VALUE : score; + } + + /** + * A comparator that sorts haplotypes in decreasing order of score, so that the best supported + * haplotypes are at the top + */ + public static class ScoreComparator implements Comparator { + @Override + public int compare(Haplotype o1, Haplotype o2) { + return -1 * Double.valueOf(o1.getScore()).compareTo(o2.getScore()); + } + } }