HaplotypeCaller now has the ability to only consider the best N haplotypes for genotyping
-- Added a -dontGenotype mode for testing assembly efficiency -- However, it looks like this has a very negative impact on the quality of the results, so the code should be deleted
This commit is contained in:
parent
a8fb26bf01
commit
ffea6dd95f
|
|
@ -52,6 +52,7 @@ import net.sf.samtools.Cigar;
|
|||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
|
@ -73,6 +74,7 @@ import java.util.*;
|
|||
*/
|
||||
|
||||
public class DeBruijnAssembler extends LocalAssemblyEngine {
|
||||
private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class);
|
||||
|
||||
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
|
||||
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11;
|
||||
|
|
@ -85,18 +87,20 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
private static final double SW_GAP = -22.0; //-1.0-1.0/3.0;
|
||||
private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0;
|
||||
|
||||
private final boolean DEBUG;
|
||||
private final PrintStream GRAPH_WRITER;
|
||||
private final boolean debug;
|
||||
private final PrintStream graphWriter;
|
||||
private final List<DeBruijnAssemblyGraph> graphs = new ArrayList<DeBruijnAssemblyGraph>();
|
||||
private final int MIN_KMER;
|
||||
private final int minKmer;
|
||||
private final int maxHaplotypesToConsider;
|
||||
|
||||
private int PRUNE_FACTOR = 2;
|
||||
|
||||
public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer) {
|
||||
public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) {
|
||||
super();
|
||||
DEBUG = debug;
|
||||
GRAPH_WRITER = graphWriter;
|
||||
MIN_KMER = minKmer;
|
||||
this.debug = debug;
|
||||
this.graphWriter = graphWriter;
|
||||
this.minKmer = minKmer;
|
||||
this.maxHaplotypesToConsider = maxHaplotypesToConsider;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -123,7 +127,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
|
||||
|
||||
// print the graphs if the appropriate debug option has been turned on
|
||||
if( GRAPH_WRITER != null ) {
|
||||
if( graphWriter != null ) {
|
||||
printGraphs();
|
||||
}
|
||||
|
||||
|
|
@ -136,11 +140,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
graphs.clear();
|
||||
|
||||
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
|
||||
if( maxKmer < MIN_KMER ) { return; } // Reads are too small for assembly so don't try to create any assembly graphs
|
||||
if( maxKmer < minKmer) { return; } // Reads are too small for assembly so don't try to create any assembly graphs
|
||||
|
||||
// create the graph for each possible kmer
|
||||
for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) {
|
||||
final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG );
|
||||
for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) {
|
||||
//if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
|
||||
final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug);
|
||||
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
|
||||
// do a series of steps to clean up the raw assembly graph to make it analysis-ready
|
||||
pruneGraph(graph, PRUNE_FACTOR);
|
||||
|
|
@ -320,22 +325,22 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
protected void printGraphs() {
|
||||
GRAPH_WRITER.println("digraph assemblyGraphs {");
|
||||
graphWriter.println("digraph assemblyGraphs {");
|
||||
for( final DeBruijnAssemblyGraph graph : graphs ) {
|
||||
for( final DeBruijnEdge edge : graph.edgeSet() ) {
|
||||
if( edge.getMultiplicity() > PRUNE_FACTOR ) {
|
||||
GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];");
|
||||
graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];");
|
||||
}
|
||||
if( edge.isRef() ) {
|
||||
GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];");
|
||||
graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];");
|
||||
}
|
||||
if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); }
|
||||
}
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]");
|
||||
graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]");
|
||||
}
|
||||
}
|
||||
GRAPH_WRITER.println("}");
|
||||
graphWriter.println("}");
|
||||
}
|
||||
|
||||
@Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
|
||||
|
|
@ -343,6 +348,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
private List<Haplotype> findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
|
||||
|
||||
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
|
||||
// TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm
|
||||
final List<Haplotype> returnHaplotypes = new ArrayList<Haplotype>();
|
||||
refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
|
||||
final Cigar c = new Cigar();
|
||||
|
|
@ -383,7 +389,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
if( !returnHaplotypes.contains(h) ) {
|
||||
h.setAlignmentStartHapwrtRef(activeRegionStart);
|
||||
h.setCigar( leftAlignedCigar );
|
||||
h.setCigar(leftAlignedCigar);
|
||||
h.setScore(path.getScore());
|
||||
returnHaplotypes.add(h);
|
||||
|
||||
// for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
|
|
@ -409,18 +416,39 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
if( DEBUG ) {
|
||||
if( returnHaplotypes.size() > 1 ) {
|
||||
System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against.");
|
||||
final List<Haplotype> finalHaplotypes = selectHighestScoringHaplotypes(returnHaplotypes);
|
||||
if ( finalHaplotypes.size() < returnHaplotypes.size() )
|
||||
logger.info("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
|
||||
|
||||
if( debug ) {
|
||||
if( finalHaplotypes.size() > 1 ) {
|
||||
System.out.println("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
|
||||
} else {
|
||||
System.out.println("Found only the reference haplotype in the assembly graph.");
|
||||
}
|
||||
for( final Haplotype h : returnHaplotypes ) {
|
||||
for( final Haplotype h : finalHaplotypes ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() );
|
||||
}
|
||||
}
|
||||
return returnHaplotypes;
|
||||
|
||||
return finalHaplotypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Select the best scoring haplotypes among all present, returning no more than maxHaplotypesToConsider
|
||||
*
|
||||
* @param haplotypes a list of haplotypes to consider
|
||||
* @return a sublist of the best haplotypes, with size() <= maxHaplotypesToConsider
|
||||
*/
|
||||
private List<Haplotype> selectHighestScoringHaplotypes(final List<Haplotype> haplotypes) {
|
||||
if ( haplotypes.size() <= maxHaplotypesToConsider )
|
||||
return haplotypes;
|
||||
else {
|
||||
final List<Haplotype> sorted = new ArrayList<Haplotype>(haplotypes);
|
||||
Collections.sort(sorted, new Haplotype.ScoreComparator());
|
||||
return sorted.subList(0, maxHaplotypesToConsider);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
|
|
@ -205,6 +206,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false)
|
||||
protected int minKmer = 11;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="maxHaplotypesToConsider", shortName="maxHaplotypesToConsider", doc="Maximum number of haplotypes to consider in the likelihood calculation. Setting this number too high can have dramatic performance implications", required = false)
|
||||
protected int maxHaplotypesToConsider = 100000;
|
||||
|
||||
/**
|
||||
* If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling
|
||||
* when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the
|
||||
|
|
@ -227,6 +232,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
|
||||
protected boolean justDetermineActiveRegions = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false)
|
||||
protected boolean dontGenotype = false;
|
||||
|
||||
/**
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
* dbSNP is not used in any way for the calculations themselves.
|
||||
|
|
@ -296,6 +305,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
// reference base padding size
|
||||
private static final int REFERENCE_PADDING = 500;
|
||||
|
||||
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
|
||||
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
|
||||
|
||||
// bases with quality less than or equal to this value are trimmed off the tails of the reads
|
||||
private static final byte MIN_TAIL_QUALITY = 20;
|
||||
|
||||
|
|
@ -374,7 +386,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
|
||||
}
|
||||
|
||||
assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer );
|
||||
assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer, maxHaplotypesToConsider );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );
|
||||
|
||||
|
|
@ -514,6 +526,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
|
||||
Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
|
||||
|
||||
if (dontGenotype)
|
||||
return 1;
|
||||
|
||||
// evaluate each sample's reads against all haplotypes
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) );
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
|
|
@ -575,7 +590,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
|
||||
if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
|
||||
final List<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>();
|
||||
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( activeRegion.getReads() );
|
||||
|
|
@ -599,7 +614,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
}
|
||||
}
|
||||
}
|
||||
activeRegion.addAll(ReadUtils.sortReadsByCoordinate(readsToUse));
|
||||
|
||||
activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart));
|
||||
}
|
||||
|
||||
private List<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
|
|
|
|||
|
|
@ -41,12 +41,12 @@ import java.io.Serializable;
|
|||
import java.util.*;
|
||||
|
||||
public class Haplotype extends Allele {
|
||||
|
||||
private GenomeLoc genomeLocation = null;
|
||||
private Map<Integer, VariantContext> eventMap = null;
|
||||
private Cigar cigar;
|
||||
private int alignmentStartHapwrtRef;
|
||||
private Event artificialEvent = null;
|
||||
private double score = 0;
|
||||
|
||||
/**
|
||||
* Main constructor
|
||||
|
|
@ -259,4 +259,34 @@ public class Haplotype extends Allele {
|
|||
this.pos = pos;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the score (an estimate of the support) of this haplotype
|
||||
* @return a double, where higher values are better
|
||||
*/
|
||||
public double getScore() {
|
||||
return this.isReference() ? Double.MAX_VALUE : score;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the score (an estimate of the support) of this haplotype.
|
||||
*
|
||||
* Note that if this is the reference haplotype it is always given Double.MAX_VALUE score
|
||||
*
|
||||
* @param score a double, where higher values are better
|
||||
*/
|
||||
public void setScore(double score) {
|
||||
this.score = this.isReference() ? Double.MAX_VALUE : score;
|
||||
}
|
||||
|
||||
/**
|
||||
* A comparator that sorts haplotypes in decreasing order of score, so that the best supported
|
||||
* haplotypes are at the top
|
||||
*/
|
||||
public static class ScoreComparator implements Comparator<Haplotype> {
|
||||
@Override
|
||||
public int compare(Haplotype o1, Haplotype o2) {
|
||||
return -1 * Double.valueOf(o1.getScore()).compareTo(o2.getScore());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue