Based on NA12878 knowledge base experiments updating HC to allow for a much smaller minimum kmer length in the assembly graph.

This commit is contained in:
Ryan Poplin 2012-12-18 15:43:56 -05:00
parent 6171419e6c
commit b5d590ba92
4 changed files with 15 additions and 13 deletions

View File

@ -343,12 +343,7 @@ public class GenotypingEngine {
} }
// count up the co-occurrences of the events for the R^2 calculation // count up the co-occurrences of the events for the R^2 calculation
for( final String sample : samples ) { for( final String sample : samples ) {
final HashSet<String> sampleSet = new HashSet<String>(1); final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( Collections.singleton(sample), haplotypeReadMap, Collections.singletonList(Allele.create(h.getBases())) )[0][0];
sampleSet.add(sample);
final List<Allele> alleleList = new ArrayList<Allele>();
alleleList.add(Allele.create(h.getBases()));
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeReadMap, alleleList )[0][0];
if( thisHapVC == null ) { if( thisHapVC == null ) {
if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); } if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); }
else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); } else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); }

View File

@ -130,12 +130,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
protected String keepRG = null; protected String keepRG = null;
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 1; protected int MIN_PRUNE_FACTOR = 2;
@Advanced @Advanced
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
protected int gcpHMM = 10; protected int gcpHMM = 10;
@Advanced
@Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false)
protected int minKmer = 11;
@Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false) @Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false)
protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000; protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000;
@ -282,7 +286,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
} }
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter, minKmer );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );
} }

View File

@ -28,7 +28,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11;
private static final byte MIN_QUALITY = (byte) 17; private static final byte MIN_QUALITY = (byte) 16;
// Smith-Waterman parameters originally copied from IndelRealigner // Smith-Waterman parameters originally copied from IndelRealigner
private static final double SW_MATCH = 5.0; // 1.0; private static final double SW_MATCH = 5.0; // 1.0;
@ -39,13 +39,15 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
private final boolean DEBUG; private final boolean DEBUG;
private final PrintStream GRAPH_WRITER; private final PrintStream GRAPH_WRITER;
private final ArrayList<DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>> graphs = new ArrayList<DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>>(); private final ArrayList<DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>> graphs = new ArrayList<DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>>();
private final int MIN_KMER;
private int PRUNE_FACTOR = 1; private int PRUNE_FACTOR = 2;
public SimpleDeBruijnAssembler( final boolean debug, final PrintStream graphWriter ) { public SimpleDeBruijnAssembler( final boolean debug, final PrintStream graphWriter, final int minKmer ) {
super(); super();
DEBUG = debug; DEBUG = debug;
GRAPH_WRITER = graphWriter; GRAPH_WRITER = graphWriter;
MIN_KMER = minKmer;
} }
public ArrayList<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final ArrayList<VariantContext> activeAllelesToGenotype ) { public ArrayList<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final ArrayList<VariantContext> activeAllelesToGenotype ) {
@ -72,8 +74,9 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
protected void createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) { protected void createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
graphs.clear(); graphs.clear();
final int maxKmer = refHaplotype.getBases().length;
// create the graph // create the graph
for( int kmer = 31; kmer <= 75; kmer += 6 ) { for( int kmer = MIN_KMER; kmer <= maxKmer; kmer += 6 ) {
final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class); final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
if( createGraphFromSequences( graph, reads, kmer, refHaplotype, DEBUG ) ) { if( createGraphFromSequences( graph, reads, kmer, refHaplotype, DEBUG ) ) {
graphs.add(graph); graphs.add(graph);