From b5d590ba92b03716265e062dfa64ebe6118cddf4 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 18 Dec 2012 15:43:56 -0500 Subject: [PATCH] Based on NA12878 knowledge base experiments updating HC to allow for a much smaller minimum kmer length in the assembly graph. --- .../walkers/haplotypecaller/GenotypingEngine.java | 7 +------ .../gatk/walkers/haplotypecaller/HaplotypeCaller.java | 8 ++++++-- .../haplotypecaller/SimpleDeBruijnAssembler.java | 11 +++++++---- .../sting/utils/sam/GATKSAMReadGroupRecord.java | 2 +- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index a786a860c..cc377d6c8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -343,12 +343,7 @@ public class GenotypingEngine { } // count up the co-occurrences of the events for the R^2 calculation for( final String sample : samples ) { - final HashSet sampleSet = new HashSet(1); - sampleSet.add(sample); - - final List alleleList = new ArrayList(); - alleleList.add(Allele.create(h.getBases())); - final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeReadMap, alleleList )[0][0]; + final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( Collections.singleton(sample), haplotypeReadMap, Collections.singletonList(Allele.create(h.getBases())) )[0][0]; if( thisHapVC == null ) { if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); } else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 9744c3dd8..0f42f2165 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -130,12 +130,16 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected String keepRG = null; @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) - protected int MIN_PRUNE_FACTOR = 1; + protected int MIN_PRUNE_FACTOR = 2; @Advanced @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) protected int gcpHMM = 10; + @Advanced + @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false) + protected int minKmer = 11; + @Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false) protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000; @@ -282,7 +286,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); + assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter, minKmer ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index 3c5a1f79c..0cc9db2cb 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -28,7 +28,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; - private static final byte MIN_QUALITY = (byte) 17; + private static final byte MIN_QUALITY = (byte) 16; // Smith-Waterman parameters originally copied from IndelRealigner private static final double SW_MATCH = 5.0; // 1.0; @@ -39,13 +39,15 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { private final boolean DEBUG; private final PrintStream GRAPH_WRITER; private final ArrayList> graphs = new ArrayList>(); + private final int MIN_KMER; - private int PRUNE_FACTOR = 1; + private int PRUNE_FACTOR = 2; - public SimpleDeBruijnAssembler( final boolean debug, final PrintStream graphWriter ) { + public SimpleDeBruijnAssembler( final boolean debug, final PrintStream graphWriter, final int minKmer ) { super(); DEBUG = debug; GRAPH_WRITER = graphWriter; + MIN_KMER = minKmer; } public ArrayList runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final ArrayList activeAllelesToGenotype ) { @@ -72,8 +74,9 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { protected void createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { graphs.clear(); + final int maxKmer = refHaplotype.getBases().length; // create the graph - for( int kmer = 31; kmer <= 75; kmer += 6 ) { + for( int kmer = MIN_KMER; kmer <= maxKmer; kmer += 6 ) { final DefaultDirectedGraph graph = new DefaultDirectedGraph(DeBruijnEdge.class); if( createGraphFromSequences( graph, reads, kmer, refHaplotype, DEBUG ) ) { graphs.add(graph); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java index 849a7ddee..bb99156fe 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java @@ -70,7 +70,7 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { super.setPlatform(s); mPlatform = s; retrievedPlatform = true; - retrievedNGSPlatform = false; // recalculate the NGSPlatform + retrievedNGSPlatform = false; // recalculate the NGSPlatform } public NGSPlatform getNGSPlatform() {