From 238d55cb61935085ddc7462b385c30b8a603689c Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 6 Aug 2012 20:22:12 -0400 Subject: [PATCH] Fixes for running HaplotypeCaller with reduced reads: a) minor refactoring, pulled out code to compute mean representative count to ReadUtils, b) Don't use min representative count over kmer when constructing de Bruijn graph - this creates many paths with multiplicity=1 and makes us lose a lot of SNP's at edge of capture targets. Use mean instead --- .../haplotypecaller/LikelihoodCalculationEngine.java | 12 ++---------- .../haplotypecaller/SimpleDeBruijnAssembler.java | 6 ++++-- .../broadinstitute/sting/utils/sam/ReadUtils.java | 9 +++++++++ 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 939b3c375..dfe6dcc3a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -30,6 +30,7 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -92,7 +93,7 @@ public class LikelihoodCalculationEngine { final int[][] readCounts = new int[numHaplotypes][numReads]; for( int iii = 0; iii < numReads; iii++ ) { final GATKSAMRecord read = reads.get(iii); - final int readCount = getRepresentativeReadCount(read); + final int readCount = ReadUtils.getMeanRepresentativeReadCount(read); final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? @@ -123,15 +124,6 @@ public class LikelihoodCalculationEngine { } } - private static int getRepresentativeReadCount(GATKSAMRecord read) { - if (!read.isReducedRead()) - return 1; - - // compute mean representative read counts - final byte[] counts = read.getReducedReadCounts(); - return MathUtils.sum(counts)/counts.length; - } - private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) { for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){ if( b1[iii] != b2[iii] ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index f3dd3babb..7319eb54d 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -198,8 +198,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } int countNumber = 1; if (read.isReducedRead()) { - // compute min (?) number of reduced read counts in current kmer span - countNumber = MathUtils.arrayMin(Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1)); + // compute mean number of reduced read counts in current kmer span + final byte[] counts = Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1); + // precise rounding can make a difference with low consensus counts + countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length); } if( !badKmer ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 6b9ba79b4..c16470c48 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -56,6 +56,15 @@ public class ReadUtils { private static int DEFAULT_ADAPTOR_SIZE = 100; public static int CLIPPING_GOAL_NOT_REACHED = -1; + public static int getMeanRepresentativeReadCount(GATKSAMRecord read) { + if (!read.isReducedRead()) + return 1; + + // compute mean representative read counts + final byte[] counts = read.getReducedReadCounts(); + return (int)Math.round((double)MathUtils.sum(counts)/counts.length); + } + /** * A marker to tell which end of the read has been clipped */