Fixes for running HaplotypeCaller with reduced reads: a) minor refactoring, pulled out code to compute mean representative count to ReadUtils, b) Don't use min representative count over kmer when constructing de Bruijn graph - this creates many paths with multiplicity=1 and makes us lose a lot of SNP's at edge of capture targets. Use mean instead

2012-08-06 20:22:12 -04:00 · 2012-08-06 20:22:12 -04:00 · 238d55cb61
parent 44f160f29f
commit 238d55cb61
3 changed files with 15 additions and 12 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@ -30,6 +30,7 @@ import com.google.java.contract.Requires;
 import org.broadinstitute.sting.utils.*;
 import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;

@ -92,7 +93,7 @@ public class LikelihoodCalculationEngine {
        final int[][] readCounts = new int[numHaplotypes][numReads];
        for( int iii = 0; iii < numReads; iii++ ) {
            final GATKSAMRecord read = reads.get(iii);
-            final int readCount = getRepresentativeReadCount(read);
+            final int readCount = ReadUtils.getMeanRepresentativeReadCount(read);

            final byte[] overallGCP = new byte[read.getReadLength()];
            Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
@ -123,15 +124,6 @@ public class LikelihoodCalculationEngine {
        }
    }

-    private static int getRepresentativeReadCount(GATKSAMRecord read) {
-        if (!read.isReducedRead())
-            return 1;
-
-        // compute mean representative read counts
-        final byte[] counts = read.getReducedReadCounts();
-        return MathUtils.sum(counts)/counts.length;
-    }
-
    private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) {
        for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){
            if( b1[iii] != b2[iii] ) {
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
@ -198,8 +198,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
                    }
                    int countNumber = 1;
                    if (read.isReducedRead()) {
-                        // compute min (?) number of reduced read counts in current kmer span
-                        countNumber = MathUtils.arrayMin(Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1));
+                        // compute mean number of reduced read counts in current kmer span
+                        final byte[] counts = Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1);
+                        // precise rounding can make a difference with low consensus counts
+                        countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length);
                    }

                    if( !badKmer ) {
--- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
@ -56,6 +56,15 @@ public class ReadUtils {
    private static int DEFAULT_ADAPTOR_SIZE = 100;
    public static int CLIPPING_GOAL_NOT_REACHED = -1;

+    public static int getMeanRepresentativeReadCount(GATKSAMRecord read) {
+        if (!read.isReducedRead())
+            return 1;
+
+        // compute mean representative read counts
+        final byte[] counts = read.getReducedReadCounts();
+        return (int)Math.round((double)MathUtils.sum(counts)/counts.length);
+    }
+
    /**
     * A marker to tell which end of the read has been clipped
     */