LikelihoodCalculationEngine will now only use reads longer than the minReadLength, which is currently fixed at 20 bp

2013-04-03 08:59:09 -04:00 · 2013-04-03 08:59:09 -04:00 · 9b5c55a84a
parent af593094a2
commit 9b5c55a84a
1 changed files with 23 additions and 4 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@ -69,6 +69,7 @@ public class LikelihoodCalculationEngine {
    private final byte constantGCP;
    private final boolean DEBUG;
    private final PairHMM pairHMM;
+    private final int minReadLength = 20;

    public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {

@ -90,9 +91,16 @@ public class LikelihoodCalculationEngine {
        DEBUG = debug;
    }

-    public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
-
-        final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
+    /**
+     * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
+     *
+     * After calling this routine the PairHMM will be configured to best evaluate all reads in the samples
+     * against the set of haplotypes
+     *
+     * @param haplotypes a non-null list of haplotypes
+     * @param perSampleReadList a mapping from sample -> reads
+     */
+    private void initializePairHMM(final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList) {
        int X_METRIC_LENGTH = 0;
        for( final Map.Entry<String, List<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
            for( final GATKSAMRecord read : sample.getValue() ) {
@ -108,13 +116,20 @@ public class LikelihoodCalculationEngine {

        // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
        pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
+    }

-        // for each sample's reads
+    public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
+        // configure the HMM
+        initializePairHMM(haplotypes, perSampleReadList);
+
+        // Add likelihoods for each sample's reads to our stratifiedReadMap
+        final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
        for( final Map.Entry<String, List<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
            //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
            // evaluate the likelihood of the reads given those haplotypes
            stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue()));
        }
+
        return stratifiedReadMap;
    }

@ -128,6 +143,10 @@ public class LikelihoodCalculationEngine {

        final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
        for( final GATKSAMRecord read : reads ) {
+            if ( read.getReadLength() < minReadLength )
+                // don't consider any reads that have a read length < the minimum
+                continue;
+
            final byte[] overallGCP = new byte[read.getReadLength()];
            Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
            // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read