Don't filter out reads without proper read groups. Instead, allow the user (or another walker calling UG) to specify an assumed sample to use (but then we assume single-sample mode).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1883 348d0f76-0448-11de-a6fe-93d51630548a
2009-10-20 01:30:53 +00:00 · 2009-10-20 01:30:53 +00:00 · b8ab77c91c
parent a8a2c1a2a1
commit b8ab77c91c
5 changed files with 151 additions and 25 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java
@ -1,8 +1,8 @@
 package org.broadinstitute.sting.gatk.walkers.genotyper;

 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.utils.BaseUtils;
-import org.broadinstitute.sting.utils.ReadBackedPileup;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.genotype.*;

 import java.util.*;

@ -10,33 +10,69 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel

    protected AllMAFsGenotypeCalculationModel() {}

-    private double[] alleleFrequencies;
+    // because the null allele frequencies are constant for a given N,
+    // we cache the results to avoid having to recompute everything
+    private HashMap<Integer, double[]> nullAlleleFrequencyCache = new HashMap<Integer, double[]>();

+    // the allele frequencies
+    private double[][] alleleFrequencies = new double[3][];
+    private double[][] oldAlleleFrequencies;
+
+    // keep track of whether or not a given MAF is stable
+    private boolean[] frequencyStabilityArray = new boolean[3];
+
+    // the minimum and actual number of points in our allele frequency estimation
+    private static final int MIN_ESTIMATION_POINTS = 1000;
+    private int estimationPoints;
+
+    // the GenotypeLikelihoods map
+    private HashMap<String, AlleleSpecificGenotypeLikelihoods> GLs = new HashMap<String, AlleleSpecificGenotypeLikelihoods>();


    protected void initializeAlleleFrequencies(int numSamples, char ref) {
-        // we have 2N possible allele frequencies in pileup
-        int possibleMAFs = 2 * numSamples;
-        alleleFrequencies = new double[possibleMAFs];
+        // first, initialize the stability array to "unstable"
+        for (int i = 0; i < 3; i++)
+            frequencyStabilityArray[i] = false;

-        // calculate sum(1/i) for i from 1 to 2N
-        double denominator = 0.0;
-        for (int i = 1; i <= possibleMAFs; i++)
-            denominator += 1.0 / (double)i;
+        // calculate the number of estimation points to use:
+        // it's either MIN_ESTIMATION_POINTS or 2N if that's larger
+        // (add 1 for allele frequency of zero)
+        estimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1;

-        // set up delta
-        double delta = 1.0 / denominator;
+        for (int alt = 0; alt < 3; alt++)
+            alleleFrequencies[alt] = getNullAlleleFrequencies(estimationPoints);

-        // calculate the null allele frequencies
-        for (int i = 1; i <= possibleMAFs; i++)
-            alleleFrequencies[i-1] = Math.log10(delta / (double)i);
+        for (int i = 1; i < estimationPoints; i++)
+            logger.debug("Initial allele frequency for MAF=" + i + ": " + alleleFrequencies[0][i]);
+    }

-        for (int i = 0; i < possibleMAFs; i++)
-            logger.debug("Initial allele frequency for MAF=" + (i+1) + ": " + alleleFrequencies[i]);
+    private double[] getNullAlleleFrequencies(int N) {
+        double[] AFs = nullAlleleFrequencyCache.get(N);
+
+        // if it hasn't been calculated yet, do so now
+        if ( AFs == null ) {
+
+            // calculate sum(1/i)
+            double denominator = 0.0;
+            for (int i = 1; i < N; i++)
+                denominator += 1.0 / (double)i;
+
+            // set up delta
+            double delta = 1.0 / denominator;
+
+            // calculate the null allele frequencies
+            AFs = new double[N];
+            for (int i = 1; i < N; i++)
+                AFs[i] = Math.log10(delta / (double)i);
+
+            nullAlleleFrequencyCache.put(N, AFs);
+        }
+
+        return AFs.clone();
    }

    protected void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
-        HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
+        GLs.clear();

        for ( String sample : contexts.keySet() ) {
            AlignmentContextBySample context = contexts.get(sample);
@ -47,7 +83,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
            GL.setVerbose(VERBOSE);
            GL.add(pileup, true);

-            GLs.put(sample, GL);
+            GLs.put(sample, new AlleleSpecificGenotypeLikelihoods(ref, GL));
        }
    }

@ -60,10 +96,67 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel
    }

    protected boolean isStable() {
+        // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC
+        // We compute this separately for all of the alternate alleles
+        for (int i = 0; i < 3; i++) {
+            // if we've already determined that a MAF is stable, don't recalculate
+            if ( frequencyStabilityArray[i] )
+                continue;
+
+            // determine change
+            double AF_delta = 0.0;
+            for (int j = 1; j < estimationPoints; j++)
+                AF_delta += Math.abs(oldAlleleFrequencies[i][j] - alleleFrequencies[i][j]);
+
+            // if it's not stable, we're done
+            if (AF_delta > EM_STABILITY_METRIC)
+                return false;
+
+            // it's stable, so record that fact in the stability array
+            frequencyStabilityArray[i] = true;
+        }
+
+        // if we got here, then we're stable
        return true;
    }

    protected EMOutput computePofF(char ref) {
        return null;
    }
+
+
+    /**
+     * A class for the allele-specific genotype likelihoods
+    */
+    protected class AlleleSpecificGenotypeLikelihoods {
+        private HashMap<String, Pair<double[], double[]>> GLs = new HashMap<String, Pair<double[], double[]>>();
+
+        AlleleSpecificGenotypeLikelihoods(char ref, GenotypeLikelihoods GL) {
+            double[] likelihoods = GL.getLikelihoods();
+            double[] posteriors = GL.getPosteriors();
+
+
+
+            // get the ref likelihood/posterior
+            DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
+            double refLikelihood = GL.getLikelihood(refGenotype);
+            double refPosterior = GL.getPosterior(refGenotype);
+            String refStr = String.valueOf(ref);
+
+            for ( char base : BaseUtils.BASES ) {
+                if ( base == ref )
+                    continue;
+
+                // get hom var and het likelihoods
+                double homLikelihood = GL.getLikelihood(DiploidGenotype.createHomGenotype(base));
+                double hetLikelihood = GL.getLikelihood(DiploidGenotype.valueOf(refStr + String.valueOf(base)));
+
+
+            }
+        }
+
+
+
+
+    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java
@ -1,6 +1,7 @@
 package org.broadinstitute.sting.gatk.walkers.genotyper;

 import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMReadGroupRecord;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.utils.*;
@ -130,7 +131,14 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
            if ( POOLED_INPUT ) {
                sample = "POOL";
            } else {
-                sample = read.getReadGroup().getSample();
+                SAMReadGroupRecord readGroup = read.getReadGroup();
+                if ( readGroup == null ) {
+                    if ( assumedSingleSample == null )
+                        throw new StingException("Missing read group for read " + read.getReadName());
+                    sample = assumedSingleSample;
+                } else {
+                    sample = readGroup.getSample();
+                }
            }

            // create a new context object if this is the first time we're seeing a read for this sample
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
@ -29,6 +29,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
    protected int POOL_SIZE;
    protected double LOD_THRESHOLD;
    protected int maxDeletionsInPileup;
+    protected String assumedSingleSample;
    protected boolean VERBOSE;

    /**
@ -57,6 +58,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
        POOL_SIZE = UAC.POOLSIZE;
        LOD_THRESHOLD = UAC.LOD_THRESHOLD;
        maxDeletionsInPileup = UAC.MAX_DELETIONS;
+        assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
        VERBOSE = UAC.VERBOSE;
    }

@ -76,10 +78,15 @@ public abstract class GenotypeCalculationModel implements Cloneable {
        gcm.POOLED_INPUT = POOLED_INPUT;
        gcm.LOD_THRESHOLD = LOD_THRESHOLD;
        gcm.maxDeletionsInPileup = maxDeletionsInPileup;
+        gcm.assumedSingleSample = assumedSingleSample;
        gcm.VERBOSE = VERBOSE;
        return gcm;
    }

+    public void setAssumedSingleSample(String sample) {
+        assumedSingleSample = sample;
+    }
+
    /**
     * Must be overridden by concrete subclasses
     * @param tracker   rod data
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -55,6 +55,11 @@ public class UnifiedArgumentCollection {
    public boolean VERBOSE = false;


+    // control the error modes
+    @Argument(fullName = "assumeSingleSampleReads", shortName = "singleSample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
+    public String ASSUME_SINGLE_SAMPLE = null;
+
+
    // control the various parameters to be used
    @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
    public double LOD_THRESHOLD = Double.MIN_VALUE;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@ -30,7 +30,6 @@ import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.filters.MissingReadGroupFilter;
 import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
@ -52,7 +51,7 @@ import java.util.List;
 import java.util.ArrayList;


-@ReadFilters({ZeroMappingQualityReadFilter.class, MissingReadGroupFilter.class})
+@ReadFilters({ZeroMappingQualityReadFilter.class})
 public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, GenotypeMetaData>, Integer> {

    @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
@ -83,6 +82,15 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
    /** Enable deletions in the pileup **/
    public boolean includeReadsWithDeletionAtLoci() { return true; }

+    /**
+     * Sets the single sample to assume when read groups are missing.
+     * To be used with walkers that call the UnifiedGenotyper's map function
+     *
+     **/
+    public void setAssumedSingleSample(String sample) {
+        gcm.setAssumedSingleSample(sample);
+    }
+
    /**
     * Initialize the samples, output, and genotype calculation model
     *
@ -95,9 +103,14 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot

        // get all of the unique sample names
        samples = new HashSet<String>();
-        List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
-        for ( SAMReadGroupRecord readGroup : readGroups )
-            samples.add(readGroup.getSample());
+        // if we're supposed to assume a single sample
+        if ( UAC.ASSUME_SINGLE_SAMPLE != null ) {
+            samples.add(UAC.ASSUME_SINGLE_SAMPLE);
+        } else {
+            List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
+            for ( SAMReadGroupRecord readGroup : readGroups )
+                samples.add(readGroup.getSample());
+        }

        // print them out for debugging (need separate loop to ensure uniqueness)
        for ( String sample : samples )