First pass at a unified caller, being checked in now so Mark can give feedback if he chooses and so Matt can debug issues with the ArgumentCollection class.

Some notes: 1. This design should be flexible enough to include pooled calling (for now) after discussions with Chris. 2. Using the unified caller with the SingleSampleCalculationModel emits the exact same output as SSG over all of chr20 for NA12878. Additionally, when we include the "max deletions allowed at a locus" argument (so we don't try to call SNPs at deletion sites), it removed 233 SNP calls in chr20 that were clearly indel artficts. 3. The MultiSampleEMCalculationModel is still a work in progress and will be checked in later this week. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1740 348d0f76-0448-11de-a6fe-93d51630548a
2009-09-29 16:48:15 +00:00 · 2009-09-29 16:48:15 +00:00 · 19bfe43173
parent 8bd345ba00
commit 19bfe43173
7 changed files with 456 additions and 1 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
@ -0,0 +1,88 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
+
+import java.util.*;
+
+/**
+ * The model representing how we calculate a genotype given the priors and a pile
+ * of bases and quality scores
+ */
+public abstract class GenotypeCalculationModel implements Cloneable {
+
+    public enum Model {
+        SINGLE_SAMPLE,
+        MULTI_SAMPLE_EM,
+        MULTI_SAMPLE_ALL_MAFS
+    }
+
+    protected BaseMismatchModel baseModel;
+    protected Set<String> samples;
+    protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
+    protected GenotypeWriter out;
+    protected boolean GENOTYPE_MODE;
+    protected double LOD_THRESHOLD;
+    protected int maxDeletionsInPileup;
+    protected boolean VERBOSE;
+
+    /**
+     * Create a new GenotypeCalculationModel object with given debugging level
+     * Assumes that out is not null
+     * @param baseModel     base model to use
+     * @param samples       samples in input bam
+     * @param platform      default platform
+     * @param out           output writer
+     * @param genotypeMode  genotyping
+     * @param lod           lod threshold
+     * @param maxDeletions  max deletions to tolerate
+     * @param verbose       verbose flag
+     */
+    public GenotypeCalculationModel(BaseMismatchModel baseModel,
+                                    Set<String> samples,
+                                    EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform,
+                                    GenotypeWriter out,
+                                    boolean genotypeMode,
+                                    double lod,
+                                    int maxDeletions,
+                                    boolean verbose) {
+        this.baseModel = baseModel;
+        this.samples = samples;
+        defaultPlatform = platform;
+        this.out = out;
+        GENOTYPE_MODE = genotypeMode;
+        LOD_THRESHOLD = lod;
+        maxDeletionsInPileup = maxDeletions;
+        VERBOSE = verbose;
+    }
+
+    /**
+     * Cloning of the object
+     * @return clone
+     * @throws CloneNotSupportedException
+     */
+    protected Object clone() throws CloneNotSupportedException {
+        GenotypeCalculationModel gcm = (GenotypeCalculationModel)super.clone();
+        gcm.baseModel = baseModel;
+        gcm.samples = new HashSet<String>(samples);
+        gcm.defaultPlatform = defaultPlatform;
+        gcm.out = out;
+        gcm.GENOTYPE_MODE = GENOTYPE_MODE;
+        gcm.LOD_THRESHOLD = LOD_THRESHOLD;
+        gcm.maxDeletionsInPileup = maxDeletionsInPileup;
+        gcm.VERBOSE = VERBOSE;
+        return gcm;
+    }
+
+    /**
+     * Must be overridden by concrete subclasses
+     * @param ref       reference base
+     * @param context   alignment context
+     * @param priors    priors to use for GL
+     *
+     * @return true if valid locus
+     */
+    public abstract boolean calculateGenotype(char ref,
+                                              AlignmentContext context,
+                                              DiploidGenotypePriors priors);
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java
@ -0,0 +1,38 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import static org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model.*;
+import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
+
+import java.util.Set;
+
+
+public class GenotypeCalculationModelFactory {
+    //private GenotypeCalculationModelFactory() {} // cannot be instantiated
+
+    public static GenotypeCalculationModel.Model getGenotypeCalculationModel(final String name) {
+        GenotypeCalculationModel.Model m = valueOf(name);
+        if ( m == null )
+            throw new RuntimeException("Unexpected GenotypeCalculationModel " + name);
+        else
+            return m;
+    }
+
+    /**
+     * General and correct way to create GenotypeCalculationModel objects for arbitrary models
+     *
+     * @param UAC           The unified argument  collection
+     * @param samples       samples in bam
+     * @param out           output
+     * @return model
+     */
+    public static GenotypeCalculationModel makeGenotypeCalculation(UnifiedArgumentCollection UAC,
+                                                                   Set<String> samples,
+                                                                   GenotypeWriter out) {
+        switch ( UAC.genotypeModel ) {
+            case SINGLE_SAMPLE: return new SingleSampleGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE);
+            //case MULTI_SAMPLE_EM: return new MultiSampleEMGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE);
+            case MULTI_SAMPLE_ALL_MAFS: return new MultiSampleAllMAFsGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE);
+            default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
+        }
+    }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java
@ -78,6 +78,27 @@ public abstract class GenotypeLikelihoods implements Cloneable {
        initialize();
    }

+    public static GenotypeLikelihoods merge(GenotypeLikelihoods gl1, GenotypeLikelihoods gl2) {
+        // GL = GL_i + GL_j
+
+        GenotypeLikelihoods gl;
+        try {
+            gl = (GenotypeLikelihoods)gl1.clone();
+        } catch (CloneNotSupportedException e) {
+            throw new StingException(e.getMessage());
+        }
+        assert(gl1.likelihoods.length == gl1.posteriors.length);
+        assert(gl1.likelihoods.length == gl2.likelihoods.length);
+        assert(gl1.posteriors.length == gl2.posteriors.length);
+
+        for (int i = 0; i < gl1.likelihoods.length; i++) {
+            gl.likelihoods[i] += gl2.likelihoods[i];
+            gl.posteriors[i] += gl2.posteriors[i];
+        }
+
+        return gl;
+    }
+
    /**
     * Cloning of the object
     * @return
@ -218,8 +239,12 @@ public abstract class GenotypeLikelihoods implements Cloneable {
        int n = 0;

        for (int i = 0; i < pileup.getReads().size(); i++) {
-            SAMRecord read = pileup.getReads().get(i);
            int offset = pileup.getOffsets().get(i);
+            // ignore deletions
+            if ( offset == -1 )
+                continue;
+
+            SAMRecord read = pileup.getReads().get(i);
            char base = read.getReadString().charAt(offset);
            byte qual = read.getBaseQualities()[offset];
            if ( ! ignoreBadBases || ! badBase(base) ) {
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleAllMAFsGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleAllMAFsGenotypeCalculationModel.java
@ -0,0 +1,42 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+
+import java.util.Set;
+
+public class MultiSampleAllMAFsGenotypeCalculationModel extends GenotypeCalculationModel {
+
+    public MultiSampleAllMAFsGenotypeCalculationModel(BaseMismatchModel baseModel,
+                                                      Set<String> samples,
+                                                      EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform,
+                                                      GenotypeWriter out,
+                                                      boolean genotypeMode,
+                                                      double lod,
+                                                      int maxDeletions,
+                                                      boolean verbose) {
+        super(baseModel, samples, platform, out, genotypeMode, lod, maxDeletions, verbose);
+    }
+
+
+    /**
+     * Cloning of the object
+     * @return clone
+     * @throws CloneNotSupportedException
+     */
+    protected Object clone() throws CloneNotSupportedException {
+        return super.clone();
+    }
+
+    public boolean calculateGenotype(char ref, AlignmentContext context, DiploidGenotypePriors priors) {
+
+        // This is just a stub...
+        // TODO --- implement me
+
+        // In this alternative approach we calculate the likelihoods of all possible
+        // MAFs (1-N) and choose the most likely one.  We can probably be smart and
+        // avoid calculating MAFs that we know can't be the most likely...
+
+        return true;
+    }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotypeCalculationModel.java
@ -0,0 +1,81 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
+
+import java.util.Set;
+
+
+public class SingleSampleGenotypeCalculationModel extends GenotypeCalculationModel {
+
+    private CallResult callsMetrics;
+
+    public SingleSampleGenotypeCalculationModel(BaseMismatchModel baseModel,
+                                                Set<String> samples,
+                                                EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform,
+                                                GenotypeWriter out,
+                                                boolean genotypeMode,
+                                                double lod,
+                                                int maxDeletions,
+                                                boolean verbose) {
+        super(baseModel, samples, platform, out, genotypeMode, lod, maxDeletions, verbose);
+
+        // check that this truly is a single-sample bam
+        if ( samples.size() > 1 )
+            throw new StingException("Single-sample genotype calculation model used on multi-sample bam... aborting");
+
+        callsMetrics = new CallResult();
+    }
+
+    /**
+     * Cloning of the object
+     * @return clone
+     * @throws CloneNotSupportedException
+     */
+    protected Object clone() throws CloneNotSupportedException {
+        return super.clone();
+    }
+
+    public boolean calculateGenotype(char ref, AlignmentContext context, DiploidGenotypePriors priors) {
+
+        GenotypeLikelihoods gl = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
+        gl.setVerbose(VERBOSE);
+        ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
+
+        // check that there aren't too many deletions in the pileup
+        pileup.setIncludeDeletionsInPileupString(true);
+        if ( Utils.countOccurrences(BasicPileup.DELETION_CHAR, pileup.getBases()) > maxDeletionsInPileup )
+            return false;
+
+        gl.add(pileup, true);
+        gl.validate();
+        SSGenotypeCall call = new SSGenotypeCall(context.getLocation(), ref, gl, pileup);
+
+        callsMetrics.nCalledBases++;
+
+        if ( GENOTYPE_MODE || call.isVariant(call.getReference()) ) {
+            double confidence = (GENOTYPE_MODE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError());
+            if ( confidence >= LOD_THRESHOLD ) {
+                callsMetrics.nConfidentCalls++;
+                //System.out.printf("Call %s%n", call);
+                out.addGenotypeCall(call);
+            } else
+                callsMetrics.nNonConfidentCalls++;
+        }
+        return true;
+    }
+
+    public class CallResult {
+        long nConfidentCalls = 0;
+        long nNonConfidentCalls = 0;
+        long nCalledBases = 0;
+
+        CallResult() {}
+
+        public String toString() {
+            return String.format("SSG: %d confident and %d non-confident calls were made at %d bases",
+                    nConfidentCalls, nNonConfidentCalls, nCalledBases);
+        }
+    }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -0,0 +1,52 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
+
+import java.io.File;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: ebanks
+ * Date: Sep 29, 2009
+ * Time: 12:07:43 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class UnifiedArgumentCollection {
+
+    // control the various models to be used
+    @Argument(fullName = "genotypeModel", shortName = "gm", doc = "Genotype calculation model to employ -- SINGLE_SAMPLE and MULTI_SAMPLE_EM are the recommended choices, but it's possible to select MULTI_SAMPLE_ALL_MAFs", required = true)
+    public GenotypeCalculationModel.Model genotypeModel = null;
+
+    @Argument(fullName = "baseModel", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
+    public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;
+
+    @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
+    public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY;
+
+
+    // control the output
+    @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
+    public boolean GENOTYPE = false;
+
+    @Argument(fullName = "verbose", shortName = "v", doc = "EXPERIMENTAL", required = false)
+    public boolean VERBOSE = false;
+
+
+    // control the various parameters to be used
+    @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
+    public double LOD_THRESHOLD = Double.MIN_VALUE;
+
+    @Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform.  Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
+    public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
+
+    @Argument(fullName = "maxDeletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
+    public Integer MAX_DELETIONS = 1;
+
+    @Argument(fullName = "maxCoverage", shortName = "maxCoverage", doc = "Maximum reads at this locus for it to be callable; to disable, provide value < 1 [default:10,000]", required = false)
+    public Integer MAX_READS_IN_PILEUP = 10000;
+
+    //@Argument(fullName = "disableCache", doc = "[ADVANCED] If true, we won't use the caching system.  This argument is for testing purposes only", required = false)
+    //public boolean disableCache = false;    
+
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@ -0,0 +1,129 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.contexts.*;
+import org.broadinstitute.sting.gatk.filters.*;
+import org.broadinstitute.sting.gatk.refdata.*;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.cmdLine.*;
+import org.broadinstitute.sting.utils.genotype.*;
+
+import java.io.File;
+import java.util.*;
+
+import net.sf.samtools.SAMReadGroupRecord;
+
+
+@ReadFilters({ZeroMappingQualityReadFilter.class, MissingReadGroupFilter.class})
+public class UnifiedGenotyper extends LocusWalker<Integer, Integer> {
+
+    @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
+
+    // control the output
+    @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false)
+    public File VARIANTS_FILE = null;
+
+    @Argument(fullName = "variant_output_format", shortName = "vf", doc = "File format to be used", required = false)
+    public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
+
+
+    // the model used for calculating genotypes
+    private GenotypeCalculationModel gcm;
+
+    // output writer
+    private GenotypeWriter writer;
+
+
+     /**
+     * Filter out loci to ignore (at an ambiguous base in the reference or a locus with zero coverage).
+     *
+     * @param tracker the meta data tracker
+     * @param ref     the reference base
+     * @param context contextual information around the locus
+     *
+     * @return true if we should look at this locus, false otherwise
+     */
+    public boolean filter(RefMetaDataTracker tracker, char ref, AlignmentContext context) {
+        return (BaseUtils.simpleBaseToBaseIndex(ref) != -1 && context.getReads().size() != 0);
+    }
+
+    /** Enable deletions in the pileup */
+    public boolean includeReadsWithDeletionAtLoci() { return true; }
+
+    /** Initialize the walker with some sensible defaults */
+    public void initialize() {
+
+        // get all of the unique sample names
+        HashSet<String> samples = new HashSet<String>();
+        List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
+        for ( SAMReadGroupRecord readGroup : readGroups )
+            samples.add(readGroup.getSample());
+
+        // print them out for debugging (need separate loop to ensure uniqueness)
+        for ( String sample : samples )
+            logger.debug("SAMPLE: " + sample);
+
+        // create the output writer stream
+        if ( VARIANTS_FILE != null )
+            writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE);
+        else
+            writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out);
+
+        gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(UAC, samples, writer);
+    }
+
+    /**
+     * Compute at a given locus.
+     *
+     * @param tracker the meta data tracker
+     * @param refContext the reference base
+     * @param context contextual information around the locus
+     */
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext context) {
+        char ref = Character.toUpperCase(refContext.getBase());
+        if ( !BaseUtils.isRegularBase(ref) )
+            return 0;
+
+        // an optimization to speed things up when overly covered
+        if ( UAC.MAX_READS_IN_PILEUP > 0 && context.getReads().size() > UAC.MAX_READS_IN_PILEUP )
+            return 0;
+
+        DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, UAC.heterozygosity, DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE);
+        return ( gcm.calculateGenotype(ref, context, priors) ?  1 : 0 );
+    }
+
+    /**
+     * Determine whether we're at a Hapmap site
+     *
+     * @param tracker the meta data tracker
+     *
+     * @return true if we're at a Hapmap site, false if otherwise
+     */
+    private static boolean isHapmapSite(RefMetaDataTracker tracker) {
+        return tracker.getTrackData("hapmap", null) != null;
+    }
+
+    /**
+     * Determine whether we're at a dbSNP site
+     *
+     * @param tracker the meta data tracker
+     *
+     * @return true if we're at a dbSNP site, false if otherwise
+     */
+    private static boolean isDbSNPSite(RefMetaDataTracker tracker) {
+        return rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)) != null;
+    }
+
+    public Integer reduceInit() { return 0; }
+
+    public Integer reduce(Integer value, Integer sum) {
+        return sum + value;
+    }
+
+    /** Close the variant file. */
+    public void onTraversalDone(Integer sum) {
+        logger.info("Processed " + sum + " loci that are callable for SNPs");
+        writer.close();
+    }
+}