diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java new file mode 100755 index 000000000..fa163dbd5 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -0,0 +1,88 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.genotype.GenotypeWriter; + +import java.util.*; + +/** + * The model representing how we calculate a genotype given the priors and a pile + * of bases and quality scores + */ +public abstract class GenotypeCalculationModel implements Cloneable { + + public enum Model { + SINGLE_SAMPLE, + MULTI_SAMPLE_EM, + MULTI_SAMPLE_ALL_MAFS + } + + protected BaseMismatchModel baseModel; + protected Set samples; + protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform; + protected GenotypeWriter out; + protected boolean GENOTYPE_MODE; + protected double LOD_THRESHOLD; + protected int maxDeletionsInPileup; + protected boolean VERBOSE; + + /** + * Create a new GenotypeCalculationModel object with given debugging level + * Assumes that out is not null + * @param baseModel base model to use + * @param samples samples in input bam + * @param platform default platform + * @param out output writer + * @param genotypeMode genotyping + * @param lod lod threshold + * @param maxDeletions max deletions to tolerate + * @param verbose verbose flag + */ + public GenotypeCalculationModel(BaseMismatchModel baseModel, + Set samples, + EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform, + GenotypeWriter out, + boolean genotypeMode, + double lod, + int maxDeletions, + boolean verbose) { + this.baseModel = baseModel; + this.samples = samples; + defaultPlatform = platform; + this.out = out; + GENOTYPE_MODE = genotypeMode; + LOD_THRESHOLD = lod; + maxDeletionsInPileup = maxDeletions; + VERBOSE = verbose; + } + + /** + * Cloning of the object + * @return clone + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + GenotypeCalculationModel gcm = (GenotypeCalculationModel)super.clone(); + gcm.baseModel = baseModel; + gcm.samples = new HashSet(samples); + gcm.defaultPlatform = defaultPlatform; + gcm.out = out; + gcm.GENOTYPE_MODE = GENOTYPE_MODE; + gcm.LOD_THRESHOLD = LOD_THRESHOLD; + gcm.maxDeletionsInPileup = maxDeletionsInPileup; + gcm.VERBOSE = VERBOSE; + return gcm; + } + + /** + * Must be overridden by concrete subclasses + * @param ref reference base + * @param context alignment context + * @param priors priors to use for GL + * + * @return true if valid locus + */ + public abstract boolean calculateGenotype(char ref, + AlignmentContext context, + DiploidGenotypePriors priors); +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java new file mode 100755 index 000000000..de3e717d9 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java @@ -0,0 +1,38 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import static org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model.*; +import org.broadinstitute.sting.utils.genotype.GenotypeWriter; + +import java.util.Set; + + +public class GenotypeCalculationModelFactory { + //private GenotypeCalculationModelFactory() {} // cannot be instantiated + + public static GenotypeCalculationModel.Model getGenotypeCalculationModel(final String name) { + GenotypeCalculationModel.Model m = valueOf(name); + if ( m == null ) + throw new RuntimeException("Unexpected GenotypeCalculationModel " + name); + else + return m; + } + + /** + * General and correct way to create GenotypeCalculationModel objects for arbitrary models + * + * @param UAC The unified argument collection + * @param samples samples in bam + * @param out output + * @return model + */ + public static GenotypeCalculationModel makeGenotypeCalculation(UnifiedArgumentCollection UAC, + Set samples, + GenotypeWriter out) { + switch ( UAC.genotypeModel ) { + case SINGLE_SAMPLE: return new SingleSampleGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE); + //case MULTI_SAMPLE_EM: return new MultiSampleEMGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE); + case MULTI_SAMPLE_ALL_MAFS: return new MultiSampleAllMAFsGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE); + default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel); + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index fdab218c3..7e7de302c 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -78,6 +78,27 @@ public abstract class GenotypeLikelihoods implements Cloneable { initialize(); } + public static GenotypeLikelihoods merge(GenotypeLikelihoods gl1, GenotypeLikelihoods gl2) { + // GL = GL_i + GL_j + + GenotypeLikelihoods gl; + try { + gl = (GenotypeLikelihoods)gl1.clone(); + } catch (CloneNotSupportedException e) { + throw new StingException(e.getMessage()); + } + assert(gl1.likelihoods.length == gl1.posteriors.length); + assert(gl1.likelihoods.length == gl2.likelihoods.length); + assert(gl1.posteriors.length == gl2.posteriors.length); + + for (int i = 0; i < gl1.likelihoods.length; i++) { + gl.likelihoods[i] += gl2.likelihoods[i]; + gl.posteriors[i] += gl2.posteriors[i]; + } + + return gl; + } + /** * Cloning of the object * @return @@ -218,8 +239,12 @@ public abstract class GenotypeLikelihoods implements Cloneable { int n = 0; for (int i = 0; i < pileup.getReads().size(); i++) { - SAMRecord read = pileup.getReads().get(i); int offset = pileup.getOffsets().get(i); + // ignore deletions + if ( offset == -1 ) + continue; + + SAMRecord read = pileup.getReads().get(i); char base = read.getReadString().charAt(offset); byte qual = read.getBaseQualities()[offset]; if ( ! ignoreBadBases || ! badBase(base) ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleAllMAFsGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleAllMAFsGenotypeCalculationModel.java new file mode 100755 index 000000000..8423a734c --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleAllMAFsGenotypeCalculationModel.java @@ -0,0 +1,42 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.genotype.GenotypeWriter; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; + +import java.util.Set; + +public class MultiSampleAllMAFsGenotypeCalculationModel extends GenotypeCalculationModel { + + public MultiSampleAllMAFsGenotypeCalculationModel(BaseMismatchModel baseModel, + Set samples, + EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform, + GenotypeWriter out, + boolean genotypeMode, + double lod, + int maxDeletions, + boolean verbose) { + super(baseModel, samples, platform, out, genotypeMode, lod, maxDeletions, verbose); + } + + + /** + * Cloning of the object + * @return clone + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + return super.clone(); + } + + public boolean calculateGenotype(char ref, AlignmentContext context, DiploidGenotypePriors priors) { + + // This is just a stub... + // TODO --- implement me + + // In this alternative approach we calculate the likelihoods of all possible + // MAFs (1-N) and choose the most likely one. We can probably be smart and + // avoid calculating MAFs that we know can't be the most likely... + + return true; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotypeCalculationModel.java new file mode 100755 index 000000000..db8b232ba --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotypeCalculationModel.java @@ -0,0 +1,81 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.GenotypeWriter; + +import java.util.Set; + + +public class SingleSampleGenotypeCalculationModel extends GenotypeCalculationModel { + + private CallResult callsMetrics; + + public SingleSampleGenotypeCalculationModel(BaseMismatchModel baseModel, + Set samples, + EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform, + GenotypeWriter out, + boolean genotypeMode, + double lod, + int maxDeletions, + boolean verbose) { + super(baseModel, samples, platform, out, genotypeMode, lod, maxDeletions, verbose); + + // check that this truly is a single-sample bam + if ( samples.size() > 1 ) + throw new StingException("Single-sample genotype calculation model used on multi-sample bam... aborting"); + + callsMetrics = new CallResult(); + } + + /** + * Cloning of the object + * @return clone + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + return super.clone(); + } + + public boolean calculateGenotype(char ref, AlignmentContext context, DiploidGenotypePriors priors) { + + GenotypeLikelihoods gl = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform); + gl.setVerbose(VERBOSE); + ReadBackedPileup pileup = new ReadBackedPileup(ref, context); + + // check that there aren't too many deletions in the pileup + pileup.setIncludeDeletionsInPileupString(true); + if ( Utils.countOccurrences(BasicPileup.DELETION_CHAR, pileup.getBases()) > maxDeletionsInPileup ) + return false; + + gl.add(pileup, true); + gl.validate(); + SSGenotypeCall call = new SSGenotypeCall(context.getLocation(), ref, gl, pileup); + + callsMetrics.nCalledBases++; + + if ( GENOTYPE_MODE || call.isVariant(call.getReference()) ) { + double confidence = (GENOTYPE_MODE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError()); + if ( confidence >= LOD_THRESHOLD ) { + callsMetrics.nConfidentCalls++; + //System.out.printf("Call %s%n", call); + out.addGenotypeCall(call); + } else + callsMetrics.nNonConfidentCalls++; + } + return true; + } + + public class CallResult { + long nConfidentCalls = 0; + long nNonConfidentCalls = 0; + long nCalledBases = 0; + + CallResult() {} + + public String toString() { + return String.format("SSG: %d confident and %d non-confident calls were made at %d bases", + nConfidentCalls, nNonConfidentCalls, nCalledBases); + } + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java new file mode 100755 index 000000000..5c7428aba --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -0,0 +1,52 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; + +import java.io.File; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Sep 29, 2009 + * Time: 12:07:43 PM + * To change this template use File | Settings | File Templates. + */ +public class UnifiedArgumentCollection { + + // control the various models to be used + @Argument(fullName = "genotypeModel", shortName = "gm", doc = "Genotype calculation model to employ -- SINGLE_SAMPLE and MULTI_SAMPLE_EM are the recommended choices, but it's possible to select MULTI_SAMPLE_ALL_MAFs", required = true) + public GenotypeCalculationModel.Model genotypeModel = null; + + @Argument(fullName = "baseModel", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false) + public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL; + + @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) + public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY; + + + // control the output + @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false) + public boolean GENOTYPE = false; + + @Argument(fullName = "verbose", shortName = "v", doc = "EXPERIMENTAL", required = false) + public boolean VERBOSE = false; + + + // control the various parameters to be used + @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false) + public double LOD_THRESHOLD = Double.MIN_VALUE; + + @Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false) + public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null; + + @Argument(fullName = "maxDeletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false) + public Integer MAX_DELETIONS = 1; + + @Argument(fullName = "maxCoverage", shortName = "maxCoverage", doc = "Maximum reads at this locus for it to be callable; to disable, provide value < 1 [default:10,000]", required = false) + public Integer MAX_READS_IN_PILEUP = 10000; + + //@Argument(fullName = "disableCache", doc = "[ADVANCED] If true, we won't use the caching system. This argument is for testing purposes only", required = false) + //public boolean disableCache = false; + +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java new file mode 100755 index 000000000..99fa0a6f8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -0,0 +1,129 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.cmdLine.*; +import org.broadinstitute.sting.utils.genotype.*; + +import java.io.File; +import java.util.*; + +import net.sf.samtools.SAMReadGroupRecord; + + +@ReadFilters({ZeroMappingQualityReadFilter.class, MissingReadGroupFilter.class}) +public class UnifiedGenotyper extends LocusWalker { + + @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + + // control the output + @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false) + public File VARIANTS_FILE = null; + + @Argument(fullName = "variant_output_format", shortName = "vf", doc = "File format to be used", required = false) + public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI; + + + // the model used for calculating genotypes + private GenotypeCalculationModel gcm; + + // output writer + private GenotypeWriter writer; + + + /** + * Filter out loci to ignore (at an ambiguous base in the reference or a locus with zero coverage). + * + * @param tracker the meta data tracker + * @param ref the reference base + * @param context contextual information around the locus + * + * @return true if we should look at this locus, false otherwise + */ + public boolean filter(RefMetaDataTracker tracker, char ref, AlignmentContext context) { + return (BaseUtils.simpleBaseToBaseIndex(ref) != -1 && context.getReads().size() != 0); + } + + /** Enable deletions in the pileup */ + public boolean includeReadsWithDeletionAtLoci() { return true; } + + /** Initialize the walker with some sensible defaults */ + public void initialize() { + + // get all of the unique sample names + HashSet samples = new HashSet(); + List readGroups = getToolkit().getSAMFileHeader().getReadGroups(); + for ( SAMReadGroupRecord readGroup : readGroups ) + samples.add(readGroup.getSample()); + + // print them out for debugging (need separate loop to ensure uniqueness) + for ( String sample : samples ) + logger.debug("SAMPLE: " + sample); + + // create the output writer stream + if ( VARIANTS_FILE != null ) + writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE); + else + writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out); + + gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(UAC, samples, writer); + } + + /** + * Compute at a given locus. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param context contextual information around the locus + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext context) { + char ref = Character.toUpperCase(refContext.getBase()); + if ( !BaseUtils.isRegularBase(ref) ) + return 0; + + // an optimization to speed things up when overly covered + if ( UAC.MAX_READS_IN_PILEUP > 0 && context.getReads().size() > UAC.MAX_READS_IN_PILEUP ) + return 0; + + DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, UAC.heterozygosity, DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE); + return ( gcm.calculateGenotype(ref, context, priors) ? 1 : 0 ); + } + + /** + * Determine whether we're at a Hapmap site + * + * @param tracker the meta data tracker + * + * @return true if we're at a Hapmap site, false if otherwise + */ + private static boolean isHapmapSite(RefMetaDataTracker tracker) { + return tracker.getTrackData("hapmap", null) != null; + } + + /** + * Determine whether we're at a dbSNP site + * + * @param tracker the meta data tracker + * + * @return true if we're at a dbSNP site, false if otherwise + */ + private static boolean isDbSNPSite(RefMetaDataTracker tracker) { + return rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)) != null; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + /** Close the variant file. */ + public void onTraversalDone(Integer sum) { + logger.info("Processed " + sum + " loci that are callable for SNPs"); + writer.close(); + } +}