diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index 03b844024..9d22fc3e7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -16,13 +16,10 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode // We consider the EM stable when the MAF doesn't change more than 1/10N protected static final double EM_STABILITY_METRIC = 0.1; - // keep track of some metrics about our calls - protected CallMetrics callsMetrics = new CallMetrics(); - protected EMGenotypeCalculationModel() {} - public List calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { + public Pair, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { // keep track of the context for each sample, overall and separated by strand int[] baseCounts = new int[4]; @@ -51,23 +48,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode // generate the calls GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF()); - List calls = genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts); - if ( calls != null && calls.size() != 0 ) { - - // use multi-sample mode if we have multiple samples or the output type allows it - if ( out.supportsMultiSample() || samples.size() > 1 ) { - - // annoying hack to get around Java generics - ArrayList callList = new ArrayList(); - for ( GenotypeCall call : calls ) - callList.add(call); - - out.addMultiSampleCall(callList, metadata); - } else { - out.addGenotypeCall(calls.get(0)); - } - } - return calls; + return new Pair, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata); } protected List genotypeCallsFromGenotypeLikelihoods(EMOutput results, char ref, HashMap contexts) { @@ -76,7 +57,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode // an optimization double expectedChromosomes = 2.0 * (double)GLs.size() * results.getMAF(); if ( expectedChromosomes < 1.0 ) - return null; + return new ArrayList(); ArrayList calls = new ArrayList(); int variantCalls = 0; @@ -113,8 +94,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode // get the initial genotype likelihoods HashMap GLs = initializeGenotypeLikelihoods(ref, contexts, alleleFrequencies, priors, contextType); - callsMetrics.nCalledBases++; - // The EM loop: // we want to continue until the calculation is stable, but we need some max on the number of iterations int iterations = 0; @@ -233,23 +212,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode } - /** - * A class to keep track of some basic metrics about our calls - */ - protected class CallMetrics { - long nConfidentCalls = 0; - long nNonConfidentCalls = 0; - long nCalledBases = 0; - - CallMetrics() {} - - public String toString() { - return String.format("UG: %d confident and %d non-confident calls were made at %d bases", - nConfidentCalls, nNonConfidentCalls, nCalledBases); - } - } - - /** * A class to keep track of the EM output */ diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java index db62965b7..875793b57 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -2,7 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.genotype.GenotypeWriter; +import org.broadinstitute.sting.utils.genotype.GenotypeMetaData; +import org.broadinstitute.sting.utils.Pair; import org.apache.log4j.Logger; import java.util.*; @@ -20,7 +21,6 @@ public abstract class GenotypeCalculationModel implements Cloneable { protected BaseMismatchModel baseModel; protected Set samples; - protected GenotypeWriter out; protected Logger logger; protected double heterozygosity; protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform; @@ -40,16 +40,13 @@ public abstract class GenotypeCalculationModel implements Cloneable { * Initialize the GenotypeCalculationModel object * Assumes that out is not null * @param samples samples in input bam - * @param out output writer * @param logger logger * @param UAC unified arg collection */ protected void initialize(Set samples, - GenotypeWriter out, Logger logger, UnifiedArgumentCollection UAC) { this.samples = samples; - this.out = out; this.logger = logger; baseModel = UAC.baseModel; heterozygosity = UAC.heterozygosity; @@ -69,7 +66,6 @@ public abstract class GenotypeCalculationModel implements Cloneable { protected Object clone() throws CloneNotSupportedException { GenotypeCalculationModel gcm = (GenotypeCalculationModel)super.clone(); gcm.samples = new HashSet(samples); - gcm.out = out; gcm.logger = logger; gcm.baseModel = baseModel; gcm.heterozygosity = heterozygosity; @@ -89,10 +85,10 @@ public abstract class GenotypeCalculationModel implements Cloneable { * @param context alignment context * @param priors priors to use for GL * - * @return list of calls + * @return calls */ - public abstract List calculateGenotype(RefMetaDataTracker tracker, - char ref, - AlignmentContext context, - DiploidGenotypePriors priors); + public abstract Pair, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, + char ref, + AlignmentContext context, + DiploidGenotypePriors priors); } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java index 100fd1957..d62b10f6a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import static org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model.*; -import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.apache.log4j.Logger; import java.util.Set; @@ -48,12 +47,10 @@ public class GenotypeCalculationModelFactory { * * @param UAC The unified argument collection * @param samples samples in bam - * @param out output writer * @param logger logger * @return model */ public static GenotypeCalculationModel makeGenotypeCalculation(Set samples, - GenotypeWriter out, Logger logger, UnifiedArgumentCollection UAC) { GenotypeCalculationModel gcm; @@ -67,7 +64,7 @@ public class GenotypeCalculationModelFactory { default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel); } - gcm.initialize(samples, out, logger, UAC); + gcm.initialize(samples, logger, UAC); return gcm; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java index e77839df5..fe3dee8c2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; +import org.broadinstitute.sting.utils.genotype.GenotypeMetaData; import java.util.*; @@ -12,7 +13,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation protected PointEstimateGenotypeCalculationModel() {} // overload this method so we can special-case the single sample - public List calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { + public Pair, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { // we don't actually want to run EM for single samples if ( samples.size() == 1 ) { @@ -30,22 +31,10 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation if ( sampleContext == null ) return null; - callsMetrics.nCalledBases++; - // create the genotype call object Pair discoveryGL = getSingleSampleLikelihoods(ref, sampleContext, priors, StratifiedContext.OVERALL); GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, discoveryGL.second, discoveryGL.first); - - if ( GENOTYPE_MODE || call.isVariant(call.getReference()) ) { - double confidence = (GENOTYPE_MODE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError()); - if ( confidence >= LOD_THRESHOLD ) { - callsMetrics.nConfidentCalls++; - out.addGenotypeCall(call); - } else { - callsMetrics.nNonConfidentCalls++; - } - } - return Arrays.asList(call); + return new Pair, GenotypeMetaData>(Arrays.asList(call), null); } return super.calculateGenotype(tracker, ref, context, priors); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 7a8ecd5d8..a59a943ca 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -36,18 +36,22 @@ import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.ReadFilters; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.GenotypeMetaData; import java.io.File; import java.util.HashSet; import java.util.List; +import java.util.ArrayList; @ReadFilters({ZeroMappingQualityReadFilter.class, MissingReadGroupFilter.class}) -public class UnifiedGenotyper extends LocusWalker, Integer> { +public class UnifiedGenotyper extends LocusWalker, GenotypeMetaData>, Integer> { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); @@ -65,6 +69,11 @@ public class UnifiedGenotyper extends LocusWalker, Integer> { // output writer private GenotypeWriter writer; + // samples in input + private HashSet samples; + + // keep track of some metrics about our calls + private CallMetrics callsMetrics; /** * Filter out loci to ignore (at an ambiguous base in the reference or a locus with zero coverage). @@ -86,7 +95,7 @@ public class UnifiedGenotyper extends LocusWalker, Integer> { public void initialize() { // get all of the unique sample names - HashSet samples = new HashSet(); + samples = new HashSet(); List readGroups = getToolkit().getSAMFileHeader().getReadGroups(); for ( SAMReadGroupRecord readGroup : readGroups ) samples.add(readGroup.getSample()); @@ -104,7 +113,8 @@ public class UnifiedGenotyper extends LocusWalker, Integer> { writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out, "UnifiedGenotyper", this.getToolkit().getArguments().referenceFile.getName()); - gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, writer, logger, UAC); + gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, logger, UAC); + callsMetrics = new CallMetrics(); } /** @@ -114,7 +124,7 @@ public class UnifiedGenotyper extends LocusWalker, Integer> { * @param refContext the reference base * @param context contextual information around the locus */ - public List map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext context) { + public Pair, GenotypeMetaData> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext context) { char ref = Character.toUpperCase(refContext.getBase()); if ( !BaseUtils.isRegularBase(ref) ) return null; @@ -151,7 +161,47 @@ public class UnifiedGenotyper extends LocusWalker, Integer> { public Integer reduceInit() { return 0; } - public Integer reduce(List value, Integer sum) { + public Integer reduce(Pair, GenotypeMetaData> value, Integer sum) { + if ( value == null || value.first == null ) + return sum; + + callsMetrics.nCalledBases++; + + if ( value.first.size() == 0 ) + return sum; + + // special-case for single-sample using PointEstimate model + if ( value.second == null ) { + GenotypeCall call = value.first.get(0); + if ( UAC.GENOTYPE || call.isVariant(call.getReference()) ) { + double confidence = (UAC.GENOTYPE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError()); + if ( confidence >= UAC.LOD_THRESHOLD ) { + callsMetrics.nConfidentCalls++; + writer.addGenotypeCall(call); + } + } else { + callsMetrics.nNonConfidentCalls++; + } + } + + // use multi-sample mode if we have multiple samples or the output type allows it + else if ( writer.supportsMultiSample() || samples.size() > 1 ) { + + // annoying hack to get around Java generics + ArrayList callList = new ArrayList(); + for ( GenotypeCall call : value.first ) + callList.add(call); + + callsMetrics.nConfidentCalls++; + writer.addMultiSampleCall(callList, value.second); + } + + // otherwise, use single sample mode + else { + callsMetrics.nConfidentCalls++; + writer.addGenotypeCall(value.first.get(0)); + } + return sum + 1; } @@ -160,4 +210,20 @@ public class UnifiedGenotyper extends LocusWalker, Integer> { logger.info("Processed " + sum + " loci that are callable for SNPs"); writer.close(); } + + /** + * A class to keep track of some basic metrics about our calls + */ + protected class CallMetrics { + long nConfidentCalls = 0; + long nNonConfidentCalls = 0; + long nCalledBases = 0; + + CallMetrics() {} + + public String toString() { + return String.format("UG: %d confident and %d non-confident calls were made at %d bases", + nConfidentCalls, nNonConfidentCalls, nCalledBases); + } + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaWalker.java index a619109c6..cf24922ff 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaWalker.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.GenotypeMetaData; import java.util.List; import java.util.LinkedList; @@ -80,8 +81,10 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker calls = ug.map(tracker,ref,context); - return ( ! calls.get(0).isVariant(ref.getBase()) && calls.get(0).getNegLog10PError() >= confidentRefThreshold ); + Pair, GenotypeMetaData> calls = ug.map(tracker,ref,context); + if (calls == null) + return false; + return ( ! calls.first.get(0).isVariant(ref.getBase()) && calls.first.get(0).getNegLog10PError() >= confidentRefThreshold ); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java index ef7b7a8eb..12c843a47 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.ListUtils; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.genotype.*; @@ -80,9 +81,9 @@ public class CoverageEvalWalker extends LocusWalker, String> { List sub_offsets = ListUtils.sliceListByIndices(subset_indices, offsets); AlignmentContext subContext = new AlignmentContext(context.getLocation(), sub_reads, sub_offsets); - List calls = UG.map(tracker, ref, subContext); - if (calls != null) { - GenotypeCall call = calls.get(0); + Pair, GenotypeMetaData> calls = UG.map(tracker, ref, subContext); + if (calls != null && calls.first != null) { + GenotypeCall call = calls.first.get(0); String callType = (call.isVariant(call.getReference())) ? ((call.isHom()) ? "HomozygousSNP" : "HeterozygousSNP") : "HomozygousReference"; GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+callType+"\t"+toGeliString(call)); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DeNovoSNPWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DeNovoSNPWalker.java index cf919c73b..038380040 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DeNovoSNPWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DeNovoSNPWalker.java @@ -10,6 +10,8 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCall; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper; import org.broadinstitute.sting.utils.genotype.Variation; +import org.broadinstitute.sting.utils.genotype.GenotypeMetaData; +import org.broadinstitute.sting.utils.Pair; import java.util.ArrayList; import java.util.List; @@ -70,14 +72,14 @@ public class DeNovoSNPWalker extends RefWalker{ } AlignmentContext parent1_subContext = new AlignmentContext(context.getLocation(), parent1_reads, parent1_offsets); - List parent1 = UG.map(tracker, ref, parent1_subContext); + Pair, GenotypeMetaData> parent1 = UG.map(tracker, ref, parent1_subContext); AlignmentContext parent2_subContext = new AlignmentContext(context.getLocation(), parent2_reads, parent2_offsets); - List parent2 = UG.map(tracker, ref, parent2_subContext); + Pair, GenotypeMetaData> parent2 = UG.map(tracker, ref, parent2_subContext); - if ( parent1 != null && parent2 != null ) { - GenotypeCall parent1call = parent1.get(0); - GenotypeCall parent2call = parent2.get(0); + if ( parent1 != null && parent1.first != null && parent2 != null && parent2.first != null ) { + GenotypeCall parent1call = parent1.first.get(0); + GenotypeCall parent2call = parent2.first.get(0); if (!parent1call.isVariant(parent1call.getReference()) && parent1call.getNegLog10PError() > 5 && diff --git a/java/src/org/broadinstitute/sting/playground/utils/ArtificialPoolContext.java b/java/src/org/broadinstitute/sting/playground/utils/ArtificialPoolContext.java index 2311ef1bf..057440d60 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/ArtificialPoolContext.java +++ b/java/src/org/broadinstitute/sting/playground/utils/ArtificialPoolContext.java @@ -5,9 +5,9 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper; +import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.*; import java.io.PrintWriter; import java.util.ArrayList; @@ -257,9 +257,9 @@ public class ArtificialPoolContext { public Genotype getGenotype(int group) { AlignmentContext alicon = this.getAlignmentContext(); Pair[],List[]> byGroupSplitPair = this.splitByGroup(alicon.getReads(),alicon.getOffsets()); - List result = ug.map(this.getRefMetaDataTracker(),this.getReferenceContext(), + Pair, GenotypeMetaData> result = ug.map(this.getRefMetaDataTracker(),this.getReferenceContext(), new AlignmentContext(this.getAlignmentContext().getLocation(), byGroupSplitPair.first[group],byGroupSplitPair.second[group])); - return (result == null ? null : result.get(0)); + return (result.first == null ? null : result.first.get(0)); } public static List[] sampleReads(List[] reads, double[] propEstGlobal) {