Because walkers call UG's map function, we need to move the actual writing out

to UG's reduce function.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1845 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-10-14 20:49:26 +00:00
parent 825e6c7a4d
commit e740e7a7ce
9 changed files with 105 additions and 89 deletions

View File

@ -16,13 +16,10 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
// We consider the EM stable when the MAF doesn't change more than 1/10N
protected static final double EM_STABILITY_METRIC = 0.1;
// keep track of some metrics about our calls
protected CallMetrics callsMetrics = new CallMetrics();
protected EMGenotypeCalculationModel() {}
public List<GenotypeCall> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
// keep track of the context for each sample, overall and separated by strand
int[] baseCounts = new int[4];
@ -51,23 +48,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
// generate the calls
GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF());
List<GenotypeCall> calls = genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts);
if ( calls != null && calls.size() != 0 ) {
// use multi-sample mode if we have multiple samples or the output type allows it
if ( out.supportsMultiSample() || samples.size() > 1 ) {
// annoying hack to get around Java generics
ArrayList<Genotype> callList = new ArrayList<Genotype>();
for ( GenotypeCall call : calls )
callList.add(call);
out.addMultiSampleCall(callList, metadata);
} else {
out.addGenotypeCall(calls.get(0));
}
}
return calls;
return new Pair<List<GenotypeCall>, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata);
}
protected List<GenotypeCall> genotypeCallsFromGenotypeLikelihoods(EMOutput results, char ref, HashMap<String, AlignmentContextBySample> contexts) {
@ -76,7 +57,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
// an optimization
double expectedChromosomes = 2.0 * (double)GLs.size() * results.getMAF();
if ( expectedChromosomes < 1.0 )
return null;
return new ArrayList<GenotypeCall>();
ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>();
int variantCalls = 0;
@ -113,8 +94,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
// get the initial genotype likelihoods
HashMap<String, GenotypeLikelihoods> GLs = initializeGenotypeLikelihoods(ref, contexts, alleleFrequencies, priors, contextType);
callsMetrics.nCalledBases++;
// The EM loop:
// we want to continue until the calculation is stable, but we need some max on the number of iterations
int iterations = 0;
@ -233,23 +212,6 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
}
/**
* A class to keep track of some basic metrics about our calls
*/
protected class CallMetrics {
long nConfidentCalls = 0;
long nNonConfidentCalls = 0;
long nCalledBases = 0;
CallMetrics() {}
public String toString() {
return String.format("UG: %d confident and %d non-confident calls were made at %d bases",
nConfidentCalls, nNonConfidentCalls, nCalledBases);
}
}
/**
* A class to keep track of the EM output
*/

View File

@ -2,7 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.GenotypeMetaData;
import org.broadinstitute.sting.utils.Pair;
import org.apache.log4j.Logger;
import java.util.*;
@ -20,7 +21,6 @@ public abstract class GenotypeCalculationModel implements Cloneable {
protected BaseMismatchModel baseModel;
protected Set<String> samples;
protected GenotypeWriter out;
protected Logger logger;
protected double heterozygosity;
protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
@ -40,16 +40,13 @@ public abstract class GenotypeCalculationModel implements Cloneable {
* Initialize the GenotypeCalculationModel object
* Assumes that out is not null
* @param samples samples in input bam
* @param out output writer
* @param logger logger
* @param UAC unified arg collection
*/
protected void initialize(Set<String> samples,
GenotypeWriter out,
Logger logger,
UnifiedArgumentCollection UAC) {
this.samples = samples;
this.out = out;
this.logger = logger;
baseModel = UAC.baseModel;
heterozygosity = UAC.heterozygosity;
@ -69,7 +66,6 @@ public abstract class GenotypeCalculationModel implements Cloneable {
protected Object clone() throws CloneNotSupportedException {
GenotypeCalculationModel gcm = (GenotypeCalculationModel)super.clone();
gcm.samples = new HashSet<String>(samples);
gcm.out = out;
gcm.logger = logger;
gcm.baseModel = baseModel;
gcm.heterozygosity = heterozygosity;
@ -89,10 +85,10 @@ public abstract class GenotypeCalculationModel implements Cloneable {
* @param context alignment context
* @param priors priors to use for GL
*
* @return list of calls
* @return calls
*/
public abstract List<GenotypeCall> calculateGenotype(RefMetaDataTracker tracker,
char ref,
AlignmentContext context,
DiploidGenotypePriors priors);
public abstract Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker,
char ref,
AlignmentContext context,
DiploidGenotypePriors priors);
}

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import static org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model.*;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.apache.log4j.Logger;
import java.util.Set;
@ -48,12 +47,10 @@ public class GenotypeCalculationModelFactory {
*
* @param UAC The unified argument collection
* @param samples samples in bam
* @param out output writer
* @param logger logger
* @return model
*/
public static GenotypeCalculationModel makeGenotypeCalculation(Set<String> samples,
GenotypeWriter out,
Logger logger,
UnifiedArgumentCollection UAC) {
GenotypeCalculationModel gcm;
@ -67,7 +64,7 @@ public class GenotypeCalculationModelFactory {
default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
}
gcm.initialize(samples, out, logger, UAC);
gcm.initialize(samples, logger, UAC);
return gcm;
}
}

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.genotype.GenotypeMetaData;
import java.util.*;
@ -12,7 +13,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
protected PointEstimateGenotypeCalculationModel() {}
// overload this method so we can special-case the single sample
public List<GenotypeCall> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
// we don't actually want to run EM for single samples
if ( samples.size() == 1 ) {
@ -30,22 +31,10 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
if ( sampleContext == null )
return null;
callsMetrics.nCalledBases++;
// create the genotype call object
Pair<ReadBackedPileup, GenotypeLikelihoods> discoveryGL = getSingleSampleLikelihoods(ref, sampleContext, priors, StratifiedContext.OVERALL);
GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, discoveryGL.second, discoveryGL.first);
if ( GENOTYPE_MODE || call.isVariant(call.getReference()) ) {
double confidence = (GENOTYPE_MODE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError());
if ( confidence >= LOD_THRESHOLD ) {
callsMetrics.nConfidentCalls++;
out.addGenotypeCall(call);
} else {
callsMetrics.nNonConfidentCalls++;
}
}
return Arrays.asList(call);
return new Pair<List<GenotypeCall>, GenotypeMetaData>(Arrays.asList(call), null);
}
return super.calculateGenotype(tracker, ref, context, priors);

View File

@ -36,18 +36,22 @@ import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.GenotypeMetaData;
import java.io.File;
import java.util.HashSet;
import java.util.List;
import java.util.ArrayList;
@ReadFilters({ZeroMappingQualityReadFilter.class, MissingReadGroupFilter.class})
public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, GenotypeMetaData>, Integer> {
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
@ -65,6 +69,11 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
// output writer
private GenotypeWriter writer;
// samples in input
private HashSet<String> samples;
// keep track of some metrics about our calls
private CallMetrics callsMetrics;
/**
* Filter out loci to ignore (at an ambiguous base in the reference or a locus with zero coverage).
@ -86,7 +95,7 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
public void initialize() {
// get all of the unique sample names
HashSet<String> samples = new HashSet<String>();
samples = new HashSet<String>();
List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
for ( SAMReadGroupRecord readGroup : readGroups )
samples.add(readGroup.getSample());
@ -104,7 +113,8 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out, "UnifiedGenotyper",
this.getToolkit().getArguments().referenceFile.getName());
gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, writer, logger, UAC);
gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, logger, UAC);
callsMetrics = new CallMetrics();
}
/**
@ -114,7 +124,7 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
* @param refContext the reference base
* @param context contextual information around the locus
*/
public List<GenotypeCall> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext context) {
public Pair<List<GenotypeCall>, GenotypeMetaData> map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext context) {
char ref = Character.toUpperCase(refContext.getBase());
if ( !BaseUtils.isRegularBase(ref) )
return null;
@ -151,7 +161,47 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
public Integer reduceInit() { return 0; }
public Integer reduce(List<GenotypeCall> value, Integer sum) {
public Integer reduce(Pair<List<GenotypeCall>, GenotypeMetaData> value, Integer sum) {
if ( value == null || value.first == null )
return sum;
callsMetrics.nCalledBases++;
if ( value.first.size() == 0 )
return sum;
// special-case for single-sample using PointEstimate model
if ( value.second == null ) {
GenotypeCall call = value.first.get(0);
if ( UAC.GENOTYPE || call.isVariant(call.getReference()) ) {
double confidence = (UAC.GENOTYPE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError());
if ( confidence >= UAC.LOD_THRESHOLD ) {
callsMetrics.nConfidentCalls++;
writer.addGenotypeCall(call);
}
} else {
callsMetrics.nNonConfidentCalls++;
}
}
// use multi-sample mode if we have multiple samples or the output type allows it
else if ( writer.supportsMultiSample() || samples.size() > 1 ) {
// annoying hack to get around Java generics
ArrayList<Genotype> callList = new ArrayList<Genotype>();
for ( GenotypeCall call : value.first )
callList.add(call);
callsMetrics.nConfidentCalls++;
writer.addMultiSampleCall(callList, value.second);
}
// otherwise, use single sample mode
else {
callsMetrics.nConfidentCalls++;
writer.addGenotypeCall(value.first.get(0));
}
return sum + 1;
}
@ -160,4 +210,20 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
logger.info("Processed " + sum + " loci that are callable for SNPs");
writer.close();
}
/**
* A class to keep track of some basic metrics about our calls
*/
protected class CallMetrics {
long nConfidentCalls = 0;
long nNonConfidentCalls = 0;
long nCalledBases = 0;
CallMetrics() {}
public String toString() {
return String.format("UG: %d confident and %d non-confident calls were made at %d bases",
nConfidentCalls, nNonConfidentCalls, nCalledBases);
}
}
}

View File

@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.GenotypeMetaData;
import java.util.List;
import java.util.LinkedList;
@ -80,8 +81,10 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Referen
public boolean baseIsConfidentRef( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
List<GenotypeCall> calls = ug.map(tracker,ref,context);
return ( ! calls.get(0).isVariant(ref.getBase()) && calls.get(0).getNegLog10PError() >= confidentRefThreshold );
Pair<List<GenotypeCall>, GenotypeMetaData> calls = ug.map(tracker,ref,context);
if (calls == null)
return false;
return ( ! calls.first.get(0).isVariant(ref.getBase()) && calls.first.get(0).getNegLog10PError() >= confidentRefThreshold );
}

View File

@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.ListUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.*;
@ -80,9 +81,9 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
List<Integer> sub_offsets = ListUtils.sliceListByIndices(subset_indices, offsets);
AlignmentContext subContext = new AlignmentContext(context.getLocation(), sub_reads, sub_offsets);
List<GenotypeCall> calls = UG.map(tracker, ref, subContext);
if (calls != null) {
GenotypeCall call = calls.get(0);
Pair<List<GenotypeCall>, GenotypeMetaData> calls = UG.map(tracker, ref, subContext);
if (calls != null && calls.first != null) {
GenotypeCall call = calls.first.get(0);
String callType = (call.isVariant(call.getReference())) ? ((call.isHom()) ? "HomozygousSNP" : "HeterozygousSNP") : "HomozygousReference";
GenotypeCalls.add(coverage+"\t"+coverage_available+"\t"+hc_genotype+"\t"+callType+"\t"+toGeliString(call));
}

View File

@ -10,6 +10,8 @@ import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCall;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.GenotypeMetaData;
import org.broadinstitute.sting.utils.Pair;
import java.util.ArrayList;
import java.util.List;
@ -70,14 +72,14 @@ public class DeNovoSNPWalker extends RefWalker<String, Integer>{
}
AlignmentContext parent1_subContext = new AlignmentContext(context.getLocation(), parent1_reads, parent1_offsets);
List<GenotypeCall> parent1 = UG.map(tracker, ref, parent1_subContext);
Pair<List<GenotypeCall>, GenotypeMetaData> parent1 = UG.map(tracker, ref, parent1_subContext);
AlignmentContext parent2_subContext = new AlignmentContext(context.getLocation(), parent2_reads, parent2_offsets);
List<GenotypeCall> parent2 = UG.map(tracker, ref, parent2_subContext);
Pair<List<GenotypeCall>, GenotypeMetaData> parent2 = UG.map(tracker, ref, parent2_subContext);
if ( parent1 != null && parent2 != null ) {
GenotypeCall parent1call = parent1.get(0);
GenotypeCall parent2call = parent2.get(0);
if ( parent1 != null && parent1.first != null && parent2 != null && parent2.first != null ) {
GenotypeCall parent1call = parent1.first.get(0);
GenotypeCall parent2call = parent2.first.get(0);
if (!parent1call.isVariant(parent1call.getReference()) &&
parent1call.getNegLog10PError() > 5 &&

View File

@ -5,9 +5,9 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.*;
import java.io.PrintWriter;
import java.util.ArrayList;
@ -257,9 +257,9 @@ public class ArtificialPoolContext {
public Genotype getGenotype(int group) {
AlignmentContext alicon = this.getAlignmentContext();
Pair<List<SAMRecord>[],List<Integer>[]> byGroupSplitPair = this.splitByGroup(alicon.getReads(),alicon.getOffsets());
List<? extends Genotype> result = ug.map(this.getRefMetaDataTracker(),this.getReferenceContext(),
Pair<List<GenotypeCall>, GenotypeMetaData> result = ug.map(this.getRefMetaDataTracker(),this.getReferenceContext(),
new AlignmentContext(this.getAlignmentContext().getLocation(), byGroupSplitPair.first[group],byGroupSplitPair.second[group]));
return (result == null ? null : result.get(0));
return (result.first == null ? null : result.first.get(0));
}
public static List<SAMRecord>[] sampleReads(List<SAMRecord>[] reads, double[] propEstGlobal) {