The UG calculations are now driven by an independent engine.
This completely separates the genotyper walker from other walkers. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2758 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d8e75cf631
commit
506d39f751
|
|
@ -1,51 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* Class for maintaining the arguments that need to be passed to UnifiedGenotyper.runGenotyper()
|
||||
* (so that they only need to be computed one time)
|
||||
*/
|
||||
public class UGCalculationArguments {
|
||||
// should we annotate dbsnp?
|
||||
protected boolean annotateDbsnp = false;
|
||||
// should we annotate hapmap2?
|
||||
protected boolean annotateHapmap2 = false;
|
||||
// should we annotate hapmap3?
|
||||
protected boolean annotateHapmap3 = false;
|
||||
|
||||
// the unified argument collection
|
||||
protected UnifiedArgumentCollection UAC = null;
|
||||
|
||||
// the model used for calculating genotypes
|
||||
protected ThreadLocal<GenotypeCalculationModel> gcm = new ThreadLocal<GenotypeCalculationModel>();
|
||||
|
||||
// samples in input
|
||||
protected Set<String> samples = new HashSet<String>();
|
||||
}
|
||||
|
|
@ -26,19 +26,13 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.contexts.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.*;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
import org.broadinstitute.sting.utils.genotype.geli.GeliGenotypeWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.GLFGenotypeWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -65,7 +59,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
public PrintStream beagleWriter = null;
|
||||
|
||||
// the calculation arguments
|
||||
private UGCalculationArguments UG_args = null;
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
|
||||
// Enable deletions in the pileup
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
|
@ -88,172 +82,13 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / nBasesCallable; }
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates the argument calculation object for the UnifiedGenotyper.
|
||||
*
|
||||
* @param toolkit the GATK Engine
|
||||
* @param UAC the UnifiedArgumentCollection
|
||||
* @return UG calculation arguments object
|
||||
**/
|
||||
public static UGCalculationArguments getUnifiedCalculationArguments(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
|
||||
return getUnifiedCalculationArguments(toolkit, UAC, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the argument calculation object for the UnifiedGenotyper.
|
||||
*
|
||||
* @param toolkit the GATK Engine
|
||||
* @param UAC the UnifiedArgumentCollection
|
||||
* @param writer the genotype writer
|
||||
* @return UG calculation arguments object
|
||||
*
|
||||
**/
|
||||
private static UGCalculationArguments getUnifiedCalculationArguments(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, GenotypeWriter writer) {
|
||||
UGCalculationArguments UG_args = new UGCalculationArguments();
|
||||
UG_args.UAC = UAC;
|
||||
|
||||
// deal with input errors
|
||||
if ( UAC.POOLSIZE > 0 && UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
|
||||
throw new IllegalArgumentException("Attempting to use a model other than POOLED with pooled data. Please set the model to POOLED.");
|
||||
}
|
||||
if ( UAC.POOLSIZE < 1 && UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED ) {
|
||||
throw new IllegalArgumentException("Attempting to use the POOLED model with a pool size less than 1. Please set the pool size to an appropriate value.");
|
||||
}
|
||||
if ( toolkit.getArguments().numberOfThreads > 1 && UAC.ASSUME_SINGLE_SAMPLE != null ) {
|
||||
// the ASSUME_SINGLE_SAMPLE argument can't be handled (at least for now) while we are multi-threaded because the IO system doesn't know how to get the sample name
|
||||
throw new IllegalArgumentException("For technical reasons, the ASSUME_SINGLE_SAMPLE argument cannot be used with multiple threads");
|
||||
}
|
||||
|
||||
// get all of the unique sample names - unless we're in POOLED mode, in which case we ignore the sample names
|
||||
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
|
||||
// if we're supposed to assume a single sample, do so
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
UG_args.samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
else
|
||||
UG_args.samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
|
||||
}
|
||||
|
||||
// in pooled mode we need to check that the format is acceptable
|
||||
if ( UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED && writer != null ) {
|
||||
// only multi-sample calls use Variations
|
||||
if ( !writer.supportsMultiSample() )
|
||||
throw new IllegalArgumentException("The POOLED model is not compatible with the specified format; try using VCF instead");
|
||||
|
||||
// when using VCF with multiple threads, we need to turn down the validation stringency so that writing temporary files will work
|
||||
if ( toolkit.getArguments().numberOfThreads > 1 && writer instanceof VCFGenotypeWriter )
|
||||
((VCFGenotypeWriter)writer).setValidationStringency(VCFGenotypeWriterAdapter.VALIDATION_STRINGENCY.SILENT);
|
||||
}
|
||||
|
||||
// check to see whether a dbsnp rod was included
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
ReferenceOrderedData rod = source.getReferenceOrderedData();
|
||||
if ( rod.getType().equals(rodDbSNP.class) ) {
|
||||
UG_args.annotateDbsnp = true;
|
||||
}
|
||||
if ( rod.getName().equals("hapmap2") ) {
|
||||
UG_args.annotateHapmap2 = true;
|
||||
}
|
||||
if ( rod.getName().equals("hapmap3") ) {
|
||||
UG_args.annotateHapmap3 = true;
|
||||
}
|
||||
}
|
||||
|
||||
return UG_args;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @param UG_args the calculation argument collection
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public static VariantCallContext runGenotyper(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, UGCalculationArguments UG_args) {
|
||||
|
||||
char ref = Character.toUpperCase(refContext.getBase());
|
||||
if ( !BaseUtils.isRegularBase(ref) )
|
||||
return null;
|
||||
|
||||
// don't try to call if we couldn't read in all reads at this locus (since it wasn't properly downsampled)
|
||||
if ( rawContext.hasExceededMaxPileup() )
|
||||
return null;
|
||||
|
||||
ReadBackedPileup rawPileup = rawContext.getBasePileup();
|
||||
|
||||
// filter the context based on min base and mapping qualities
|
||||
ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UG_args.UAC.MIN_BASE_QUALTY_SCORE, UG_args.UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||
|
||||
// filter the context based on mapping quality and mismatch rate
|
||||
pileup = filterPileup(pileup, refContext, UG_args.UAC);
|
||||
|
||||
// don't call when there is no coverage
|
||||
if ( pileup.size() == 0 )
|
||||
return null;
|
||||
|
||||
// are there too many deletions in the pileup?
|
||||
if ( isValidDeletionFraction(UG_args.UAC.MAX_DELETION_FRACTION) &&
|
||||
(double)pileup.getNumberOfDeletions() / (double)pileup.size() > UG_args.UAC.MAX_DELETION_FRACTION )
|
||||
return null;
|
||||
|
||||
// stratify the AlignmentContext and cut by sample
|
||||
// Note that for testing purposes, we may want to throw multi-samples at pooled mode
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UG_args.UAC.ASSUME_SINGLE_SAMPLE, (UG_args.UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED ? PooledCalculationModel.POOL_SAMPLE_NAME : null));
|
||||
if ( stratifiedContexts == null )
|
||||
return null;
|
||||
|
||||
DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, UG_args.UAC.heterozygosity, DiploidGenotypePriors.PROB_OF_REFERENCE_ERROR);
|
||||
VariantCallContext call = UG_args.gcm.get().callLocus(tracker, ref, rawContext.getLocation(), stratifiedContexts, priors);
|
||||
|
||||
// annotate the call, if possible
|
||||
if ( call != null && call.variation != null && call.variation instanceof ArbitraryFieldsBacked ) {
|
||||
// first off, we want to use the *unfiltered* context for the annotations
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup());
|
||||
|
||||
Map<String, String> annotations;
|
||||
if ( UG_args.UAC.ALL_ANNOTATIONS )
|
||||
annotations = VariantAnnotator.getAllAnnotations(tracker, refContext, stratifiedContexts, call.variation, UG_args.annotateDbsnp, UG_args.annotateHapmap2, UG_args.annotateHapmap3);
|
||||
else
|
||||
annotations = VariantAnnotator.getAnnotations(tracker, refContext, stratifiedContexts, call.variation, UG_args.annotateDbsnp, UG_args.annotateHapmap2, UG_args.annotateHapmap3);
|
||||
((ArbitraryFieldsBacked)call.variation).setFields(annotations);
|
||||
}
|
||||
|
||||
return call;
|
||||
}
|
||||
|
||||
// filter based on maximum mismatches and bad mates
|
||||
private static ReadBackedPileup filterPileup(ReadBackedPileup pileup, ReferenceContext refContext, UnifiedArgumentCollection UAC) {
|
||||
|
||||
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( (UAC.USE_BADLY_MATED_READS || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) &&
|
||||
AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= UAC.MAX_MISMATCHES )
|
||||
filteredPileup.add(p);
|
||||
}
|
||||
return new ReadBackedPileup(pileup.getLocation(), filteredPileup);
|
||||
}
|
||||
|
||||
private static boolean isValidDeletionFraction(double d) {
|
||||
return ( d >= 0.0 && d <= 1.0 );
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// The following methods are walker-specific; don't use them unless you are a traversal.
|
||||
// If you are a walker, stick to the static methods defined above.
|
||||
//
|
||||
// ------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Initialize the samples, output, and genotype calculation model
|
||||
*
|
||||
**/
|
||||
public void initialize() {
|
||||
|
||||
UG_args = getUnifiedCalculationArguments(getToolkit(), UAC, writer);
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, writer, verboseWriter, beagleWriter);
|
||||
|
||||
// initialize the writers
|
||||
if ( verboseWriter != null ) {
|
||||
|
|
@ -267,16 +102,16 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
}
|
||||
if ( beagleWriter != null ) {
|
||||
beagleWriter.print("marker alleleA alleleB");
|
||||
for ( String sample : UG_args.samples )
|
||||
for ( String sample : UG_engine.samples )
|
||||
beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));
|
||||
beagleWriter.println();
|
||||
}
|
||||
|
||||
// initialize the header
|
||||
GenotypeWriterFactory.writeHeader(writer, GenomeAnalysisEngine.instance.getSAMFileHeader(), UG_args.samples, getHeaderInfo(UG_args));
|
||||
GenotypeWriterFactory.writeHeader(writer, GenomeAnalysisEngine.instance.getSAMFileHeader(), UG_engine.samples, getHeaderInfo());
|
||||
}
|
||||
|
||||
private Set<VCFHeaderLine> getHeaderInfo(UGCalculationArguments UG_args) {
|
||||
private Set<VCFHeaderLine> getHeaderInfo() {
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
|
||||
// this is only applicable to VCF
|
||||
|
|
@ -295,11 +130,11 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
|
||||
// annotation (INFO) fields from UnifiedGenotyper
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.ALLELE_FREQUENCY_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Allele Frequency"));
|
||||
if ( UG_args.annotateDbsnp )
|
||||
if ( UG_engine.annotateDbsnp )
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.DBSNP_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "dbSNP Membership"));
|
||||
if ( UG_args.annotateHapmap2 )
|
||||
if ( UG_engine.annotateHapmap2 )
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.HAPMAP2_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "HapMap2 Membership"));
|
||||
if ( UG_args.annotateHapmap3 )
|
||||
if ( UG_engine.annotateHapmap3 )
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.HAPMAP3_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "HapMap3 Membership"));
|
||||
if ( !UAC.NO_SLOD )
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFRecord.STRAND_BIAS_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Strand Bias"));
|
||||
|
|
@ -331,24 +166,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
* @return the VariantCallContext object
|
||||
*/
|
||||
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
|
||||
// initialize the GenotypeCalculationModel for this thread if that hasn't been done yet
|
||||
if ( UG_args.gcm.get() == null ) {
|
||||
GenotypeWriterFactory.GENOTYPE_FORMAT format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
|
||||
if ( writer != null ) {
|
||||
if ( writer instanceof VCFGenotypeWriter )
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
|
||||
else if ( writer instanceof GLFGenotypeWriter )
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GLF;
|
||||
else if ( writer instanceof GeliGenotypeWriter )
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
|
||||
else
|
||||
throw new StingException("Unsupported genotype format: " + writer.getClass().getName());
|
||||
}
|
||||
UG_args.gcm.set(GenotypeCalculationModelFactory.makeGenotypeCalculation(UG_args.samples, logger, UAC, format, verboseWriter, beagleWriter));
|
||||
}
|
||||
|
||||
return runGenotyper(tracker, refContext, rawContext, UG_args);
|
||||
return UG_engine.runGenotyper(tracker, refContext, rawContext);
|
||||
}
|
||||
|
||||
public UGStatistics reduceInit() { return new UGStatistics(); }
|
||||
|
|
@ -381,7 +199,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
}
|
||||
|
||||
// if we have a single-sample call (single sample from PointEstimate model returns no VariationCall data)
|
||||
if ( value.variation == null || (!writer.supportsMultiSample() && UG_args.samples.size() <= 1) ) {
|
||||
if ( value.variation == null || (!writer.supportsMultiSample() && UG_engine.samples.size() <= 1) ) {
|
||||
writer.addGenotypeCall(value.genotypes.get(0));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,227 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.contexts.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
import org.broadinstitute.sting.utils.genotype.geli.GeliGenotypeWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.GLFGenotypeWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
|
||||
|
||||
public class UnifiedGenotyperEngine {
|
||||
|
||||
// should we annotate dbsnp?
|
||||
protected boolean annotateDbsnp = false;
|
||||
// should we annotate hapmap2?
|
||||
protected boolean annotateHapmap2 = false;
|
||||
// should we annotate hapmap3?
|
||||
protected boolean annotateHapmap3 = false;
|
||||
|
||||
// the unified argument collection
|
||||
protected UnifiedArgumentCollection UAC = null;
|
||||
|
||||
// the model used for calculating genotypes
|
||||
protected ThreadLocal<GenotypeCalculationModel> gcm = new ThreadLocal<GenotypeCalculationModel>();
|
||||
|
||||
// the various loggers and writers
|
||||
protected Logger logger = null;
|
||||
protected GenotypeWriter genotypeWriter = null;
|
||||
protected PrintStream verboseWriter = null;
|
||||
protected PrintStream beagleWriter = null;
|
||||
|
||||
// samples in input
|
||||
protected Set<String> samples = new HashSet<String>();
|
||||
|
||||
|
||||
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
|
||||
initialize(toolkit, UAC, null, null, null, null);
|
||||
}
|
||||
|
||||
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, GenotypeWriter genotypeWriter, PrintStream verboseWriter, PrintStream beagleWriter) {
|
||||
initialize(toolkit, UAC, logger, genotypeWriter, verboseWriter, beagleWriter);
|
||||
|
||||
}
|
||||
|
||||
private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, GenotypeWriter genotypeWriter, PrintStream verboseWriter, PrintStream beagleWriter) {
|
||||
this.UAC = UAC;
|
||||
this.logger = logger;
|
||||
this.genotypeWriter = genotypeWriter;
|
||||
this.verboseWriter = verboseWriter;
|
||||
this.beagleWriter = beagleWriter;
|
||||
|
||||
// deal with input errors
|
||||
if ( UAC.POOLSIZE > 0 && UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
|
||||
throw new IllegalArgumentException("Attempting to use a model other than POOLED with pooled data. Please set the model to POOLED.");
|
||||
}
|
||||
if ( UAC.POOLSIZE < 1 && UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED ) {
|
||||
throw new IllegalArgumentException("Attempting to use the POOLED model with a pool size less than 1. Please set the pool size to an appropriate value.");
|
||||
}
|
||||
if ( toolkit.getArguments().numberOfThreads > 1 && UAC.ASSUME_SINGLE_SAMPLE != null ) {
|
||||
// the ASSUME_SINGLE_SAMPLE argument can't be handled (at least for now) while we are multi-threaded because the IO system doesn't know how to get the sample name
|
||||
throw new IllegalArgumentException("For technical reasons, the ASSUME_SINGLE_SAMPLE argument cannot be used with multiple threads");
|
||||
}
|
||||
|
||||
// get all of the unique sample names - unless we're in POOLED mode, in which case we ignore the sample names
|
||||
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
|
||||
// if we're supposed to assume a single sample, do so
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
this.samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
else
|
||||
this.samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
|
||||
}
|
||||
|
||||
// in pooled mode we need to check that the format is acceptable
|
||||
if ( UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED && genotypeWriter != null ) {
|
||||
// only multi-sample calls use Variations
|
||||
if ( !genotypeWriter.supportsMultiSample() )
|
||||
throw new IllegalArgumentException("The POOLED model is not compatible with the specified format; try using VCF instead");
|
||||
|
||||
// when using VCF with multiple threads, we need to turn down the validation stringency so that writing temporary files will work
|
||||
if ( toolkit.getArguments().numberOfThreads > 1 && genotypeWriter instanceof VCFGenotypeWriter )
|
||||
((VCFGenotypeWriter)genotypeWriter).setValidationStringency(VCFGenotypeWriterAdapter.VALIDATION_STRINGENCY.SILENT);
|
||||
}
|
||||
|
||||
// check to see whether a dbsnp rod was included
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
ReferenceOrderedData rod = source.getReferenceOrderedData();
|
||||
if ( rod.getType().equals(rodDbSNP.class) ) {
|
||||
this.annotateDbsnp = true;
|
||||
}
|
||||
if ( rod.getName().equals("hapmap2") ) {
|
||||
this.annotateHapmap2 = true;
|
||||
}
|
||||
if ( rod.getName().equals("hapmap3") ) {
|
||||
this.annotateHapmap3 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public VariantCallContext runGenotyper(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
|
||||
// initialize the GenotypeCalculationModel for this thread if that hasn't been done yet
|
||||
if ( gcm.get() == null ) {
|
||||
GenotypeWriterFactory.GENOTYPE_FORMAT format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
|
||||
if ( genotypeWriter != null ) {
|
||||
if ( genotypeWriter instanceof VCFGenotypeWriter )
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
|
||||
else if ( genotypeWriter instanceof GLFGenotypeWriter)
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GLF;
|
||||
else if ( genotypeWriter instanceof GeliGenotypeWriter)
|
||||
format = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
|
||||
else
|
||||
throw new StingException("Unsupported genotype format: " + genotypeWriter.getClass().getName());
|
||||
}
|
||||
gcm.set(GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, logger, UAC, format, verboseWriter, beagleWriter));
|
||||
}
|
||||
|
||||
char ref = Character.toUpperCase(refContext.getBase());
|
||||
if ( !BaseUtils.isRegularBase(ref) )
|
||||
return null;
|
||||
|
||||
// don't try to call if we couldn't read in all reads at this locus (since it wasn't properly downsampled)
|
||||
if ( rawContext.hasExceededMaxPileup() )
|
||||
return null;
|
||||
|
||||
ReadBackedPileup rawPileup = rawContext.getBasePileup();
|
||||
|
||||
// filter the context based on min base and mapping qualities
|
||||
ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||
|
||||
// filter the context based on mapping quality and mismatch rate
|
||||
pileup = filterPileup(pileup, refContext);
|
||||
|
||||
// don't call when there is no coverage
|
||||
if ( pileup.size() == 0 )
|
||||
return null;
|
||||
|
||||
// are there too many deletions in the pileup?
|
||||
if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) &&
|
||||
(double)pileup.getNumberOfDeletions() / (double)pileup.size() > UAC.MAX_DELETION_FRACTION )
|
||||
return null;
|
||||
|
||||
// stratify the AlignmentContext and cut by sample
|
||||
// Note that for testing purposes, we may want to throw multi-samples at pooled mode
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE, (UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED ? PooledCalculationModel.POOL_SAMPLE_NAME : null));
|
||||
if ( stratifiedContexts == null )
|
||||
return null;
|
||||
|
||||
DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, UAC.heterozygosity, DiploidGenotypePriors.PROB_OF_REFERENCE_ERROR);
|
||||
VariantCallContext call = gcm.get().callLocus(tracker, ref, rawContext.getLocation(), stratifiedContexts, priors);
|
||||
|
||||
// annotate the call, if possible
|
||||
if ( call != null && call.variation != null && call.variation instanceof ArbitraryFieldsBacked ) {
|
||||
// first off, we want to use the *unfiltered* context for the annotations
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup());
|
||||
|
||||
Map<String, String> annotations;
|
||||
if ( UAC.ALL_ANNOTATIONS )
|
||||
annotations = VariantAnnotator.getAllAnnotations(tracker, refContext, stratifiedContexts, call.variation, annotateDbsnp, annotateHapmap2, annotateHapmap3);
|
||||
else
|
||||
annotations = VariantAnnotator.getAnnotations(tracker, refContext, stratifiedContexts, call.variation, annotateDbsnp, annotateHapmap2, annotateHapmap3);
|
||||
((ArbitraryFieldsBacked)call.variation).setFields(annotations);
|
||||
}
|
||||
|
||||
return call;
|
||||
}
|
||||
|
||||
// filter based on maximum mismatches and bad mates
|
||||
private ReadBackedPileup filterPileup(ReadBackedPileup pileup, ReferenceContext refContext) {
|
||||
|
||||
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( (UAC.USE_BADLY_MATED_READS || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) &&
|
||||
AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= UAC.MAX_MISMATCHES )
|
||||
filteredPileup.add(p);
|
||||
}
|
||||
return new ReadBackedPileup(pileup.getLocation(), filteredPileup);
|
||||
}
|
||||
|
||||
private static boolean isValidDeletionFraction(double d) {
|
||||
return ( d >= 0.0 && d <= 1.0 );
|
||||
}
|
||||
}
|
||||
|
|
@ -49,7 +49,7 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Set<Bas
|
|||
@Argument(fullName="forcePreviousReadBasesToMatchRef", doc="Forces previous read bases to match the reference", required = false)
|
||||
boolean readBasesMustMatchRef = false;
|
||||
|
||||
private UGCalculationArguments ug;
|
||||
private UnifiedGenotyperEngine ug;
|
||||
// private ReferenceContextWindow refWindow;
|
||||
// private Set<BaseTransitionTable> conditionalTables;
|
||||
private List<Boolean> usePreviousBases;
|
||||
|
|
@ -62,7 +62,7 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Set<Bas
|
|||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
uac.baseModel = BaseMismatchModel.THREE_STATE;
|
||||
uac.ALL_BASES = true;
|
||||
ug = UnifiedGenotyper.getUnifiedCalculationArguments(getToolkit(), uac);
|
||||
ug = new UnifiedGenotyperEngine(getToolkit(), uac);
|
||||
// refWindow = new ReferenceContextWindow(nPreviousBases);
|
||||
usePreviousBases = new ArrayList<Boolean>();
|
||||
previousBaseLoci = new ArrayList<GenomeLoc>();
|
||||
|
|
@ -359,7 +359,7 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Set<Bas
|
|||
public boolean baseIsConfidentRef( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
if ( !BaseUtils.isRegularBase(ref.getBase()) )
|
||||
return false;
|
||||
VariantCallContext calls = UnifiedGenotyper.runGenotyper(tracker,ref,context,ug);
|
||||
VariantCallContext calls = ug.runGenotyper(tracker,ref,context);
|
||||
if ( calls == null || calls.genotypes == null)
|
||||
return false;
|
||||
return ( calls.genotypes.size() > 0 && !calls.genotypes.get(0).isVariant(ref.getBase()) && calls.genotypes.get(0).getNegLog10PError() > confidentRefThreshold );
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Walker to calculate the number of mismatches, their base counts, and their quality sums at confidence ref sites"
|
||||
|
|
@ -39,13 +38,13 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
|
|||
@Argument(fullName="skip", doc = "Only display every skip eligable sites. Defaults to all sites", required = false)
|
||||
int skip = 1;
|
||||
|
||||
private UGCalculationArguments ug;
|
||||
private UnifiedGenotyperEngine ug;
|
||||
|
||||
public void initialize() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
uac.baseModel = BaseMismatchModel.THREE_STATE;
|
||||
uac.ALL_BASES = true;
|
||||
ug = UnifiedGenotyper.getUnifiedCalculationArguments(getToolkit(), uac);
|
||||
ug = new UnifiedGenotyperEngine(getToolkit(), uac);
|
||||
|
||||
// print the header
|
||||
out.printf("loc ref genotype genotypeQ depth nMM qSumMM A C G T%n");
|
||||
|
|
@ -54,7 +53,7 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
|
|||
public String map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
String result = null;
|
||||
|
||||
ReadBackedPileup pileup = context.getPileup();
|
||||
ReadBackedPileup pileup = context.getBasePileup();
|
||||
if ( locusIsUsable(tracker, ref, pileup, context) ) {
|
||||
Genotype g = getGenotype(tracker, ref, context);
|
||||
if ( g != null && g.isPointGenotype() )
|
||||
|
|
@ -163,7 +162,7 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
|
|||
}
|
||||
|
||||
private Genotype getGenotype( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
VariantCallContext calls = UnifiedGenotyper.runGenotyper(tracker,ref,context, ug);
|
||||
VariantCallContext calls = ug.runGenotyper(tracker,ref,context);
|
||||
if ( calls == null || calls.variation == null || calls.genotypes == null )
|
||||
return null;
|
||||
else {
|
||||
|
|
|
|||
|
|
@ -5,10 +5,6 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.*;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.VariationCall;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeCall;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
|
@ -16,8 +12,6 @@ import org.broadinstitute.sting.playground.utils.NamedTable;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import cern.jet.stat.Probability;
|
||||
|
||||
/**
|
||||
|
|
@ -34,13 +28,13 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
|||
@Argument(fullName="limit", shortName="lim", doc="The pValue limit for which a read group will be deemed to be a contaminant", required=false)
|
||||
private Double LIMIT = 1e-9;
|
||||
|
||||
private UGCalculationArguments ug;
|
||||
private UnifiedGenotyperEngine ug;
|
||||
private NamedTable altTable;
|
||||
|
||||
public void initialize() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
uac.CONFIDENCE_THRESHOLD = 50;
|
||||
ug = UnifiedGenotyper.getUnifiedCalculationArguments(getToolkit(), uac);
|
||||
ug = new UnifiedGenotyperEngine(getToolkit(), uac);
|
||||
|
||||
altTable = new NamedTable();
|
||||
}
|
||||
|
|
@ -58,7 +52,7 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
|||
int altCount = 0;
|
||||
int totalCount = 0;
|
||||
|
||||
ReadBackedPileup pileup = context.getPileup();
|
||||
ReadBackedPileup pileup = context.getBasePileup();
|
||||
int refIndex = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
|
||||
|
||||
for (byte base : pileup.getBases() ) {
|
||||
|
|
@ -73,7 +67,7 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
|||
double altBalance = ((double) altCount)/((double) totalCount);
|
||||
|
||||
if (altBalance > 0.70) {
|
||||
VariantCallContext ugResult = UnifiedGenotyper.runGenotyper(tracker, ref, context, ug);
|
||||
VariantCallContext ugResult = ug.runGenotyper(tracker, ref, context);
|
||||
|
||||
if (ugResult != null && ugResult.genotypes != null && ugResult.genotypes.size() > 0) {
|
||||
return ugResult.genotypes.get(0).isHet();
|
||||
|
|
|
|||
|
|
@ -26,14 +26,14 @@ import java.util.HashMap;
|
|||
public class SecondaryBaseTransitionTableWalker extends LocusWalker<Integer, Integer> {
|
||||
|
||||
HashMap<String,Long> counts = new HashMap<String,Long>();
|
||||
private UGCalculationArguments ug;
|
||||
private UnifiedGenotyperEngine ug;
|
||||
private NamedTable altTable;
|
||||
|
||||
public void initialize() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
uac.CONFIDENCE_THRESHOLD = 50;
|
||||
uac.ALL_BASES = true;
|
||||
ug = UnifiedGenotyper.getUnifiedCalculationArguments(getToolkit(), uac);
|
||||
ug = new UnifiedGenotyperEngine(getToolkit(), uac);
|
||||
|
||||
altTable = new NamedTable();
|
||||
}
|
||||
|
|
@ -46,7 +46,7 @@ public class SecondaryBaseTransitionTableWalker extends LocusWalker<Integer, Int
|
|||
char nextBase = Character.toUpperCase(contextBases[contextBases.length - 1]);
|
||||
|
||||
if (contextBases.length == 3 && refBase != 'N' && pileup.getBases() != null && pileup.getSecondaryBases() != null) {
|
||||
VariantCallContext ugResult = UnifiedGenotyper.runGenotyper(tracker,ref,context,ug);
|
||||
VariantCallContext ugResult = ug.runGenotyper(tracker,ref,context);
|
||||
if (ugResult != null && ugResult.variation != null) {
|
||||
Genotype res = ugResult.genotypes.get(0);
|
||||
String call = res.getBases();
|
||||
|
|
|
|||
Loading…
Reference in New Issue