diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java index 7c69ab014..3f7794b24 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java @@ -145,7 +145,7 @@ public class StandardCallerArgumentCollection implements Cloneable { */ @Hidden @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel(); + public AFCalcFactory.Calculation requestedAlleleFrequencyCalculationModel; @Hidden @Argument(shortName = "logExactCalls", doc="x", required=false) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/genotyping/InfiniteRandomMatingPopulationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/genotyping/InfiniteRandomMatingPopulationModel.java index 6b0594ee0..f476d940c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/genotyping/InfiniteRandomMatingPopulationModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/genotyping/InfiniteRandomMatingPopulationModel.java @@ -68,6 +68,28 @@ import java.util.List; */ public class InfiniteRandomMatingPopulationModel implements GenotypingModel { + private final int cachePloidyCapacity; + private final int cacheAlleleCountCapacity; + private ThreadLocal likelihoodCalculators; + + /** + * Create a new infinite model instance. + */ + public InfiniteRandomMatingPopulationModel() { + this(10,50); + } + + public InfiniteRandomMatingPopulationModel(final int calculatorCachePloidyCapacity, final int calculatorCacheAlleleCapacity) { + cachePloidyCapacity = calculatorCachePloidyCapacity; + cacheAlleleCountCapacity = calculatorCachePloidyCapacity; + likelihoodCalculators = new ThreadLocal( ) { + @Override + public GenotypeLikelihoodCalculator[][] initialValue() { + return new GenotypeLikelihoodCalculator[calculatorCachePloidyCapacity][calculatorCacheAlleleCapacity]; + } + }; + } + @Override public GenotypingLikelihoods calculateLikelihoods(final AlleleList genotypingAlleles, final GenotypingData data) { if (genotypingAlleles == null) @@ -99,18 +121,27 @@ public class InfiniteRandomMatingPopulationModel implements GenotypingModel { } private GenotypingLikelihoods singleSampleLikelihoods(final AlleleList genotypingAlleles, - final GenotypingData data, - final AlleleLikelihoodMatrixMapper alleleLikelihoodMatrixMapper) { + final GenotypingData data, + final AlleleLikelihoodMatrixMapper alleleLikelihoodMatrixMapper) { final PloidyModel ploidyModel = data.ploidyModel(); final int samplePloidy = ploidyModel.samplePloidy(0); final int alleleCount = genotypingAlleles.alleleCount(); - final GenotypeLikelihoodCalculator likelihoodsCalculator = - GenotypeLikelihoodCalculator.getInstance(samplePloidy,alleleCount); + final GenotypeLikelihoodCalculator likelihoodsCalculator = getLikelihoodsCalculator(samplePloidy,alleleCount); final ReadLikelihoods.Matrix sampleLikelihoods = alleleLikelihoodMatrixMapper.map(data.readLikelihoods().sampleMatrix(0)); final List genotypeLikelihoods = Collections.singletonList(likelihoodsCalculator.genotypeLikelihoods(sampleLikelihoods)); return new GenotypingLikelihoods<>(genotypingAlleles,ploidyModel,genotypeLikelihoods); } + private GenotypeLikelihoodCalculator getLikelihoodsCalculator(final int samplePloidy, final int alleleCount) { + if (samplePloidy >= cacheAlleleCountCapacity) + return GenotypeLikelihoodCalculators.getInstance(samplePloidy, alleleCount); + else if (alleleCount >= cacheAlleleCountCapacity) + return GenotypeLikelihoodCalculators.getInstance(samplePloidy, alleleCount); + final GenotypeLikelihoodCalculator[][] cache = likelihoodCalculators.get(); + final GenotypeLikelihoodCalculator result = cache[samplePloidy][alleleCount]; + return result != null ? result : (cache[samplePloidy][alleleCount] = GenotypeLikelihoodCalculators.getInstance(samplePloidy, alleleCount)); + } + private GenotypingLikelihoods multiSampleHeterogeneousPloidyModelLikelihoods(final AlleleList genotypingAlleles, final GenotypingData data, final AlleleLikelihoodMatrixMapper alleleLikelihoodMatrixMapper, @@ -120,8 +151,7 @@ public class InfiniteRandomMatingPopulationModel implements GenotypingModel { final int alleleCount = genotypingAlleles.alleleCount(); for (int i = 0; i < sampleCount; i++) { final int samplePloidy = ploidyModel.samplePloidy(i); - final GenotypeLikelihoodCalculator likelihoodsCalculator = - GenotypeLikelihoodCalculator.getInstance(samplePloidy,alleleCount); + final GenotypeLikelihoodCalculator likelihoodsCalculator = getLikelihoodsCalculator(samplePloidy,alleleCount); final ReadLikelihoods.Matrix sampleLikelihoods = alleleLikelihoodMatrixMapper.map(data.readLikelihoods().sampleMatrix(i)); genotypeLikelihoods.add(likelihoodsCalculator.genotypeLikelihoods(sampleLikelihoods)); } @@ -136,8 +166,7 @@ public class InfiniteRandomMatingPopulationModel implements GenotypingModel { final int samplePloidy = ploidyModel.samplePloidy(0); final List genotypeLikelihoods = new ArrayList<>(sampleCount); final int alleleCount = genotypingAlleles.alleleCount(); - final GenotypeLikelihoodCalculator likelihoodsCalculator = - GenotypeLikelihoodCalculator.getInstance(samplePloidy,alleleCount); + final GenotypeLikelihoodCalculator likelihoodsCalculator = getLikelihoodsCalculator(samplePloidy,alleleCount); for (int i = 0; i < sampleCount; i++) { final ReadLikelihoods.Matrix sampleLikelihoods = alleleLikelihoodMatrixMapper.map(data.readLikelihoods().sampleMatrix(i)); genotypeLikelihoods.add(likelihoodsCalculator.genotypeLikelihoods(sampleLikelihoods)); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java index 0495db541..5abb1cd85 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java @@ -47,16 +47,19 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.samtools.SAMUtils; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.GenotypeLikelihoods; +import htsjdk.variant.vcf.VCFConstants; +import org.broadinstitute.gatk.genotyping.GenotypeAlleleCounts; +import org.broadinstitute.gatk.genotyping.GenotypeLikelihoodCalculator; +import org.broadinstitute.gatk.genotyping.GenotypeLikelihoodCalculators; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.ExactACcounts; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.ExactACset; import org.broadinstitute.gatk.utils.MathUtils; -import htsjdk.variant.vcf.VCFConstants; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.GenotypeLikelihoods; import java.util.*; @@ -424,18 +427,9 @@ public abstract class GeneralPloidyGenotypeLikelihoods { */ public static int[] getAlleleCountFromPLIndex(final int nAlleles, final int numChromosomes, final int PLindex) { - // todo - another brain-dead inefficient implementation, can do much better by computing in closed form - final SumIterator iterator = new SumIterator(nAlleles,numChromosomes); - while (iterator.hasNext()) { - final int[] plVec = iterator.getCurrentVector(); - if (iterator.getLinearIndex() == PLindex) - return plVec; - - iterator.next(); - } - - return null; - + final GenotypeLikelihoodCalculator calculator = GenotypeLikelihoodCalculators.getInstance(numChromosomes, nAlleles); + final GenotypeAlleleCounts alleleCounts = calculator.genotypeAlleleCountsAt(PLindex); + return alleleCounts.alleleCountsByIndex(nAlleles - 1); } /* diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index 97a98e704..8225ba9a5 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -48,15 +48,17 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; import org.broadinstitute.gatk.engine.contexts.AlignmentContext; import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; import org.broadinstitute.gatk.engine.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.genotyping.SampleList; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalc; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalcFactory; @@ -69,8 +71,6 @@ import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import htsjdk.variant.variantcontext.*; -import htsjdk.variant.vcf.VCFConstants; import java.util.*; @@ -86,22 +86,20 @@ public abstract class GenotypingEngine sampleNames; + protected final SampleList samples; private final double[] log10AlleleFrequencyPriorsSNPs; private final double[] log10AlleleFrequencyPriorsIndels; - private final GenomeLocParser genomeLocParser; + protected final GenomeLocParser genomeLocParser; // the model used for calculating p(non-ref) protected ThreadLocal afcm = new ThreadLocal() { @@ -111,59 +109,32 @@ public abstract class GenotypingEngine resolveSampleNamesFromToolkit(final GenomeAnalysisEngine toolkit) { - if (toolkit == null) - throw new IllegalArgumentException("the toolkit cannot be null"); - return new LinkedHashSet<>(toolkit.getSampleDB().getSampleNames()); - } /** * Construct a new genotyper engine, on a specific subset of samples. * - * @param toolkit reference to the genome-analysis toolkit. * @param configuration engine configuration object. - * @param sampleNames subset of sample to work on identified by their names. If {@code null}, the full toolkit + * @param samples subset of sample to work on identified by their names. If {@code null}, the full toolkit * sample set will be used instead. + * @param genomeLocParser the genome-loc-parser * - * @throws IllegalArgumentException if either {@code toolkit} or {@code configuration} is {@code null}. + * @throws IllegalArgumentException if any of {@code samples}, {@code configuration} or {@code genomeLocParser} is {@code null}. */ - protected GenotypingEngine(final GenomeAnalysisEngine toolkit, final Config configuration,final Set sampleNames) { - if (toolkit == null) - throw new IllegalArgumentException("the toolkit cannot be null"); + protected GenotypingEngine(final Config configuration,final SampleList samples, final GenomeLocParser genomeLocParser) { + if (configuration == null) throw new IllegalArgumentException("the configuration cannot be null"); this.configuration = configuration; logger = Logger.getLogger(getClass()); - this.toolkit = toolkit; - this.sampleNames = sampleNames != null ? sampleNames : toolkit.getSampleDB().getSampleNames(); - numberOfGenomes = this.sampleNames.size() * configuration.genotypeArgs.samplePloidy; + this.samples = samples; + numberOfGenomes = this.samples.sampleCount() * configuration.genotypeArgs.samplePloidy; MathUtils.Log10Cache.ensureCacheContains(numberOfGenomes * 2); log10AlleleFrequencyPriorsSNPs = computeAlleleFrequencyPriors(numberOfGenomes, configuration.genotypeArgs.snpHeterozygosity,configuration.genotypeArgs.inputPrior); log10AlleleFrequencyPriorsIndels = computeAlleleFrequencyPriors(numberOfGenomes, configuration.genotypeArgs.indelHeterozygosity,configuration.genotypeArgs.inputPrior); - genomeLocParser = toolkit.getGenomeLocParser(); + this.genomeLocParser = genomeLocParser; } /** @@ -257,7 +228,7 @@ public abstract class GenotypingEngine outputAlleles = outputAlternativeAlleles.outputAlleles(vc.getReference()); final VariantContextBuilder builder = new VariantContextBuilder(callSourceString(), loc.getContig(), loc.getStart(), loc.getStop(), outputAlleles); @@ -506,7 +480,9 @@ public abstract class GenotypingEngine, Unif * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. */ @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) - protected List annotationsToUse = new ArrayList(); + protected List annotationsToUse = new ArrayList<>(); /** * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, * so annotations will be excluded even if they are explicitly included with the other options. */ @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList(); + protected List annotationsToExclude = new ArrayList<>(); /** * If specified, all available annotations in the group will be applied. See the VariantAnnotator -list argument to view available groups. @@ -218,11 +221,6 @@ public class UnifiedGenotyper extends LocusWalker, Unif // the calculation arguments private UnifiedGenotypingEngine genotypingEngine = null; - // the annotation engine - private VariantAnnotatorEngine annotationEngine; - - private Set samples; - // enable deletions in the pileup @Override public boolean includeReadsWithDeletionAtLoci() { return true; } @@ -256,17 +254,22 @@ public class UnifiedGenotyper extends LocusWalker, Unif * **/ public void initialize() { + super.initialize(); + final GenomeAnalysisEngine toolkit = getToolkit(); + final Set sampleNameSet; if ( UAC.TREAT_ALL_READS_AS_SINGLE_POOL ) { - samples.add(GenotypeLikelihoodsCalculationModel.DUMMY_SAMPLE_NAME); + sampleNameSet = Collections.singleton(GenotypeLikelihoodsCalculationModel.DUMMY_SAMPLE_NAME); } else { // get all of the unique sample names - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + sampleNameSet = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()); if ( UAC.referenceSampleName != null ) - samples.remove(UAC.referenceSampleName); + sampleNameSet.remove(UAC.referenceSampleName); } + final SampleList samples = new IndexedSampleList(sampleNameSet); + if ( UAC.CONTAMINATION_FRACTION_FILE != null ) - UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger)); + UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, sampleNameSet, logger)); // check for a bad max alleles value if ( UAC.genotypeArgs.MAX_ALTERNATE_ALLELES > GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) @@ -282,8 +285,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif if ( verboseWriter != null ) verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP"); - annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - genotypingEngine = new UnifiedGenotypingEngine(getToolkit(), UAC, samples); + final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); + genotypingEngine = new UnifiedGenotypingEngine(UAC, samples, toolkit.getGenomeLocParser(), toolkit.getArguments().BAQMode); genotypingEngine.setVerboseWriter(verboseWriter); genotypingEngine.setAnnotationEngine(annotationEngine); @@ -298,11 +301,11 @@ public class UnifiedGenotyper extends LocusWalker, Unif final Set samplesForHeader; if ( ! onlyEmitSamples.isEmpty() ) { // make sure that onlyEmitSamples is a subset of samples - if ( ! samples.containsAll(onlyEmitSamples) ) + if ( ! sampleNameSet.containsAll(onlyEmitSamples) ) throw new UserException.BadArgumentValue("onlyEmitSamples", "must be a strict subset of the samples in the BAM files but is wasn't"); samplesForHeader = onlyEmitSamples; } else { - samplesForHeader = samples; + samplesForHeader = sampleNameSet; } writer.writeHeader(new VCFHeader(headerInfo, samplesForHeader)); } @@ -310,7 +313,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif public static Set getHeaderInfo(final UnifiedArgumentCollection UAC, final VariantAnnotatorEngine annotationEngine, final DbsnpArgumentCollection dbsnp) { - Set headerInfo = new HashSet(); + final Set headerInfo = new HashSet<>(); // all annotation fields from VariantAnnotatorEngine if ( annotationEngine != null ) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java index 36ac2e5bf..22bc09e98 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java @@ -45,12 +45,16 @@ */ package org.broadinstitute.gatk.tools.walkers.genotyper; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.GenotypesContext; +import htsjdk.variant.variantcontext.VariantContext; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.contexts.AlignmentContext; import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; import org.broadinstitute.gatk.engine.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.genotyping.SampleList; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalcResult; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -62,9 +66,6 @@ import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.GenotypesContext; -import htsjdk.variant.variantcontext.VariantContext; import java.io.PrintStream; import java.lang.reflect.Constructor; @@ -88,7 +89,6 @@ public class UnifiedGenotypingEngine extends GenotypingEngineThe new engine won't emmit annotations, will use the full sample set and will not produce additional verbose - * output

+ * Creates a new unified genotyping given the UG configuration parameters and the GA engine. + * @param configuration the UG configuration. + * @param toolkit the GA engine. * - * @param toolkit reference to the enclosing genome analysis engine. - * @param configuration configuration object. - * - * @throws IllegalArgumentException if either {@code toolkit} or {@code UAC} is {@code null}. + * @throws NullPointerException if either {@code configuration} or {@code toolkit} is {@code null}. */ - public UnifiedGenotypingEngine(final GenomeAnalysisEngine toolkit, final UnifiedArgumentCollection configuration) { - this(toolkit, configuration, null); + public UnifiedGenotypingEngine(final UnifiedArgumentCollection configuration, + final GenomeAnalysisEngine toolkit) { + this(configuration,toolkit.getSampleList(),toolkit.getGenomeLocParser(),toolkit.getArguments().BAQMode); } + /** - * Constructs a new Unified-Genotyper engine. + * Creates a new unified genotyping given the UG configuration parameters, the targeted set of samples and + * a genome location parser. * - * @param toolkit reference to the enclosing genome analysis engine. - * @param configuration configuration object. - * @param sampleNames subset of sample names to work on. If {@code null}, all it will use the {@code toolkit} full sample set. + * @param configuration the UG configuration. + * @param samples {@inheritDoc} + * @param baqCalculationMode the BAQ calculation mode. * - * @throws IllegalArgumentException if either {@code toolkit} or {@code UAC} is {@code null}. + * @throws NullPointerException if any of {@code configuration}, {@code samples} or {@code genomeLocParser} is {@code null}. + * + * @throws IllegalArgumentException if {@code baqCalculationMode} is {@code null}. */ - public UnifiedGenotypingEngine(final GenomeAnalysisEngine toolkit, final UnifiedArgumentCollection configuration, - final Set sampleNames) { + public UnifiedGenotypingEngine(final UnifiedArgumentCollection configuration, + final SampleList samples, final GenomeLocParser genomeLocParser, + final BAQ.CalculationMode baqCalculationMode) { - super(toolkit,configuration,sampleNames); + super(configuration,samples,genomeLocParser); - this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; - genomeLocParser = toolkit.getGenomeLocParser(); + if (baqCalculationMode == null) + throw new IllegalArgumentException("the BAQ calculation mode cannot be null"); + + this.BAQEnabledOnCMDLine = baqCalculationMode != BAQ.CalculationMode.OFF; determineGLModelsToUse(); @@ -302,7 +308,8 @@ public class UnifiedGenotypingEngine extends GenotypingEngine perReadAlleleLikelihoodMap) { - return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); + return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, + genomeLocParser != null || refContext == null ? genomeLocParser : refContext.getGenomeLocParser(), perReadAlleleLikelihoodMap); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalcFactory.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalcFactory.java index e4cbf0b5d..ad4ce979a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalcFactory.java @@ -48,10 +48,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; -import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.classloader.PluginManager; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; import java.lang.reflect.Constructor; import java.util.LinkedList; @@ -105,6 +103,24 @@ public class AFCalcFactory { } public static Calculation getDefaultModel() { return EXACT_INDEPENDENT; } + + /** + * Returns the best (fastest) model give the required ploidy and alternative allele count. + * @param requiredPloidy required ploidy + * @param requiredAlternativeAlleleCount required alternative allele count. + * @param preferredModel a preferred mode if any. A {@code null} indicate that we should be try to use the default instead. + * @return never {@code null} + */ + public static Calculation getBestModel(final int requiredPloidy, final int requiredAlternativeAlleleCount, final Calculation preferredModel) { + final Calculation preferred = preferredModel == null ? getDefaultModel() : preferredModel; + if (preferred.usableForParams(requiredPloidy,requiredAlternativeAlleleCount)) + return preferred; + if (EXACT_INDEPENDENT.usableForParams(requiredPloidy,requiredAlternativeAlleleCount)) + return EXACT_INDEPENDENT; + if (EXACT_REFERENCE.usableForParams(requiredPloidy,requiredAlternativeAlleleCount)) + return EXACT_REFERENCE; + return EXACT_GENERAL_PLOIDY; + } } private static final Map> afClasses; @@ -137,25 +153,10 @@ public class AFCalcFactory { public static AFCalc createAFCalc(final StandardCallerArgumentCollection UAC, final int nSamples, final Logger logger) { - final int maxAltAlleles = UAC.genotypeArgs.MAX_ALTERNATE_ALLELES; - if ( ! UAC.AFmodel.usableForParams(UAC.genotypeArgs.samplePloidy, maxAltAlleles) ) { - logger.info("Requested ploidy " + UAC.genotypeArgs.samplePloidy + " maxAltAlleles " + maxAltAlleles + " not supported by requested model " + UAC.AFmodel + " looking for an option"); - final List supportingCalculations = new LinkedList(); - for ( final Calculation calc : Calculation.values() ) { - if ( calc.usableForParams(UAC.genotypeArgs.samplePloidy, maxAltAlleles) ) - supportingCalculations.add(calc); - } + final Calculation afCalculationModel = Calculation.getBestModel(UAC.genotypeArgs.samplePloidy,UAC.genotypeArgs.MAX_ALTERNATE_ALLELES, + UAC.requestedAlleleFrequencyCalculationModel); - if ( supportingCalculations.isEmpty() ) - throw new UserException("no AFCalculation model found that supports ploidy of " + UAC.genotypeArgs.samplePloidy + " and max alt alleles " + maxAltAlleles); - else if ( supportingCalculations.size() > 1 ) - logger.debug("Warning, multiple supporting AFCalcs found " + Utils.join(",", supportingCalculations) + " choosing first arbitrarily"); - else - UAC.AFmodel = supportingCalculations.get(0); - logger.info("Selecting model " + UAC.AFmodel); - } - - final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, maxAltAlleles, UAC.genotypeArgs.samplePloidy); + final AFCalc calc = createAFCalc(afCalculationModel, nSamples, UAC.genotypeArgs.MAX_ALTERNATE_ALLELES, UAC.genotypeArgs.samplePloidy); if ( logger != null ) calc.setLogger(logger); if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 500cdf4ce..98df248aa 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -56,7 +56,7 @@ import htsjdk.variant.variantcontext.*; import java.util.*; public class GeneralPloidyExactAFCalc extends ExactAFCalc { - static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them + static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 100; // if PL vectors longer than this # of elements, don't log them private final int ploidy; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java index ace4f22d2..0008f5b28 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java @@ -47,6 +47,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import org.apache.log4j.Logger; +import org.broadinstitute.gatk.genotyping.SampleList; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.SeqGraph; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; @@ -113,7 +114,7 @@ public class GraphBasedLikelihoodCalculationEngine implements ReadLikelihoodCalc } @Override - public ReadLikelihoods computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final List samples, final Map> perSampleReadList) { + public ReadLikelihoods computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final SampleList samples, final Map> perSampleReadList) { final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine = new GraphBasedLikelihoodCalculationEngineInstance(assemblyResultSet, hmm,log10GlobalReadMismappingRate,heterogeneousKmerSizeResolution); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java index 7d4b3db1a..e2229cd50 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java @@ -46,7 +46,11 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; +import htsjdk.variant.variantcontext.Allele; import org.apache.log4j.Logger; +import org.broadinstitute.gatk.genotyping.AlleleList; +import org.broadinstitute.gatk.genotyping.IndexedAlleleList; +import org.broadinstitute.gatk.genotyping.SampleList; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.MultiSampleEdge; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.Path; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.Route; @@ -61,7 +65,6 @@ import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pairhmm.FlexibleHMM; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; import java.util.*; @@ -233,12 +236,13 @@ public class GraphBasedLikelihoodCalculationEngineInstance { * @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}. * The value maps can be potentially empty though. */ - public ReadLikelihoods computeReadLikelihoods(final List haplotypes, final List samples, + public ReadLikelihoods computeReadLikelihoods(final List haplotypes, final SampleList samples, final Map> perSampleReadList) { // General preparation on the input haplotypes: - final ReadLikelihoods result = new ReadLikelihoods<>(samples, haplotypes, perSampleReadList); final List sortedHaplotypes = new ArrayList<>(haplotypes); Collections.sort(sortedHaplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR); + final AlleleList alleles = new IndexedAlleleList<>(sortedHaplotypes); + final ReadLikelihoods result = new ReadLikelihoods<>(samples, alleles, perSampleReadList); // The actual work: final int sampleCount = result.sampleCount(); @@ -315,7 +319,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance { private void calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(final int haplotypeIndex, final ReadLikelihoods.Matrix likelihoods, final Map> costsEndingByVertex) { - final Haplotype haplotype = likelihoods.allele(haplotypeIndex); + final Haplotype haplotype = likelihoods.alleleAt(haplotypeIndex); final HaplotypeRoute haplotypeRoute = haplotypeGraph.getHaplotypeRoute(haplotype); final Set haplotypeVertices = haplotypeRoute.vertexSet(); final Map readCostByRead = new HashMap<>(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 2ee5752f0..5e6a519ea 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -52,6 +52,7 @@ import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; import org.broadinstitute.gatk.engine.contexts.AlignmentContext; import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; @@ -64,15 +65,12 @@ import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.genotyping.*; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.genotyper.*; -import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; @@ -606,7 +604,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // the minimum length of a read we'd consider using for genotyping private final static int MIN_READ_LENGTH = 10; - private List samplesList; + private SampleList samplesList; private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file @@ -626,9 +624,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In public void initialize() { super.initialize(); - if (SCAC.genotypeArgs.samplePloidy != HomoSapiensConstants.DEFAULT_PLOIDY) - throw new UserException.BadArgumentValue("-ploidy", "" + SCAC.genotypeArgs.samplePloidy + "; currently HaplotypeCaller only supports diploid sample analysis (-ploidy 2)"); - if (dontGenotype && emitReferenceConfidence()) throw new UserException("You cannot request gVCF output and do not genotype at the same time"); @@ -656,12 +651,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In logger.info("Disabling physical phasing, which is supported only for reference-model confidence output"); } - if ( SCAC.AFmodel == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) - throw new UserException.BadArgumentValue("pnrm", "HaplotypeCaller doesn't currently support " + SCAC.AFmodel); - - - samplesList = new ArrayList<>(SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader())); - Set samplesSet = new LinkedHashSet<>(samplesList); + final GenomeAnalysisEngine toolkit = getToolkit(); + samplesList = toolkit.getReadSampleList(); + final Set sampleSet = SampleListUtils.asSet(samplesList); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested final UnifiedArgumentCollection simpleUAC = SCAC.cloneTo(UnifiedArgumentCollection.class); @@ -672,20 +664,24 @@ public class HaplotypeCaller extends ActiveRegionWalker, In simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.CONTAMINATION_FRACTION_FILE = null; simpleUAC.exactCallsLog = null; - activeRegionEvaluationGenotyperEngine = new UnifiedGenotypingEngine(getToolkit(), simpleUAC, samplesSet); + // Seems that at least with some test data we can lose genuine haploid variation if we use + // UGs engine with ploidy == 1 + simpleUAC.genotypeArgs.samplePloidy = Math.max(2,SCAC.genotypeArgs.samplePloidy); + activeRegionEvaluationGenotyperEngine = new UnifiedGenotypingEngine(simpleUAC, toolkit); activeRegionEvaluationGenotyperEngine.setLogger(logger); if( SCAC.CONTAMINATION_FRACTION_FILE != null ) - SCAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(SCAC.CONTAMINATION_FRACTION_FILE, SCAC.CONTAMINATION_FRACTION, samplesSet, logger)); + SCAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(SCAC.CONTAMINATION_FRACTION_FILE, SCAC.CONTAMINATION_FRACTION, sampleSet, logger)); if( SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES && consensusMode ) throw new UserException("HaplotypeCaller cannot be run in both GENOTYPE_GIVEN_ALLELES mode and in consensus mode. Please choose one or the other."); - genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(), SCAC, !doNotRunPhysicalPhasing); + final GenomeLocParser genomeLocParser = toolkit.getGenomeLocParser(); + genotypingEngine = new HaplotypeCallerGenotypingEngine( SCAC, samplesList, genomeLocParser, !doNotRunPhysicalPhasing); // initialize the output VCF header final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - Set headerInfo = new HashSet<>(); + final Set headerInfo = new HashSet<>(); headerInfo.addAll(genotypingEngine.getAppropriateVCFInfoHeaders()); // all annotation fields from VariantAnnotatorEngine @@ -707,13 +703,20 @@ public class HaplotypeCaller extends ActiveRegionWalker, In headerInfo.add(new VCFFormatHeaderLine(HAPLOTYPE_CALLER_PHASING_GT_KEY, 1, VCFHeaderLineType.String, "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another")); } + if (SCAC.genotypeArgs.samplePloidy != HomoSapiensConstants.DEFAULT_PLOIDY) { + if (SCAC.emitReferenceConfidence != ReferenceConfidenceMode.NONE) + throw new UserException.BadArgumentValue("ERC", "For now ploidies different that 2 are not allow for GVCF or BP_RESOLUTION outputs"); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample")); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction, in the same order as listed, for each individual sample")); + } + // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality")); - initializeReferenceConfidenceModel(samplesSet, headerInfo); + initializeReferenceConfidenceModel(samplesList, headerInfo); - vcfWriter.writeHeader(new VCFHeader(headerInfo, samplesSet)); + vcfWriter.writeHeader(new VCFHeader(headerInfo, sampleSet)); try { // fasta reference reader to supplement the edges of the reference sequence @@ -771,10 +774,11 @@ public class HaplotypeCaller extends ActiveRegionWalker, In SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES,emitReferenceConfidence()); } - private void initializeReferenceConfidenceModel(final Set samples, final Set headerInfo) { + private void initializeReferenceConfidenceModel(final SampleList samples, final Set headerInfo) { referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel); if ( emitReferenceConfidence() ) { - if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); + if ( samples.sampleCount() != 1 ) + throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { // a kluge to enforce the use of this indexing strategy @@ -784,7 +788,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } try { - vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands); + vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands,SCAC.genotypeArgs.samplePloidy); } catch ( IllegalArgumentException e ) { throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage()); } @@ -857,8 +861,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context); final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size()); final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage(); + final GenotypingModel genotypingModel = genotypingEngine.getGenotypingModel(); for( final Map.Entry sample : splitContexts.entrySet() ) { - final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, averageHQSoftClips).genotypeLikelihoods; + final String sampleName = sample.getKey(); + // The ploidy here is not dictated by the sample but by the simple genotyping-engine used to determine whether regions are active or not. + final int activeRegionDetectionHackishSamplePloidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy; + final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sampleName,activeRegionDetectionHackishSamplePloidy,genotypingModel,sample.getValue().getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, averageHQSoftClips).genotypeLikelihoods; genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() ); } @@ -969,8 +977,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In regionForGenotyping.getLocation(), getToolkit().getGenomeLocParser(), metaDataTracker, - ( consensusMode ? Collections.emptyList() : givenAlleles ), - emitReferenceConfidence() ); + (consensusMode ? Collections.emptyList() : givenAlleles), + emitReferenceConfidence()); // TODO -- must disable if we are doing NCT, or set the output type of ! presorted if ( bamWriter != null ) { @@ -997,7 +1005,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // output variant containing region. result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(), calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, - readLikelihoods, calledHaplotypes.getCalls())); + readLikelihoods, genotypingEngine.getPloidyModel(), genotypingEngine.getGenotypingModel(), calledHaplotypes.getCalls())); // output right-flanking non-variant section: if (trimmingResult.hasRightFlankingRegion()) result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantRightFlankRegion(),false)); @@ -1110,7 +1118,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final List haplotypes = Collections.singletonList(refHaplotype); return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes, paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region), - Collections.emptyList()); + genotypingEngine.getPloidyModel(), genotypingEngine.getGenotypingModel(), Collections.emptyList()); } else return NO_CALLS; } @@ -1123,11 +1131,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref */ public static ReadLikelihoods createDummyStratifiedReadMap(final Haplotype refHaplotype, - final List samples, + final SampleList samples, final ActiveRegion region) { - return new ReadLikelihoods<>(samples, Collections.singletonList(refHaplotype), + return new ReadLikelihoods<>(samples, new IndexedAlleleList<>(refHaplotype), splitReadsBySample(samples, region.getReads())); - } @@ -1235,18 +1242,15 @@ public class HaplotypeCaller extends ActiveRegionWalker, In return splitReadsBySample(samplesList, reads); } - public static Map> splitReadsBySample( final List samplesList, final Collection reads ) { + private static Map> splitReadsBySample( final SampleList samplesList, final Collection reads ) { final Map> returnMap = new HashMap<>(); - for( final String sample : samplesList) { - List readList = returnMap.get( sample ); - if( readList == null ) { - readList = new ArrayList<>(); - returnMap.put(sample, readList); - } - } - for( final GATKSAMRecord read : reads ) { + final int sampleCount = samplesList.sampleCount(); + for (int i = 0; i < sampleCount; i++) + returnMap.put(samplesList.sampleAt(i), new ArrayList()); + + for( final GATKSAMRecord read : reads ) returnMap.get(read.getReadGroup().getSample()).add(read); - } + return returnMap; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java index 321c48166..97bb9efa8 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java @@ -45,9 +45,9 @@ */ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; +import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; /** * Set of arguments for the {@link HaplotypeCaller} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 77c308cfc..76c8dc772 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -48,8 +48,9 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import htsjdk.variant.variantcontext.*; import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.genotyping.*; import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine; import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingOutputMode; @@ -64,7 +65,6 @@ import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.MergeVariantsAcrossHaplotypes; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import htsjdk.variant.variantcontext.*; import java.util.*; @@ -80,28 +80,33 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine sampleNames) { - super(toolkit,configuration,sampleNames); - doPhysicalPhasing = true; + public HaplotypeCallerGenotypingEngine(final HaplotypeCallerArgumentCollection configuration, final SampleList samples, final GenomeLocParser genomeLocParser) { + this(configuration,samples,genomeLocParser,false); } - /** * Change the merge variant across haplotypes for this engine. * @@ -198,9 +203,9 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine alleleList = new ArrayList<>(); - alleleList.addAll(mergedVC.getAlleles()); - alleleList.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - vcb.alleles(alleleList); - mergedVC = vcb.make(); - } + if (emitReferenceConfidence) + mergedVC = addNonRefSymbolicAllele(mergedVC); final Map mergeMap = new LinkedHashMap<>(); mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele @@ -264,7 +265,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine originalList = mergedVC.getAlleles(); + final List alleleList = new ArrayList<>(originalList.size() + 1); + alleleList.addAll(mergedVC.getAlleles()); + alleleList.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + vcb.alleles(alleleList); + return vcb.make(); + } + // Builds the read-likelihoods collection to use for annotation considering user arguments and the collection // used for genotyping. private ReadLikelihoods prepareReadAlleleLikelihoodsForAnnotation( @@ -653,22 +664,14 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine readLikelihoods, final VariantContext mergedVC ) { - final GenotypesContext genotypes = GenotypesContext.create(readLikelihoods.sampleCount()); - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - for (final String sample : readLikelihoods.samples() ) { - final int numHaplotypes = mergedVC.getAlleles().size(); - final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, readLikelihoods, mergedVC.getAlleles(), true); - int glIndex = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - logger.debug(" Likelihoods for sample " + sample + " : " + Arrays.toString(genotypeLikelihoods)); - genotypes.add(new GenotypeBuilder(sample).alleles(NO_CALL).PL(genotypeLikelihoods).make()); - } - return genotypes; + final List vcAlleles = mergedVC.getAlleles(); + final AlleleList alleleList = readLikelihoods.alleleCount() == vcAlleles.size() ? readLikelihoods : new IndexedAlleleList<>(vcAlleles); + final GenotypingLikelihoods likelihoods = genotypingModel.calculateLikelihoods(alleleList,new GenotypingData<>(ploidyModel,readLikelihoods)); + final int sampleCount = samples.sampleCount(); + final GenotypesContext result = GenotypesContext.create(sampleCount); + for (int s = 0; s < sampleCount; s++) + result.add(new GenotypeBuilder(samples.sampleAt(s)).alleles(NO_CALL).PL(likelihoods.sampleLikelihoods(s).getAsPLs()).make()); + return result; } /** @@ -779,4 +782,22 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final List samples, final Map> perSampleReadList ) { + public ReadLikelihoods computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final SampleList samples, final Map> perSampleReadList ) { - final List haplotypes = assemblyResultSet.getHaplotypeList(); + final List haplotypeList = assemblyResultSet.getHaplotypeList(); + final AlleleList haplotypes = new IndexedAlleleList<>(haplotypeList); // configure the HMM - initializePairHMM(haplotypes, perSampleReadList); + initializePairHMM(haplotypeList, perSampleReadList); // Add likelihoods for each sample's reads to our result final ReadLikelihoods result = new ReadLikelihoods<>(samples, haplotypes, perSampleReadList); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java index bd72a764d..aa7305bcf 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java @@ -46,11 +46,14 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; +import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.genotyping.AlleleList; +import org.broadinstitute.gatk.genotyping.IndexedAlleleList; +import org.broadinstitute.gatk.genotyping.SampleList; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; import java.util.HashMap; import java.util.List; @@ -63,17 +66,15 @@ import java.util.Random; public class RandomLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine { @Override - public ReadLikelihoods computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, - final List samples, + public ReadLikelihoods computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, + final SampleList samples, final Map> reads) { - final List haplotypes = assemblyResultSet.getHaplotypeList(); + final AlleleList haplotypes = new IndexedAlleleList<>(assemblyResultSet.getHaplotypeList()); final ReadLikelihoods result = new ReadLikelihoods(samples, haplotypes, reads); - final Map alleles = new HashMap<>(haplotypes.size()); - for (final Haplotype haplotype : haplotypes) - alleles.put(haplotype,Allele.create(haplotype,false)); + final Map alleles = new HashMap<>(haplotypes.alleleCount()); final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - final int sampleCount = samples.size(); - final int alleleCount = alleles.size(); + final int sampleCount = samples.sampleCount(); + final int alleleCount = haplotypes.alleleCount(); for (int i = 0; i < sampleCount; i++) { final List sampleReads = result.sampleReads(i); final int readCount = sampleReads.size(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java index 6119cba1c..ccc0c18b8 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java @@ -46,6 +46,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; +import org.broadinstitute.gatk.genotyping.SampleList; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; @@ -81,6 +82,7 @@ public interface ReadLikelihoodCalculationEngine { * active region assembly process. * * @param assemblyResultSet the input assembly results. + * @param samples the list of targeted samples. * @param perSampleReadList the input read sets stratified per sample. * * @throws NullPointerException if either parameter is {@code null}. @@ -88,7 +90,7 @@ public interface ReadLikelihoodCalculationEngine { * @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}. * The value maps can be potentially empty though. */ - public ReadLikelihoods computeReadLikelihoods(AssemblyResultSet assemblyResultSet, List samples, + public ReadLikelihoods computeReadLikelihoods(AssemblyResultSet assemblyResultSet, SampleList samples, Map> perSampleReadList); public void close(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RefVsAnyResult.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RefVsAnyResult.java index 8726c9365..bb3b64f93 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RefVsAnyResult.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RefVsAnyResult.java @@ -46,6 +46,8 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; +import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; + /** * Holds information about a genotype call of a single sample reference vs. any non-ref event * @@ -58,7 +60,7 @@ final class RefVsAnyResult { /** * The genotype likelihoods for ref/ref ref/non-ref non-ref/non-ref */ - final double[] genotypeLikelihoods = new double[3]; + final double[] genotypeLikelihoods; /** * AD field value for ref / non-ref @@ -74,7 +76,31 @@ final class RefVsAnyResult { * Cap the het and hom var likelihood values by the hom ref likelihood. */ protected void capByHomRefLikelihood() { - genotypeLikelihoods[1] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[1]); - genotypeLikelihoods[2] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[2]); + final int likelihoodCount = genotypeLikelihoods.length; + for (int i = 1; i < likelihoodCount; i++) + genotypeLikelihoods[i] = Math.min(genotypeLikelihoods[0],genotypeLikelihoods[i]); } + + /** + * Creates a new ref-vs-alt result assuming 3 as the number of genotype likelihoods (human ploidy. + */ + @Deprecated + public RefVsAnyResult() { + genotypeLikelihoods = + new double[(HomoSapiensConstants.DEFAULT_PLOIDY * (HomoSapiensConstants.DEFAULT_PLOIDY + 1)) >> 1]; + } + + /** + * Creates a new ref-vs-alt result indicating the genotype likelihood vector capacity. + * @param likelihoodCapacity the required capacity of the likelihood array, should match the possible number of + * genotypes given the number of alleles (always 2), ploidy (arbitrary) less the genotyping + * model non-sense genotype count if applies. + * @throws IllegalArgumentException if {@code likelihoodCapacity} is negative. + */ + public RefVsAnyResult(final int likelihoodCapacity) { + if (likelihoodCapacity < 0) + throw new IllegalArgumentException("likelihood capacity is negative"); + genotypeLikelihoods = new double[likelihoodCapacity]; + } + } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java index dc9512eee..116b27009 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -51,11 +51,13 @@ import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFSimpleHeaderLine; import org.broadinstitute.gatk.engine.contexts.AlignmentContext; +import org.broadinstitute.gatk.genotyping.*; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; @@ -85,8 +87,8 @@ public class ReferenceConfidenceModel { public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele private final GenomeLocParser genomeLocParser; - private final Set samples; - private final SAMFileHeader header; // TODO -- really shouldn't depend on this + + private final SampleList samples; private final int indelInformativeDepthIndelSize; private final static boolean WRITE_DEBUGGING_BAM = false; @@ -103,18 +105,17 @@ public class ReferenceConfidenceModel { * @param indelInformativeDepthIndelSize the max size of indels to consider when calculating indel informative depths */ public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser, - final Set samples, + final SampleList samples, final SAMFileHeader header, final int indelInformativeDepthIndelSize) { if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); if ( samples == null ) throw new IllegalArgumentException("samples cannot be null"); - if ( samples.isEmpty() ) throw new IllegalArgumentException("samples cannot be empty"); + if ( samples.sampleCount() == 0) throw new IllegalArgumentException("samples cannot be empty"); if ( header == null ) throw new IllegalArgumentException("header cannot be empty"); if ( indelInformativeDepthIndelSize < 0) throw new IllegalArgumentException("indelInformativeDepthIndelSize must be >= 1 but got " + indelInformativeDepthIndelSize); this.genomeLocParser = genomeLocParser; this.samples = samples; - this.header = header; this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize; if ( WRITE_DEBUGGING_BAM ) { @@ -124,8 +125,6 @@ public class ReferenceConfidenceModel { } else { debuggingWriter = null; } - - initializeIndelPLCache(); } /** @@ -151,7 +150,7 @@ public class ReferenceConfidenceModel { /** * Calculate the reference confidence for a single sample given the its read data * - * Returns a list of variant contexts, one for each position in the activeregion.getLoc(), each containing + * Returns a list of variant contexts, one for each position in the {@code activeRegion.getLoc()}, each containing * detailed information about the certainty that the sample is hom-ref for each base in the region. * * @@ -162,6 +161,8 @@ public class ReferenceConfidenceModel { * @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc()) * @param activeRegion the active region we want to get the reference confidence over * @param readLikelihoods a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes + * @param ploidyModel indicate the ploidy of each sample in {@code stratifiedReadMap}. + * @param model genotyping model. * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence * under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span) @@ -173,6 +174,8 @@ public class ReferenceConfidenceModel { final GenomeLoc paddedReferenceLoc, final ActiveRegion activeRegion, final ReadLikelihoods readLikelihoods, + final PloidyModel ploidyModel, + final GenotypingModel model, final List variantCalls) { if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); @@ -182,12 +185,15 @@ public class ReferenceConfidenceModel { if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null"); if ( readLikelihoods.sampleCount() != 1 ) throw new IllegalArgumentException("readLikelihoods must contain exactly one sample but it contained " + readLikelihoods.sampleCount()); if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different"); + if ( ploidyModel == null) throw new IllegalArgumentException("the ploidy model cannot be null"); + if ( model == null) throw new IllegalArgumentException("the genotyping model cannot be null"); + final int ploidy = ploidyModel.samplePloidy(0); // the first sample = the only sample in reference-confidence mode. final GenomeLoc refSpan = activeRegion.getLocation(); final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, readLikelihoods); final byte[] ref = refHaplotype.getBases(); final List results = new ArrayList<>(refSpan.size()); - final String sampleName = readLikelihoods.sample(0); + final String sampleName = readLikelihoods.sampleAt(0); final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart(); for ( final ReadBackedPileup pileup : refPileups ) { @@ -201,20 +207,20 @@ public class ReferenceConfidenceModel { // otherwise emit a reference confidence variant context final int refOffset = offset + globalRefOffset; final byte refBase = ref[refOffset]; - final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null); + final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(sampleName,ploidy,model,pileup, refBase, (byte)6, null); homRefCalc.capByHomRefLikelihood(); final Allele refAllele = Allele.create(refBase, true); final List refSiteAlleles = Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles); - final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele)); + final GenotypeBuilder gb = new GenotypeBuilder(sampleName, GATKVariantContextUtils.homozygousAlleleList(refAllele, ploidy)); gb.AD(homRefCalc.AD_Ref_Any); gb.DP(homRefCalc.getDP()); // genotype likelihood calculation final GenotypeLikelihoods snpGLs = GenotypeLikelihoods.fromLog10Likelihoods(homRefCalc.genotypeLikelihoods); final int nIndelInformativeReads = calcNIndelInformativeReads(pileup, refOffset, ref, indelInformativeDepthIndelSize); - final GenotypeLikelihoods indelGLs = getIndelPLs(nIndelInformativeReads); + final GenotypeLikelihoods indelGLs = getIndelPLs(ploidy,nIndelInformativeReads); // now that we have the SNP and indel GLs, we take the one with the least confidence, // as this is the most conservative estimate of our certainty that we are hom-ref. @@ -251,23 +257,51 @@ public class ReferenceConfidenceModel { * Get indel PLs corresponding to seeing N nIndelInformativeReads at this site * * @param nInformativeReads the number of reads that inform us about being ref without an indel at this site + * @param ploidy the requested ploidy. * @return non-null GenotypeLikelihoods given N */ - protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) { - return indelPLCache[nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads]; + protected final GenotypeLikelihoods getIndelPLs(final int ploidy, final int nInformativeReads) { + if (ploidy > MAX_N_INDEL_PLOIDY) + throw new IllegalArgumentException("you have hit a current limitation of the GVCF output model that cannot handle ploidies larger than " + MAX_N_INDEL_PLOIDY + " , please let the GATK team about it: " + ploidy); + return indelPLCache(ploidy, nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads); } protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway - private static final GenotypeLikelihoods[] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1]; + private static final int MAX_N_INDEL_PLOIDY = 20; + private static final GenotypeLikelihoods[][] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_PLOIDY][]; private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp - private void initializeIndelPLCache() { - for( int nInformativeReads = 0; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) { - final double homRef = 0.0; - final double het = MathUtils.LOG_ONE_HALF * nInformativeReads; - final double homVar = INDEL_ERROR_RATE * nInformativeReads; - indelPLCache[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); + private final GenotypeLikelihoods indelPLCache(final int ploidy, final int nInformativeReads) { + GenotypeLikelihoods[] indelPLCacheByPloidy = indelPLCache[ploidy]; + if (indelPLCacheByPloidy == null) + return initializeIndelPLCache(ploidy)[nInformativeReads]; + else + return indelPLCacheByPloidy[nInformativeReads]; + } + + private synchronized GenotypeLikelihoods[] initializeIndelPLCache(final int ploidy) { + // Double-check whether another thread has done the initialization. + if (indelPLCache[ploidy] != null) + return indelPLCache[ploidy]; + + final double denominator = - MathUtils.Log10Cache.get(ploidy); + final GenotypeLikelihoods[] result = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1]; + result[0] = GenotypeLikelihoods.fromLog10Likelihoods(new double[ploidy + 1]); + for( int nInformativeReads = 1; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) { + final byte indelQual = (byte) Math.round((INDEL_ERROR_RATE * -10)); + final double refLikelihood = QualityUtils.qualToProbLog10(indelQual); + final double altLikelihood = QualityUtils.qualToErrorProbLog10(indelQual); + double[] PLs = new double[ploidy + 1]; + PLs[0] = nInformativeReads * refLikelihood; + for (int altCount = 1; altCount <= ploidy; altCount++) { + final double refLikelihoodAccum = refLikelihood + MathUtils.Log10Cache.get(ploidy - altCount); + final double altLikelihoodAccum = altLikelihood + MathUtils.Log10Cache.get(altCount); + PLs[altCount] = nInformativeReads * (MathUtils.approximateLog10SumLog10(refLikelihoodAccum ,altLikelihoodAccum) + denominator); + } + result[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(PLs); } + indelPLCache[ploidy] = result; + return result; } /** @@ -279,6 +313,7 @@ public class ReferenceConfidenceModel { * @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips * @return a RefVsAnyResult genotype call */ + @Deprecated public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) { final RefVsAnyResult result = new RefVsAnyResult(); @@ -305,6 +340,73 @@ public class ReferenceConfidenceModel { return result; } + /** + * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt + * + * @param sampleName target sample name. + * @param ploidy target sample ploidy. + * @param genotypingModel model to calculate likelihoods and genotypes. + * @param pileup the read backed pileup containing the data we want to evaluate + * @param refBase the reference base at this pileup position + * @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation + * @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips + * @return a RefVsAnyResult genotype call. + */ + public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final String sampleName, final int ploidy, + final GenotypingModel genotypingModel, + final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) { + final AlleleList alleleList = new IndexedAlleleList<>(Allele.create(refBase,true),GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + // Notice that the sample name is rather irrelevant as this information is never used, just need to be the same in both lines bellow. + + final int maximumReadCount = pileup.getReads().size(); + + final List reads = new ArrayList<>(maximumReadCount); + final double[][] likelihoods = new double[2][maximumReadCount]; + final int[] adCounts = new int[2]; + int nextIndex = 0; + for (final PileupElement p : pileup) { + final byte qual = p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual(); + if (!p.isDeletion() && qual <= minBaseQual) + continue; + final GATKSAMRecord read = p.getRead(); + reads.add(read); + final boolean isAlt = p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() + || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip(); + final int bestAllele; + final int worstAllele; + if (isAlt) { + bestAllele = 1; + worstAllele = 0; + } else { + bestAllele = 0; + worstAllele = 1; + } + + likelihoods[bestAllele][nextIndex] = QualityUtils.qualToProbLog10(qual); + likelihoods[worstAllele][nextIndex++] = QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD; + adCounts[bestAllele]++; + if (isAlt && hqSoftClips != null && p.isNextToSoftClip()) + hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(read, (byte) 28)); + } + + final Map> sampleToReads = Collections.singletonMap(sampleName,reads); + final ReadLikelihoods readLikelihoods = new ReadLikelihoods<>(new IndexedSampleList(sampleName),alleleList,sampleToReads); + final ReadLikelihoods.Matrix sampleLikelihoods = readLikelihoods.sampleMatrix(0); + final int readCount = sampleLikelihoods.readCount(); + for (int i = 0; i < readCount; i++) { + sampleLikelihoods.set(0,i,likelihoods[0][i]); + sampleLikelihoods.set(1,i,likelihoods[1][i]); + } + + final PloidyModel ploidyModel = new HomogeneousPloidyModel(new IndexedSampleList(sampleName),ploidy); + final GenotypingLikelihoods genotypingLikelihoods = genotypingModel.calculateLikelihoods(alleleList, new GenotypingData<>(ploidyModel, readLikelihoods)); + final double[] genotypeLikelihoodArray = genotypingLikelihoods.sampleLikelihoods(0).getAsVector(); + final RefVsAnyResult result = new RefVsAnyResult(genotypeLikelihoodArray.length); + System.arraycopy(genotypeLikelihoodArray,0,result.genotypeLikelihoods,0,genotypeLikelihoodArray.length); + System.arraycopy(adCounts,0,result.AD_Ref_Any,0,2); + return result; + } + /** * Get a list of pileups that span the entire active region span, in order, one for each position */ @@ -330,7 +432,7 @@ public class ReferenceConfidenceModel { debuggingWriter.addAlignment(read); final LocusIteratorByState libs = new LocusIteratorByState(reads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING, - true, genomeLocParser, samples, false); + true, genomeLocParser, SampleListUtils.asSet(samples), false); final List pileups = new LinkedList<>(); final int startPos = activeRegionSpan.getStart(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java index 6f71868bb..a798c760f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java @@ -49,6 +49,9 @@ package org.broadinstitute.gatk.tools.walkers.indels; import com.google.java.contract.Ensures; import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.genotyping.AlleleList; +import org.broadinstitute.gatk.genotyping.IndexedAlleleList; +import org.broadinstitute.gatk.genotyping.IndexedSampleList; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.clipping.ReadClipper; import org.broadinstitute.gatk.utils.exceptions.UserException; @@ -444,10 +447,10 @@ public class PairHMMIndelErrorModel { // Apparently more than one allele can map to the same haplotype after trimming final Set distinctHaplotypesSet = new LinkedHashSet<>(trimmedHaplotypeMap.values()); - final List distinctHaplotypesList = Arrays.asList(distinctHaplotypesSet.toArray(new Haplotype[distinctHaplotypesSet.size()])); + final AlleleList distinctHaplotypesList = new IndexedAlleleList<>(distinctHaplotypesSet.toArray(new Haplotype[distinctHaplotypesSet.size()])); // Get the likelihoods for our clipped read against each of our trimmed haplotypes. final ReadLikelihoods rl = new ReadLikelihoods<>( - Collections.singletonList("DUMMY_SAMPLE"),distinctHaplotypesList,Collections.singletonMap("DUMMY_SAMPLE",Collections.singletonList(processedRead))); + new IndexedSampleList(Collections.singletonList("DUMMY_SAMPLE")),distinctHaplotypesList,Collections.singletonMap("DUMMY_SAMPLE",Collections.singletonList(processedRead))); final ReadLikelihoods.Matrix dummySampleLikelihoods = rl.sampleMatrix(0); pairHMM.computeLikelihoods(rl.sampleMatrix(0), Collections.singletonList(processedRead), readGCPArrayMap); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java index 432bfc770..f78519f3f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java @@ -46,6 +46,7 @@ package org.broadinstitute.gatk.tools.walkers.validation; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; @@ -349,13 +350,14 @@ public class GenotypeAndValidate extends RodWalker= 0) uac.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf; if (callConf >= 0) uac.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = callConf; + final GenomeAnalysisEngine toolkit = getToolkit(); uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; - snpEngine = new UnifiedGenotypingEngine(getToolkit(), uac); + snpEngine = new UnifiedGenotypingEngine(uac,toolkit); // Adding the INDEL calling arguments for UG UnifiedArgumentCollection uac_indel = uac.clone(); uac_indel.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL; - indelEngine = new UnifiedGenotypingEngine(getToolkit(), uac_indel); + indelEngine = new UnifiedGenotypingEngine(uac_indel,toolkit); // make sure we have callConf set to the threshold set by the UAC so we can use it later. callConf = uac.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index ebcb82444..9d18e9981 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -46,8 +46,12 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine; +import org.broadinstitute.gatk.genotyping.IndexedSampleList; +import org.broadinstitute.gatk.genotyping.SampleList; +import org.broadinstitute.gatk.genotyping.SampleListUtils; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; @@ -111,6 +115,7 @@ import java.util.*; */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=-10,stop=10)) +@SuppressWarnings("unused") public class GenotypeGVCFs extends RodWalker implements AnnotatorCompatible, TreeReducible { /** @@ -159,12 +164,13 @@ public class GenotypeGVCFs extends RodWalker variantCollection : variantCollections ) variants.addAll(variantCollection.getRodBindings()); - final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), variants); - final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + final GenomeAnalysisEngine toolkit = getToolkit(); + final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(toolkit, variants); + final SampleList samples = new IndexedSampleList(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); // create the genotyping engine - genotypingEngine = new UnifiedGenotypingEngine(getToolkit(), createUAC(), samples); + genotypingEngine = new UnifiedGenotypingEngine(createUAC(), samples, toolkit.getGenomeLocParser(), toolkit.getArguments().BAQMode); // create the annotation engine - annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.emptyList(), this, getToolkit()); + annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.emptyList(), this, toolkit); // take care of the VCF headers final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); @@ -179,7 +185,8 @@ public class GenotypeGVCFs extends RodWalker sampleNameSet = SampleListUtils.asSet(samples); + final VCFHeader vcfHeader = new VCFHeader(headerLines, sampleNameSet); vcfWriter.writeHeader(vcfHeader); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java index 50e8bd416..e6cc614e6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java @@ -46,6 +46,10 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.genotyping.IndexedSampleList; +import org.broadinstitute.gatk.genotyping.SampleList; +import org.broadinstitute.gatk.genotyping.SampleListUtils; import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; @@ -120,14 +124,18 @@ public class RegenotypeVariants extends RodWalker implements T UAC.genotypingOutputMode = GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES; String trackName = variantCollection.variants.getName(); - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); - UG_engine = new UnifiedGenotypingEngine(getToolkit(), UAC, samples); + + final GenomeAnalysisEngine toolkit = getToolkit(); + final SampleList samples = + new IndexedSampleList(SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName))); + final Set sampleNameSet = SampleListUtils.asSet(samples); + UG_engine = new UnifiedGenotypingEngine(UAC, samples,toolkit.getGenomeLocParser(),toolkit.getArguments().BAQMode); final Set hInfo = new HashSet(); hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(trackName))); hInfo.addAll(UnifiedGenotyper.getHeaderInfo(UAC, null, null)); - vcfWriter.writeHeader(new VCFHeader(hInfo, samples)); + vcfWriter.writeHeader(new VCFHeader(hInfo, sampleNameSet)); } /** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java index e55bf4fa0..5659e76a0 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java @@ -46,15 +46,14 @@ package org.broadinstitute.gatk.utils.gvcf; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; @@ -70,9 +69,7 @@ public class GVCFWriter implements VariantContextWriter { // // static VCF field names // - protected final static String BLOCK_SIZE_INFO_FIELD = "BLOCK_SIZE"; protected final static String MIN_DP_FORMAT_FIELD = "MIN_DP"; - protected final static String MIN_GQ_FORMAT_FIELD = "MIN_GQ"; // // Final fields initialized in constructor @@ -87,6 +84,7 @@ public class GVCFWriter implements VariantContextWriter { String contigOfNextAvailableStart = null; private String sampleName = null; private HomRefBlock currentBlock = null; + private final int defaultPloidy; /** * Is the proposed GQ partitions well-formed? @@ -94,7 +92,7 @@ public class GVCFWriter implements VariantContextWriter { * @param GQPartitions proposed GQ partitions * @return a non-null string if something is wrong (string explains issue) */ - protected static List parsePartitions(final List GQPartitions) { + protected static List parsePartitions(final List GQPartitions, final int defaultPloidy) { if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null"); if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty"); @@ -104,10 +102,10 @@ public class GVCFWriter implements VariantContextWriter { if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer"); if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value); if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value); - result.add(new HomRefBlock(lastThreshold, value)); + result.add(new HomRefBlock(lastThreshold, value,defaultPloidy)); lastThreshold = value; } - result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE)); + result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE,defaultPloidy)); return result; } @@ -128,11 +126,13 @@ public class GVCFWriter implements VariantContextWriter { * * @param underlyingWriter the ultimate destination of the GVCF records * @param GQPartitions a well-formed list of GQ partitions + * @param defaultPloidy the assumed ploidy for input variant context without one. */ - public GVCFWriter(final VariantContextWriter underlyingWriter, final List GQPartitions) { + public GVCFWriter(final VariantContextWriter underlyingWriter, final List GQPartitions, final int defaultPloidy) { if ( underlyingWriter == null ) throw new IllegalArgumentException("underlyingWriter cannot be null"); this.underlyingWriter = underlyingWriter; - this.GQPartitions = parsePartitions(GQPartitions); + this.GQPartitions = parsePartitions(GQPartitions,defaultPloidy); + this.defaultPloidy = defaultPloidy; } /** @@ -148,10 +148,6 @@ public class GVCFWriter implements VariantContextWriter { header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); header.addMetaDataLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")); - // These annotations are no longer standard - //header.addMetaDataLine(new VCFInfoHeaderLine(BLOCK_SIZE_INFO_FIELD, 1, VCFHeaderLineType.Integer, "Size of the homozygous reference GVCF block")); - //header.addMetaDataLine(new VCFFormatHeaderLine(MIN_GQ_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum GQ observed within the GVCF block")); - for ( final HomRefBlock partition : GQPartitions ) { header.addMetaDataLine(partition.toVCFHeaderLine()); } @@ -188,27 +184,30 @@ public class GVCFWriter implements VariantContextWriter { * @return a VariantContext to be emitted, or null if non is appropriate */ protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) { + if ( nextAvailableStart != -1 ) { // don't create blocks while the hom-ref site falls before nextAvailableStart (for deletions) - if ( vc.getStart() <= nextAvailableStart && vc.getChr().equals(contigOfNextAvailableStart) ) { + if ( vc.getStart() <= nextAvailableStart && vc.getChr().equals(contigOfNextAvailableStart) ) return null; - } // otherwise, reset to non-relevant nextAvailableStart = -1; contigOfNextAvailableStart = null; } - if ( currentBlock == null ) { - currentBlock = createNewBlock(vc, g); - return null; - } else if ( currentBlock.withinBounds(g.getGQ()) ) { + final VariantContext result; + if (genotypeCanBeMergedInCurrentBlock(g)) { currentBlock.add(vc.getStart(), g); - return null; + result = null; } else { - final VariantContext result = blockToVCF(currentBlock); + result = blockToVCF(currentBlock); currentBlock = createNewBlock(vc, g); - return result; } + return result; + } + + private boolean genotypeCanBeMergedInCurrentBlock(final Genotype g) { + return currentBlock != null && currentBlock.withinBounds(g.getGQ()) && currentBlock.getPloidy() == g.getPloidy() + && (currentBlock.getMinPLs() == null || !g.hasPL() || (currentBlock.getMinPLs().length == g.getPL().length)); } /** @@ -226,21 +225,20 @@ public class GVCFWriter implements VariantContextWriter { * Convert a HomRefBlock into a VariantContext * * @param block the block to convert - * @return a VariantContext representing the gVCF encoding for this block + * @return a VariantContext representing the gVCF encoding for this block. + * It will return {@code null} if input {@code block} is {@code null}, indicating that there + * is no variant-context to be output into the VCF. */ private VariantContext blockToVCF(final HomRefBlock block) { - if ( block == null ) throw new IllegalArgumentException("block cannot be null"); + if ( block == null ) return null; final VariantContextBuilder vcb = new VariantContextBuilder(block.getStartingVC()); vcb.attributes(new HashMap(2)); // clear the attributes vcb.stop(block.getStop()); vcb.attribute(VCFConstants.END_KEY, block.getStop()); - // This annotation is no longer standard - //vcb.attribute(BLOCK_SIZE_INFO_FIELD, block.getSize()); - // create the single Genotype with GQ and DP annotations - final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Collections.nCopies(2, block.getRef())); + final GenotypeBuilder gb = new GenotypeBuilder(sampleName, GATKVariantContextUtils.homozygousAlleleList(block.getRef(),block.getPloidy())); gb.noAD().noPL().noAttributes(); // clear all attributes gb.GQ(block.getMedianGQ()); gb.DP(block.getMedianDP()); @@ -269,10 +267,12 @@ public class GVCFWriter implements VariantContextWriter { break; } } - if ( partition == null ) throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition " + partition); + + if ( partition == null ) + throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition"); // create the block, add g to it, and return it for use - final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound()); + final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound(), defaultPloidy); block.add(vc.getStart(), g); return block; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java index 179217b91..200eb1366 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java @@ -46,11 +46,11 @@ package org.broadinstitute.gatk.utils.gvcf; -import org.broadinstitute.gatk.utils.MathUtils; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeaderLine; +import org.broadinstitute.gatk.utils.MathUtils; import java.util.ArrayList; import java.util.List; @@ -75,6 +75,7 @@ final class HomRefBlock { final private List GQs = new ArrayList<>(100); final private List DPs = new ArrayList<>(100); private final Allele ref; + private final int ploidy; /** * Create a new HomRefBlock @@ -83,7 +84,7 @@ final class HomRefBlock { * @param minGQ the minGQ (inclusive) to use in this band * @param maxGQ the maxGQ (exclusive) to use in this band */ - public HomRefBlock(final VariantContext startingVC, int minGQ, int maxGQ) { + public HomRefBlock(final VariantContext startingVC, final int minGQ, final int maxGQ, final int defaultPloidy) { if ( startingVC == null ) throw new IllegalArgumentException("startingVC cannot be null"); if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); @@ -92,6 +93,7 @@ final class HomRefBlock { this.ref = startingVC.getReference(); this.minGQ = minGQ; this.maxGQ = maxGQ; + this.ploidy = startingVC.getMaxPloidy(defaultPloidy); } /** @@ -100,7 +102,7 @@ final class HomRefBlock { * @param minGQ the minGQ (inclusive) to use in this band * @param maxGQ the maxGQ (exclusive) to use in this band */ - public HomRefBlock(int minGQ, int maxGQ) { + public HomRefBlock(final int minGQ, final int maxGQ, final int ploidy) { if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); this.startingVC = null; @@ -108,6 +110,7 @@ final class HomRefBlock { this.ref = null; this.minGQ = minGQ; this.maxGQ = maxGQ; + this.ploidy = ploidy; } /** @@ -119,19 +122,18 @@ final class HomRefBlock { if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field"); if ( ! g.hasPL() ) throw new IllegalArgumentException("g must have PL field"); if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop); + if ( g.getPloidy() != ploidy) + throw new IllegalArgumentException("cannot add a genotype with a different ploidy: " + g.getPloidy() + " != " + ploidy); - if( minPLs == null ) { // if the minPLs vector has not been set yet, create it here by copying the provided genotype's PLs + if( minPLs == null ) + minPLs = g.getPL(); + else { // otherwise take the min with the provided genotype's PLs final int[] PL = g.getPL(); - if( PL.length == 3 ) { - minPLs = PL.clone(); - } - } else { // otherwise take the min with the provided genotype's PLs - final int[] PL = g.getPL(); - if( PL.length == 3 ) { - minPLs[0] = Math.min(minPLs[0], PL[0]); - minPLs[1] = Math.min(minPLs[1], PL[1]); - minPLs[2] = Math.min(minPLs[2], PL[2]); - } + if (PL.length != minPLs.length) + throw new IllegalStateException("trying to merge different PL array sizes: " + PL.length + " != " + minPLs.length); + for (int i = 0; i < PL.length; i++) + if (minPLs[i] > PL[i]) + minPLs[i] = PL[i]; } stop = pos; GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission @@ -182,4 +184,12 @@ final class HomRefBlock { public VCFHeaderLine toVCFHeaderLine() { return new VCFHeaderLine("GVCFBlock", "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)"); } + + /** + * Get the ploidy of this hom-ref block. + * @return + */ + public int getPloidy() { + return ploidy; + } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java index 2c17d4d5c..af93892a7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java @@ -48,6 +48,9 @@ package org.broadinstitute.gatk.utils.haplotype; import com.google.java.contract.Requires; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.genotyping.AlleleList; +import org.broadinstitute.gatk.genotyping.AlleleListUtils; +import org.broadinstitute.gatk.genotyping.SampleListUtils; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; @@ -77,7 +80,9 @@ public class HaplotypeLDCalculator { @SuppressWarnings("unchecked") protected HaplotypeLDCalculator() { haplotypes = Collections.emptyList(); - readLikelihoods = new ReadLikelihoods<>((List)Collections.EMPTY_LIST, (List)Collections.EMPTY_LIST, Collections.EMPTY_MAP); + final AlleleList alleleList = AlleleListUtils.emptyList(); + readLikelihoods = new ReadLikelihoods<>(SampleListUtils.emptyList(), + alleleList, Collections.EMPTY_MAP); } public HaplotypeLDCalculator(final List haplotypes, final ReadLikelihoods haplotypeReadMap) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java index d17c12b30..51f1e04a2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java @@ -50,15 +50,17 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; // the imports for unit testing. -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.Utils; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeLikelihoods; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.genotyping.SampleList; +import org.broadinstitute.gatk.genotyping.SampleListUtils; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.Utils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -76,7 +78,7 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest { engine.setArguments(new GATKArgumentCollection()); final UnifiedArgumentCollection args = new UnifiedArgumentCollection(); - final Set fakeSamples = Collections.singleton("fake"); + final SampleList fakeSamples = SampleListUtils.singletonList("fake"); ugEngine = new UnifiedGenotypingEngine(engine, args,fakeSamples); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java index 66af674e9..d36b1af16 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java @@ -47,6 +47,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import com.google.caliper.Param; import com.google.caliper.SimpleBenchmark; +import org.broadinstitute.gatk.genotyping.SampleListUtils; import org.broadinstitute.gatk.utils.pairhmm.ActiveRegionTestDataSet; import org.broadinstitute.gatk.utils.pairhmm.FastLoglessPairHMM; import org.broadinstitute.gatk.utils.pairhmm.PairHMM; @@ -112,7 +113,7 @@ public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark { public void timeGraphBasedLikelihoods(final int reps) { for (int i = 0; i < reps; i++) { final GraphBasedLikelihoodCalculationEngineInstance rtlce = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(), new FastLoglessPairHMM((byte)10),Double.NEGATIVE_INFINITY,HeterogeneousKmerSizeResolution.COMBO_MAX); - rtlce.computeReadLikelihoods(dataSet.haplotypeList(), Collections.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList())); + rtlce.computeReadLikelihoods(dataSet.haplotypeList(), SampleListUtils.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList())); } } @@ -121,7 +122,7 @@ public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark { for (int i = 0; i < reps; i++) { final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte) 10, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE); - engine.computeReadLikelihoods(dataSet.assemblyResultSet(), Collections.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList())); + engine.computeReadLikelihoods(dataSet.assemblyResultSet(), SampleListUtils.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList())); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java index db6923feb..cb946aafe 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.genotyping.SampleListUtils; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.HaplotypeGraph; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; @@ -262,7 +263,7 @@ public class ReadThreadingLikelihoodCalculationEngineUnitTest extends ActiveRegi dataSet = (ActiveRegionTestDataSet) params[0]; if (INTRODUCE_READ_ERRORS) dataSet.introduceErrors(new Random(13)); graphEngine = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(),hmm,Double.NEGATIVE_INFINITY, HeterogeneousKmerSizeResolution.COMBO_MAX); - graphLks = graphEngine.computeReadLikelihoods(dataSet.haplotypeList(),Collections.singletonList("anonymous"),Collections.singletonMap("anonymous",dataSet.readList())).toPerReadAlleleLikelihoodMap(0); + graphLks = graphEngine.computeReadLikelihoods(dataSet.haplotypeList(), SampleListUtils.singletonList("anonymous"),Collections.singletonMap("anonymous",dataSet.readList())).toPerReadAlleleLikelihoodMap(0); // clip reads at the anchors. final Map clippedReads = anchorClippedReads(graphEngine.getHaplotypeGraph(),dataSet.readList()); @@ -272,7 +273,7 @@ public class ReadThreadingLikelihoodCalculationEngineUnitTest extends ActiveRegi clippedReadList.add(clippedReads.containsKey(r) ? clippedReads.get(r) : r); } - loglessLks = fullPairHMM.computeReadLikelihoods(dataSet.assemblyResultSet(),Collections.singletonList("anonymous"),Collections.singletonMap("anonymous",clippedReadList)).toPerReadAlleleLikelihoodMap(0); + loglessLks = fullPairHMM.computeReadLikelihoods(dataSet.assemblyResultSet(),SampleListUtils.singletonList("anonymous"),Collections.singletonMap("anonymous",clippedReadList)).toPerReadAlleleLikelihoodMap(0); // Change clipped by unclipped in the resulting likelihood map. for (final GATKSAMRecord r : clippedReads.keySet()) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java index 08edeee98..7c2cd8727 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -47,11 +47,12 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.samtools.SAMFileHeader; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.UnvalidatingGenomeLoc; -import org.broadinstitute.gatk.utils.Utils; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeLikelihoods; +import htsjdk.variant.variantcontext.GenotypeType; +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.genotyping.*; +import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; @@ -61,7 +62,7 @@ import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import htsjdk.variant.variantcontext.*; +import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; @@ -75,7 +76,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { final String RGID = "ID1"; GATKSAMReadGroupRecord rg; final String sample = "NA12878"; - final Set samples = Collections.singleton(sample); + final SampleList samples = SampleListUtils.singletonList(sample); SAMFileHeader header; ReferenceConfidenceModel model; @@ -179,12 +180,12 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { @Test public void testIndelLikelihoods() { - GenotypeLikelihoods prev = model.getIndelPLs(0); + GenotypeLikelihoods prev = model.getIndelPLs(HomoSapiensConstants.DEFAULT_PLOIDY,0); Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0}); Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0); for ( int i = 1; i <= ReferenceConfidenceModel.MAX_N_INDEL_INFORMATIVE_READS; i++ ) { - final GenotypeLikelihoods current = model.getIndelPLs(i); + final GenotypeLikelihoods current = model.getIndelPLs(HomoSapiensConstants.DEFAULT_PLOIDY,i); final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF); final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF); Assert.assertTrue(prevGQ < currGQ, "GQ Failed with prev " + prev + " curr " + current + " at " + i); @@ -288,10 +289,12 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); } - final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), samples, data.getActiveRegion()); + final PloidyModel ploidyModel = new HomogeneousPloidyModel(samples,2); + final GenotypingModel genotypingModel = new InfiniteRandomMatingPopulationModel(); final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); - final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, ploidyModel, genotypingModel, calls); checkReferenceModelResult(data, contexts, expectedDPs, calls); } @@ -304,12 +307,14 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { final List haplotypes = Arrays.asList(data.getRefHap()); final List calls = Collections.emptyList(); + final PloidyModel ploidyModel = new HomogeneousPloidyModel(samples,2); + final GenotypingModel genotypingModel = new InfiniteRandomMatingPopulationModel(); data.getActiveRegion().add(data.makeRead(start, readLen)); - final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), samples, data.getActiveRegion()); final List expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0)); for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1); - final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, ploidyModel, genotypingModel, calls); checkReferenceModelResult(data, contexts, expectedDPs, calls); } } @@ -340,10 +345,12 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); } - final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), samples, data.getActiveRegion()); + final PloidyModel ploidyModel = new HomogeneousPloidyModel(samples,HomoSapiensConstants.DEFAULT_PLOIDY); + final GenotypingModel genotypingModel = new InfiniteRandomMatingPopulationModel(); final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); - final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, ploidyModel, genotypingModel, calls); checkReferenceModelResult(data, contexts, expectedDPs, calls); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java index 61a6860fd..33e1a4758 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java @@ -48,6 +48,8 @@ package org.broadinstitute.gatk.utils.genotyper; import htsjdk.samtools.SAMFileHeader; import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.genotyping.IndexedAlleleList; +import org.broadinstitute.gatk.genotyping.IndexedSampleList; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; @@ -72,7 +74,7 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "dataSets") public void testInstantiationAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods result = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); Assert.assertEquals(result.sampleCount(), samples.length); Assert.assertEquals(result.alleleCount(), alleles.length); @@ -85,7 +87,7 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "dataSets") public void testLikelihoodFillingAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods result = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); final double[][][] likelihoods = fillWithRandomLikelihoods(samples, alleles, result); testLikelihoodMatrixQueries(samples, result, likelihoods); } @@ -106,7 +108,7 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "dataSets") public void testBestAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); fillWithRandomLikelihoods(samples,alleles,original); final int alleleCount = alleles.length; for (int s = 0; s < samples.length; s++) { @@ -146,7 +148,7 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "dataSets") public void testBestAlleleMap(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); fillWithRandomLikelihoods(samples,alleles,original); final Map> expected = new HashMap<>(alleles.length); for (final Allele allele : alleles) @@ -171,7 +173,7 @@ public class ReadLikelihoodsUnitTest } } if ((bestAlleleLk - secondBestAlleleLk) > ReadLikelihoods.BestAllele.INFORMATIVE_THRESHOLD) - expected.get(alleles[bestAlleleIndex]).add(sampleMatrix.read(r)); + expected.get(alleles[bestAlleleIndex]).add(sampleMatrix.readAt(r)); } } @@ -189,7 +191,7 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "dataSets") public void testFilterPoorlyModeledReads(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); for (int s = 0; s < samples.length; s++) { final int sampleReadCount = original.sampleReadCount(s); @@ -220,7 +222,7 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "dataSets") public void testFilterReadsToOverlap(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); fillWithRandomLikelihoods(samples,alleles,original); final ReadLikelihoods result = original.clone(); @@ -231,7 +233,7 @@ public class ReadLikelihoodsUnitTest newLikelihoods[s][a] = new double[(original.sampleReadCount(s) + 1) / 2]; final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); for (int r = 0; r < newLikelihoods[s][a].length; r++) { - Assert.assertEquals(result.readIndex(s,sampleMatrix.read(r << 1)),r); + Assert.assertEquals(result.readIndex(s,sampleMatrix.readAt(r << 1)),r); newLikelihoods[s][a][r] = sampleMatrix.get(a, r << 1); } } @@ -240,14 +242,14 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "marginalizationDataSets") public void testMarginalizationWithOverlap(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { - final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); fillWithRandomLikelihoods(samples, alleles, original); final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping,evenReadOverlap); Assert.assertNotNull(marginalized); Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); for (int a = 0; a < marginalized.alleleCount(); a++) { - final List oldAlleles = newToOldAlleleMapping.get(marginalized.allele(a)); + final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a)); Assert.assertNotNull(oldAlleles); for (int s = 0; s < samples.length; s++) { final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); @@ -268,13 +270,13 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "marginalizationDataSets") public void testMarginalization(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { - final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); fillWithRandomLikelihoods(samples, alleles, original); final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping); Assert.assertNotNull(marginalized); Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); for (int a = 0; a < marginalized.alleleCount(); a++) { - final List oldAlleles = newToOldAlleleMapping.get(marginalized.allele(a)); + final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a)); Assert.assertNotNull(oldAlleles); for (int s = 0; s < samples.length; s++) { final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); @@ -295,7 +297,7 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "dataSets") public void testNormalizeBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); final ReadLikelihoods result= original.clone(); result.normalizeLikelihoods(true, Double.NEGATIVE_INFINITY); @@ -321,7 +323,7 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "dataSets") public void testNormalizeCapWorstLK(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); final ReadLikelihoods result= original.clone(); result.normalizeLikelihoods(false, - 0.001); @@ -354,7 +356,7 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "dataSets") public void testNormalizeCapWorstLKAndBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); final ReadLikelihoods result= original.clone(); result.normalizeLikelihoods(true, - 0.001); @@ -390,7 +392,7 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "dataSets") public void testAddMissingAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); final ReadLikelihoods result = original.clone(); @@ -411,8 +413,8 @@ public class ReadLikelihoodsUnitTest Assert.assertEquals(original.alleleCount() + 1, result.alleleCount()); // We add too more amongst exisisting alleles: - result.addMissingAlleles(Arrays.asList(newTwo = Allele.create("ATATATTATATTAATATT".getBytes(), false),result.allele(1), - result.allele(0),newThree = Allele.create("TGTGTGTATTG".getBytes(),false),Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-6.54321); + result.addMissingAlleles(Arrays.asList(newTwo = Allele.create("ATATATTATATTAATATT".getBytes(), false),result.alleleAt(1), + result.alleleAt(0),newThree = Allele.create("TGTGTGTATTG".getBytes(),false),Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-6.54321); Assert.assertEquals(original.alleleCount()+3,result.alleleCount()); @@ -439,7 +441,7 @@ public class ReadLikelihoodsUnitTest @Test(dataProvider = "dataSets") public void testAddNonRefAllele(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); final ReadLikelihoods result = original.clone(); result.addNonReferenceAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); @@ -473,13 +475,13 @@ public class ReadLikelihoodsUnitTest private void testLikelihoodMatrixQueries(String[] samples, ReadLikelihoods result, final double[][][] likelihoods) { for (final String sample : samples) { final int sampleIndex = result.sampleIndex(sample); - final double[][] likelihoodMatrix = result.sampleValues(sampleIndex); final int sampleReadCount = result.sampleReadCount(sampleIndex); - Assert.assertEquals(result.alleleCount(), likelihoodMatrix.length); - for (int a = 0; a < likelihoodMatrix.length; a++) { - Assert.assertEquals(likelihoodMatrix[a].length,sampleReadCount); + final int alleleCount = result.alleleCount(); + Assert.assertEquals(result.alleleCount(), alleleCount); + for (int a = 0; a < alleleCount; a++) { + Assert.assertEquals(result.sampleReadCount(0),sampleReadCount); for (int r = 0; r < sampleReadCount; r++) - Assert.assertEquals(likelihoodMatrix[a][r], + Assert.assertEquals(result.sampleMatrix(0).get(a,r), likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java index 3228e87d6..1f0280c82 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java @@ -46,14 +46,14 @@ package org.broadinstitute.gatk.utils.gvcf; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.ReferenceConfidenceModel; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; import org.testng.Assert; import org.testng.annotations.BeforeMethod; import org.testng.annotations.DataProvider; @@ -100,21 +100,21 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testHeaderWriting() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.writeHeader(new VCFHeader()); Assert.assertTrue(mockWriter.headerWritten); } @Test public void testClose() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.close(); Assert.assertTrue(mockWriter.closed); } @Test public void testCloseWithoutClosingUnderlyingWriter() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.close(false); Assert.assertFalse(mockWriter.closed); } @@ -164,7 +164,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testCloseEmitsLastVariant() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.add(makeHomRef("20", 1, 30)); Assert.assertEquals(mockWriter.emitted.size(), 0); @@ -176,7 +176,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testCloseDoesntEmitsLastVariantWhenNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.add(makeNonRef("20", 1, 30)); Assert.assertEquals(mockWriter.emitted.size(), 1); @@ -188,7 +188,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testCrossingContigBoundaryRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.add(makeHomRef("20", 1, 30)); writer.add(makeHomRef("20", 2, 30)); @@ -204,7 +204,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testCrossingContigBoundaryToLowerPositionsRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.add(makeHomRef("20", 30, 30)); writer.add(makeHomRef("20", 31, 30)); @@ -220,7 +220,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testCrossingContigBoundaryFromNonRefToLowerPositionsRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.add(makeNonRef("20", 20, 30)); Assert.assertEquals(mockWriter.emitted.size(), 1); @@ -235,7 +235,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testCrossingContigBoundaryNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.add(makeHomRef("20", 1, 30)); writer.add(makeHomRef("20", 2, 30)); @@ -248,7 +248,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testCrossingContigBoundaryNonRefThenNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.add(makeNonRef("20", 1, 30)); Assert.assertEquals(mockWriter.emitted.size(), 1); @@ -283,7 +283,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testVariantForcesNonRef() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.add(makeHomRef("20", 1, 30)); writer.add(makeHomRef("20", 2, 30)); @@ -300,7 +300,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testEmittingTwoBands() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.add(makeHomRef("20", 1, 0)); writer.add(makeHomRef("20", 2, 0)); @@ -315,7 +315,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testNonContiguousBlocks() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.add(makeHomRef("20", 1, 0)); writer.add(makeHomRef("20", 2, 0)); @@ -329,7 +329,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testDeletion() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY); writer.add(makeHomRef("20", 1, 0)); writer.add(makeHomRef("20", 2, 0)); @@ -347,7 +347,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test public void testHomRefAlt() { - final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, 2); writer.add(makeHomRef("20", 1, 0)); writer.add(makeHomRef("20", 2, 0)); @@ -383,7 +383,7 @@ public class GVCFWriterUnitTest extends BaseTest { @Test(dataProvider = "BandPartitionData") public void testMyData(final List partitions, final boolean expectedGood) { try { - GVCFWriter.parsePartitions(partitions); + GVCFWriter.parsePartitions(partitions,2); Assert.assertTrue(expectedGood, "Expected to fail but didn't"); } catch ( Exception e ) { Assert.assertTrue(! expectedGood, "Expected to succeed but failed with message " + e.getMessage()); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java index 36d3aa09e..00d5d6984 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java @@ -46,11 +46,11 @@ package org.broadinstitute.gatk.utils.gvcf; -import org.broadinstitute.gatk.utils.BaseTest; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.gatk.utils.BaseTest; import org.testng.Assert; import org.testng.annotations.BeforeMethod; import org.testng.annotations.DataProvider; @@ -58,7 +58,6 @@ import org.testng.annotations.Test; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.List; public class HomRefBlockUnitTest extends BaseTest { @@ -71,7 +70,7 @@ public class HomRefBlockUnitTest extends BaseTest { @Test public void testBasicConstruction() { - final HomRefBlock band = new HomRefBlock(vc, 10, 20); + final HomRefBlock band = new HomRefBlock(vc, 10, 20, 2); Assert.assertSame(band.getStartingVC(), vc); Assert.assertEquals(band.getRef(), vc.getReference()); Assert.assertEquals(band.getGQLowerBound(), 10); @@ -86,7 +85,7 @@ public class HomRefBlockUnitTest extends BaseTest { @Test public void testMinMedian() { //TODO - might be better to make this test use a data provider? - final HomRefBlock band = new HomRefBlock(vc, 10, 20); + final HomRefBlock band = new HomRefBlock(vc, 10, 20,2); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); int pos = vc.getStart(); @@ -117,7 +116,7 @@ public class HomRefBlockUnitTest extends BaseTest { @Test public void testBigGQIsCapped() { - final HomRefBlock band = new HomRefBlock(vc, 10, 20); + final HomRefBlock band = new HomRefBlock(vc, 10, 20,2); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); band.add(vc.getStart(), gb.DP(1000).GQ(1000).PL(new int[]{0,10,100}).make()); @@ -126,7 +125,7 @@ public class HomRefBlockUnitTest extends BaseTest { @Test(expectedExceptions = IllegalArgumentException.class) public void testBadAdd() { - final HomRefBlock band = new HomRefBlock(vc, 10, 20); + final HomRefBlock band = new HomRefBlock(vc, 10, 20,2); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); band.add(vc.getStart() + 10, gb.DP(10).GQ(11).PL(new int[]{0,10,100}).make()); @@ -156,7 +155,7 @@ public class HomRefBlockUnitTest extends BaseTest { @Test(dataProvider = "ContiguousData") public void testIsContiguous(final String contig, final int pos, final boolean expected) { - final HomRefBlock band = new HomRefBlock(vc, 10, 20); + final HomRefBlock band = new HomRefBlock(vc, 10, 20,2); final VariantContext testVC = new VariantContextBuilder(vc).chr(contig).start(pos).stop(pos).make(); Assert.assertEquals(band.isContiguous(testVC), expected); } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java index 8e5ae61fb..feb41712b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java @@ -26,14 +26,13 @@ package org.broadinstitute.gatk.engine; import com.google.java.contract.Ensures; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.variant.vcf.VCFConstants; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.walkers.*; -import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; import org.broadinstitute.gatk.engine.datasources.reads.*; @@ -55,8 +54,12 @@ import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; import org.broadinstitute.gatk.engine.samples.SampleDB; import org.broadinstitute.gatk.engine.samples.SampleDBBuilder; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.genotyping.IndexedSampleList; +import org.broadinstitute.gatk.genotyping.SampleList; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.classloader.PluginManager; +import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.interval.IntervalUtils; @@ -64,7 +67,6 @@ import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; import org.broadinstitute.gatk.utils.recalibration.BQSRArgumentSet; import org.broadinstitute.gatk.utils.text.XReadLines; import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; -import htsjdk.variant.vcf.VCFConstants; import java.io.File; import java.io.FileNotFoundException; @@ -1138,7 +1140,7 @@ public class GenomeAnalysisEngine { * Returns data source objects encapsulating all rod data; * individual rods can be accessed through the returned data source objects. * - * @return the rods data sources + * @return the rods data sources, never {@code null}. */ public List getRodDataSources() { return this.rodDataSources; @@ -1254,4 +1256,20 @@ public class GenomeAnalysisEngine { runtimeLimitInNanoseconds = TimeUnit.NANOSECONDS.convert(args.maxRuntime, args.maxRuntimeUnits); } } + + /** + * Returns the sample list including all samples. + * @return never {@code null}. + */ + public SampleList getSampleList() { + return new IndexedSampleList(getSampleDB().getSampleNames()); + } + + /** + * Returns the sample list including samples in read inputs. + * @return never {@code null}. + */ + public SampleList getReadSampleList() { + return new IndexedSampleList(SampleUtils.getSAMFileSamples(getSAMFileHeader())); + } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/AlleleListUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/AlleleListUtils.java index 380a20379..dfdfe0945 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/AlleleListUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/AlleleListUtils.java @@ -37,6 +37,24 @@ import java.util.List; */ public class AlleleListUtils { + @SuppressWarnings("unchecked") + private static final AlleleList EMPTY_LIST = new AlleleList() { + @Override + public int alleleCount() { + return 0; + } + + @Override + public int alleleIndex(final Allele allele) { + return -1; + } + + @Override + public Allele alleleAt(final int index) { + throw new IllegalArgumentException("allele index is out of range"); + } + }; + /** * Checks whether two allele lists are in fact the same. * @param first one list to compare. @@ -107,6 +125,16 @@ public class AlleleListUtils { return new AsList(list); } + /** + * Returns an unmodifiable empty allele-list. + * @param
the allele class. + * @return never {@code null}. + */ + @SuppressWarnings("unchecked") + public static final AlleleList emptyList() { + return EMPTY_LIST; + } + /** * Simple list view of a sample-list. */ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/SampleListUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/SampleListUtils.java index e151d03da..f3787568a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/SampleListUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/SampleListUtils.java @@ -34,6 +34,33 @@ import java.util.*; */ public class SampleListUtils { + private static final SampleList EMPTY_LIST = new SampleList() { + + @Override + public int sampleCount() { + return 0; + } + + @Override + public int sampleIndex(String sample) { + return -1; + } + + @Override + public String sampleAt(final int sampleIndex) { + throw new IllegalArgumentException("index is out of valid range"); + } + }; + + /** + * Empty list. + * + * @return never {@code null} + */ + public static SampleList emptyList() { + return EMPTY_LIST; + } + /** * Checks whether two sample lists are in fact the same. * @param first one list to compare. diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SampleUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SampleUtils.java index ccb573414..77fc17083 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SampleUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SampleUtils.java @@ -60,9 +60,9 @@ public class SampleUtils { * @param header the sam file header * @return list of strings representing the sample names */ - public static Set getSAMFileSamples(SAMFileHeader header) { + public static Set getSAMFileSamples(final SAMFileHeader header) { // get all of the unique sample names - Set samples = new TreeSet(); + final Set samples = new TreeSet(); List readGroups = header.getReadGroups(); for ( SAMReadGroupRecord readGroup : readGroups ) samples.add(readGroup.getSample()); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java index 97cbebd09..2fdf657cc 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java @@ -85,6 +85,24 @@ public class GATKVariantContextUtils { return true; } + /** + * Returns a homozygous call allele list given the only allele and the ploidy. + * + * @param allele the only allele in the allele list. + * @param ploidy the ploidy of the resulting allele list. + * + * @throws IllegalArgumentException if {@code allele} is {@code null} or ploidy is negative. + * + * @return never {@code null}. + */ + public static List homozygousAlleleList(final Allele allele, final int ploidy) { + if (allele == null || ploidy < 0) + throw new IllegalArgumentException(); + + // Use a tailored inner class to implement the list: + return Collections.nCopies(ploidy,allele); + } + public enum GenotypeMergeType { /** * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD.