From 9ee9da36bb206d57b07987bdc10c9bbda69f3516 Mon Sep 17 00:00:00 2001
From: Valentin Ruano-Rubio
Date: Mon, 18 Aug 2014 15:29:28 -0400
Subject: [PATCH] Generalize the calculation of the genotype likelihoods in HC
to cope with haploid and multiploidy Changes in several walker to use new
sample, allele closed lists and new GenotypingEngine constructors signatures
Rebase adoption of new calculation system in walkers
---
.../StandardCallerArgumentCollection.java | 2 +-
.../InfiniteRandomMatingPopulationModel.java | 45 +++++-
.../GeneralPloidyGenotypeLikelihoods.java | 24 ++-
.../walkers/genotyper/GenotypingEngine.java | 64 +++-----
.../walkers/genotyper/UnifiedGenotyper.java | 35 +++--
.../genotyper/UnifiedGenotypingEngine.java | 55 ++++---
.../genotyper/afcalc/AFCalcFactory.java | 41 ++---
.../afcalc/GeneralPloidyExactAFCalc.java | 2 +-
...GraphBasedLikelihoodCalculationEngine.java | 3 +-
...edLikelihoodCalculationEngineInstance.java | 12 +-
.../haplotypecaller/HaplotypeCaller.java | 88 ++++++-----
.../HaplotypeCallerArgumentCollection.java | 2 +-
.../HaplotypeCallerGenotypingEngine.java | 105 ++++++++-----
.../PairHMMLikelihoodCalculationEngine.java | 12 +-
.../RandomLikelihoodCalculationEngine.java | 19 +--
.../ReadLikelihoodCalculationEngine.java | 4 +-
.../haplotypecaller/RefVsAnyResult.java | 32 +++-
.../ReferenceConfidenceModel.java | 146 +++++++++++++++---
.../indels/PairHMMIndelErrorModel.java | 7 +-
.../validation/GenotypeAndValidate.java | 6 +-
.../walkers/variantutils/GenotypeGVCFs.java | 17 +-
.../variantutils/RegenotypeVariants.java | 14 +-
.../gatk/utils/gvcf/GVCFWriter.java | 60 +++----
.../gatk/utils/gvcf/HomRefBlock.java | 38 +++--
.../haplotype/HaplotypeLDCalculator.java | 7 +-
.../UnifiedGenotyperEngineUnitTest.java | 14 +-
...LikelihoodCalculationEnginesBenchmark.java | 5 +-
...ngLikelihoodCalculationEngineUnitTest.java | 5 +-
.../ReferenceConfidenceModelUnitTest.java | 37 +++--
.../genotyper/ReadLikelihoodsUnitTest.java | 50 +++---
.../gatk/utils/gvcf/GVCFWriterUnitTest.java | 40 ++---
.../gatk/utils/gvcf/HomRefBlockUnitTest.java | 13 +-
.../gatk/engine/GenomeAnalysisEngine.java | 30 +++-
.../gatk/genotyping/AlleleListUtils.java | 28 ++++
.../gatk/genotyping/SampleListUtils.java | 27 ++++
.../gatk/utils/SampleUtils.java | 4 +-
.../variant/GATKVariantContextUtils.java | 18 +++
37 files changed, 712 insertions(+), 399 deletions(-)
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java
index 7c69ab014..3f7794b24 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java
@@ -145,7 +145,7 @@ public class StandardCallerArgumentCollection implements Cloneable {
*/
@Hidden
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
- public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
+ public AFCalcFactory.Calculation requestedAlleleFrequencyCalculationModel;
@Hidden
@Argument(shortName = "logExactCalls", doc="x", required=false)
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/genotyping/InfiniteRandomMatingPopulationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/genotyping/InfiniteRandomMatingPopulationModel.java
index 6b0594ee0..f476d940c 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/genotyping/InfiniteRandomMatingPopulationModel.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/genotyping/InfiniteRandomMatingPopulationModel.java
@@ -68,6 +68,28 @@ import java.util.List;
*/
public class InfiniteRandomMatingPopulationModel implements GenotypingModel {
+ private final int cachePloidyCapacity;
+ private final int cacheAlleleCountCapacity;
+ private ThreadLocal likelihoodCalculators;
+
+ /**
+ * Create a new infinite model instance.
+ */
+ public InfiniteRandomMatingPopulationModel() {
+ this(10,50);
+ }
+
+ public InfiniteRandomMatingPopulationModel(final int calculatorCachePloidyCapacity, final int calculatorCacheAlleleCapacity) {
+ cachePloidyCapacity = calculatorCachePloidyCapacity;
+ cacheAlleleCountCapacity = calculatorCachePloidyCapacity;
+ likelihoodCalculators = new ThreadLocal( ) {
+ @Override
+ public GenotypeLikelihoodCalculator[][] initialValue() {
+ return new GenotypeLikelihoodCalculator[calculatorCachePloidyCapacity][calculatorCacheAlleleCapacity];
+ }
+ };
+ }
+
@Override
public GenotypingLikelihoods calculateLikelihoods(final AlleleList genotypingAlleles, final GenotypingData data) {
if (genotypingAlleles == null)
@@ -99,18 +121,27 @@ public class InfiniteRandomMatingPopulationModel implements GenotypingModel {
}
private GenotypingLikelihoods singleSampleLikelihoods(final AlleleList genotypingAlleles,
- final GenotypingData data,
- final AlleleLikelihoodMatrixMapper alleleLikelihoodMatrixMapper) {
+ final GenotypingData data,
+ final AlleleLikelihoodMatrixMapper alleleLikelihoodMatrixMapper) {
final PloidyModel ploidyModel = data.ploidyModel();
final int samplePloidy = ploidyModel.samplePloidy(0);
final int alleleCount = genotypingAlleles.alleleCount();
- final GenotypeLikelihoodCalculator likelihoodsCalculator =
- GenotypeLikelihoodCalculator.getInstance(samplePloidy,alleleCount);
+ final GenotypeLikelihoodCalculator likelihoodsCalculator = getLikelihoodsCalculator(samplePloidy,alleleCount);
final ReadLikelihoods.Matrix sampleLikelihoods = alleleLikelihoodMatrixMapper.map(data.readLikelihoods().sampleMatrix(0));
final List genotypeLikelihoods = Collections.singletonList(likelihoodsCalculator.genotypeLikelihoods(sampleLikelihoods));
return new GenotypingLikelihoods<>(genotypingAlleles,ploidyModel,genotypeLikelihoods);
}
+ private GenotypeLikelihoodCalculator getLikelihoodsCalculator(final int samplePloidy, final int alleleCount) {
+ if (samplePloidy >= cacheAlleleCountCapacity)
+ return GenotypeLikelihoodCalculators.getInstance(samplePloidy, alleleCount);
+ else if (alleleCount >= cacheAlleleCountCapacity)
+ return GenotypeLikelihoodCalculators.getInstance(samplePloidy, alleleCount);
+ final GenotypeLikelihoodCalculator[][] cache = likelihoodCalculators.get();
+ final GenotypeLikelihoodCalculator result = cache[samplePloidy][alleleCount];
+ return result != null ? result : (cache[samplePloidy][alleleCount] = GenotypeLikelihoodCalculators.getInstance(samplePloidy, alleleCount));
+ }
+
private GenotypingLikelihoods multiSampleHeterogeneousPloidyModelLikelihoods(final AlleleList genotypingAlleles,
final GenotypingData data,
final AlleleLikelihoodMatrixMapper alleleLikelihoodMatrixMapper,
@@ -120,8 +151,7 @@ public class InfiniteRandomMatingPopulationModel implements GenotypingModel {
final int alleleCount = genotypingAlleles.alleleCount();
for (int i = 0; i < sampleCount; i++) {
final int samplePloidy = ploidyModel.samplePloidy(i);
- final GenotypeLikelihoodCalculator likelihoodsCalculator =
- GenotypeLikelihoodCalculator.getInstance(samplePloidy,alleleCount);
+ final GenotypeLikelihoodCalculator likelihoodsCalculator = getLikelihoodsCalculator(samplePloidy,alleleCount);
final ReadLikelihoods.Matrix sampleLikelihoods = alleleLikelihoodMatrixMapper.map(data.readLikelihoods().sampleMatrix(i));
genotypeLikelihoods.add(likelihoodsCalculator.genotypeLikelihoods(sampleLikelihoods));
}
@@ -136,8 +166,7 @@ public class InfiniteRandomMatingPopulationModel implements GenotypingModel {
final int samplePloidy = ploidyModel.samplePloidy(0);
final List genotypeLikelihoods = new ArrayList<>(sampleCount);
final int alleleCount = genotypingAlleles.alleleCount();
- final GenotypeLikelihoodCalculator likelihoodsCalculator =
- GenotypeLikelihoodCalculator.getInstance(samplePloidy,alleleCount);
+ final GenotypeLikelihoodCalculator likelihoodsCalculator = getLikelihoodsCalculator(samplePloidy,alleleCount);
for (int i = 0; i < sampleCount; i++) {
final ReadLikelihoods.Matrix sampleLikelihoods = alleleLikelihoodMatrixMapper.map(data.readLikelihoods().sampleMatrix(i));
genotypeLikelihoods.add(likelihoodsCalculator.genotypeLikelihoods(sampleLikelihoods));
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
index 0495db541..5abb1cd85 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
@@ -47,16 +47,19 @@
package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.samtools.SAMUtils;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.GenotypeLikelihoods;
+import htsjdk.variant.vcf.VCFConstants;
+import org.broadinstitute.gatk.genotyping.GenotypeAlleleCounts;
+import org.broadinstitute.gatk.genotyping.GenotypeLikelihoodCalculator;
+import org.broadinstitute.gatk.genotyping.GenotypeLikelihoodCalculators;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.ExactACcounts;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.ExactACset;
import org.broadinstitute.gatk.utils.MathUtils;
-import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
-import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import java.util.*;
@@ -424,18 +427,9 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
*/
public static int[] getAlleleCountFromPLIndex(final int nAlleles, final int numChromosomes, final int PLindex) {
- // todo - another brain-dead inefficient implementation, can do much better by computing in closed form
- final SumIterator iterator = new SumIterator(nAlleles,numChromosomes);
- while (iterator.hasNext()) {
- final int[] plVec = iterator.getCurrentVector();
- if (iterator.getLinearIndex() == PLindex)
- return plVec;
-
- iterator.next();
- }
-
- return null;
-
+ final GenotypeLikelihoodCalculator calculator = GenotypeLikelihoodCalculators.getInstance(numChromosomes, nAlleles);
+ final GenotypeAlleleCounts alleleCounts = calculator.genotypeAlleleCountsAt(PLindex);
+ return alleleCounts.alleleCountsByIndex(nAlleles - 1);
}
/*
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java
index 97a98e704..8225ba9a5 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java
@@ -48,15 +48,17 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
+import htsjdk.variant.variantcontext.*;
+import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.log4j.Logger;
-import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
+import org.broadinstitute.gatk.genotyping.SampleList;
import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalc;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalcFactory;
@@ -69,8 +71,6 @@ import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
-import htsjdk.variant.variantcontext.*;
-import htsjdk.variant.vcf.VCFConstants;
import java.util.*;
@@ -86,22 +86,20 @@ public abstract class GenotypingEngine sampleNames;
+ protected final SampleList samples;
private final double[] log10AlleleFrequencyPriorsSNPs;
private final double[] log10AlleleFrequencyPriorsIndels;
- private final GenomeLocParser genomeLocParser;
+ protected final GenomeLocParser genomeLocParser;
// the model used for calculating p(non-ref)
protected ThreadLocal afcm = new ThreadLocal() {
@@ -111,59 +109,32 @@ public abstract class GenotypingEngine resolveSampleNamesFromToolkit(final GenomeAnalysisEngine toolkit) {
- if (toolkit == null)
- throw new IllegalArgumentException("the toolkit cannot be null");
- return new LinkedHashSet<>(toolkit.getSampleDB().getSampleNames());
- }
/**
* Construct a new genotyper engine, on a specific subset of samples.
*
- * @param toolkit reference to the genome-analysis toolkit.
* @param configuration engine configuration object.
- * @param sampleNames subset of sample to work on identified by their names. If {@code null}, the full toolkit
+ * @param samples subset of sample to work on identified by their names. If {@code null}, the full toolkit
* sample set will be used instead.
+ * @param genomeLocParser the genome-loc-parser
*
- * @throws IllegalArgumentException if either {@code toolkit} or {@code configuration} is {@code null}.
+ * @throws IllegalArgumentException if any of {@code samples}, {@code configuration} or {@code genomeLocParser} is {@code null}.
*/
- protected GenotypingEngine(final GenomeAnalysisEngine toolkit, final Config configuration,final Set sampleNames) {
- if (toolkit == null)
- throw new IllegalArgumentException("the toolkit cannot be null");
+ protected GenotypingEngine(final Config configuration,final SampleList samples, final GenomeLocParser genomeLocParser) {
+
if (configuration == null)
throw new IllegalArgumentException("the configuration cannot be null");
this.configuration = configuration;
logger = Logger.getLogger(getClass());
- this.toolkit = toolkit;
- this.sampleNames = sampleNames != null ? sampleNames : toolkit.getSampleDB().getSampleNames();
- numberOfGenomes = this.sampleNames.size() * configuration.genotypeArgs.samplePloidy;
+ this.samples = samples;
+ numberOfGenomes = this.samples.sampleCount() * configuration.genotypeArgs.samplePloidy;
MathUtils.Log10Cache.ensureCacheContains(numberOfGenomes * 2);
log10AlleleFrequencyPriorsSNPs = computeAlleleFrequencyPriors(numberOfGenomes,
configuration.genotypeArgs.snpHeterozygosity,configuration.genotypeArgs.inputPrior);
log10AlleleFrequencyPriorsIndels = computeAlleleFrequencyPriors(numberOfGenomes,
configuration.genotypeArgs.indelHeterozygosity,configuration.genotypeArgs.inputPrior);
- genomeLocParser = toolkit.getGenomeLocParser();
+ this.genomeLocParser = genomeLocParser;
}
/**
@@ -257,7 +228,7 @@ public abstract class GenotypingEngine outputAlleles = outputAlternativeAlleles.outputAlleles(vc.getReference());
final VariantContextBuilder builder = new VariantContextBuilder(callSourceString(), loc.getContig(), loc.getStart(), loc.getStop(), outputAlleles);
@@ -506,7 +480,9 @@ public abstract class GenotypingEngine, Unif
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
*/
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
- protected List annotationsToUse = new ArrayList();
+ protected List annotationsToUse = new ArrayList<>();
/**
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
* so annotations will be excluded even if they are explicitly included with the other options.
*/
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
- protected List annotationsToExclude = new ArrayList();
+ protected List annotationsToExclude = new ArrayList<>();
/**
* If specified, all available annotations in the group will be applied. See the VariantAnnotator -list argument to view available groups.
@@ -218,11 +221,6 @@ public class UnifiedGenotyper extends LocusWalker, Unif
// the calculation arguments
private UnifiedGenotypingEngine genotypingEngine = null;
- // the annotation engine
- private VariantAnnotatorEngine annotationEngine;
-
- private Set samples;
-
// enable deletions in the pileup
@Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
@@ -256,17 +254,22 @@ public class UnifiedGenotyper extends LocusWalker, Unif
*
**/
public void initialize() {
+ super.initialize();
+ final GenomeAnalysisEngine toolkit = getToolkit();
+ final Set sampleNameSet;
if ( UAC.TREAT_ALL_READS_AS_SINGLE_POOL ) {
- samples.add(GenotypeLikelihoodsCalculationModel.DUMMY_SAMPLE_NAME);
+ sampleNameSet = Collections.singleton(GenotypeLikelihoodsCalculationModel.DUMMY_SAMPLE_NAME);
} else {
// get all of the unique sample names
- samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
+ sampleNameSet = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
if ( UAC.referenceSampleName != null )
- samples.remove(UAC.referenceSampleName);
+ sampleNameSet.remove(UAC.referenceSampleName);
}
+ final SampleList samples = new IndexedSampleList(sampleNameSet);
+
if ( UAC.CONTAMINATION_FRACTION_FILE != null )
- UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger));
+ UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, sampleNameSet, logger));
// check for a bad max alleles value
if ( UAC.genotypeArgs.MAX_ALTERNATE_ALLELES > GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)
@@ -282,8 +285,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif
if ( verboseWriter != null )
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP");
- annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
- genotypingEngine = new UnifiedGenotypingEngine(getToolkit(), UAC, samples);
+ final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
+ genotypingEngine = new UnifiedGenotypingEngine(UAC, samples, toolkit.getGenomeLocParser(), toolkit.getArguments().BAQMode);
genotypingEngine.setVerboseWriter(verboseWriter);
genotypingEngine.setAnnotationEngine(annotationEngine);
@@ -298,11 +301,11 @@ public class UnifiedGenotyper extends LocusWalker, Unif
final Set samplesForHeader;
if ( ! onlyEmitSamples.isEmpty() ) {
// make sure that onlyEmitSamples is a subset of samples
- if ( ! samples.containsAll(onlyEmitSamples) )
+ if ( ! sampleNameSet.containsAll(onlyEmitSamples) )
throw new UserException.BadArgumentValue("onlyEmitSamples", "must be a strict subset of the samples in the BAM files but is wasn't");
samplesForHeader = onlyEmitSamples;
} else {
- samplesForHeader = samples;
+ samplesForHeader = sampleNameSet;
}
writer.writeHeader(new VCFHeader(headerInfo, samplesForHeader));
}
@@ -310,7 +313,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif
public static Set getHeaderInfo(final UnifiedArgumentCollection UAC,
final VariantAnnotatorEngine annotationEngine,
final DbsnpArgumentCollection dbsnp) {
- Set headerInfo = new HashSet();
+ final Set headerInfo = new HashSet<>();
// all annotation fields from VariantAnnotatorEngine
if ( annotationEngine != null )
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java
index 36ac2e5bf..22bc09e98 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java
@@ -45,12 +45,16 @@
*/
package org.broadinstitute.gatk.tools.walkers.genotyper;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.GenotypesContext;
+import htsjdk.variant.variantcontext.VariantContext;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
+import org.broadinstitute.gatk.genotyping.SampleList;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalcResult;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.GenomeLocParser;
@@ -62,9 +66,6 @@ import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
-import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.GenotypesContext;
-import htsjdk.variant.variantcontext.VariantContext;
import java.io.PrintStream;
import java.lang.reflect.Constructor;
@@ -88,7 +89,6 @@ public class UnifiedGenotypingEngine extends GenotypingEngineThe new engine won't emmit annotations, will use the full sample set and will not produce additional verbose
- * output
+ * Creates a new unified genotyping given the UG configuration parameters and the GA engine.
+ * @param configuration the UG configuration.
+ * @param toolkit the GA engine.
*
- * @param toolkit reference to the enclosing genome analysis engine.
- * @param configuration configuration object.
- *
- * @throws IllegalArgumentException if either {@code toolkit} or {@code UAC} is {@code null}.
+ * @throws NullPointerException if either {@code configuration} or {@code toolkit} is {@code null}.
*/
- public UnifiedGenotypingEngine(final GenomeAnalysisEngine toolkit, final UnifiedArgumentCollection configuration) {
- this(toolkit, configuration, null);
+ public UnifiedGenotypingEngine(final UnifiedArgumentCollection configuration,
+ final GenomeAnalysisEngine toolkit) {
+ this(configuration,toolkit.getSampleList(),toolkit.getGenomeLocParser(),toolkit.getArguments().BAQMode);
}
+
/**
- * Constructs a new Unified-Genotyper engine.
+ * Creates a new unified genotyping given the UG configuration parameters, the targeted set of samples and
+ * a genome location parser.
*
- * @param toolkit reference to the enclosing genome analysis engine.
- * @param configuration configuration object.
- * @param sampleNames subset of sample names to work on. If {@code null}, all it will use the {@code toolkit} full sample set.
+ * @param configuration the UG configuration.
+ * @param samples {@inheritDoc}
+ * @param baqCalculationMode the BAQ calculation mode.
*
- * @throws IllegalArgumentException if either {@code toolkit} or {@code UAC} is {@code null}.
+ * @throws NullPointerException if any of {@code configuration}, {@code samples} or {@code genomeLocParser} is {@code null}.
+ *
+ * @throws IllegalArgumentException if {@code baqCalculationMode} is {@code null}.
*/
- public UnifiedGenotypingEngine(final GenomeAnalysisEngine toolkit, final UnifiedArgumentCollection configuration,
- final Set sampleNames) {
+ public UnifiedGenotypingEngine(final UnifiedArgumentCollection configuration,
+ final SampleList samples, final GenomeLocParser genomeLocParser,
+ final BAQ.CalculationMode baqCalculationMode) {
- super(toolkit,configuration,sampleNames);
+ super(configuration,samples,genomeLocParser);
- this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF;
- genomeLocParser = toolkit.getGenomeLocParser();
+ if (baqCalculationMode == null)
+ throw new IllegalArgumentException("the BAQ calculation mode cannot be null");
+
+ this.BAQEnabledOnCMDLine = baqCalculationMode != BAQ.CalculationMode.OFF;
determineGLModelsToUse();
@@ -302,7 +308,8 @@ public class UnifiedGenotypingEngine extends GenotypingEngine perReadAlleleLikelihoodMap) {
- return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap);
+ return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine,
+ genomeLocParser != null || refContext == null ? genomeLocParser : refContext.getGenomeLocParser(), perReadAlleleLikelihoodMap);
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalcFactory.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalcFactory.java
index e4cbf0b5d..ad4ce979a 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalcFactory.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalcFactory.java
@@ -48,10 +48,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection;
-import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.classloader.PluginManager;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
-import org.broadinstitute.gatk.utils.exceptions.UserException;
import java.lang.reflect.Constructor;
import java.util.LinkedList;
@@ -105,6 +103,24 @@ public class AFCalcFactory {
}
public static Calculation getDefaultModel() { return EXACT_INDEPENDENT; }
+
+ /**
+ * Returns the best (fastest) model give the required ploidy and alternative allele count.
+ * @param requiredPloidy required ploidy
+ * @param requiredAlternativeAlleleCount required alternative allele count.
+ * @param preferredModel a preferred mode if any. A {@code null} indicate that we should be try to use the default instead.
+ * @return never {@code null}
+ */
+ public static Calculation getBestModel(final int requiredPloidy, final int requiredAlternativeAlleleCount, final Calculation preferredModel) {
+ final Calculation preferred = preferredModel == null ? getDefaultModel() : preferredModel;
+ if (preferred.usableForParams(requiredPloidy,requiredAlternativeAlleleCount))
+ return preferred;
+ if (EXACT_INDEPENDENT.usableForParams(requiredPloidy,requiredAlternativeAlleleCount))
+ return EXACT_INDEPENDENT;
+ if (EXACT_REFERENCE.usableForParams(requiredPloidy,requiredAlternativeAlleleCount))
+ return EXACT_REFERENCE;
+ return EXACT_GENERAL_PLOIDY;
+ }
}
private static final Map> afClasses;
@@ -137,25 +153,10 @@ public class AFCalcFactory {
public static AFCalc createAFCalc(final StandardCallerArgumentCollection UAC,
final int nSamples,
final Logger logger) {
- final int maxAltAlleles = UAC.genotypeArgs.MAX_ALTERNATE_ALLELES;
- if ( ! UAC.AFmodel.usableForParams(UAC.genotypeArgs.samplePloidy, maxAltAlleles) ) {
- logger.info("Requested ploidy " + UAC.genotypeArgs.samplePloidy + " maxAltAlleles " + maxAltAlleles + " not supported by requested model " + UAC.AFmodel + " looking for an option");
- final List supportingCalculations = new LinkedList();
- for ( final Calculation calc : Calculation.values() ) {
- if ( calc.usableForParams(UAC.genotypeArgs.samplePloidy, maxAltAlleles) )
- supportingCalculations.add(calc);
- }
+ final Calculation afCalculationModel = Calculation.getBestModel(UAC.genotypeArgs.samplePloidy,UAC.genotypeArgs.MAX_ALTERNATE_ALLELES,
+ UAC.requestedAlleleFrequencyCalculationModel);
- if ( supportingCalculations.isEmpty() )
- throw new UserException("no AFCalculation model found that supports ploidy of " + UAC.genotypeArgs.samplePloidy + " and max alt alleles " + maxAltAlleles);
- else if ( supportingCalculations.size() > 1 )
- logger.debug("Warning, multiple supporting AFCalcs found " + Utils.join(",", supportingCalculations) + " choosing first arbitrarily");
- else
- UAC.AFmodel = supportingCalculations.get(0);
- logger.info("Selecting model " + UAC.AFmodel);
- }
-
- final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, maxAltAlleles, UAC.genotypeArgs.samplePloidy);
+ final AFCalc calc = createAFCalc(afCalculationModel, nSamples, UAC.genotypeArgs.MAX_ALTERNATE_ALLELES, UAC.genotypeArgs.samplePloidy);
if ( logger != null ) calc.setLogger(logger);
if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog);
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java
index 500cdf4ce..98df248aa 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java
@@ -56,7 +56,7 @@ import htsjdk.variant.variantcontext.*;
import java.util.*;
public class GeneralPloidyExactAFCalc extends ExactAFCalc {
- static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
+ static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 100; // if PL vectors longer than this # of elements, don't log them
private final int ploidy;
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java
index ace4f22d2..0008f5b28 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java
@@ -47,6 +47,7 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.genotyping.SampleList;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.SeqGraph;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
@@ -113,7 +114,7 @@ public class GraphBasedLikelihoodCalculationEngine implements ReadLikelihoodCalc
}
@Override
- public ReadLikelihoods computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final List samples, final Map> perSampleReadList) {
+ public ReadLikelihoods computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final SampleList samples, final Map> perSampleReadList) {
final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine =
new GraphBasedLikelihoodCalculationEngineInstance(assemblyResultSet,
hmm,log10GlobalReadMismappingRate,heterogeneousKmerSizeResolution);
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java
index 7d4b3db1a..e2229cd50 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java
@@ -46,7 +46,11 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
+import htsjdk.variant.variantcontext.Allele;
import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.genotyping.AlleleList;
+import org.broadinstitute.gatk.genotyping.IndexedAlleleList;
+import org.broadinstitute.gatk.genotyping.SampleList;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.MultiSampleEdge;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.Path;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.Route;
@@ -61,7 +65,6 @@ import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.pairhmm.FlexibleHMM;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
-import htsjdk.variant.variantcontext.Allele;
import java.util.*;
@@ -233,12 +236,13 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
* @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}.
* The value maps can be potentially empty though.
*/
- public ReadLikelihoods computeReadLikelihoods(final List haplotypes, final List samples,
+ public ReadLikelihoods computeReadLikelihoods(final List haplotypes, final SampleList samples,
final Map> perSampleReadList) {
// General preparation on the input haplotypes:
- final ReadLikelihoods result = new ReadLikelihoods<>(samples, haplotypes, perSampleReadList);
final List sortedHaplotypes = new ArrayList<>(haplotypes);
Collections.sort(sortedHaplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR);
+ final AlleleList alleles = new IndexedAlleleList<>(sortedHaplotypes);
+ final ReadLikelihoods result = new ReadLikelihoods<>(samples, alleles, perSampleReadList);
// The actual work:
final int sampleCount = result.sampleCount();
@@ -315,7 +319,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
private void calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(final int haplotypeIndex,
final ReadLikelihoods.Matrix likelihoods,
final Map> costsEndingByVertex) {
- final Haplotype haplotype = likelihoods.allele(haplotypeIndex);
+ final Haplotype haplotype = likelihoods.alleleAt(haplotypeIndex);
final HaplotypeRoute haplotypeRoute = haplotypeGraph.getHaplotypeRoute(haplotype);
final Set haplotypeVertices = haplotypeRoute.vertexSet();
final Map readCostByRead = new HashMap<>();
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java
index 2ee5752f0..5e6a519ea 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java
@@ -52,6 +52,7 @@ import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
+import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
@@ -64,15 +65,12 @@ import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter;
import org.broadinstitute.gatk.engine.iterators.ReadTransformer;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.walkers.*;
+import org.broadinstitute.gatk.genotyping.*;
import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.genotyper.*;
-import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
-import org.broadinstitute.gatk.utils.GenomeLoc;
-import org.broadinstitute.gatk.utils.MathUtils;
-import org.broadinstitute.gatk.utils.QualityUtils;
-import org.broadinstitute.gatk.utils.SampleUtils;
+import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState;
@@ -606,7 +604,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// the minimum length of a read we'd consider using for genotyping
private final static int MIN_READ_LENGTH = 10;
- private List samplesList;
+ private SampleList samplesList;
private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
@@ -626,9 +624,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
public void initialize() {
super.initialize();
- if (SCAC.genotypeArgs.samplePloidy != HomoSapiensConstants.DEFAULT_PLOIDY)
- throw new UserException.BadArgumentValue("-ploidy", "" + SCAC.genotypeArgs.samplePloidy + "; currently HaplotypeCaller only supports diploid sample analysis (-ploidy 2)");
-
if (dontGenotype && emitReferenceConfidence())
throw new UserException("You cannot request gVCF output and do not genotype at the same time");
@@ -656,12 +651,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
logger.info("Disabling physical phasing, which is supported only for reference-model confidence output");
}
- if ( SCAC.AFmodel == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY )
- throw new UserException.BadArgumentValue("pnrm", "HaplotypeCaller doesn't currently support " + SCAC.AFmodel);
-
-
- samplesList = new ArrayList<>(SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()));
- Set samplesSet = new LinkedHashSet<>(samplesList);
+ final GenomeAnalysisEngine toolkit = getToolkit();
+ samplesList = toolkit.getReadSampleList();
+ final Set sampleSet = SampleListUtils.asSet(samplesList);
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
final UnifiedArgumentCollection simpleUAC = SCAC.cloneTo(UnifiedArgumentCollection.class);
@@ -672,20 +664,24 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
simpleUAC.CONTAMINATION_FRACTION = 0.0;
simpleUAC.CONTAMINATION_FRACTION_FILE = null;
simpleUAC.exactCallsLog = null;
- activeRegionEvaluationGenotyperEngine = new UnifiedGenotypingEngine(getToolkit(), simpleUAC, samplesSet);
+ // Seems that at least with some test data we can lose genuine haploid variation if we use
+ // UGs engine with ploidy == 1
+ simpleUAC.genotypeArgs.samplePloidy = Math.max(2,SCAC.genotypeArgs.samplePloidy);
+ activeRegionEvaluationGenotyperEngine = new UnifiedGenotypingEngine(simpleUAC, toolkit);
activeRegionEvaluationGenotyperEngine.setLogger(logger);
if( SCAC.CONTAMINATION_FRACTION_FILE != null )
- SCAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(SCAC.CONTAMINATION_FRACTION_FILE, SCAC.CONTAMINATION_FRACTION, samplesSet, logger));
+ SCAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(SCAC.CONTAMINATION_FRACTION_FILE, SCAC.CONTAMINATION_FRACTION, sampleSet, logger));
if( SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES && consensusMode )
throw new UserException("HaplotypeCaller cannot be run in both GENOTYPE_GIVEN_ALLELES mode and in consensus mode. Please choose one or the other.");
- genotypingEngine = new HaplotypeCallerGenotypingEngine( getToolkit(), SCAC, !doNotRunPhysicalPhasing);
+ final GenomeLocParser genomeLocParser = toolkit.getGenomeLocParser();
+ genotypingEngine = new HaplotypeCallerGenotypingEngine( SCAC, samplesList, genomeLocParser, !doNotRunPhysicalPhasing);
// initialize the output VCF header
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
- Set headerInfo = new HashSet<>();
+ final Set headerInfo = new HashSet<>();
headerInfo.addAll(genotypingEngine.getAppropriateVCFInfoHeaders());
// all annotation fields from VariantAnnotatorEngine
@@ -707,13 +703,20 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
headerInfo.add(new VCFFormatHeaderLine(HAPLOTYPE_CALLER_PHASING_GT_KEY, 1, VCFHeaderLineType.String, "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another"));
}
+ if (SCAC.genotypeArgs.samplePloidy != HomoSapiensConstants.DEFAULT_PLOIDY) {
+ if (SCAC.emitReferenceConfidence != ReferenceConfidenceMode.NONE)
+ throw new UserException.BadArgumentValue("ERC", "For now ploidies different that 2 are not allow for GVCF or BP_RESOLUTION outputs");
+ headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample"));
+ headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction, in the same order as listed, for each individual sample"));
+ }
+
// FILTER fields are added unconditionally as it's not always 100% certain the circumstances
// where the filters are used. For example, in emitting all sites the lowQual field is used
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
- initializeReferenceConfidenceModel(samplesSet, headerInfo);
+ initializeReferenceConfidenceModel(samplesList, headerInfo);
- vcfWriter.writeHeader(new VCFHeader(headerInfo, samplesSet));
+ vcfWriter.writeHeader(new VCFHeader(headerInfo, sampleSet));
try {
// fasta reference reader to supplement the edges of the reference sequence
@@ -771,10 +774,11 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES,emitReferenceConfidence());
}
- private void initializeReferenceConfidenceModel(final Set samples, final Set headerInfo) {
+ private void initializeReferenceConfidenceModel(final SampleList samples, final Set headerInfo) {
referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel);
if ( emitReferenceConfidence() ) {
- if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently");
+ if ( samples.sampleCount() != 1 )
+ throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently");
headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines());
if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) {
// a kluge to enforce the use of this indexing strategy
@@ -784,7 +788,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
}
try {
- vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands);
+ vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands,SCAC.genotypeArgs.samplePloidy);
} catch ( IllegalArgumentException e ) {
throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage());
}
@@ -857,8 +861,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size());
final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage();
+ final GenotypingModel genotypingModel = genotypingEngine.getGenotypingModel();
for( final Map.Entry sample : splitContexts.entrySet() ) {
- final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, averageHQSoftClips).genotypeLikelihoods;
+ final String sampleName = sample.getKey();
+ // The ploidy here is not dictated by the sample but by the simple genotyping-engine used to determine whether regions are active or not.
+ final int activeRegionDetectionHackishSamplePloidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy;
+ final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sampleName,activeRegionDetectionHackishSamplePloidy,genotypingModel,sample.getValue().getBasePileup(), ref.getBase(), MIN_BASE_QUALTY_SCORE, averageHQSoftClips).genotypeLikelihoods;
genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() );
}
@@ -969,8 +977,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
regionForGenotyping.getLocation(),
getToolkit().getGenomeLocParser(),
metaDataTracker,
- ( consensusMode ? Collections.emptyList() : givenAlleles ),
- emitReferenceConfidence() );
+ (consensusMode ? Collections.emptyList() : givenAlleles),
+ emitReferenceConfidence());
// TODO -- must disable if we are doing NCT, or set the output type of ! presorted
if ( bamWriter != null ) {
@@ -997,7 +1005,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// output variant containing region.
result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(),
calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping,
- readLikelihoods, calledHaplotypes.getCalls()));
+ readLikelihoods, genotypingEngine.getPloidyModel(), genotypingEngine.getGenotypingModel(), calledHaplotypes.getCalls()));
// output right-flanking non-variant section:
if (trimmingResult.hasRightFlankingRegion())
result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantRightFlankRegion(),false));
@@ -1110,7 +1118,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
final List haplotypes = Collections.singletonList(refHaplotype);
return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes,
paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region),
- Collections.emptyList());
+ genotypingEngine.getPloidyModel(), genotypingEngine.getGenotypingModel(), Collections.emptyList());
} else
return NO_CALLS;
}
@@ -1123,11 +1131,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
* @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref
*/
public static ReadLikelihoods createDummyStratifiedReadMap(final Haplotype refHaplotype,
- final List samples,
+ final SampleList samples,
final ActiveRegion region) {
- return new ReadLikelihoods<>(samples, Collections.singletonList(refHaplotype),
+ return new ReadLikelihoods<>(samples, new IndexedAlleleList<>(refHaplotype),
splitReadsBySample(samples, region.getReads()));
-
}
@@ -1235,18 +1242,15 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
return splitReadsBySample(samplesList, reads);
}
- public static Map> splitReadsBySample( final List samplesList, final Collection reads ) {
+ private static Map> splitReadsBySample( final SampleList samplesList, final Collection reads ) {
final Map> returnMap = new HashMap<>();
- for( final String sample : samplesList) {
- List readList = returnMap.get( sample );
- if( readList == null ) {
- readList = new ArrayList<>();
- returnMap.put(sample, readList);
- }
- }
- for( final GATKSAMRecord read : reads ) {
+ final int sampleCount = samplesList.sampleCount();
+ for (int i = 0; i < sampleCount; i++)
+ returnMap.put(samplesList.sampleAt(i), new ArrayList());
+
+ for( final GATKSAMRecord read : reads )
returnMap.get(read.getReadGroup().getSample()).add(read);
- }
+
return returnMap;
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java
index 321c48166..97bb9efa8 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java
@@ -45,9 +45,9 @@
*/
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
+import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.gatk.utils.commandline.Advanced;
import org.broadinstitute.gatk.utils.commandline.Argument;
-import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection;
/**
* Set of arguments for the {@link HaplotypeCaller}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
index 77c308cfc..76c8dc772 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
@@ -48,8 +48,9 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
-import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
+import org.broadinstitute.gatk.genotyping.*;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingOutputMode;
@@ -64,7 +65,6 @@ import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.haplotype.MergeVariantsAcrossHaplotypes;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
-import htsjdk.variant.variantcontext.*;
import java.util.*;
@@ -80,28 +80,33 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine sampleNames) {
- super(toolkit,configuration,sampleNames);
- doPhysicalPhasing = true;
+ public HaplotypeCallerGenotypingEngine(final HaplotypeCallerArgumentCollection configuration, final SampleList samples, final GenomeLocParser genomeLocParser) {
+ this(configuration,samples,genomeLocParser,false);
}
-
/**
* Change the merge variant across haplotypes for this engine.
*
@@ -198,9 +203,9 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine alleleList = new ArrayList<>();
- alleleList.addAll(mergedVC.getAlleles());
- alleleList.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
- vcb.alleles(alleleList);
- mergedVC = vcb.make();
- }
+ if (emitReferenceConfidence)
+ mergedVC = addNonRefSymbolicAllele(mergedVC);
final Map mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
@@ -264,7 +265,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine originalList = mergedVC.getAlleles();
+ final List alleleList = new ArrayList<>(originalList.size() + 1);
+ alleleList.addAll(mergedVC.getAlleles());
+ alleleList.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
+ vcb.alleles(alleleList);
+ return vcb.make();
+ }
+
// Builds the read-likelihoods collection to use for annotation considering user arguments and the collection
// used for genotyping.
private ReadLikelihoods prepareReadAlleleLikelihoodsForAnnotation(
@@ -653,22 +664,14 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine readLikelihoods, final VariantContext mergedVC ) {
- final GenotypesContext genotypes = GenotypesContext.create(readLikelihoods.sampleCount());
- // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
- for (final String sample : readLikelihoods.samples() ) {
- final int numHaplotypes = mergedVC.getAlleles().size();
- final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
- final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, readLikelihoods, mergedVC.getAlleles(), true);
- int glIndex = 0;
- for( int iii = 0; iii < numHaplotypes; iii++ ) {
- for( int jjj = 0; jjj <= iii; jjj++ ) {
- genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
- }
- }
- logger.debug(" Likelihoods for sample " + sample + " : " + Arrays.toString(genotypeLikelihoods));
- genotypes.add(new GenotypeBuilder(sample).alleles(NO_CALL).PL(genotypeLikelihoods).make());
- }
- return genotypes;
+ final List vcAlleles = mergedVC.getAlleles();
+ final AlleleList alleleList = readLikelihoods.alleleCount() == vcAlleles.size() ? readLikelihoods : new IndexedAlleleList<>(vcAlleles);
+ final GenotypingLikelihoods likelihoods = genotypingModel.calculateLikelihoods(alleleList,new GenotypingData<>(ploidyModel,readLikelihoods));
+ final int sampleCount = samples.sampleCount();
+ final GenotypesContext result = GenotypesContext.create(sampleCount);
+ for (int s = 0; s < sampleCount; s++)
+ result.add(new GenotypeBuilder(samples.sampleAt(s)).alleles(NO_CALL).PL(likelihoods.sampleLikelihoods(s).getAsPLs()).make());
+ return result;
}
/**
@@ -779,4 +782,22 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final List samples, final Map> perSampleReadList ) {
+ public ReadLikelihoods computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final SampleList samples, final Map> perSampleReadList ) {
- final List haplotypes = assemblyResultSet.getHaplotypeList();
+ final List haplotypeList = assemblyResultSet.getHaplotypeList();
+ final AlleleList haplotypes = new IndexedAlleleList<>(haplotypeList);
// configure the HMM
- initializePairHMM(haplotypes, perSampleReadList);
+ initializePairHMM(haplotypeList, perSampleReadList);
// Add likelihoods for each sample's reads to our result
final ReadLikelihoods result = new ReadLikelihoods<>(samples, haplotypes, perSampleReadList);
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java
index bd72a764d..aa7305bcf 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java
@@ -46,11 +46,14 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
+import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.genotyping.AlleleList;
+import org.broadinstitute.gatk.genotyping.IndexedAlleleList;
+import org.broadinstitute.gatk.genotyping.SampleList;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
-import htsjdk.variant.variantcontext.Allele;
import java.util.HashMap;
import java.util.List;
@@ -63,17 +66,15 @@ import java.util.Random;
public class RandomLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine {
@Override
- public ReadLikelihoods computeReadLikelihoods(final AssemblyResultSet assemblyResultSet,
- final List samples,
+ public ReadLikelihoods computeReadLikelihoods(final AssemblyResultSet assemblyResultSet,
+ final SampleList samples,
final Map> reads) {
- final List haplotypes = assemblyResultSet.getHaplotypeList();
+ final AlleleList haplotypes = new IndexedAlleleList<>(assemblyResultSet.getHaplotypeList());
final ReadLikelihoods result = new ReadLikelihoods(samples, haplotypes, reads);
- final Map alleles = new HashMap<>(haplotypes.size());
- for (final Haplotype haplotype : haplotypes)
- alleles.put(haplotype,Allele.create(haplotype,false));
+ final Map alleles = new HashMap<>(haplotypes.alleleCount());
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
- final int sampleCount = samples.size();
- final int alleleCount = alleles.size();
+ final int sampleCount = samples.sampleCount();
+ final int alleleCount = haplotypes.alleleCount();
for (int i = 0; i < sampleCount; i++) {
final List sampleReads = result.sampleReads(i);
final int readCount = sampleReads.size();
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java
index 6119cba1c..ccc0c18b8 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java
@@ -46,6 +46,7 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
+import org.broadinstitute.gatk.genotyping.SampleList;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
@@ -81,6 +82,7 @@ public interface ReadLikelihoodCalculationEngine {
* active region assembly process.
*
* @param assemblyResultSet the input assembly results.
+ * @param samples the list of targeted samples.
* @param perSampleReadList the input read sets stratified per sample.
*
* @throws NullPointerException if either parameter is {@code null}.
@@ -88,7 +90,7 @@ public interface ReadLikelihoodCalculationEngine {
* @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}.
* The value maps can be potentially empty though.
*/
- public ReadLikelihoods computeReadLikelihoods(AssemblyResultSet assemblyResultSet, List samples,
+ public ReadLikelihoods computeReadLikelihoods(AssemblyResultSet assemblyResultSet, SampleList samples,
Map> perSampleReadList);
public void close();
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RefVsAnyResult.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RefVsAnyResult.java
index 8726c9365..bb3b64f93 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RefVsAnyResult.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RefVsAnyResult.java
@@ -46,6 +46,8 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
+import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants;
+
/**
* Holds information about a genotype call of a single sample reference vs. any non-ref event
*
@@ -58,7 +60,7 @@ final class RefVsAnyResult {
/**
* The genotype likelihoods for ref/ref ref/non-ref non-ref/non-ref
*/
- final double[] genotypeLikelihoods = new double[3];
+ final double[] genotypeLikelihoods;
/**
* AD field value for ref / non-ref
@@ -74,7 +76,31 @@ final class RefVsAnyResult {
* Cap the het and hom var likelihood values by the hom ref likelihood.
*/
protected void capByHomRefLikelihood() {
- genotypeLikelihoods[1] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[1]);
- genotypeLikelihoods[2] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[2]);
+ final int likelihoodCount = genotypeLikelihoods.length;
+ for (int i = 1; i < likelihoodCount; i++)
+ genotypeLikelihoods[i] = Math.min(genotypeLikelihoods[0],genotypeLikelihoods[i]);
}
+
+ /**
+ * Creates a new ref-vs-alt result assuming 3 as the number of genotype likelihoods (human ploidy.
+ */
+ @Deprecated
+ public RefVsAnyResult() {
+ genotypeLikelihoods =
+ new double[(HomoSapiensConstants.DEFAULT_PLOIDY * (HomoSapiensConstants.DEFAULT_PLOIDY + 1)) >> 1];
+ }
+
+ /**
+ * Creates a new ref-vs-alt result indicating the genotype likelihood vector capacity.
+ * @param likelihoodCapacity the required capacity of the likelihood array, should match the possible number of
+ * genotypes given the number of alleles (always 2), ploidy (arbitrary) less the genotyping
+ * model non-sense genotype count if applies.
+ * @throws IllegalArgumentException if {@code likelihoodCapacity} is negative.
+ */
+ public RefVsAnyResult(final int likelihoodCapacity) {
+ if (likelihoodCapacity < 0)
+ throw new IllegalArgumentException("likelihood capacity is negative");
+ genotypeLikelihoods = new double[likelihoodCapacity];
+ }
+
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java
index dc9512eee..116b27009 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java
@@ -51,11 +51,13 @@ import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
+import org.broadinstitute.gatk.genotyping.*;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState;
@@ -85,8 +87,8 @@ public class ReferenceConfidenceModel {
public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele
private final GenomeLocParser genomeLocParser;
- private final Set samples;
- private final SAMFileHeader header; // TODO -- really shouldn't depend on this
+
+ private final SampleList samples;
private final int indelInformativeDepthIndelSize;
private final static boolean WRITE_DEBUGGING_BAM = false;
@@ -103,18 +105,17 @@ public class ReferenceConfidenceModel {
* @param indelInformativeDepthIndelSize the max size of indels to consider when calculating indel informative depths
*/
public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser,
- final Set samples,
+ final SampleList samples,
final SAMFileHeader header,
final int indelInformativeDepthIndelSize) {
if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null");
if ( samples == null ) throw new IllegalArgumentException("samples cannot be null");
- if ( samples.isEmpty() ) throw new IllegalArgumentException("samples cannot be empty");
+ if ( samples.sampleCount() == 0) throw new IllegalArgumentException("samples cannot be empty");
if ( header == null ) throw new IllegalArgumentException("header cannot be empty");
if ( indelInformativeDepthIndelSize < 0) throw new IllegalArgumentException("indelInformativeDepthIndelSize must be >= 1 but got " + indelInformativeDepthIndelSize);
this.genomeLocParser = genomeLocParser;
this.samples = samples;
- this.header = header;
this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize;
if ( WRITE_DEBUGGING_BAM ) {
@@ -124,8 +125,6 @@ public class ReferenceConfidenceModel {
} else {
debuggingWriter = null;
}
-
- initializeIndelPLCache();
}
/**
@@ -151,7 +150,7 @@ public class ReferenceConfidenceModel {
/**
* Calculate the reference confidence for a single sample given the its read data
*
- * Returns a list of variant contexts, one for each position in the activeregion.getLoc(), each containing
+ * Returns a list of variant contexts, one for each position in the {@code activeRegion.getLoc()}, each containing
* detailed information about the certainty that the sample is hom-ref for each base in the region.
*
*
@@ -162,6 +161,8 @@ public class ReferenceConfidenceModel {
* @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc())
* @param activeRegion the active region we want to get the reference confidence over
* @param readLikelihoods a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes
+ * @param ploidyModel indicate the ploidy of each sample in {@code stratifiedReadMap}.
+ * @param model genotyping model.
* @param variantCalls calls made in this region. The return result will contain any variant call in this list in the
* correct order by genomic position, and any variant in this list will stop us emitting a ref confidence
* under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span)
@@ -173,6 +174,8 @@ public class ReferenceConfidenceModel {
final GenomeLoc paddedReferenceLoc,
final ActiveRegion activeRegion,
final ReadLikelihoods readLikelihoods,
+ final PloidyModel ploidyModel,
+ final GenotypingModel model,
final List variantCalls) {
if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null");
if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null");
@@ -182,12 +185,15 @@ public class ReferenceConfidenceModel {
if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null");
if ( readLikelihoods.sampleCount() != 1 ) throw new IllegalArgumentException("readLikelihoods must contain exactly one sample but it contained " + readLikelihoods.sampleCount());
if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different");
+ if ( ploidyModel == null) throw new IllegalArgumentException("the ploidy model cannot be null");
+ if ( model == null) throw new IllegalArgumentException("the genotyping model cannot be null");
+ final int ploidy = ploidyModel.samplePloidy(0); // the first sample = the only sample in reference-confidence mode.
final GenomeLoc refSpan = activeRegion.getLocation();
final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, readLikelihoods);
final byte[] ref = refHaplotype.getBases();
final List results = new ArrayList<>(refSpan.size());
- final String sampleName = readLikelihoods.sample(0);
+ final String sampleName = readLikelihoods.sampleAt(0);
final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart();
for ( final ReadBackedPileup pileup : refPileups ) {
@@ -201,20 +207,20 @@ public class ReferenceConfidenceModel {
// otherwise emit a reference confidence variant context
final int refOffset = offset + globalRefOffset;
final byte refBase = ref[refOffset];
- final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null);
+ final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(sampleName,ploidy,model,pileup, refBase, (byte)6, null);
homRefCalc.capByHomRefLikelihood();
final Allele refAllele = Allele.create(refBase, true);
final List refSiteAlleles = Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles);
- final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele));
+ final GenotypeBuilder gb = new GenotypeBuilder(sampleName, GATKVariantContextUtils.homozygousAlleleList(refAllele, ploidy));
gb.AD(homRefCalc.AD_Ref_Any);
gb.DP(homRefCalc.getDP());
// genotype likelihood calculation
final GenotypeLikelihoods snpGLs = GenotypeLikelihoods.fromLog10Likelihoods(homRefCalc.genotypeLikelihoods);
final int nIndelInformativeReads = calcNIndelInformativeReads(pileup, refOffset, ref, indelInformativeDepthIndelSize);
- final GenotypeLikelihoods indelGLs = getIndelPLs(nIndelInformativeReads);
+ final GenotypeLikelihoods indelGLs = getIndelPLs(ploidy,nIndelInformativeReads);
// now that we have the SNP and indel GLs, we take the one with the least confidence,
// as this is the most conservative estimate of our certainty that we are hom-ref.
@@ -251,23 +257,51 @@ public class ReferenceConfidenceModel {
* Get indel PLs corresponding to seeing N nIndelInformativeReads at this site
*
* @param nInformativeReads the number of reads that inform us about being ref without an indel at this site
+ * @param ploidy the requested ploidy.
* @return non-null GenotypeLikelihoods given N
*/
- protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) {
- return indelPLCache[nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads];
+ protected final GenotypeLikelihoods getIndelPLs(final int ploidy, final int nInformativeReads) {
+ if (ploidy > MAX_N_INDEL_PLOIDY)
+ throw new IllegalArgumentException("you have hit a current limitation of the GVCF output model that cannot handle ploidies larger than " + MAX_N_INDEL_PLOIDY + " , please let the GATK team about it: " + ploidy);
+ return indelPLCache(ploidy, nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads);
}
protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway
- private static final GenotypeLikelihoods[] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1];
+ private static final int MAX_N_INDEL_PLOIDY = 20;
+ private static final GenotypeLikelihoods[][] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_PLOIDY][];
private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp
- private void initializeIndelPLCache() {
- for( int nInformativeReads = 0; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) {
- final double homRef = 0.0;
- final double het = MathUtils.LOG_ONE_HALF * nInformativeReads;
- final double homVar = INDEL_ERROR_RATE * nInformativeReads;
- indelPLCache[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar});
+ private final GenotypeLikelihoods indelPLCache(final int ploidy, final int nInformativeReads) {
+ GenotypeLikelihoods[] indelPLCacheByPloidy = indelPLCache[ploidy];
+ if (indelPLCacheByPloidy == null)
+ return initializeIndelPLCache(ploidy)[nInformativeReads];
+ else
+ return indelPLCacheByPloidy[nInformativeReads];
+ }
+
+ private synchronized GenotypeLikelihoods[] initializeIndelPLCache(final int ploidy) {
+ // Double-check whether another thread has done the initialization.
+ if (indelPLCache[ploidy] != null)
+ return indelPLCache[ploidy];
+
+ final double denominator = - MathUtils.Log10Cache.get(ploidy);
+ final GenotypeLikelihoods[] result = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1];
+ result[0] = GenotypeLikelihoods.fromLog10Likelihoods(new double[ploidy + 1]);
+ for( int nInformativeReads = 1; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) {
+ final byte indelQual = (byte) Math.round((INDEL_ERROR_RATE * -10));
+ final double refLikelihood = QualityUtils.qualToProbLog10(indelQual);
+ final double altLikelihood = QualityUtils.qualToErrorProbLog10(indelQual);
+ double[] PLs = new double[ploidy + 1];
+ PLs[0] = nInformativeReads * refLikelihood;
+ for (int altCount = 1; altCount <= ploidy; altCount++) {
+ final double refLikelihoodAccum = refLikelihood + MathUtils.Log10Cache.get(ploidy - altCount);
+ final double altLikelihoodAccum = altLikelihood + MathUtils.Log10Cache.get(altCount);
+ PLs[altCount] = nInformativeReads * (MathUtils.approximateLog10SumLog10(refLikelihoodAccum ,altLikelihoodAccum) + denominator);
+ }
+ result[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(PLs);
}
+ indelPLCache[ploidy] = result;
+ return result;
}
/**
@@ -279,6 +313,7 @@ public class ReferenceConfidenceModel {
* @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips
* @return a RefVsAnyResult genotype call
*/
+ @Deprecated
public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) {
final RefVsAnyResult result = new RefVsAnyResult();
@@ -305,6 +340,73 @@ public class ReferenceConfidenceModel {
return result;
}
+ /**
+ * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
+ *
+ * @param sampleName target sample name.
+ * @param ploidy target sample ploidy.
+ * @param genotypingModel model to calculate likelihoods and genotypes.
+ * @param pileup the read backed pileup containing the data we want to evaluate
+ * @param refBase the reference base at this pileup position
+ * @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation
+ * @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips
+ * @return a RefVsAnyResult genotype call.
+ */
+ public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final String sampleName, final int ploidy,
+ final GenotypingModel genotypingModel,
+ final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) {
+ final AlleleList alleleList = new IndexedAlleleList<>(Allele.create(refBase,true),GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
+ // Notice that the sample name is rather irrelevant as this information is never used, just need to be the same in both lines bellow.
+
+ final int maximumReadCount = pileup.getReads().size();
+
+ final List reads = new ArrayList<>(maximumReadCount);
+ final double[][] likelihoods = new double[2][maximumReadCount];
+ final int[] adCounts = new int[2];
+ int nextIndex = 0;
+ for (final PileupElement p : pileup) {
+ final byte qual = p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual();
+ if (!p.isDeletion() && qual <= minBaseQual)
+ continue;
+ final GATKSAMRecord read = p.getRead();
+ reads.add(read);
+ final boolean isAlt = p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart()
+ || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip();
+ final int bestAllele;
+ final int worstAllele;
+ if (isAlt) {
+ bestAllele = 1;
+ worstAllele = 0;
+ } else {
+ bestAllele = 0;
+ worstAllele = 1;
+ }
+
+ likelihoods[bestAllele][nextIndex] = QualityUtils.qualToProbLog10(qual);
+ likelihoods[worstAllele][nextIndex++] = QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
+ adCounts[bestAllele]++;
+ if (isAlt && hqSoftClips != null && p.isNextToSoftClip())
+ hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(read, (byte) 28));
+ }
+
+ final Map> sampleToReads = Collections.singletonMap(sampleName,reads);
+ final ReadLikelihoods readLikelihoods = new ReadLikelihoods<>(new IndexedSampleList(sampleName),alleleList,sampleToReads);
+ final ReadLikelihoods.Matrix sampleLikelihoods = readLikelihoods.sampleMatrix(0);
+ final int readCount = sampleLikelihoods.readCount();
+ for (int i = 0; i < readCount; i++) {
+ sampleLikelihoods.set(0,i,likelihoods[0][i]);
+ sampleLikelihoods.set(1,i,likelihoods[1][i]);
+ }
+
+ final PloidyModel ploidyModel = new HomogeneousPloidyModel(new IndexedSampleList(sampleName),ploidy);
+ final GenotypingLikelihoods genotypingLikelihoods = genotypingModel.calculateLikelihoods(alleleList, new GenotypingData<>(ploidyModel, readLikelihoods));
+ final double[] genotypeLikelihoodArray = genotypingLikelihoods.sampleLikelihoods(0).getAsVector();
+ final RefVsAnyResult result = new RefVsAnyResult(genotypeLikelihoodArray.length);
+ System.arraycopy(genotypeLikelihoodArray,0,result.genotypeLikelihoods,0,genotypeLikelihoodArray.length);
+ System.arraycopy(adCounts,0,result.AD_Ref_Any,0,2);
+ return result;
+ }
+
/**
* Get a list of pileups that span the entire active region span, in order, one for each position
*/
@@ -330,7 +432,7 @@ public class ReferenceConfidenceModel {
debuggingWriter.addAlignment(read);
final LocusIteratorByState libs = new LocusIteratorByState(reads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING,
- true, genomeLocParser, samples, false);
+ true, genomeLocParser, SampleListUtils.asSet(samples), false);
final List pileups = new LinkedList<>();
final int startPos = activeRegionSpan.getStart();
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java
index 6f71868bb..a798c760f 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java
@@ -49,6 +49,9 @@ package org.broadinstitute.gatk.tools.walkers.indels;
import com.google.java.contract.Ensures;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
+import org.broadinstitute.gatk.genotyping.AlleleList;
+import org.broadinstitute.gatk.genotyping.IndexedAlleleList;
+import org.broadinstitute.gatk.genotyping.IndexedSampleList;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.clipping.ReadClipper;
import org.broadinstitute.gatk.utils.exceptions.UserException;
@@ -444,10 +447,10 @@ public class PairHMMIndelErrorModel {
// Apparently more than one allele can map to the same haplotype after trimming
final Set distinctHaplotypesSet = new LinkedHashSet<>(trimmedHaplotypeMap.values());
- final List distinctHaplotypesList = Arrays.asList(distinctHaplotypesSet.toArray(new Haplotype[distinctHaplotypesSet.size()]));
+ final AlleleList distinctHaplotypesList = new IndexedAlleleList<>(distinctHaplotypesSet.toArray(new Haplotype[distinctHaplotypesSet.size()]));
// Get the likelihoods for our clipped read against each of our trimmed haplotypes.
final ReadLikelihoods rl = new ReadLikelihoods<>(
- Collections.singletonList("DUMMY_SAMPLE"),distinctHaplotypesList,Collections.singletonMap("DUMMY_SAMPLE",Collections.singletonList(processedRead)));
+ new IndexedSampleList(Collections.singletonList("DUMMY_SAMPLE")),distinctHaplotypesList,Collections.singletonMap("DUMMY_SAMPLE",Collections.singletonList(processedRead)));
final ReadLikelihoods.Matrix dummySampleLikelihoods = rl.sampleMatrix(0);
pairHMM.computeLikelihoods(rl.sampleMatrix(0), Collections.singletonList(processedRead), readGCPArrayMap);
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java
index 432bfc770..f78519f3f 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java
@@ -46,6 +46,7 @@
package org.broadinstitute.gatk.tools.walkers.validation;
+import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.walkers.*;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
@@ -349,13 +350,14 @@ public class GenotypeAndValidate extends RodWalker= 0) uac.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf;
if (callConf >= 0) uac.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = callConf;
+ final GenomeAnalysisEngine toolkit = getToolkit();
uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
- snpEngine = new UnifiedGenotypingEngine(getToolkit(), uac);
+ snpEngine = new UnifiedGenotypingEngine(uac,toolkit);
// Adding the INDEL calling arguments for UG
UnifiedArgumentCollection uac_indel = uac.clone();
uac_indel.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL;
- indelEngine = new UnifiedGenotypingEngine(getToolkit(), uac_indel);
+ indelEngine = new UnifiedGenotypingEngine(uac_indel,toolkit);
// make sure we have callConf set to the threshold set by the UAC so we can use it later.
callConf = uac.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING;
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java
index ebcb82444..9d18e9981 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java
@@ -46,8 +46,12 @@
package org.broadinstitute.gatk.tools.walkers.variantutils;
+import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine;
+import org.broadinstitute.gatk.genotyping.IndexedSampleList;
+import org.broadinstitute.gatk.genotyping.SampleList;
+import org.broadinstitute.gatk.genotyping.SampleListUtils;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
@@ -111,6 +115,7 @@ import java.util.*;
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-10,stop=10))
+@SuppressWarnings("unused")
public class GenotypeGVCFs extends RodWalker implements AnnotatorCompatible, TreeReducible {
/**
@@ -159,12 +164,13 @@ public class GenotypeGVCFs extends RodWalker variantCollection : variantCollections )
variants.addAll(variantCollection.getRodBindings());
- final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), variants);
- final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
+ final GenomeAnalysisEngine toolkit = getToolkit();
+ final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(toolkit, variants);
+ final SampleList samples = new IndexedSampleList(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
// create the genotyping engine
- genotypingEngine = new UnifiedGenotypingEngine(getToolkit(), createUAC(), samples);
+ genotypingEngine = new UnifiedGenotypingEngine(createUAC(), samples, toolkit.getGenomeLocParser(), toolkit.getArguments().BAQMode);
// create the annotation engine
- annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.emptyList(), this, getToolkit());
+ annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.emptyList(), this, toolkit);
// take care of the VCF headers
final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
@@ -179,7 +185,8 @@ public class GenotypeGVCFs extends RodWalker sampleNameSet = SampleListUtils.asSet(samples);
+ final VCFHeader vcfHeader = new VCFHeader(headerLines, sampleNameSet);
vcfWriter.writeHeader(vcfHeader);
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java
index 50e8bd416..e6cc614e6 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java
@@ -46,6 +46,10 @@
package org.broadinstitute.gatk.tools.walkers.variantutils;
+import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.genotyping.IndexedSampleList;
+import org.broadinstitute.gatk.genotyping.SampleList;
+import org.broadinstitute.gatk.genotyping.SampleListUtils;
import org.broadinstitute.gatk.utils.commandline.ArgumentCollection;
import org.broadinstitute.gatk.utils.commandline.Output;
import org.broadinstitute.gatk.engine.CommandLineGATK;
@@ -120,14 +124,18 @@ public class RegenotypeVariants extends RodWalker implements T
UAC.genotypingOutputMode = GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES;
String trackName = variantCollection.variants.getName();
- Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName));
- UG_engine = new UnifiedGenotypingEngine(getToolkit(), UAC, samples);
+
+ final GenomeAnalysisEngine toolkit = getToolkit();
+ final SampleList samples =
+ new IndexedSampleList(SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)));
+ final Set sampleNameSet = SampleListUtils.asSet(samples);
+ UG_engine = new UnifiedGenotypingEngine(UAC, samples,toolkit.getGenomeLocParser(),toolkit.getArguments().BAQMode);
final Set hInfo = new HashSet();
hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(trackName)));
hInfo.addAll(UnifiedGenotyper.getHeaderInfo(UAC, null, null));
- vcfWriter.writeHeader(new VCFHeader(hInfo, samples));
+ vcfWriter.writeHeader(new VCFHeader(hInfo, sampleNameSet));
}
/**
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java
index e55bf4fa0..5659e76a0 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java
@@ -46,15 +46,14 @@
package org.broadinstitute.gatk.utils.gvcf;
-import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
+import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
-import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
@@ -70,9 +69,7 @@ public class GVCFWriter implements VariantContextWriter {
//
// static VCF field names
//
- protected final static String BLOCK_SIZE_INFO_FIELD = "BLOCK_SIZE";
protected final static String MIN_DP_FORMAT_FIELD = "MIN_DP";
- protected final static String MIN_GQ_FORMAT_FIELD = "MIN_GQ";
//
// Final fields initialized in constructor
@@ -87,6 +84,7 @@ public class GVCFWriter implements VariantContextWriter {
String contigOfNextAvailableStart = null;
private String sampleName = null;
private HomRefBlock currentBlock = null;
+ private final int defaultPloidy;
/**
* Is the proposed GQ partitions well-formed?
@@ -94,7 +92,7 @@ public class GVCFWriter implements VariantContextWriter {
* @param GQPartitions proposed GQ partitions
* @return a non-null string if something is wrong (string explains issue)
*/
- protected static List parsePartitions(final List GQPartitions) {
+ protected static List parsePartitions(final List GQPartitions, final int defaultPloidy) {
if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null");
if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty");
@@ -104,10 +102,10 @@ public class GVCFWriter implements VariantContextWriter {
if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer");
if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value);
if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value);
- result.add(new HomRefBlock(lastThreshold, value));
+ result.add(new HomRefBlock(lastThreshold, value,defaultPloidy));
lastThreshold = value;
}
- result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE));
+ result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE,defaultPloidy));
return result;
}
@@ -128,11 +126,13 @@ public class GVCFWriter implements VariantContextWriter {
*
* @param underlyingWriter the ultimate destination of the GVCF records
* @param GQPartitions a well-formed list of GQ partitions
+ * @param defaultPloidy the assumed ploidy for input variant context without one.
*/
- public GVCFWriter(final VariantContextWriter underlyingWriter, final List GQPartitions) {
+ public GVCFWriter(final VariantContextWriter underlyingWriter, final List GQPartitions, final int defaultPloidy) {
if ( underlyingWriter == null ) throw new IllegalArgumentException("underlyingWriter cannot be null");
this.underlyingWriter = underlyingWriter;
- this.GQPartitions = parsePartitions(GQPartitions);
+ this.GQPartitions = parsePartitions(GQPartitions,defaultPloidy);
+ this.defaultPloidy = defaultPloidy;
}
/**
@@ -148,10 +148,6 @@ public class GVCFWriter implements VariantContextWriter {
header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
header.addMetaDataLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block"));
- // These annotations are no longer standard
- //header.addMetaDataLine(new VCFInfoHeaderLine(BLOCK_SIZE_INFO_FIELD, 1, VCFHeaderLineType.Integer, "Size of the homozygous reference GVCF block"));
- //header.addMetaDataLine(new VCFFormatHeaderLine(MIN_GQ_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum GQ observed within the GVCF block"));
-
for ( final HomRefBlock partition : GQPartitions ) {
header.addMetaDataLine(partition.toVCFHeaderLine());
}
@@ -188,27 +184,30 @@ public class GVCFWriter implements VariantContextWriter {
* @return a VariantContext to be emitted, or null if non is appropriate
*/
protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) {
+
if ( nextAvailableStart != -1 ) {
// don't create blocks while the hom-ref site falls before nextAvailableStart (for deletions)
- if ( vc.getStart() <= nextAvailableStart && vc.getChr().equals(contigOfNextAvailableStart) ) {
+ if ( vc.getStart() <= nextAvailableStart && vc.getChr().equals(contigOfNextAvailableStart) )
return null;
- }
// otherwise, reset to non-relevant
nextAvailableStart = -1;
contigOfNextAvailableStart = null;
}
- if ( currentBlock == null ) {
- currentBlock = createNewBlock(vc, g);
- return null;
- } else if ( currentBlock.withinBounds(g.getGQ()) ) {
+ final VariantContext result;
+ if (genotypeCanBeMergedInCurrentBlock(g)) {
currentBlock.add(vc.getStart(), g);
- return null;
+ result = null;
} else {
- final VariantContext result = blockToVCF(currentBlock);
+ result = blockToVCF(currentBlock);
currentBlock = createNewBlock(vc, g);
- return result;
}
+ return result;
+ }
+
+ private boolean genotypeCanBeMergedInCurrentBlock(final Genotype g) {
+ return currentBlock != null && currentBlock.withinBounds(g.getGQ()) && currentBlock.getPloidy() == g.getPloidy()
+ && (currentBlock.getMinPLs() == null || !g.hasPL() || (currentBlock.getMinPLs().length == g.getPL().length));
}
/**
@@ -226,21 +225,20 @@ public class GVCFWriter implements VariantContextWriter {
* Convert a HomRefBlock into a VariantContext
*
* @param block the block to convert
- * @return a VariantContext representing the gVCF encoding for this block
+ * @return a VariantContext representing the gVCF encoding for this block.
+ * It will return {@code null} if input {@code block} is {@code null}, indicating that there
+ * is no variant-context to be output into the VCF.
*/
private VariantContext blockToVCF(final HomRefBlock block) {
- if ( block == null ) throw new IllegalArgumentException("block cannot be null");
+ if ( block == null ) return null;
final VariantContextBuilder vcb = new VariantContextBuilder(block.getStartingVC());
vcb.attributes(new HashMap(2)); // clear the attributes
vcb.stop(block.getStop());
vcb.attribute(VCFConstants.END_KEY, block.getStop());
- // This annotation is no longer standard
- //vcb.attribute(BLOCK_SIZE_INFO_FIELD, block.getSize());
-
// create the single Genotype with GQ and DP annotations
- final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Collections.nCopies(2, block.getRef()));
+ final GenotypeBuilder gb = new GenotypeBuilder(sampleName, GATKVariantContextUtils.homozygousAlleleList(block.getRef(),block.getPloidy()));
gb.noAD().noPL().noAttributes(); // clear all attributes
gb.GQ(block.getMedianGQ());
gb.DP(block.getMedianDP());
@@ -269,10 +267,12 @@ public class GVCFWriter implements VariantContextWriter {
break;
}
}
- if ( partition == null ) throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition " + partition);
+
+ if ( partition == null )
+ throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition");
// create the block, add g to it, and return it for use
- final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound());
+ final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound(), defaultPloidy);
block.add(vc.getStart(), g);
return block;
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java
index 179217b91..200eb1366 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java
@@ -46,11 +46,11 @@
package org.broadinstitute.gatk.utils.gvcf;
-import org.broadinstitute.gatk.utils.MathUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLine;
+import org.broadinstitute.gatk.utils.MathUtils;
import java.util.ArrayList;
import java.util.List;
@@ -75,6 +75,7 @@ final class HomRefBlock {
final private List GQs = new ArrayList<>(100);
final private List DPs = new ArrayList<>(100);
private final Allele ref;
+ private final int ploidy;
/**
* Create a new HomRefBlock
@@ -83,7 +84,7 @@ final class HomRefBlock {
* @param minGQ the minGQ (inclusive) to use in this band
* @param maxGQ the maxGQ (exclusive) to use in this band
*/
- public HomRefBlock(final VariantContext startingVC, int minGQ, int maxGQ) {
+ public HomRefBlock(final VariantContext startingVC, final int minGQ, final int maxGQ, final int defaultPloidy) {
if ( startingVC == null ) throw new IllegalArgumentException("startingVC cannot be null");
if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ);
@@ -92,6 +93,7 @@ final class HomRefBlock {
this.ref = startingVC.getReference();
this.minGQ = minGQ;
this.maxGQ = maxGQ;
+ this.ploidy = startingVC.getMaxPloidy(defaultPloidy);
}
/**
@@ -100,7 +102,7 @@ final class HomRefBlock {
* @param minGQ the minGQ (inclusive) to use in this band
* @param maxGQ the maxGQ (exclusive) to use in this band
*/
- public HomRefBlock(int minGQ, int maxGQ) {
+ public HomRefBlock(final int minGQ, final int maxGQ, final int ploidy) {
if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ);
this.startingVC = null;
@@ -108,6 +110,7 @@ final class HomRefBlock {
this.ref = null;
this.minGQ = minGQ;
this.maxGQ = maxGQ;
+ this.ploidy = ploidy;
}
/**
@@ -119,19 +122,18 @@ final class HomRefBlock {
if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field");
if ( ! g.hasPL() ) throw new IllegalArgumentException("g must have PL field");
if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop);
+ if ( g.getPloidy() != ploidy)
+ throw new IllegalArgumentException("cannot add a genotype with a different ploidy: " + g.getPloidy() + " != " + ploidy);
- if( minPLs == null ) { // if the minPLs vector has not been set yet, create it here by copying the provided genotype's PLs
+ if( minPLs == null )
+ minPLs = g.getPL();
+ else { // otherwise take the min with the provided genotype's PLs
final int[] PL = g.getPL();
- if( PL.length == 3 ) {
- minPLs = PL.clone();
- }
- } else { // otherwise take the min with the provided genotype's PLs
- final int[] PL = g.getPL();
- if( PL.length == 3 ) {
- minPLs[0] = Math.min(minPLs[0], PL[0]);
- minPLs[1] = Math.min(minPLs[1], PL[1]);
- minPLs[2] = Math.min(minPLs[2], PL[2]);
- }
+ if (PL.length != minPLs.length)
+ throw new IllegalStateException("trying to merge different PL array sizes: " + PL.length + " != " + minPLs.length);
+ for (int i = 0; i < PL.length; i++)
+ if (minPLs[i] > PL[i])
+ minPLs[i] = PL[i];
}
stop = pos;
GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission
@@ -182,4 +184,12 @@ final class HomRefBlock {
public VCFHeaderLine toVCFHeaderLine() {
return new VCFHeaderLine("GVCFBlock", "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)");
}
+
+ /**
+ * Get the ploidy of this hom-ref block.
+ * @return
+ */
+ public int getPloidy() {
+ return ploidy;
+ }
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java
index 2c17d4d5c..af93892a7 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java
@@ -48,6 +48,9 @@ package org.broadinstitute.gatk.utils.haplotype;
import com.google.java.contract.Requires;
import htsjdk.variant.variantcontext.VariantContext;
+import org.broadinstitute.gatk.genotyping.AlleleList;
+import org.broadinstitute.gatk.genotyping.AlleleListUtils;
+import org.broadinstitute.gatk.genotyping.SampleListUtils;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
@@ -77,7 +80,9 @@ public class HaplotypeLDCalculator {
@SuppressWarnings("unchecked")
protected HaplotypeLDCalculator() {
haplotypes = Collections.emptyList();
- readLikelihoods = new ReadLikelihoods<>((List)Collections.EMPTY_LIST, (List)Collections.EMPTY_LIST, Collections.EMPTY_MAP);
+ final AlleleList alleleList = AlleleListUtils.emptyList();
+ readLikelihoods = new ReadLikelihoods<>(SampleListUtils.emptyList(),
+ alleleList, Collections.EMPTY_MAP);
}
public HaplotypeLDCalculator(final List haplotypes, final ReadLikelihoods haplotypeReadMap) {
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java
index d17c12b30..51f1e04a2 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java
@@ -50,15 +50,17 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
// the imports for unit testing.
-import org.broadinstitute.gatk.utils.BaseTest;
-import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
-import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection;
-import org.broadinstitute.gatk.utils.MathUtils;
-import org.broadinstitute.gatk.utils.Utils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
+import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection;
+import org.broadinstitute.gatk.genotyping.SampleList;
+import org.broadinstitute.gatk.genotyping.SampleListUtils;
+import org.broadinstitute.gatk.utils.BaseTest;
+import org.broadinstitute.gatk.utils.MathUtils;
+import org.broadinstitute.gatk.utils.Utils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
@@ -76,7 +78,7 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest {
engine.setArguments(new GATKArgumentCollection());
final UnifiedArgumentCollection args = new UnifiedArgumentCollection();
- final Set fakeSamples = Collections.singleton("fake");
+ final SampleList fakeSamples = SampleListUtils.singletonList("fake");
ugEngine = new UnifiedGenotypingEngine(engine, args,fakeSamples);
}
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java
index 66af674e9..d36b1af16 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java
@@ -47,6 +47,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import com.google.caliper.Param;
import com.google.caliper.SimpleBenchmark;
+import org.broadinstitute.gatk.genotyping.SampleListUtils;
import org.broadinstitute.gatk.utils.pairhmm.ActiveRegionTestDataSet;
import org.broadinstitute.gatk.utils.pairhmm.FastLoglessPairHMM;
import org.broadinstitute.gatk.utils.pairhmm.PairHMM;
@@ -112,7 +113,7 @@ public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark {
public void timeGraphBasedLikelihoods(final int reps) {
for (int i = 0; i < reps; i++) {
final GraphBasedLikelihoodCalculationEngineInstance rtlce = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(), new FastLoglessPairHMM((byte)10),Double.NEGATIVE_INFINITY,HeterogeneousKmerSizeResolution.COMBO_MAX);
- rtlce.computeReadLikelihoods(dataSet.haplotypeList(), Collections.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList()));
+ rtlce.computeReadLikelihoods(dataSet.haplotypeList(), SampleListUtils.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList()));
}
}
@@ -121,7 +122,7 @@ public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark {
for (int i = 0; i < reps; i++) {
final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte) 10,
PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE);
- engine.computeReadLikelihoods(dataSet.assemblyResultSet(), Collections.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList()));
+ engine.computeReadLikelihoods(dataSet.assemblyResultSet(), SampleListUtils.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList()));
}
}
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java
index db6923feb..cb946aafe 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java
@@ -47,6 +47,7 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import htsjdk.variant.variantcontext.Allele;
+import org.broadinstitute.gatk.genotyping.SampleListUtils;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.HaplotypeGraph;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
@@ -262,7 +263,7 @@ public class ReadThreadingLikelihoodCalculationEngineUnitTest extends ActiveRegi
dataSet = (ActiveRegionTestDataSet) params[0];
if (INTRODUCE_READ_ERRORS) dataSet.introduceErrors(new Random(13));
graphEngine = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(),hmm,Double.NEGATIVE_INFINITY, HeterogeneousKmerSizeResolution.COMBO_MAX);
- graphLks = graphEngine.computeReadLikelihoods(dataSet.haplotypeList(),Collections.singletonList("anonymous"),Collections.singletonMap("anonymous",dataSet.readList())).toPerReadAlleleLikelihoodMap(0);
+ graphLks = graphEngine.computeReadLikelihoods(dataSet.haplotypeList(), SampleListUtils.singletonList("anonymous"),Collections.singletonMap("anonymous",dataSet.readList())).toPerReadAlleleLikelihoodMap(0);
// clip reads at the anchors.
final Map clippedReads = anchorClippedReads(graphEngine.getHaplotypeGraph(),dataSet.readList());
@@ -272,7 +273,7 @@ public class ReadThreadingLikelihoodCalculationEngineUnitTest extends ActiveRegi
clippedReadList.add(clippedReads.containsKey(r) ? clippedReads.get(r) : r);
}
- loglessLks = fullPairHMM.computeReadLikelihoods(dataSet.assemblyResultSet(),Collections.singletonList("anonymous"),Collections.singletonMap("anonymous",clippedReadList)).toPerReadAlleleLikelihoodMap(0);
+ loglessLks = fullPairHMM.computeReadLikelihoods(dataSet.assemblyResultSet(),SampleListUtils.singletonList("anonymous"),Collections.singletonMap("anonymous",clippedReadList)).toPerReadAlleleLikelihoodMap(0);
// Change clipped by unclipped in the resulting likelihood map.
for (final GATKSAMRecord r : clippedReads.keySet()) {
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java
index 08edeee98..7c2cd8727 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java
@@ -47,11 +47,12 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import htsjdk.samtools.SAMFileHeader;
-import org.broadinstitute.gatk.utils.BaseTest;
-import org.broadinstitute.gatk.utils.GenomeLoc;
-import org.broadinstitute.gatk.utils.GenomeLocParser;
-import org.broadinstitute.gatk.utils.UnvalidatingGenomeLoc;
-import org.broadinstitute.gatk.utils.Utils;
+import htsjdk.variant.variantcontext.Genotype;
+import htsjdk.variant.variantcontext.GenotypeLikelihoods;
+import htsjdk.variant.variantcontext.GenotypeType;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.broadinstitute.gatk.genotyping.*;
+import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
@@ -61,7 +62,7 @@ import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
-import htsjdk.variant.variantcontext.*;
+import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.BeforeMethod;
@@ -75,7 +76,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest {
final String RGID = "ID1";
GATKSAMReadGroupRecord rg;
final String sample = "NA12878";
- final Set samples = Collections.singleton(sample);
+ final SampleList samples = SampleListUtils.singletonList(sample);
SAMFileHeader header;
ReferenceConfidenceModel model;
@@ -179,12 +180,12 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest {
@Test
public void testIndelLikelihoods() {
- GenotypeLikelihoods prev = model.getIndelPLs(0);
+ GenotypeLikelihoods prev = model.getIndelPLs(HomoSapiensConstants.DEFAULT_PLOIDY,0);
Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0});
Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0);
for ( int i = 1; i <= ReferenceConfidenceModel.MAX_N_INDEL_INFORMATIVE_READS; i++ ) {
- final GenotypeLikelihoods current = model.getIndelPLs(i);
+ final GenotypeLikelihoods current = model.getIndelPLs(HomoSapiensConstants.DEFAULT_PLOIDY,i);
final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF);
final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF);
Assert.assertTrue(prevGQ < currGQ, "GQ Failed with prev " + prev + " curr " + current + " at " + i);
@@ -288,10 +289,12 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest {
data.getActiveRegion().add(data.makeRead(0, data.getRefLength()));
}
- final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
+ final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), samples, data.getActiveRegion());
+ final PloidyModel ploidyModel = new HomogeneousPloidyModel(samples,2);
+ final GenotypingModel genotypingModel = new InfiniteRandomMatingPopulationModel();
final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads);
- final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls);
+ final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, ploidyModel, genotypingModel, calls);
checkReferenceModelResult(data, contexts, expectedDPs, calls);
}
@@ -304,12 +307,14 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest {
final List haplotypes = Arrays.asList(data.getRefHap());
final List calls = Collections.emptyList();
+ final PloidyModel ploidyModel = new HomogeneousPloidyModel(samples,2);
+ final GenotypingModel genotypingModel = new InfiniteRandomMatingPopulationModel();
data.getActiveRegion().add(data.makeRead(start, readLen));
- final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
+ final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), samples, data.getActiveRegion());
final List expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0));
for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1);
- final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls);
+ final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, ploidyModel, genotypingModel, calls);
checkReferenceModelResult(data, contexts, expectedDPs, calls);
}
}
@@ -340,10 +345,12 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest {
data.getActiveRegion().add(data.makeRead(0, data.getRefLength()));
}
- final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
+ final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), samples, data.getActiveRegion());
+ final PloidyModel ploidyModel = new HomogeneousPloidyModel(samples,HomoSapiensConstants.DEFAULT_PLOIDY);
+ final GenotypingModel genotypingModel = new InfiniteRandomMatingPopulationModel();
final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads);
- final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls);
+ final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, ploidyModel, genotypingModel, calls);
checkReferenceModelResult(data, contexts, expectedDPs, calls);
}
}
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java
index 61a6860fd..33e1a4758 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java
@@ -48,6 +48,8 @@ package org.broadinstitute.gatk.utils.genotyper;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.genotyping.IndexedAlleleList;
+import org.broadinstitute.gatk.genotyping.IndexedSampleList;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
@@ -72,7 +74,7 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "dataSets")
public void testInstantiationAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) {
- final ReadLikelihoods result = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
Assert.assertEquals(result.sampleCount(), samples.length);
Assert.assertEquals(result.alleleCount(), alleles.length);
@@ -85,7 +87,7 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "dataSets")
public void testLikelihoodFillingAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) {
- final ReadLikelihoods result = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
final double[][][] likelihoods = fillWithRandomLikelihoods(samples, alleles, result);
testLikelihoodMatrixQueries(samples, result, likelihoods);
}
@@ -106,7 +108,7 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "dataSets")
public void testBestAlleles(final String[] samples, final Allele[] alleles, final Map> reads) {
- final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
fillWithRandomLikelihoods(samples,alleles,original);
final int alleleCount = alleles.length;
for (int s = 0; s < samples.length; s++) {
@@ -146,7 +148,7 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "dataSets")
public void testBestAlleleMap(final String[] samples, final Allele[] alleles, final Map> reads) {
- final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
fillWithRandomLikelihoods(samples,alleles,original);
final Map> expected = new HashMap<>(alleles.length);
for (final Allele allele : alleles)
@@ -171,7 +173,7 @@ public class ReadLikelihoodsUnitTest
}
}
if ((bestAlleleLk - secondBestAlleleLk) > ReadLikelihoods.BestAllele.INFORMATIVE_THRESHOLD)
- expected.get(alleles[bestAlleleIndex]).add(sampleMatrix.read(r));
+ expected.get(alleles[bestAlleleIndex]).add(sampleMatrix.readAt(r));
}
}
@@ -189,7 +191,7 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "dataSets")
public void testFilterPoorlyModeledReads(final String[] samples, final Allele[] alleles, final Map> reads) {
- final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
for (int s = 0; s < samples.length; s++) {
final int sampleReadCount = original.sampleReadCount(s);
@@ -220,7 +222,7 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "dataSets")
public void testFilterReadsToOverlap(final String[] samples, final Allele[] alleles, final Map> reads) {
- final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START );
fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods result = original.clone();
@@ -231,7 +233,7 @@ public class ReadLikelihoodsUnitTest
newLikelihoods[s][a] = new double[(original.sampleReadCount(s) + 1) / 2];
final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s);
for (int r = 0; r < newLikelihoods[s][a].length; r++) {
- Assert.assertEquals(result.readIndex(s,sampleMatrix.read(r << 1)),r);
+ Assert.assertEquals(result.readIndex(s,sampleMatrix.readAt(r << 1)),r);
newLikelihoods[s][a][r] = sampleMatrix.get(a, r << 1);
}
}
@@ -240,14 +242,14 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "marginalizationDataSets")
public void testMarginalizationWithOverlap(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) {
- final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START );
fillWithRandomLikelihoods(samples, alleles, original);
final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping,evenReadOverlap);
Assert.assertNotNull(marginalized);
Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount());
for (int a = 0; a < marginalized.alleleCount(); a++) {
- final List oldAlleles = newToOldAlleleMapping.get(marginalized.allele(a));
+ final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a));
Assert.assertNotNull(oldAlleles);
for (int s = 0; s < samples.length; s++) {
final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s);
@@ -268,13 +270,13 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "marginalizationDataSets")
public void testMarginalization(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) {
- final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
fillWithRandomLikelihoods(samples, alleles, original);
final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping);
Assert.assertNotNull(marginalized);
Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount());
for (int a = 0; a < marginalized.alleleCount(); a++) {
- final List oldAlleles = newToOldAlleleMapping.get(marginalized.allele(a));
+ final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a));
Assert.assertNotNull(oldAlleles);
for (int s = 0; s < samples.length; s++) {
final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s);
@@ -295,7 +297,7 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "dataSets")
public void testNormalizeBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) {
- final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods result= original.clone();
result.normalizeLikelihoods(true, Double.NEGATIVE_INFINITY);
@@ -321,7 +323,7 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "dataSets")
public void testNormalizeCapWorstLK(final String[] samples, final Allele[] alleles, final Map> reads) {
- final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods result= original.clone();
result.normalizeLikelihoods(false, - 0.001);
@@ -354,7 +356,7 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "dataSets")
public void testNormalizeCapWorstLKAndBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) {
- final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods result= original.clone();
result.normalizeLikelihoods(true, - 0.001);
@@ -390,7 +392,7 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "dataSets")
public void testAddMissingAlleles(final String[] samples, final Allele[] alleles, final Map> reads) {
- final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods result = original.clone();
@@ -411,8 +413,8 @@ public class ReadLikelihoodsUnitTest
Assert.assertEquals(original.alleleCount() + 1, result.alleleCount());
// We add too more amongst exisisting alleles:
- result.addMissingAlleles(Arrays.asList(newTwo = Allele.create("ATATATTATATTAATATT".getBytes(), false),result.allele(1),
- result.allele(0),newThree = Allele.create("TGTGTGTATTG".getBytes(),false),Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-6.54321);
+ result.addMissingAlleles(Arrays.asList(newTwo = Allele.create("ATATATTATATTAATATT".getBytes(), false),result.alleleAt(1),
+ result.alleleAt(0),newThree = Allele.create("TGTGTGTATTG".getBytes(),false),Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-6.54321);
Assert.assertEquals(original.alleleCount()+3,result.alleleCount());
@@ -439,7 +441,7 @@ public class ReadLikelihoodsUnitTest
@Test(dataProvider = "dataSets")
public void testAddNonRefAllele(final String[] samples, final Allele[] alleles, final Map> reads) {
- final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
+ final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods result = original.clone();
result.addNonReferenceAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
@@ -473,13 +475,13 @@ public class ReadLikelihoodsUnitTest
private void testLikelihoodMatrixQueries(String[] samples, ReadLikelihoods result, final double[][][] likelihoods) {
for (final String sample : samples) {
final int sampleIndex = result.sampleIndex(sample);
- final double[][] likelihoodMatrix = result.sampleValues(sampleIndex);
final int sampleReadCount = result.sampleReadCount(sampleIndex);
- Assert.assertEquals(result.alleleCount(), likelihoodMatrix.length);
- for (int a = 0; a < likelihoodMatrix.length; a++) {
- Assert.assertEquals(likelihoodMatrix[a].length,sampleReadCount);
+ final int alleleCount = result.alleleCount();
+ Assert.assertEquals(result.alleleCount(), alleleCount);
+ for (int a = 0; a < alleleCount; a++) {
+ Assert.assertEquals(result.sampleReadCount(0),sampleReadCount);
for (int r = 0; r < sampleReadCount; r++)
- Assert.assertEquals(likelihoodMatrix[a][r],
+ Assert.assertEquals(result.sampleMatrix(0).get(a,r),
likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON);
}
}
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java
index 3228e87d6..1f0280c82 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java
@@ -46,14 +46,14 @@
package org.broadinstitute.gatk.utils.gvcf;
-import org.broadinstitute.gatk.utils.BaseTest;
-import org.broadinstitute.gatk.tools.walkers.haplotypecaller.ReferenceConfidenceModel;
-import org.broadinstitute.gatk.utils.Utils;
-import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
+import org.broadinstitute.gatk.utils.BaseTest;
+import org.broadinstitute.gatk.utils.Utils;
+import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
+import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
@@ -100,21 +100,21 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testHeaderWriting() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.writeHeader(new VCFHeader());
Assert.assertTrue(mockWriter.headerWritten);
}
@Test
public void testClose() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.close();
Assert.assertTrue(mockWriter.closed);
}
@Test
public void testCloseWithoutClosingUnderlyingWriter() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.close(false);
Assert.assertFalse(mockWriter.closed);
}
@@ -164,7 +164,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testCloseEmitsLastVariant() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.add(makeHomRef("20", 1, 30));
Assert.assertEquals(mockWriter.emitted.size(), 0);
@@ -176,7 +176,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testCloseDoesntEmitsLastVariantWhenNonRef() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.add(makeNonRef("20", 1, 30));
Assert.assertEquals(mockWriter.emitted.size(), 1);
@@ -188,7 +188,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testCrossingContigBoundaryRef() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.add(makeHomRef("20", 1, 30));
writer.add(makeHomRef("20", 2, 30));
@@ -204,7 +204,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testCrossingContigBoundaryToLowerPositionsRef() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.add(makeHomRef("20", 30, 30));
writer.add(makeHomRef("20", 31, 30));
@@ -220,7 +220,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testCrossingContigBoundaryFromNonRefToLowerPositionsRef() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.add(makeNonRef("20", 20, 30));
Assert.assertEquals(mockWriter.emitted.size(), 1);
@@ -235,7 +235,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testCrossingContigBoundaryNonRef() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.add(makeHomRef("20", 1, 30));
writer.add(makeHomRef("20", 2, 30));
@@ -248,7 +248,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testCrossingContigBoundaryNonRefThenNonRef() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.add(makeNonRef("20", 1, 30));
Assert.assertEquals(mockWriter.emitted.size(), 1);
@@ -283,7 +283,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testVariantForcesNonRef() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.add(makeHomRef("20", 1, 30));
writer.add(makeHomRef("20", 2, 30));
@@ -300,7 +300,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testEmittingTwoBands() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.add(makeHomRef("20", 1, 0));
writer.add(makeHomRef("20", 2, 0));
@@ -315,7 +315,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testNonContiguousBlocks() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.add(makeHomRef("20", 1, 0));
writer.add(makeHomRef("20", 2, 0));
@@ -329,7 +329,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testDeletion() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, HomoSapiensConstants.DEFAULT_PLOIDY);
writer.add(makeHomRef("20", 1, 0));
writer.add(makeHomRef("20", 2, 0));
@@ -347,7 +347,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test
public void testHomRefAlt() {
- final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition);
+ final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition, 2);
writer.add(makeHomRef("20", 1, 0));
writer.add(makeHomRef("20", 2, 0));
@@ -383,7 +383,7 @@ public class GVCFWriterUnitTest extends BaseTest {
@Test(dataProvider = "BandPartitionData")
public void testMyData(final List partitions, final boolean expectedGood) {
try {
- GVCFWriter.parsePartitions(partitions);
+ GVCFWriter.parsePartitions(partitions,2);
Assert.assertTrue(expectedGood, "Expected to fail but didn't");
} catch ( Exception e ) {
Assert.assertTrue(! expectedGood, "Expected to succeed but failed with message " + e.getMessage());
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java
index 36d3aa09e..00d5d6984 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java
@@ -46,11 +46,11 @@
package org.broadinstitute.gatk.utils.gvcf;
-import org.broadinstitute.gatk.utils.BaseTest;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
+import org.broadinstitute.gatk.utils.BaseTest;
import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.DataProvider;
@@ -58,7 +58,6 @@ import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collections;
import java.util.List;
public class HomRefBlockUnitTest extends BaseTest {
@@ -71,7 +70,7 @@ public class HomRefBlockUnitTest extends BaseTest {
@Test
public void testBasicConstruction() {
- final HomRefBlock band = new HomRefBlock(vc, 10, 20);
+ final HomRefBlock band = new HomRefBlock(vc, 10, 20, 2);
Assert.assertSame(band.getStartingVC(), vc);
Assert.assertEquals(band.getRef(), vc.getReference());
Assert.assertEquals(band.getGQLowerBound(), 10);
@@ -86,7 +85,7 @@ public class HomRefBlockUnitTest extends BaseTest {
@Test
public void testMinMedian() {
//TODO - might be better to make this test use a data provider?
- final HomRefBlock band = new HomRefBlock(vc, 10, 20);
+ final HomRefBlock band = new HomRefBlock(vc, 10, 20,2);
final GenotypeBuilder gb = new GenotypeBuilder("NA12878");
int pos = vc.getStart();
@@ -117,7 +116,7 @@ public class HomRefBlockUnitTest extends BaseTest {
@Test
public void testBigGQIsCapped() {
- final HomRefBlock band = new HomRefBlock(vc, 10, 20);
+ final HomRefBlock band = new HomRefBlock(vc, 10, 20,2);
final GenotypeBuilder gb = new GenotypeBuilder("NA12878");
band.add(vc.getStart(), gb.DP(1000).GQ(1000).PL(new int[]{0,10,100}).make());
@@ -126,7 +125,7 @@ public class HomRefBlockUnitTest extends BaseTest {
@Test(expectedExceptions = IllegalArgumentException.class)
public void testBadAdd() {
- final HomRefBlock band = new HomRefBlock(vc, 10, 20);
+ final HomRefBlock band = new HomRefBlock(vc, 10, 20,2);
final GenotypeBuilder gb = new GenotypeBuilder("NA12878");
band.add(vc.getStart() + 10, gb.DP(10).GQ(11).PL(new int[]{0,10,100}).make());
@@ -156,7 +155,7 @@ public class HomRefBlockUnitTest extends BaseTest {
@Test(dataProvider = "ContiguousData")
public void testIsContiguous(final String contig, final int pos, final boolean expected) {
- final HomRefBlock band = new HomRefBlock(vc, 10, 20);
+ final HomRefBlock band = new HomRefBlock(vc, 10, 20,2);
final VariantContext testVC = new VariantContextBuilder(vc).chr(contig).start(pos).stop(pos).make();
Assert.assertEquals(band.isContiguous(testVC), expected);
}
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java
index 8e5ae61fb..feb41712b 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java
@@ -26,14 +26,13 @@
package org.broadinstitute.gatk.engine;
import com.google.java.contract.Ensures;
-import htsjdk.samtools.reference.IndexedFastaSequenceFile;
-import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.reference.IndexedFastaSequenceFile;
+import htsjdk.samtools.reference.ReferenceSequenceFile;
+import htsjdk.variant.vcf.VCFConstants;
import org.apache.log4j.Logger;
-import org.broadinstitute.gatk.engine.walkers.*;
-import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection;
import org.broadinstitute.gatk.engine.arguments.ValidationExclusion;
import org.broadinstitute.gatk.engine.datasources.reads.*;
@@ -55,8 +54,12 @@ import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet;
import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation;
import org.broadinstitute.gatk.engine.samples.SampleDB;
import org.broadinstitute.gatk.engine.samples.SampleDBBuilder;
+import org.broadinstitute.gatk.engine.walkers.*;
+import org.broadinstitute.gatk.genotyping.IndexedSampleList;
+import org.broadinstitute.gatk.genotyping.SampleList;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.classloader.PluginManager;
+import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.interval.IntervalUtils;
@@ -64,7 +67,6 @@ import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter;
import org.broadinstitute.gatk.utils.recalibration.BQSRArgumentSet;
import org.broadinstitute.gatk.utils.text.XReadLines;
import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor;
-import htsjdk.variant.vcf.VCFConstants;
import java.io.File;
import java.io.FileNotFoundException;
@@ -1138,7 +1140,7 @@ public class GenomeAnalysisEngine {
* Returns data source objects encapsulating all rod data;
* individual rods can be accessed through the returned data source objects.
*
- * @return the rods data sources
+ * @return the rods data sources, never {@code null}.
*/
public List getRodDataSources() {
return this.rodDataSources;
@@ -1254,4 +1256,20 @@ public class GenomeAnalysisEngine {
runtimeLimitInNanoseconds = TimeUnit.NANOSECONDS.convert(args.maxRuntime, args.maxRuntimeUnits);
}
}
+
+ /**
+ * Returns the sample list including all samples.
+ * @return never {@code null}.
+ */
+ public SampleList getSampleList() {
+ return new IndexedSampleList(getSampleDB().getSampleNames());
+ }
+
+ /**
+ * Returns the sample list including samples in read inputs.
+ * @return never {@code null}.
+ */
+ public SampleList getReadSampleList() {
+ return new IndexedSampleList(SampleUtils.getSAMFileSamples(getSAMFileHeader()));
+ }
}
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/AlleleListUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/AlleleListUtils.java
index 380a20379..dfdfe0945 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/AlleleListUtils.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/AlleleListUtils.java
@@ -37,6 +37,24 @@ import java.util.List;
*/
public class AlleleListUtils {
+ @SuppressWarnings("unchecked")
+ private static final AlleleList EMPTY_LIST = new AlleleList() {
+ @Override
+ public int alleleCount() {
+ return 0;
+ }
+
+ @Override
+ public int alleleIndex(final Allele allele) {
+ return -1;
+ }
+
+ @Override
+ public Allele alleleAt(final int index) {
+ throw new IllegalArgumentException("allele index is out of range");
+ }
+ };
+
/**
* Checks whether two allele lists are in fact the same.
* @param first one list to compare.
@@ -107,6 +125,16 @@ public class AlleleListUtils {
return new AsList(list);
}
+ /**
+ * Returns an unmodifiable empty allele-list.
+ * @param the allele class.
+ * @return never {@code null}.
+ */
+ @SuppressWarnings("unchecked")
+ public static final AlleleList emptyList() {
+ return EMPTY_LIST;
+ }
+
/**
* Simple list view of a sample-list.
*/
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/SampleListUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/SampleListUtils.java
index e151d03da..f3787568a 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/SampleListUtils.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/genotyping/SampleListUtils.java
@@ -34,6 +34,33 @@ import java.util.*;
*/
public class SampleListUtils {
+ private static final SampleList EMPTY_LIST = new SampleList() {
+
+ @Override
+ public int sampleCount() {
+ return 0;
+ }
+
+ @Override
+ public int sampleIndex(String sample) {
+ return -1;
+ }
+
+ @Override
+ public String sampleAt(final int sampleIndex) {
+ throw new IllegalArgumentException("index is out of valid range");
+ }
+ };
+
+ /**
+ * Empty list.
+ *
+ * @return never {@code null}
+ */
+ public static SampleList emptyList() {
+ return EMPTY_LIST;
+ }
+
/**
* Checks whether two sample lists are in fact the same.
* @param first one list to compare.
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SampleUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SampleUtils.java
index ccb573414..77fc17083 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SampleUtils.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SampleUtils.java
@@ -60,9 +60,9 @@ public class SampleUtils {
* @param header the sam file header
* @return list of strings representing the sample names
*/
- public static Set getSAMFileSamples(SAMFileHeader header) {
+ public static Set getSAMFileSamples(final SAMFileHeader header) {
// get all of the unique sample names
- Set samples = new TreeSet();
+ final Set samples = new TreeSet();
List readGroups = header.getReadGroups();
for ( SAMReadGroupRecord readGroup : readGroups )
samples.add(readGroup.getSample());
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java
index 97cbebd09..2fdf657cc 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java
@@ -85,6 +85,24 @@ public class GATKVariantContextUtils {
return true;
}
+ /**
+ * Returns a homozygous call allele list given the only allele and the ploidy.
+ *
+ * @param allele the only allele in the allele list.
+ * @param ploidy the ploidy of the resulting allele list.
+ *
+ * @throws IllegalArgumentException if {@code allele} is {@code null} or ploidy is negative.
+ *
+ * @return never {@code null}.
+ */
+ public static List homozygousAlleleList(final Allele allele, final int ploidy) {
+ if (allele == null || ploidy < 0)
+ throw new IllegalArgumentException();
+
+ // Use a tailored inner class to implement the list:
+ return Collections.nCopies(ploidy,allele);
+ }
+
public enum GenotypeMergeType {
/**
* Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD.