From 427645162bcf4c770c92bae0ed4d8add4b54ba0c Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Mon, 13 Jun 2016 11:39:46 -0400 Subject: [PATCH] SelectVariants works with non-diploids --- .../afcalc/DiploidExactAFCalculator.java | 6 +- .../variantutils/FamilyLikelihoodsUtils.java | 2 +- .../PosteriorLikelihoodsUtils.java | 2 +- .../SelectVariantsIntegrationTest.java | 49 ++- ...SelectVariantsParallelIntegrationTest.java | 2 +- .../VCFStreamingIntegrationTest.java | 2 +- .../walkers/variantutils/SelectVariants.java | 27 +- .../variantutils/VCFIntegrationTest.java | 6 +- .../variant/GATKVariantContextUtils.java | 166 +++++----- .../GATKVariantContextUtilsUnitTest.java | 302 +++++++++++++----- 10 files changed, 372 insertions(+), 192 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/DiploidExactAFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/DiploidExactAFCalculator.java index 49f1554ef..a3c7eba2f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/DiploidExactAFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/DiploidExactAFCalculator.java @@ -110,7 +110,7 @@ public abstract class DiploidExactAFCalculator extends ExactAFCalculator { @Override protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final int defaultPloidy, final List allelesToUse) { - return GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL); + return GATKVariantContextUtils.subsetAlleles(vc, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL); } @Override @@ -346,8 +346,8 @@ public abstract class DiploidExactAFCalculator extends ExactAFCalculator { if (defaultPloidy != 2) throw new IllegalArgumentException("cannot support ploidy different than 2 and the default ploidy is " + defaultPloidy); return allelesToUse.size() == 1 - ? GATKVariantContextUtils.subsetToRefOnly(vc, 2) - : GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, + ? GATKVariantContextUtils.subsetToRefOnly(vc, defaultPloidy) + : GATKVariantContextUtils.subsetAlleles(vc, allelesToUse, assignGenotypes ? GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN : GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FamilyLikelihoodsUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FamilyLikelihoodsUtils.java index c451c4a1a..9049e5c79 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FamilyLikelihoodsUtils.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FamilyLikelihoodsUtils.java @@ -165,7 +165,7 @@ public class FamilyLikelihoodsUtils { //final double[] log10Posteriors = MathUtils.toLog10(normalizedPosteriors); //update genotype types based on posteriors - GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc.getAlleles(), builder, + GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc.getAlleles(), genotype.getPloidy(), builder, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, log10Posteriors, vc.getAlleles()); builder.attribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY, diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/PosteriorLikelihoodsUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/PosteriorLikelihoodsUtils.java index 2dd92578c..feb0e4650 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/PosteriorLikelihoodsUtils.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/PosteriorLikelihoodsUtils.java @@ -153,7 +153,7 @@ public class PosteriorLikelihoodsUtils { final GenotypeBuilder builder = new GenotypeBuilder(vc1.getGenotype(genoIdx)); builder.phased(vc1.getGenotype(genoIdx).isPhased()); if ( posteriors.get(genoIdx) != null ) { - GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc1.getAlleles(), builder, + GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc1.getAlleles(), vc1.getMaxPloidy(2), builder, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, posteriors.get(genoIdx), vc1.getAlleles()); builder.attribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY, Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(posteriors.get(genoIdx)).getAsPLs())); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index b434d97d3..6edd72240 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -304,7 +304,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --minIndelSize 2", 1, - Arrays.asList("ed9dc00d0551630a2eed9e81a2a357d3") + Arrays.asList("ad0965edb1dbd30060afd21ba9f11bf7") ); executeTest("testMinIndelLengthSelection--" + testFile, spec); @@ -395,7 +395,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("c78a65b41edbdd386211042e8f65220b") + Arrays.asList("1fc77d7f47e75a24222a358c69de7f3d") ); executeTest("testNoGTs--" + testFile, spec); @@ -606,7 +606,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -IDs " + idFile + " --variant " + testFile), 1, - Arrays.asList("c6632b63617162455f02670174a2322a") + Arrays.asList("da1117cba380345c622a6d8e52c2270b") ); spec.disableShadowBCF(); executeTest("testKeepSelectionID--" + testFile, spec); @@ -641,7 +641,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -xlSelectType SNP --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("ed9dc00d0551630a2eed9e81a2a357d3") + Arrays.asList("ad0965edb1dbd30060afd21ba9f11bf7") ); executeTest("testExcludeSelectionType--" + testFile, spec); @@ -782,7 +782,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -sn NA12891 -trimAlternates", 1, - ReviewedGATKException.class); + Arrays.asList("7880f8a1dfae1804998b6a994574e734")); spec.disableShadowBCF(); executeTest("testSACNonDiploid", spec); } @@ -838,4 +838,43 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testMaxNoCall0_5", spec); } + + @Test + public void testHaploid() { + final String testfile = privateTestDir + "haploid-multisample.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -sn HG00610 -select 'DP > 7'", + 1, + Arrays.asList("bc6caa81836f4c94a1216babd0c1ac72")); + spec.disableShadowBCF(); + + executeTest("testHaploid", spec); + } + + @Test + public void testTetraploid() { + final String testfile = privateTestDir + "tetraploid-multisample.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -sn NA18486 -select 'DP > 19'", + 1, + Arrays.asList("4fcfa5e0ba5d39ca9f0593aa0c0f7a63")); + spec.disableShadowBCF(); + + executeTest("testTetraploid", spec); + } + + @Test + public void testTetraDiploid() { + final String testfile = privateTestDir + "tetra-diploid.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -sn NA12878 -select 'DP > 48' -trimAlternates", + 1, + Arrays.asList("709782f7a07cd500d41370e6a275fcdf")); + spec.disableShadowBCF(); + + executeTest("testTetraDiploid", spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java index 45677ffee..fb2c68f1a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java @@ -95,7 +95,7 @@ public class SelectVariantsParallelIntegrationTest extends WalkerTest { { // new tests on b37 using testdir VCF final String testfile = privateTestDir + "NA12878.hg19.example1.vcf"; final String args = "-select 'DP > 30' -V " + testfile; - new ParallelSelectTestProvider(b37KGReference, args, "64f9258e9e3024b6361abbeeeefafee9", nt); + new ParallelSelectTestProvider(b37KGReference, args, "51645037428729c3a9fa0e25fc2104ad", nt); } { // AD and PL decoding race condition final String testfile = privateTestDir + "race_condition.vcf"; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VCFStreamingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VCFStreamingIntegrationTest.java index 0aa2aedcc..c2cd539d6 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -87,7 +87,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " --no_cmdline_in_header " + " -o %s", 1, - Arrays.asList("f9f6418698f967ba7ca451ac1fb4cc8d") + Arrays.asList("94057d7a98c1af0a7490540ea1d9b247") ); executeTest("testSimpleVCFStreaming", spec); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index f177b7393..899346924 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -626,7 +626,8 @@ public class SelectVariants extends RodWalker implements TreeR private Set IDsToRemove = null; private Map vcfRods; - private final List diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + private final List diploidNoCallAlleles = GATKVariantContextUtils.noCallAlleles(2); + private final Map ploidyToNumberOfAlleles = new HashMap(); /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher @@ -844,12 +845,24 @@ public class SelectVariants extends RodWalker implements TreeR continue; } - if (considerNoCallGenotypes()) { + if (considerNoCallGenotypes()) { final int numNoCallSamples = numNoCallGenotypes(vc); final double fractionNoCallGenotypes = samples.isEmpty() ? 0.0 : ((double) numNoCallSamples) / samples.size(); if (numNoCallSamples > maxNOCALLnumber || fractionNoCallGenotypes > maxNOCALLfraction) continue; - } + } + + // Initialize cache of PL index to a list of alleles for any ploidy + if (vc.getType() != VariantContext.Type.NO_VARIATION) { + for (final Genotype g : vc.getGenotypes()) { + if (g.getPloidy() != 0) { + if (!ploidyToNumberOfAlleles.containsKey(g.getPloidy()) || ploidyToNumberOfAlleles.get(g.getPloidy()) < vc.getNAlleles()) { + GenotypeLikelihoods.initializeAnyploidPLIndexToAlleleIndices(vc.getNAlleles() - 1, g.getPloidy()); + ploidyToNumberOfAlleles.put(g.getPloidy(), vc.getNAlleles()); + } + } + } + } VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates); @@ -1089,7 +1102,9 @@ public class SelectVariants extends RodWalker implements TreeR for ( Genotype genotype : newGC ) { //Set genotype to no call if it falls in the fraction. if(fractionGenotypes>0 && randomGenotypes.nextDouble() noCallAlleles = (genotype.getPloidy() == 2 ? diploidNoCallAlleles : + GATKVariantContextUtils.noCallAlleles(genotype.getPloidy())); + genotypes.add(new GenotypeBuilder(genotype).alleles(noCallAlleles).noGQ().make()); } else{ genotypes.add(genotype); @@ -1137,7 +1152,9 @@ public class SelectVariants extends RodWalker implements TreeR for ( final Genotype g : vc.getGenotypes() ) { if ( g.isCalled() && g.isFiltered() ) { haveFilteredNoCallAlleles = true; - genotypes.add(new GenotypeBuilder(g).alleles(diploidNoCallAlleles).make()); + final List noCallAlleles = (g.getPloidy() == 2 ? diploidNoCallAlleles : + GATKVariantContextUtils.noCallAlleles(g.getPloidy())); + genotypes.add(new GenotypeBuilder(g).alleles(noCallAlleles).make()); } else { // increment the number called alleles and called alternate alleles diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VCFIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VCFIntegrationTest.java index d7ba67fd0..878316db8 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VCFIntegrationTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VCFIntegrationTest.java @@ -109,7 +109,7 @@ public class VCFIntegrationTest extends WalkerTest { String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("122340e3dc333d2b4b79c5c0c443a3fe")); + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("8e35e142fea8891f72e577e7b1526756")); executeTest("Test reading and writing samtools vcf", spec1); } @@ -118,7 +118,7 @@ public class VCFIntegrationTest extends WalkerTest { String testVCF = privateTestDir + "ex2.vcf"; String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("db565efb14b2fe5f00a11762751d2476")); + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("5245b967ac66b29c8e836bfa5b5b4643")); executeTest("Test writing samtools WEx BCF example", spec1); } @@ -360,7 +360,7 @@ public class VCFIntegrationTest extends WalkerTest { " -o %s "; final String name = "testBlockCompressedInput: " + testSpec.toString(); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList("ce9c0bf31ee9452ac4a12a59d5814545")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList("1cc0ab636f33b9105d54bdaf2b80ad28")); executeTest(name, spec); } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java index 3bf41d947..407a49570 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java @@ -48,14 +48,6 @@ public class GATKVariantContextUtils { public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - /** - * Diploid NO_CALL allele list... - * - * @deprecated you should use {@link #noCallAlleles(int)} instead. It indicates the presence of a hardcoded diploid assumption which is bad. - */ - @Deprecated - public final static List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - public final static String MERGE_FILTER_PREFIX = "filterIn"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; @@ -600,11 +592,11 @@ public class GATKVariantContextUtils { * @param vc variant context with genotype likelihoods * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** * @param assignGenotypes assignment strategy for the (subsetted) PLs - * @return a new non-null GenotypesContext + * @return a new non-null GenotypesContext with subsetted alleles */ - public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, - final List allelesToUse, - final GenotypeAssignmentMethod assignGenotypes) { + public static GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final GenotypeAssignmentMethod assignGenotypes) { if ( vc == null ) throw new IllegalArgumentException("the VariantContext cannot be null"); if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null"); if ( allelesToUse.isEmpty() ) throw new IllegalArgumentException("must have alleles to use"); @@ -615,7 +607,7 @@ public class GATKVariantContextUtils { if (vc.getGenotypes().isEmpty()) return GenotypesContext.create(); // find the likelihoods indexes to use from the used alternate alleles - final List likelihoodIndexesToUse = determineDiploidLikelihoodIndexesToUse(vc, allelesToUse); + final List> likelihoodIndexesToUse = determineLikelihoodIndexesToUse(vc, allelesToUse); // find the strand allele count indexes to use from the used alternate alleles final List sacIndexesToUse = determineSACIndexesToUse(vc, allelesToUse); @@ -625,26 +617,26 @@ public class GATKVariantContextUtils { } /** - * Find the likelihood indexes to use for a selected set of diploid alleles + * Find the likelihood indexes to use for a selected set of alleles * * @param originalVC the original VariantContext * @param allelesToUse the subset of alleles to use * @return a list of PL indexes to use or null if none */ - private static List determineDiploidLikelihoodIndexesToUse(final VariantContext originalVC, final List allelesToUse) { + private static List> determineLikelihoodIndexesToUse(final VariantContext originalVC, final List allelesToUse) { if ( originalVC == null) throw new IllegalArgumentException("the original VariantContext cannot be null"); if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null"); // the bitset representing the allele indexes we want to keep - final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); + final BitSet alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, // then we can keep the PLs as is; otherwise, we determine which ones to keep - if ( MathUtils.countOccurrences(true, alleleIndexesToUse) == alleleIndexesToUse.length ) + if ( alleleIndexesToUse.cardinality() == alleleIndexesToUse.size() ) return null; - return getDiploidLikelihoodIndexes(originalVC, alleleIndexesToUse); + return getLikelihoodIndexes(originalVC, alleleIndexesToUse); } /** @@ -654,17 +646,17 @@ public class GATKVariantContextUtils { * @param allelesToUse the subset of alleles to use * @return a list of SAC indexes to use or null if none */ - public static List determineSACIndexesToUse(final VariantContext originalVC, final List allelesToUse) { + public static List determineSACIndexesToUse(final VariantContext originalVC, final List allelesToUse) { if ( originalVC == null ) throw new IllegalArgumentException("the original VC cannot be null"); if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null"); // the bitset representing the allele indexes we want to keep - final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); + final BitSet alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, // then we can keep the SACs as is; otherwise, we determine which ones to keep - if (MathUtils.countOccurrences(true, alleleIndexesToUse) == alleleIndexesToUse.length) + if (alleleIndexesToUse.cardinality() == alleleIndexesToUse.size()) return null; return getSACIndexes(alleleIndexesToUse); @@ -675,32 +667,24 @@ public class GATKVariantContextUtils { * * @param originalVC the original VariantContext * @param alleleIndexesToUse the bitset representing the alleles to use (@see #getAlleleIndexBitset) - * @return a non-null List + * @return likelihoods indexes for each genotype */ - private static List getDiploidLikelihoodIndexes(final VariantContext originalVC, final boolean[] alleleIndexesToUse) { + private static List> getLikelihoodIndexes(final VariantContext originalVC, BitSet alleleIndexesToUse) { - if (originalVC == null) throw new IllegalArgumentException("the original VC cannot be null"); - if (alleleIndexesToUse == null) throw new IllegalArgumentException("the alleles to use cannot be null"); + final List> likelihoodIndexesPerGenotype = new ArrayList>(10); - // All samples must be diploid - for ( final Genotype g : originalVC.getGenotypes() ){ - if ( g.getPloidy() != DEFAULT_PLOIDY ) - throw new ReviewedGATKException("All samples must be diploid"); + for (final Genotype g : originalVC.getGenotypes()) { + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), g.getPloidy()); + final List likelihoodIndexes = new ArrayList<>(30); + for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { + // consider this entry only if all the alleles are good + if ( GenotypeLikelihoods.getAlleles(PLindex, g.getPloidy()).stream().allMatch(i -> alleleIndexesToUse.get(i)) ) + likelihoodIndexes.add(PLindex); + } + likelihoodIndexesPerGenotype.add(likelihoodIndexes); } - final List result = new ArrayList<>(30); - - // numLikelihoods takes total # of alleles. - final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), DEFAULT_PLOIDY); - - for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - // consider this entry only if both of the alleles are good - if ( alleleIndexesToUse[alleles.alleleIndex1] && alleleIndexesToUse[alleles.alleleIndex2] ) - result.add(PLindex); - } - - return result; + return likelihoodIndexesPerGenotype; } /** @@ -709,15 +693,15 @@ public class GATKVariantContextUtils { * @param alleleIndexesToUse the bitset representing the alleles to use (@see #getAlleleIndexBitset) * @return a non-null List */ - private static List getSACIndexes(final boolean[] alleleIndexesToUse) { + private static List getSACIndexes(final BitSet alleleIndexesToUse) { if (alleleIndexesToUse == null) throw new IllegalArgumentException("the alleles to use cannot be null"); - if (alleleIndexesToUse.length == 0) throw new IllegalArgumentException("cannot have no alleles to use"); + if (alleleIndexesToUse.isEmpty()) throw new IllegalArgumentException("cannot have no alleles to use"); - final List result = new ArrayList<>(2 * alleleIndexesToUse.length); + final List result = new ArrayList<>(2 * alleleIndexesToUse.size()); - for (int SACindex = 0; SACindex < alleleIndexesToUse.length; SACindex++) { - if (alleleIndexesToUse[SACindex]) { + for (int SACindex = 0; SACindex < alleleIndexesToUse.size(); SACindex++) { + if (alleleIndexesToUse.get(SACindex)) { result.add(2 * SACindex); result.add(2 * SACindex + 1); } @@ -734,19 +718,19 @@ public class GATKVariantContextUtils { * @param allelesToUse the list of alleles to keep * @return non-null bitset */ - private static boolean[] getAlleleIndexBitset(final VariantContext originalVC, final List allelesToUse) { + private static BitSet getAlleleIndexBitset(final VariantContext originalVC, final List allelesToUse) { if (originalVC == null) throw new IllegalArgumentException("the original VC cannot be null"); if (allelesToUse == null) throw new IllegalArgumentException("the alleles to use cannot be null"); final int numOriginalAltAlleles = originalVC.getNAlleles() - 1; - final boolean[] alleleIndexesToKeep = new boolean[numOriginalAltAlleles + 1]; + final BitSet alleleIndexesToKeep = new BitSet(numOriginalAltAlleles + 1); - // the reference Allele is definitely still used - alleleIndexesToKeep[0] = true; + // the reference Allele is always used + alleleIndexesToKeep.set(0); for (int i = 0; i < numOriginalAltAlleles; i++) { if (allelesToUse.contains(originalVC.getAlternateAllele(i))) - alleleIndexesToKeep[i + 1] = true; + alleleIndexesToKeep.set(i + 1); } return alleleIndexesToKeep; @@ -813,7 +797,7 @@ public class GATKVariantContextUtils { * @param originalGs the original GenotypesContext * @param originalVC the original VariantContext * @param allelesToUse the actual alleles to use with the new Genotypes - * @param likelihoodIndexesToUse the indexes in the PL to use given the allelesToUse (@see #determineDiploidLikelihoodIndexesToUse()) + * @param likelihoodIndexesToUse the indexes in the PL to use given the allelesToUse for each genotype (@see #determineLikelihoodIndexesToUse()) * @param sacIndexesToUse the indexes in the SAC to use given the allelesToUse (@see #determineSACIndexesToUse()) * @param assignGenotypes assignment strategy for the (subsetted) PLs * @return a new non-null GenotypesContext @@ -821,7 +805,7 @@ public class GATKVariantContextUtils { private static GenotypesContext createGenotypesWithSubsettedLikelihoods(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse, - final List likelihoodIndexesToUse, + final List> likelihoodIndexesToUse, final List sacIndexesToUse, final GenotypeAssignmentMethod assignGenotypes) { @@ -832,9 +816,6 @@ public class GATKVariantContextUtils { // the new genotypes to create final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); - // make sure we are seeing the expected number of likelihoods per sample - final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), 2); - // the samples final List sampleIndices = originalGs.getSampleNamesOrderedByName(); @@ -850,6 +831,8 @@ public class GATKVariantContextUtils { newLikelihoods = null; gb.noPL(); } else { + // make sure we are seeing the expected number of likelihoods per sample + final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), g.getPloidy()); final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); if ( likelihoodIndexesToUse == null ) { newLikelihoods = originalLikelihoods; @@ -857,19 +840,20 @@ public class GATKVariantContextUtils { logger.debug("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + originalVC + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); newLikelihoods = null; } else { - newLikelihoods = new double[likelihoodIndexesToUse.size()]; + newLikelihoods = new double[likelihoodIndexesToUse.get(k).size()]; int newIndex = 0; - for ( final int oldIndex : likelihoodIndexesToUse ) + for ( final int oldIndex : likelihoodIndexesToUse.get(k) ) newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; // might need to re-normalize newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); } - if ( newLikelihoods == null || (originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0) == 0 && likelihoodsAreUninformative(newLikelihoods) )) + if ( newLikelihoods == null || (originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0) == 0 && likelihoodsAreUninformative(newLikelihoods) )) { gb.noPL(); - else + } else { gb.PL(newLikelihoods); + } } // create the new strand allele counts array from the used alleles @@ -878,7 +862,7 @@ public class GATKVariantContextUtils { gb.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, newSACs); } - updateGenotypeAfterSubsetting(g.getAlleles(), gb, assignGenotypes, newLikelihoods, allelesToUse); + updateGenotypeAfterSubsetting(g.getAlleles(), g.getPloidy(), gb, assignGenotypes, newLikelihoods, allelesToUse); newGTs.add(gb.make()); } @@ -893,12 +877,14 @@ public class GATKVariantContextUtils { * Add the genotype call (GT) field to GenotypeBuilder using the requested algorithm assignmentMethod * * @param originalGT the original genotype calls, cannot be null + * @param ploidy the number of sets of chromosomes * @param gb the builder where we should put our newly called alleles, cannot be null * @param assignmentMethod the method to use to do the assignment, cannot be null * @param newLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null * @param allelesToUse the alleles we are using for our subsetting */ public static void updateGenotypeAfterSubsetting(final List originalGT, + final int ploidy, final GenotypeBuilder gb, final GenotypeAssignmentMethod assignmentMethod, final double[] newLikelihoods, @@ -911,11 +897,11 @@ public class GATKVariantContextUtils { case DO_NOT_ASSIGN_GENOTYPES: break; case SET_TO_NO_CALL: - gb.alleles(NO_CALL_ALLELES); + gb.alleles(noCallAlleles(ploidy)); gb.noGQ(); break; case SET_TO_NO_CALL_NO_ANNOTATIONS: - gb.alleles(NO_CALL_ALLELES); + gb.alleles(noCallAlleles(ploidy)); gb.noGQ(); gb.noAD(); gb.noPL(); @@ -924,13 +910,16 @@ public class GATKVariantContextUtils { case USE_PLS_TO_ASSIGN: if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) { // if there is no mass on the (new) likelihoods, then just no-call the sample - gb.alleles(NO_CALL_ALLELES); + gb.alleles(noCallAlleles(ploidy)); gb.noGQ(); } else { // find the genotype with maximum likelihoods final int PLindex = MathUtils.maxElementIndex(newLikelihoods); - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); + final List alleles = new ArrayList<>(); + for ( final Integer alleleIndex : GenotypeLikelihoods.getAlleles(PLindex, ploidy)) { + alleles.add(allelesToUse.get(alleleIndex) ); + } + gb.alleles(alleles); gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); } break; @@ -973,7 +962,7 @@ public class GATKVariantContextUtils { // create the new genotypes for ( final Genotype g : vc.getGenotypes() ) { final int gPloidy = g.getPloidy() == 0 ? ploidy : g.getPloidy(); - final List refAlleles = gPloidy == 2 ? diploidRefAlleles : Collections.nCopies(gPloidy, ref); + final List refAlleles = Collections.nCopies(gPloidy, vc.getReference()); final GenotypeBuilder gb = new GenotypeBuilder(g.getSampleName(), refAlleles); if ( g.hasDP() ) gb.DP(g.getDP()); if ( g.hasGQ() ) gb.GQ(g.getGQ()); @@ -990,7 +979,7 @@ public class GATKVariantContextUtils { * @return genotypes context */ public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { - return subsetDiploidAlleles(vc, vc.getAlleles(), GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); + return subsetAlleles(vc, vc.getAlleles(), GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); } /** @@ -1079,7 +1068,7 @@ public class GATKVariantContextUtils { genotypeAssignmentMethodUsed != GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL ) addInfoFiledAnnotations(vc, builder, alt, keepOriginalChrCounts); - builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethodUsed)); + builder.genotypes(subsetAlleles(vc, alleles, genotypeAssignmentMethodUsed)); final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); biallelics.add(trimmed); } @@ -1400,7 +1389,7 @@ public class GATKVariantContextUtils { if ( numNewAlleles == numOriginalAlleles ) return oldGs; - return fixDiploidGenotypesFromSubsettedAlleles(oldGs, originalVC, selectedVC.getAlleles()); + return fixGenotypesFromSubsettedAlleles(oldGs, originalVC, selectedVC.getAlleles()); } /** @@ -1411,14 +1400,14 @@ public class GATKVariantContextUtils { * @param allelesToUse the new subset of alleles to use * @return a new non-null GenotypesContext */ - static private GenotypesContext fixDiploidGenotypesFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { + static private GenotypesContext fixGenotypesFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { if ( originalGs == null ) throw new IllegalArgumentException("the selected GenotypesContext cannot be null"); if ( originalVC == null ) throw new IllegalArgumentException("the original VariantContext cannot be null"); if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null"); // find the likelihoods indexes to use from the used alternate alleles - final List likelihoodIndexesToUse = determineDiploidLikelihoodIndexesToUse(originalVC, allelesToUse); + final List> likelihoodIndexesToUse = determineLikelihoodIndexesToUse(originalVC, allelesToUse); // find the strand allele count indexes to use from the used alternate alleles final List sacIndexesToUse = determineSACIndexesToUse(originalVC, allelesToUse); @@ -1441,7 +1430,7 @@ public class GATKVariantContextUtils { if (allelesToUse == null) throw new IllegalArgumentException("the alleles to use list cannot be null"); // the bitset representing the allele indexes we want to keep - final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); + final BitSet alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); // the new genotypes to create final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); @@ -1452,7 +1441,7 @@ public class GATKVariantContextUtils { // create the new genotypes for ( int k = 0; k < originalGs.size(); k++ ) { final Genotype g = originalGs.get(sampleIndices.get(k)); - newGTs.add(fixAD(g, alleleIndexesToUse, allelesToUse.size())); + newGTs.add(fixAD(g, alleleIndexesToUse)); } return newGTs; @@ -1463,10 +1452,9 @@ public class GATKVariantContextUtils { * * @param genotype the original Genotype * @param alleleIndexesToUse a bitset describing whether or not to keep a given index - * @param nAllelesToUse how many alleles we are keeping * @return a non-null Genotype */ - private static Genotype fixAD(final Genotype genotype, final boolean[] alleleIndexesToUse, final int nAllelesToUse) { + private static Genotype fixAD(final Genotype genotype, final BitSet alleleIndexesToUse) { // if it ain't broke don't fix it if ( !genotype.hasAD() ) return genotype; @@ -1474,18 +1462,14 @@ public class GATKVariantContextUtils { final GenotypeBuilder builder = new GenotypeBuilder(genotype); final int[] oldAD = genotype.getAD(); - if ( oldAD.length != alleleIndexesToUse.length ) { - builder.noAD(); - } else { - final int[] newAD = new int[nAllelesToUse]; - int currentIndex = 0; - for ( int i = 0; i < oldAD.length; i++ ) { - if ( alleleIndexesToUse[i] ) - newAD[currentIndex++] = oldAD[i]; - } - builder.AD(newAD); + final int[] newAD = new int[alleleIndexesToUse.cardinality()]; + + int currentIndex = 0; + for ( int i = alleleIndexesToUse.nextSetBit(0); i >= 0; i = alleleIndexesToUse.nextSetBit(i+1) ) { + newAD[currentIndex++] = oldAD[i]; } - return builder.make(); + + return builder.AD(newAD).make(); } private static Allele determineReferenceAllele(final List VCs) { @@ -2247,11 +2231,11 @@ public class GATKVariantContextUtils { final Object[] splitOriginalField = getVAttributeValues(vc.getAttribute(infoFieldName)); // subset the field - final boolean[] alleleIndexesToUse = getAlleleIndexBitset(vc, Arrays.asList(altAllele)); + final BitSet alleleIndexesToUse = getAlleleIndexBitset(vc, Arrays.asList(altAllele)); // skip the first allele, which is the reference - for (int i = 1; i < alleleIndexesToUse.length; i++) { - if (alleleIndexesToUse[i] == true) + for (int i = 1; i < alleleIndexesToUse.size(); i++) { + if (alleleIndexesToUse.get(i)) return splitOriginalField[i-1]; } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java index 33684cda1..f875d2e8f 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -554,8 +554,8 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { // final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); // // final VariantContext merged = VariantContextUtils.simpleMerge( -// Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, -// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false, false); +// Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, +// GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false); // } // -------------------------------------------------------------------------------- @@ -835,7 +835,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { @Test(enabled = !DEBUG) public void testRepeatAllele() { - Allele nullR = Allele.create("A", true); + Allele nullR = Aref; Allele nullA = Allele.create("A", false); Allele atc = Allele.create("AATC", false); Allele atcatc = Allele.create("AATCATC", false); @@ -1126,24 +1126,25 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { // // -------------------------------------------------------------------------------- - @DataProvider(name = "subsetDiploidAllelesData") - public Object[][] makesubsetDiploidAllelesData() { + @DataProvider(name = "SubsetAllelesData") + public Object[][] makesubsetAllelesData() { List tests = new ArrayList<>(); - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); + final List AA = Arrays.asList(Aref,Aref); + final List AC = Arrays.asList(Aref,C); final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); + final List AG = Arrays.asList(Aref,G); final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); + final List ACG = Arrays.asList(Aref,C,G); final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); + // haploid, one alt allele + final double[] haploidRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.1}); + final double[] haploidAltPL = MathUtils.normalizeFromRealSpace(new double[]{0.1, 0.9}); + final double[] haploidUninformative = new double[]{0, 0}; + + // diploid, one alt allele final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); @@ -1151,37 +1152,47 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(50).make(); - // make sure we don't screw up the simple case - final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); - final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10, 2}).PL(hetPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); - final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10, 2}).PL(homVarPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); + // the simple case where no selection occurs + final Genotype aHaploidGT = new GenotypeBuilder(base).alleles(Arrays.asList(Aref)).AD(new int[]{10,2}).PL(haploidRefPL).GQ(8).make(); + final Genotype cHaploidGT = new GenotypeBuilder(base).alleles(Arrays.asList(C)).AD(new int[]{10,2}).PL(haploidAltPL).GQ(8).make(); + final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); + final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); + final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); + // haploid + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aHaploidGT).make(), AC, Arrays.asList(new GenotypeBuilder(aHaploidGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(cHaploidGT).make(), AC, Arrays.asList(new GenotypeBuilder(cHaploidGT).make())}); + // diploid tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), AC, Arrays.asList(new GenotypeBuilder(aaGT).make())}); tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), AC, Arrays.asList(new GenotypeBuilder(acGT).make())}); tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), AC, Arrays.asList(new GenotypeBuilder(ccGT).make())}); - // uninformative test case + // uninformative test cases + // diploid final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).PL(uninformative).GQ(0).make(); - final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noPL().noGQ().make(); + final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.noCallAlleles(2)).noPL().noGQ().make(); tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), AC, Arrays.asList(emptyGT)}); + // haploid + final Genotype haploidUninformativeGT = new GenotypeBuilder(base).alleles(Arrays.asList(C)).PL(haploidUninformative).GQ(0).make(); + final Genotype haplpoidEmptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.noCallAlleles(1)).noPL().noGQ().make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(haploidUninformativeGT).make(), AC, Arrays.asList(haplpoidEmptyGT)}); - // actually subsetting down from multiple alt values + // subsetting from 3 to 2 alleles + // diploid PL order is: AA, AC, CC, AG, CG, GG (00, 01, 11, 02, 12, 22) final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; - final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG - final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG + final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; + final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; tests.add(new Object[]{ new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homRef3AllelesPL).make()).make(), AC, Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).GQ(100).make())}); - tests.add(new Object[]{ new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetRefC3AllelesPL).make()).make(), AC, Arrays.asList(new GenotypeBuilder(base).alleles(AC).PL(new double[]{-10, 0, -20}).GQ(100).make())}); - tests.add(new Object[]{ new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homC3AllelesPL).make()).make(), AC, @@ -1190,7 +1201,6 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetRefG3AllelesPL).make()).make(), AG, Arrays.asList(new GenotypeBuilder(base).alleles(AG).PL(new double[]{-20, 0, -50}).GQ(200).make())}); - // wow, scary -- bad output but discussed with Eric and we think this is the only thing that can be done tests.add(new Object[]{ new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetCG3AllelesPL).make()).make(), @@ -1202,14 +1212,36 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { AG, Arrays.asList(new GenotypeBuilder(base).alleles(GG).PL(new double[]{-20, -40, 0}).GQ(200).make())}); + // haploid PL order is: A, C, G (0, 1, 2) + final double[] haploidRef3AllelesPL = new double[]{0, -10, -20}; + final double[] haploidAltC3AllelesPL = new double[]{-10, 0, -20}; + final double[] haploidAltG3AllelesPL = new double[]{-20, -10, 0}; + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(Arrays.asList(Aref)).PL(haploidRef3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(Arrays.asList(Aref)).PL(new double[]{0, -10}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(Arrays.asList(C)).PL(haploidAltC3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(Arrays.asList(C)).PL(new double[]{-10, 0}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(Arrays.asList(G)).PL(haploidAltG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(Arrays.asList(G)).PL(new double[]{-20, 0}).GQ(200).make())}); + return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "subsetDiploidAllelesData") - public void testsubsetDiploidAllelesData(final VariantContext inputVC, - final List allelesToUse, - final List expectedGenotypes) { - final GenotypesContext actual = GATKVariantContextUtils.subsetDiploidAlleles(inputVC, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); + @Test(dataProvider = "SubsetAllelesData") + public void testSubsetAlleles(final VariantContext inputVC, + final List allelesToUse, + final List expectedGenotypes) { + // initialize cache of allele anyploid indices + for (final Genotype genotype : inputVC.getGenotypes()) { + GenotypeLikelihoods.initializeAnyploidPLIndexToAlleleIndices(inputVC.getNAlleles() - 1, genotype.getPloidy()); + } + + final GenotypesContext actual = GATKVariantContextUtils.subsetAlleles(inputVC, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); Assert.assertEquals(actual.size(), expectedGenotypes.size()); for ( final Genotype expected : expectedGenotypes ) { @@ -1221,71 +1253,134 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { @DataProvider(name = "UpdateGenotypeAfterSubsettingData") public Object[][] makeUpdateGenotypeAfterSubsettingData() { - List tests = new ArrayList(); + final List tests = new ArrayList<>(); - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); + final List AA = Arrays.asList(Aref,Aref); + final List AC = Arrays.asList(Aref,C); final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); + final List AG = Arrays.asList(Aref,G); final List CG = Arrays.asList(C,G); final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); - final List> allSubsetAlleles = Arrays.asList(AC,AG,ACG); + final List AAA = Arrays.asList(Aref,Aref,Aref); + final List AAC = Arrays.asList(Aref,Aref,C); + final List ACC = Arrays.asList(Aref,C,C); + final List CCC = Arrays.asList(C,C,C); + final List AAG = Arrays.asList(Aref,Aref,G); + final List ACG = Arrays.asList(Aref,C,G); + final List CCG = Arrays.asList(C,C,G); + final List AGG = Arrays.asList(Aref,G,G); + final List CGG = Arrays.asList(C,G,G); + final List GGG = Arrays.asList(G,G,G); + final List> allDiploidSubsetAlleles = Arrays.asList(AC,AG,ACG); + final List> allTriploidSubsetAlleles = Arrays.asList(AAA,AAC,ACC,CCC,AAG,ACG,CCG,AGG,CGG,GGG); + // for P=1, the index of the genotype a is a + final double[] aRefPL = new double[]{0.9, 0.09, 0.01}; + final double[] cPL = new double[]{0.09, 0.9, 0.01}; + final double[] gPL = new double[]{0.01, 0.09, 0.9}; + final List allHaploidPLs = Arrays.asList(aRefPL, cPL, gPL); + final List> allHaploidSubsetAlleles = Arrays.asList(Arrays.asList(Aref), Arrays.asList(G)); + + // for P=2 and N=1, the ordering is 00,01,11 final double[] homRefPL = new double[]{0.9, 0.09, 0.01}; final double[] hetPL = new double[]{0.09, 0.9, 0.01}; final double[] homVarPL = new double[]{0.01, 0.09, 0.9}; final double[] uninformative = new double[]{0.33, 0.33, 0.33}; - final List allPLs = Arrays.asList(homRefPL, hetPL, homVarPL, uninformative); + final List allDiploidPLs = Arrays.asList(homRefPL, hetPL, homVarPL, uninformative); - for ( final List alleles : allSubsetAlleles ) { - for ( final double[] pls : allPLs ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, pls, AA, alleles, GATKVariantContextUtils.NO_CALL_ALLELES}); - } + // for P=3 and N=2, the ordering is 000, 001, 011, 111, 002, 012, 112, 022, 122, 222 + final double[] aaaPL = new double[]{0.9, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09}; + final double[] aacPL = new double[]{0.01, 0.9, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09}; + final double[] accPL = new double[]{0.01, 0.02, 0.9, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09}; + final double[] cccPL = new double[]{0.01, 0.02, 0.03, 0.9, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09}; + final double[] aagPL = new double[]{0.01, 0.02, 0.03, 0.04, 0.9, 0.05, 0.06, 0.07, 0.08, 0.09}; + final double[] acgPL = new double[]{0.01, 0.02, 0.03, 0.04, 0.05, 0.9, 0.06, 0.07, 0.08, 0.09}; + final double[] ccgPL = new double[]{0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.9, 0.07, 0.08, 0.09}; + final double[] aggPL = new double[]{0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.9, 0.08, 0.09}; + final double[] cggPL = new double[]{0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.9, 0.09}; + final double[] gggPL = new double[]{0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.9}; + final double[] uninformativeTriploid = new double[]{0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1}; + final List allTriploidPLs = Arrays.asList(homRefPL, hetPL, homVarPL, uninformativeTriploid); + + + for ( final List alleles : allHaploidSubsetAlleles ) { + tests.add(new Object[]{1, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, allHaploidPLs.get(0), Arrays.asList(Aref), alleles, GATKVariantContextUtils.noCallAlleles(1)}); } + for ( final List alleles : allDiploidSubsetAlleles ) { + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, allDiploidPLs.get(0), AA, alleles, GATKVariantContextUtils.noCallAlleles(2)}); + } + + for ( final List alleles : allTriploidSubsetAlleles ) { + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, allTriploidPLs.get(0), AAA, alleles, GATKVariantContextUtils.noCallAlleles(3)}); + } + + final List originalHaploidGT = Arrays.asList(Aref, C, G ); + tests.add(new Object[]{1, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, aRefPL, originalHaploidGT, originalHaploidGT, Arrays.asList(Aref)}); + tests.add(new Object[]{1, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, cPL, originalHaploidGT, originalHaploidGT, Arrays.asList(C)}); + tests.add(new Object[]{1, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, gPL, originalHaploidGT, originalHaploidGT, Arrays.asList(G)}); + for ( final List originalGT : Arrays.asList(AA, AC, CC, AG, CG, GG) ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homRefPL, originalGT, AC, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, hetPL, originalGT, AC, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homVarPL, originalGT, AC, CC}); -// tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, uninformative, AA, AC, GATKVariantContextUtils.NO_CALL_ALLELES}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homRefPL, originalGT, AC, AA}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, hetPL, originalGT, AC, AC}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homVarPL, originalGT, AC, CC}); } - for ( final double[] pls : allPLs ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AC, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AC, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AC, CC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AC, AC}); - - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AG, AG}); - - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, ACG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, ACG, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, ACG, CC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AG, ACG, AG}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, ACG, CG}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, GG, ACG, GG}); + for ( final List originalGT : allTriploidSubsetAlleles) { + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, aaaPL, originalGT, ACG, AAA}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, aacPL, originalGT, ACG, AAC}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, accPL, originalGT, ACG, ACC}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, cccPL, originalGT, ACG, CCC}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, aagPL, originalGT, ACG, AAG}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, acgPL, originalGT, ACG, ACG}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, ccgPL, originalGT, ACG, CCG}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, aggPL, originalGT, ACG, AGG}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, cggPL, originalGT, ACG, CCG}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, gggPL, originalGT, ACG, GGG}); } + tests.add(new Object[]{1, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allHaploidPLs.get(0), Arrays.asList(Aref, C, G), Arrays.asList(Aref), Arrays.asList(Aref)}); + tests.add(new Object[]{1, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allHaploidPLs.get(0), Arrays.asList(Aref, C, G), Arrays.asList(C), Arrays.asList(C)}); + tests.add(new Object[]{1, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allHaploidPLs.get(0), Arrays.asList(Aref, C, G), Arrays.asList(G), Arrays.asList(G)}); + + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), AA, AC, AA}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), AC, AC, AC}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), CC, AC, CC}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), CG, AC, AC}); + + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), AA, AG, AA}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), AC, AG, AA}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), CC, AG, AA}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), CG, AG, AG}); + + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), AA, ACG, AA}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), AC, ACG, AC}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), CC, ACG, CC}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), AG, ACG, AG}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), CG, ACG, CG}); + tests.add(new Object[]{2, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allDiploidPLs.get(0), GG, ACG, GG}); + + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allTriploidPLs.get(0), AAA, AAC, AAA}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allTriploidPLs.get(0), ACC, AAC, ACC}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allTriploidPLs.get(0), AAC, AAC, AAC}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allTriploidPLs.get(0), AAC, ACG, AAC}); + tests.add(new Object[]{3, GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, allTriploidPLs.get(0), GGG, AAA, AAA}); + return tests.toArray(new Object[][]{}); } @Test(enabled = !DEBUG, dataProvider = "UpdateGenotypeAfterSubsettingData") - public void testUpdateGenotypeAfterSubsetting(final GATKVariantContextUtils.GenotypeAssignmentMethod mode, + public void testUpdateGenotypeAfterSubsetting(final int ploidy, + final GATKVariantContextUtils.GenotypeAssignmentMethod mode, final double[] likelihoods, final List originalGT, final List allelesToUse, final List expectedAlleles) { + final int numAltAlleles = originalGT.size() > 1 ? originalGT.size() - 1 : 1; + GenotypeLikelihoods.initializeAnyploidPLIndexToAlleleIndices(numAltAlleles, ploidy); final GenotypeBuilder gb = new GenotypeBuilder("test"); final double[] log10Likelhoods = MathUtils.normalizeFromLog10(likelihoods, true, false); - GATKVariantContextUtils.updateGenotypeAfterSubsetting(originalGT, gb, mode, log10Likelhoods, allelesToUse); + GATKVariantContextUtils.updateGenotypeAfterSubsetting(originalGT, ploidy, gb, mode, log10Likelhoods, allelesToUse); final Genotype g = gb.make(); Assert.assertEquals(new HashSet<>(g.getAlleles()), new HashSet<>(expectedAlleles)); } @@ -1331,17 +1426,11 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public Object[][] makeUpdatePLsSACsAndADData() { List tests = new ArrayList<>(); - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); + final List AA = Arrays.asList(Aref,Aref); + final List AC = Arrays.asList(Aref,C); final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); - final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); + final List AG = Arrays.asList(Aref,G); + final List ACG = Arrays.asList(Aref,C,G); final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); @@ -1352,7 +1441,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(100).make(); - // make sure we don't screw up the simple case where no selection happens + // make sure we don't screw up the simple case where no selection occurs final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10, 2}).PL(hetPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10, 2}).PL(homVarPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); @@ -1364,7 +1453,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { // uninformative test cases final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make(); tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(uninformativeGT)}); - final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make(); + final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.noCallAlleles(2)).noAD().noPL().noGQ().make(); tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(emptyGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(emptyGT)}); // actually subsetting down from multiple alt values @@ -1375,6 +1464,14 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG + final double[] haploidRef3AllelesPL = new double[]{0, -10, -20}; + final double[] haploidAltC3AllelesPL = new double[]{-10, 0, -20}; + final double[] haploidAltG3AllelesPL = new double[]{-20, -10, 0}; + + // for P=3 and N=2, the ordering is 000, 001, 011, 111, 002, 012, 112, 022, 122, 222 + final double[] triploidRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50, -60, -70, -80, -90}; + final double[] tripoidAltC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50, -60, -70, -80, -90}; + final int[] homRef3AllelesAD = new int[]{20, 0, 1}; final int[] hetRefC3AllelesAD = new int[]{10, 10, 1}; final int[] homC3AllelesAD = new int[]{0, 20, 1}; @@ -1389,6 +1486,44 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final int[] hetCG3AllelesSAC = new int[]{0, 0, 12, 12, 11, 11}; // AA, AC, CC, AG, CG, GG final int[] homG3AllelesSAC = new int[]{0, 0, 1, 1, 21, 21}; // AA, AC, CC, AG, CG, GG + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(Arrays.asList(Aref)).AD(homRef3AllelesAD).PL(haploidRef3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, homRef3AllelesSAC).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(Arrays.asList(Aref)).PL(new double[]{0, -10}).AD(new int[]{20, 0}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{20, 19, 0, 1}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(Arrays.asList(Aref)).AD(hetRefC3AllelesAD).PL(haploidAltC3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, hetRefC3AllelesSAC).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(Arrays.asList(Aref)).PL(new double[]{-10, 0}).AD(new int[]{10, 10}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{10, 9, 10, 9}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(Arrays.asList(Aref)).AD(homC3AllelesAD).PL(haploidAltG3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, homC3AllelesSAC).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(Arrays.asList(Aref)).PL(new double[]{-20, 0}).AD(new int[]{0, 1}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{0, 0, 1, 1}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(Arrays.asList(Aref, Aref, Aref)).AD(homRef3AllelesAD).PL(triploidRef3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, homRef3AllelesSAC).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(Arrays.asList(Aref, Aref, Aref)).PL(new double[]{0, -10, -20, -30}).AD(new int[]{20, 0}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{20, 19, 0, 1}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(Arrays.asList(Aref, Aref, C)).AD(hetRefC3AllelesAD).PL(tripoidAltC3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, hetRefC3AllelesSAC).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(Arrays.asList(Aref, Aref, C)).PL(new double[]{-10, 0, -20, -30}).AD(new int[]{10, 10}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{10, 9, 10, 9}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(Arrays.asList(Aref, Aref, G)).AD(homRef3AllelesAD).PL(triploidRef3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, homC3AllelesSAC).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(Arrays.asList(Aref, Aref, G)).PL(new double[]{0, -40, -70, -90}).AD(new int[]{20, 1}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{0, 0, 1, 1}).GQ(100).make())}); + tests.add(new Object[]{ new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homRef3AllelesAD).PL(homRef3AllelesPL). attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, homRef3AllelesSAC).make()).make(), @@ -1433,6 +1568,11 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public void testUpdatePLsAndADData(final VariantContext originalVC, final VariantContext selectedVC, final List expectedGenotypes) { + // initialize cache of allele anyploid indices + for (final Genotype genotype : originalVC.getGenotypes()) { + GenotypeLikelihoods.initializeAnyploidPLIndexToAlleleIndices(originalVC.getNAlleles() - 1, genotype.getPloidy()); + } + final VariantContext selectedVCwithGTs = new VariantContextBuilder(selectedVC).genotypes(originalVC.getGenotypes()).make(); final GenotypesContext actual = GATKVariantContextUtils.updatePLsSACsAD(selectedVCwithGTs, originalVC);