From dd5674b3b8c088e7dbb0b7e1822f4e55d02f7315 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 12 Jun 2013 12:43:19 -0400 Subject: [PATCH] Add genotyping accuracy assessment to AssessNA12878 -- Now table looks like: Name VariantType AssessmentType Count variant SNPS TRUE_POSITIVE 1220 variant SNPS FALSE_POSITIVE 0 variant SNPS FALSE_NEGATIVE 1 variant SNPS TRUE_NEGATIVE 150 variant SNPS CALLED_NOT_IN_DB_AT_ALL 0 variant SNPS HET_CONCORDANCE 100.00 variant SNPS HOMVAR_CONCORDANCE 99.63 variant INDELS TRUE_POSITIVE 273 variant INDELS FALSE_POSITIVE 0 variant INDELS FALSE_NEGATIVE 15 variant INDELS TRUE_NEGATIVE 79 variant INDELS CALLED_NOT_IN_DB_AT_ALL 2 variant INDELS HET_CONCORDANCE 98.67 variant INDELS HOMVAR_CONCORDANCE 89.58 -- Rewrite / refactored parts of subsetDiploidAlleles in GATKVariantContextUtils to have a BEST_MATCH assignment method that does it's best to simply match the genotype after subsetting to a set of alleles. So if the original GT was A/B and you subset to A/B it remains A/B but if you subset to A/C you get A/A. This means that het-alt B/C genotypes become A/B and A/C when subsetting to bi-allelics which is the convention in the KB. Add lots of unit tests for this functions (from 0 previously) -- BadSites in Assessment now emits TP sites with discordant genotypes with the type GENOTYPE_DISCORDANCE and tags the expected genotype in the info field as ExpectedGenotype, such as this record: 20 10769255 . A ATGTG 165.73 . ExpectedGenotype=HOM_VAR;SupportingCallsets=ebanks,depristo,CEUTrio_best_practices;WHY=GENOTYPE_DISCORDANCE GT:AD:DP:GQ:PL 0/1:1,9:10:6:360,0,6 Indicating that the call was a HET but the expected result was HOM_VAR -- Forbid subsetting of diploid genotypes to just a single allele. -- Added subsetToRef as a separate specific function. Use that in the DiploidExactAFCalc in the case that you need to reduce yourself to ref only. Preserves DP in the genotype field when this is possible, so a few integration tests have changed for the UG --- .../genotyper/afcalc/DiploidExactAFCalc.java | 7 +- ...dGenotyperIndelCallingIntegrationTest.java | 2 +- ...GenotyperNormalCallingIntegrationTest.java | 4 +- .../variant/GATKVariantContextUtils.java | 206 ++++++++++++---- .../GATKVariantContextUtilsUnitTest.java | 233 ++++++++++++++++-- 5 files changed, 380 insertions(+), 72 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 170b6e250..2ece18002 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -106,7 +106,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { alleles.add(vc.getReference()); alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles())); builder.alleles(alleles); - builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); + builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL)); return builder.make(); } else { return vc; @@ -352,6 +352,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { final List allelesToUse, final boolean assignGenotypes, final int ploidy) { - return GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes); + return allelesToUse.size() == 1 + ? GATKVariantContextUtils.subsetToRefOnly(vc, ploidy) + : GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, + assignGenotypes ? GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN : GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 98a482c6f..64a27c4c3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("294183823d678d3668f4fa98b4de6e06")); + Arrays.asList("facac578891a4f2be63ddd5ba6b9096b")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index bf4316415..f7c5e6fd5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("5e8f1fa88dc93320cc0e75e9fe6e153b")); + Arrays.asList("474dfb943a307c86cabe2043970c58f3")); executeTest("test MultiSample Pilot1", spec); } @@ -80,7 +80,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("60115af273fde49c76d4df6c9c0f6501")); + Arrays.asList("3e646003c5b93da80c7d8e5d0ff2ee4e")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index b5a6e82a0..3bc5da82f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -45,7 +45,7 @@ public class GATKVariantContextUtils { public static final int DEFAULT_PLOIDY = 2; public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + protected static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); public final static String MERGE_FILTER_PREFIX = "filterIn"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; @@ -421,6 +421,37 @@ public class GATKVariantContextUtils { return true; // we passed all tests, we matched } + public enum GenotypeAssignmentMethod { + /** + * set all of the genotype GT values to NO_CALL + */ + SET_TO_NO_CALL, + + /** + * Use the subsetted PLs to greedily assigned genotypes + */ + USE_PLS_TO_ASSIGN, + + /** + * Try to match the original GT calls, if at all possible + * + * Suppose I have 3 alleles: A/B/C and the following samples: + * + * original_GT best_match to A/B best_match to A/C + * S1 => A/A A/A A/A + * S2 => A/B A/B A/A + * S3 => B/B B/B A/A + * S4 => B/C A/B A/C + * S5 => C/C A/A C/C + * + * Basically, all alleles not in the subset map to ref. It means that het-alt genotypes + * when split into 2 bi-allelic variants will be het in each, which is good in some cases, + * rather than the undetermined behavior when using the PLs to assign, which could result + * in hom-var or hom-ref for each, depending on the exact PL values. + */ + BEST_MATCH_TO_ORIGINAL + } + /** * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) * @@ -430,22 +461,23 @@ public class GATKVariantContextUtils { * @return genotypes */ public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes) { + final List allelesToUse, + final GenotypeAssignmentMethod assignGenotypes) { + if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele"); + if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele"); // the genotypes with PLs final GenotypesContext oldGTs = vc.getGenotypes(); // the new genotypes to create final GenotypesContext newGTs = GenotypesContext.create(); + // optimization: if no input genotypes, just exit - if (oldGTs.isEmpty()) - return newGTs; + if (oldGTs.isEmpty()) return newGTs; // samples final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); @@ -456,8 +488,8 @@ public class GATKVariantContextUtils { // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, // then we can keep the PLs as is; otherwise, we determine which ones to keep - if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { - likelihoodIndexesToUse = new ArrayList(30); + if ( numNewAltAlleles != numOriginalAltAlleles ) { + likelihoodIndexesToUse = new ArrayList<>(30); final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; for ( int i = 0; i < numOriginalAltAlleles; i++ ) { @@ -478,55 +510,127 @@ public class GATKVariantContextUtils { // create the new genotypes for ( int k = 0; k < oldGTs.size(); k++ ) { final Genotype g = oldGTs.get(sampleIndices.get(k)); - if ( !g.hasLikelihoods() ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); - continue; - } + final GenotypeBuilder gb = new GenotypeBuilder(g); // create the new likelihoods array from the alleles we are allowed to use - final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); double[] newLikelihoods; - if ( likelihoodIndexesToUse == null ) { - newLikelihoods = originalLikelihoods; - } else if ( originalLikelihoods.length != expectedNumLikelihoods ) { - logger.warn("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); + if ( !g.hasLikelihoods() ) { + // we don't have any likelihoods, so we null out PLs and make G ./. newLikelihoods = null; + gb.noPL(); } else { - newLikelihoods = new double[likelihoodIndexesToUse.size()]; - int newIndex = 0; - for ( int oldIndex : likelihoodIndexesToUse ) - newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + if ( likelihoodIndexesToUse == null ) { + newLikelihoods = originalLikelihoods; + } else if ( originalLikelihoods.length != expectedNumLikelihoods ) { + logger.warn("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); + newLikelihoods = null; + } else { + newLikelihoods = new double[likelihoodIndexesToUse.size()]; + int newIndex = 0; + for ( int oldIndex : likelihoodIndexesToUse ) + newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; - // might need to re-normalize - newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); - } + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } - // if there is no mass on the (new) likelihoods, then just no-call the sample - if ( newLikelihoods != null && MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); - } - else { - final GenotypeBuilder gb = new GenotypeBuilder(g); - - if ( newLikelihoods == null || numNewAltAlleles == 0 ) + if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) gb.noPL(); else gb.PL(newLikelihoods); - - // if we weren't asked to assign a genotype, then just no-call the sample - if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { - gb.alleles(NO_CALL_ALLELES); - } - else { - // find the genotype with maximum likelihoods - int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - - gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); - if ( numNewAltAlleles != 0 ) gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); - } - newGTs.add(gb.make()); } + + updateGenotypeAfterSubsetting(g.getAlleles(), gb, assignGenotypes, newLikelihoods, allelesToUse); + newGTs.add(gb.make()); + } + + return newGTs; + } + + private static boolean likelihoodsAreUninformative(final double[] likelihoods) { + return MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL; + } + + /** + * Add the genotype call (GT) field to GenotypeBuilder using the requested algorithm assignmentMethod + * + * @param originalGT the original genotype calls, cannot be null + * @param gb the builder where we should put our newly called alleles, cannot be null + * @param assignmentMethod the method to use to do the assignment, cannot be null + * @param newLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null + * @param allelesToUse the alleles we are using for our subsetting + */ + protected static void updateGenotypeAfterSubsetting(final List originalGT, + final GenotypeBuilder gb, + final GenotypeAssignmentMethod assignmentMethod, + final double[] newLikelihoods, + final List allelesToUse) { + gb.noAD(); + switch ( assignmentMethod ) { + case SET_TO_NO_CALL: + gb.alleles(NO_CALL_ALLELES); + gb.noGQ(); + break; + case USE_PLS_TO_ASSIGN: + if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) { + // if there is no mass on the (new) likelihoods, then just no-call the sample + gb.alleles(NO_CALL_ALLELES); + gb.noGQ(); + } else { + // find the genotype with maximum likelihoods + final int PLindex = MathUtils.maxElementIndex(newLikelihoods); + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); + gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); + } + break; + case BEST_MATCH_TO_ORIGINAL: + final List best = new LinkedList<>(); + final Allele ref = allelesToUse.get(0); // WARNING -- should be checked in input argument + for ( final Allele originalAllele : originalGT ) { + best.add(allelesToUse.contains(originalAllele) ? originalAllele : ref); + } + gb.noGQ(); + gb.noPL(); + gb.alleles(best); + break; + } + } + + /** + * Subset the samples in VC to reference only information with ref call alleles + * + * Preserves DP if present + * + * @param vc the variant context to subset down to + * @param ploidy ploidy to use if a genotype doesn't have any alleles + * @return a GenotypesContext + */ + public static GenotypesContext subsetToRefOnly(final VariantContext vc, final int ploidy) { + if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); + if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be >= 1 but got " + ploidy); + + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); + + // optimization: if no input genotypes, just exit + if (oldGTs.isEmpty()) return oldGTs; + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); + + final Allele ref = vc.getReference(); + final List diploidRefAlleles = Arrays.asList(ref, ref); + + // create the new genotypes + for ( final Genotype g : vc.getGenotypes() ) { + final int gPloidy = g.getPloidy() == 0 ? ploidy : g.getPloidy(); + final List refAlleles = gPloidy == 2 ? diploidRefAlleles : Collections.nCopies(gPloidy, ref); + final GenotypeBuilder gb = new GenotypeBuilder(g.getSampleName(), refAlleles); + if ( g.hasDP() ) gb.DP(g.getDP()); + if ( g.hasGQ() ) gb.GQ(g.getGQ()); + newGTs.add(gb.make()); } return newGTs; @@ -539,7 +643,7 @@ public class GATKVariantContextUtils { * @return genotypes context */ public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { - return subsetDiploidAlleles(vc, vc.getAlleles(), true); + return subsetDiploidAlleles(vc, vc.getAlleles(), GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); } /** @@ -557,7 +661,7 @@ public class GATKVariantContextUtils { * @return a list of bi-allelic (or monomorphic) variant context */ public static List splitVariantContextToBiallelics(final VariantContext vc) { - return splitVariantContextToBiallelics(vc, false); + return splitVariantContextToBiallelics(vc, false, GenotypeAssignmentMethod.SET_TO_NO_CALL); } /** @@ -575,18 +679,18 @@ public class GATKVariantContextUtils { * @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome * @return a list of bi-allelic (or monomorphic) variant context */ - public static List splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft) { + public static List splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod) { if ( ! vc.isVariant() || vc.isBiallelic() ) // non variant or biallelics already satisfy the contract return Collections.singletonList(vc); else { - final List biallelics = new LinkedList(); + final List biallelics = new LinkedList<>(); for ( final Allele alt : vc.getAlternateAlleles() ) { VariantContextBuilder builder = new VariantContextBuilder(vc); final List alleles = Arrays.asList(vc.getReference(), alt); builder.alleles(alleles); - builder.genotypes(subsetDiploidAlleles(vc, alleles, false)); + builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethod)); VariantContextUtils.calculateChromosomeCounts(builder, true); final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); biallelics.add(trimmed); diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index fcc7c7998..937698d82 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.variant; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.*; @@ -39,6 +40,7 @@ import org.testng.annotations.Test; import java.util.*; public class GATKVariantContextUtilsUnitTest extends BaseTest { + private final static boolean DEBUG = false; Allele Aref, T, C, G, Cref, ATC, ATCATC; @@ -168,7 +170,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return MergeAllelesTest.getTests(MergeAllelesTest.class); } - @Test(dataProvider = "mergeAlleles") + @Test(enabled = !DEBUG, dataProvider = "mergeAlleles") public void testMergeAlleles(MergeAllelesTest cfg) { final List inputs = new ArrayList(); @@ -229,7 +231,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return SimpleMergeRSIDTest.getTests(SimpleMergeRSIDTest.class); } - @Test(dataProvider = "simplemergersiddata") + @Test(enabled = !DEBUG, dataProvider = "simplemergersiddata") public void testRSIDMerge(SimpleMergeRSIDTest cfg) { VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); final List inputs = new ArrayList(); @@ -352,7 +354,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return MergeFilteredTest.getTests(MergeFilteredTest.class); } - @Test(dataProvider = "mergeFiltered") + @Test(enabled = !DEBUG, dataProvider = "mergeFiltered") public void testMergeFiltered(MergeFilteredTest cfg) { final List priority = vcs2priority(cfg.inputs); final VariantContext merged = GATKVariantContextUtils.simpleMerge( @@ -479,7 +481,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return MergeGenotypesTest.getTests(MergeGenotypesTest.class); } - @Test(dataProvider = "mergeGenotypes") + @Test(enabled = !DEBUG, dataProvider = "mergeGenotypes") public void testMergeGenotypes(MergeGenotypesTest cfg) { final VariantContext merged = GATKVariantContextUtils.simpleMerge( cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, @@ -517,7 +519,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } } - @Test + @Test(enabled = !DEBUG) public void testMergeGenotypesUniquify() { final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); @@ -547,7 +549,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { // // -------------------------------------------------------------------------------- - @Test + @Test(enabled = !DEBUG) public void testAnnotationSet() { for ( final boolean annotate : Arrays.asList(true, false)) { for ( final String set : Arrays.asList("set", "combine", "x")) { @@ -618,7 +620,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class); } - @Test(dataProvider = "ReverseClippingPositionTestProvider") + @Test(enabled = !DEBUG, dataProvider = "ReverseClippingPositionTestProvider") public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { int result = GATKVariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes()); Assert.assertEquals(result, cfg.expectedClip); @@ -706,7 +708,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "SplitBiallelics") + @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics") public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List expectedBiallelics) { final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); Assert.assertEquals(biallelics.size(), expectedBiallelics.size()); @@ -717,7 +719,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } } - @Test(dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes") + @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes") public void testSplitBiallelicsGenotypes(final VariantContext vc, final List expectedBiallelics) { final List genotypes = new ArrayList(); @@ -745,7 +747,6 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } } - // -------------------------------------------------------------------------------- // // Test repeats @@ -810,14 +811,14 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return RepeatDetectorTest.getTests(RepeatDetectorTest.class); } - @Test(dataProvider = "RepeatDetectorTest") + @Test(enabled = !DEBUG, dataProvider = "RepeatDetectorTest") public void testRepeatDetectorTest(RepeatDetectorTest cfg) { // test alleles are equal Assert.assertEquals(GATKVariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat); } - @Test + @Test(enabled = !DEBUG) public void testRepeatAllele() { Allele nullR = Allele.create("A", true); Allele nullA = Allele.create("A", false); @@ -940,7 +941,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "ForwardClippingData") + @Test(enabled = !DEBUG, dataProvider = "ForwardClippingData") public void testForwardClipping(final List alleleStrings, final int expectedClip) { final List alleles = new LinkedList(); for ( final String alleleString : alleleStrings ) @@ -975,7 +976,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "ClipAlleleTest") + @Test(enabled = !DEBUG, dataProvider = "ClipAlleleTest") public void testClipAlleles(final List alleleStrings, final List expected, final int numLeftClipped) { final int start = 10; final VariantContext unclipped = GATKVariantContextUtils.makeFromAlleles("test", "20", start, alleleStrings); @@ -1019,7 +1020,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "PrimitiveAlleleSplittingData") + @Test(enabled = !DEBUG, dataProvider = "PrimitiveAlleleSplittingData") public void testPrimitiveAlleleSplitting(final String ref, final String alt, final int expectedSplit, final List variantPositions) { final int start = 10; @@ -1066,7 +1067,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "AlleleRemappingData") + @Test(enabled = !DEBUG, dataProvider = "AlleleRemappingData") public void testAlleleRemapping(final Map alleleMap, final int numGenotypes) { final GATKVariantContextUtils.AlleleMapper alleleMapper = new GATKVariantContextUtils.AlleleMapper(alleleMap); @@ -1102,4 +1103,204 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { return gc; } + + // -------------------------------------------------------------------------------- + // + // Test subsetDiploidAlleles + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "subsetDiploidAllelesData") + public Object[][] makesubsetDiploidAllelesData() { + List tests = new ArrayList<>(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + + final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); + + final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); + final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); + final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); + final double[] uninformative = new double[]{0, 0, 0}; + + final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(50).make(); + + // make sure we don't screw up the simple case + final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); + final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); + final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); + + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), AC, Arrays.asList(new GenotypeBuilder(aaGT).noAD().make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), AC, Arrays.asList(new GenotypeBuilder(acGT).noAD().make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), AC, Arrays.asList(new GenotypeBuilder(ccGT).noAD().make())}); + + // uninformative test case + final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make(); + final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), AC, Arrays.asList(emptyGT)}); + + // actually subsetting down from multiple alt values + final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; + final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; + final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; + final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; + final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG + final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homRef3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).noAD().GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetRefC3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(AC).PL(new double[]{-10, 0, -20}).noAD().GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homC3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(CC).PL(new double[]{-20, -10, 0}).noAD().GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetRefG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(AG).PL(new double[]{-20, 0, -50}).noAD().GQ(200).make())}); + + // wow, scary -- bad output but discussed with Eric and we think this is the only thing that can be done + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetCG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).noAD().GQ(200).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(GG).PL(new double[]{-20, -40, 0}).noAD().GQ(200).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "subsetDiploidAllelesData") + public void testsubsetDiploidAllelesData(final VariantContext inputVC, + final List allelesToUse, + final List expectedGenotypes) { + final GenotypesContext actual = GATKVariantContextUtils.subsetDiploidAlleles(inputVC, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); + + Assert.assertEquals(actual.size(), expectedGenotypes.size()); + for ( final Genotype expected : expectedGenotypes ) { + final Genotype actualGT = actual.get(expected.getSampleName()); + Assert.assertNotNull(actualGT); + assertGenotypesAreEqual(actualGT, expected); + } + } + + @DataProvider(name = "UpdateGenotypeAfterSubsettingData") + public Object[][] makeUpdateGenotypeAfterSubsettingData() { + List tests = new ArrayList(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + final List> allSubsetAlleles = Arrays.asList(AC,AG,ACG); + + final double[] homRefPL = new double[]{0.9, 0.09, 0.01}; + final double[] hetPL = new double[]{0.09, 0.9, 0.01}; + final double[] homVarPL = new double[]{0.01, 0.09, 0.9}; + final double[] uninformative = new double[]{0.33, 0.33, 0.33}; + final List allPLs = Arrays.asList(homRefPL, hetPL, homVarPL, uninformative); + + for ( final List alleles : allSubsetAlleles ) { + for ( final double[] pls : allPLs ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, pls, AA, alleles, GATKVariantContextUtils.NO_CALL_ALLELES}); + } + } + + for ( final List originalGT : Arrays.asList(AA, AC, CC, AG, CG, GG) ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homRefPL, originalGT, AC, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, hetPL, originalGT, AC, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homVarPL, originalGT, AC, CC}); +// tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, uninformative, AA, AC, GATKVariantContextUtils.NO_CALL_ALLELES}); + } + + for ( final double[] pls : allPLs ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AC, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AC, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AC, CC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AC, AC}); + + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AG, AG}); + + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, ACG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, ACG, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, ACG, CC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AG, ACG, AG}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, ACG, CG}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, GG, ACG, GG}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "UpdateGenotypeAfterSubsettingData") + public void testUpdateGenotypeAfterSubsetting(final GATKVariantContextUtils.GenotypeAssignmentMethod mode, + final double[] likelihoods, + final List originalGT, + final List allelesToUse, + final List expectedAlleles) { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + final double[] log10Likelhoods = MathUtils.normalizeFromLog10(likelihoods, true, false); + GATKVariantContextUtils.updateGenotypeAfterSubsetting(originalGT, gb, mode, log10Likelhoods, allelesToUse); + final Genotype g = gb.make(); + Assert.assertEquals(new HashSet<>(g.getAlleles()), new HashSet<>(expectedAlleles)); + } + + @Test(enabled = !DEBUG) + public void testSubsetToRef() { + final Map tests = new LinkedHashMap<>(); + + for ( final List alleles : Arrays.asList(Arrays.asList(Aref), Arrays.asList(C), Arrays.asList(Aref, C), Arrays.asList(Aref, C, C) ) ) { + for ( final String name : Arrays.asList("test1", "test2") ) { + final GenotypeBuilder builder = new GenotypeBuilder(name, alleles); + builder.DP(10); + builder.GQ(30); + builder.AD(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1, 2} : new int[]{1, 2, 3})); + builder.PL(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1,2} : new int[]{1,2,3})); + final List refs = Collections.nCopies(alleles.size(), Aref); + tests.put(builder.make(), builder.alleles(refs).noAD().noPL().make()); + } + } + + for ( final int n : Arrays.asList(1, 2, 3) ) { + for ( final List genotypes : Utils.makePermutations(new ArrayList<>(tests.keySet()), n, false) ) { + final VariantContext vc = new VariantContextBuilder("test", "20", 1, 1, Arrays.asList(Aref, C)).genotypes(genotypes).make(); + final GenotypesContext gc = GATKVariantContextUtils.subsetToRefOnly(vc, 2); + + Assert.assertEquals(gc.size(), genotypes.size()); + for ( int i = 0; i < genotypes.size(); i++ ) { +// logger.warn("Testing " + genotypes.get(i) + " => " + gc.get(i) + " " + tests.get(genotypes.get(i))); + assertGenotypesAreEqual(gc.get(i), tests.get(genotypes.get(i))); + } + } + } + } } \ No newline at end of file