From c167fb5fdf0b9c11a119c8397f3af3e33c9cdddb Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Fri, 14 Feb 2014 16:34:13 -0500 Subject: [PATCH] Fixing GenotypesGVCF. Bug uncovered by some untrimmed alleles in the single sample pipeline output. Notice however does not fix the untrimmed alleles in general. Story: https://www.pivotaltracker.com/story/show/65481104 Changes: 1. Fixed the bug itself. 2. Fixed non-working tests (sliently skipped due to exception in dataProvider). --- .../GenotypeGVCFsIntegrationTest.java | 4 +- .../variant/GATKVariantContextUtils.java | 153 +++--- .../GATKVariantContextUtilsUnitTest.java | 459 +++++++++--------- 3 files changed, 322 insertions(+), 294 deletions(-) diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index fff353c60..741141118 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -65,7 +65,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("ebda39d3343b34d21490a78284ed88e8")); + Arrays.asList("9c618890c03ee9cae1d269039fc29506")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -89,7 +89,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), 1, - Arrays.asList("8eeda24a07f66d67b7639a31fda5c903")); + Arrays.asList("27f3e4700cf836c23a9af2dc1d1bbecb")); executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index a6caea597..fb5564ab3 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -774,6 +774,9 @@ public class GATKVariantContextUtils { return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; } + //TODO consider refactor variant-context merging code so that we share as much as possible between + //TODO simpleMerge and referenceConfidenceMerge + //TODO likely using a separate helper class or hierarchy. /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with @@ -1051,31 +1054,45 @@ public class GATKVariantContextUtils { final String name = first.getSource(); // ref allele - final Allele refAllele = determineReferenceAlleleGiveReferenceBase(VCs, loc, refBase); + final Allele refAllele = determineReferenceAlleleGivenReferenceBase(VCs, loc, refBase); if ( refAllele == null ) return null; - // alt alleles - final AlleleMapper alleleMapper = determineAlternateAlleleMapping(VCs, refAllele, loc); - // the allele list will not include the symbolic allele, so add it if needed - if ( !removeNonRefSymbolicAllele ) - alleleMapper.map.put(NON_REF_SYMBOLIC_ALLELE, NON_REF_SYMBOLIC_ALLELE); - final List alleles = getAllelesListFromMapper(refAllele, alleleMapper); + // FinalAlleleSet contains the alleles of the new resulting VC. + // Using linked set in order to guaranteed an stable order: + final LinkedHashSet finalAlleleSet = new LinkedHashSet<>(10); + // Reference goes first: + finalAlleleSet.add(refAllele); final Map attributes = new LinkedHashMap<>(); final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id - int depth = 0; final Map> annotationMap = new LinkedHashMap<>(); - GenotypesContext genotypes = GenotypesContext.create(); + final GenotypesContext genotypes = GenotypesContext.create(); + final int variantContextCount = VCs.size(); + // In this list we hold the mapping of each variant context alleles. + final List>> vcAndNewAllelePairs = new ArrayList<>(variantContextCount); // cycle through and add info from the other VCs for ( final VariantContext vc : VCs ) { // if this context doesn't start at the current location then it must be a spanning event (deletion or ref block) final boolean isSpanningEvent = loc.getStart() != vc.getStart(); - final List remappedAlleles = isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) : alleleMapper.remap(vc.getAlleles()); - mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, alleles); + + vcAndNewAllelePairs.add(new Pair<>(vc,isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) + : remapAlleles(vc.getAlleles(), refAllele, finalAlleleSet))); + } + + // Add to the end if at all required in in the output. + if (!removeNonRefSymbolicAllele) finalAlleleSet.add(NON_REF_SYMBOLIC_ALLELE); + + final List allelesList = new ArrayList<>(finalAlleleSet); + + for ( final Pair> pair : vcAndNewAllelePairs ) { + final VariantContext vc = pair.getFirst(); + final List remappedAlleles = pair.getSecond(); + + mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList); // special case DP (add it up) for all events if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) @@ -1083,7 +1100,7 @@ public class GATKVariantContextUtils { else if ( vc.getNSamples() == 1 && vc.getGenotype(0).hasExtendedAttribute("MIN_DP") ) // handle the gVCF case from the HaplotypeCaller depth += vc.getGenotype(0).getAttributeAsInt("MIN_DP", 0); - if ( isSpanningEvent ) + if ( loc.getStart() != vc.getStart() ) continue; // special case ID (just preserve it) @@ -1108,8 +1125,8 @@ public class GATKVariantContextUtils { final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); - final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(alleles) - .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(alleles, loc.getStart(), loc.getStart()) + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList) + .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart()) .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later return builder.make(); @@ -1123,27 +1140,13 @@ public class GATKVariantContextUtils { * @param refBase the reference allele to use if all contexts in the VC are spanning * @return new Allele or null if no reference allele/base is available */ - private static Allele determineReferenceAlleleGiveReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) { + private static Allele determineReferenceAlleleGivenReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) { final Allele refAllele = determineReferenceAllele(VCs, loc); if ( refAllele == null ) return ( refBase == null ? null : Allele.create(refBase, true) ); return refAllele; } - /** - * Creates an alleles list given a reference allele and a mapper - * - * @param refAllele the reference allele - * @param alleleMapper the allele mapper - * @return a non-null, non-empty list of Alleles - */ - private static List getAllelesListFromMapper(final Allele refAllele, final AlleleMapper alleleMapper) { - final List alleles = new ArrayList<>(); - alleles.add(refAllele); - alleles.addAll(alleleMapper.getUniqueMappedAlleles()); - return alleles; - } - /** * Remove the stale attributes from the merged set * @@ -1157,7 +1160,6 @@ public class GATKVariantContextUtils { attributes.remove(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); attributes.remove(VCFConstants.END_KEY); } - /** * Adds attributes to the global map from the new context in a sophisticated manner * @@ -1207,6 +1209,54 @@ public class GATKVariantContextUtils { return it1.hasNext() || it2.hasNext(); } + //TODO as part of a larger refactoring effort remapAlleles can be merged with createAlleleMapping. + /** + * This method does a couple of things: + *
  • + * remaps the vc alleles considering the differences between the final reference allele and its own reference,
  • + *
  • + * collects alternative alleles present in variant context and add them to the {@code finalAlleles} set. + *
+ * + * @param vcAlleles the variant context allele list. + * @param refAllele final reference allele. + * @param finalAlleles where to add the final set of non-ref called alleles. + * @return never {@code null} + */ + private static List remapAlleles(final List vcAlleles, final Allele refAllele, final LinkedHashSet finalAlleles) { + final Allele vcRef = vcAlleles.get(0); + if (!vcRef.isReference()) throw new IllegalStateException("the first allele of the vc allele list must be reference"); + final byte[] refBases = refAllele.getBases(); + final int extraBaseCount = refBases.length - vcRef.getBases().length; + if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected"); + final List result = new ArrayList<>(vcAlleles.size()); + + for (final Allele a : vcAlleles) { + if (a.isReference()) { + result.add(refAllele); + } else if (a.isSymbolic()) { + result.add(a); + // we always skip when adding to finalAlleles this is done outside if applies. + if (!a.equals(NON_REF_SYMBOLIC_ALLELE)) + finalAlleles.add(a); + } else if (a.isCalled()) { + final Allele newAllele; + if (extraBaseCount > 0) { + final byte[] oldBases = a.getBases(); + final byte[] newBases = Arrays.copyOf(oldBases,oldBases.length + extraBaseCount); + System.arraycopy(refBases,refBases.length - extraBaseCount,newBases,oldBases.length,extraBaseCount); + newAllele = Allele.create(newBases,false); + } else + newAllele = a; + result.add(newAllele); + finalAlleles.add(newAllele); + } else { // NO_CALL and strange miscellanea + result.add(a); + } + } + return result; + } + public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { final GenotypesContext newGs = GenotypesContext.create(genotypes.size()); @@ -1346,50 +1396,10 @@ public class GATKVariantContextUtils { return ref; } - static private boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) { + public static boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) { return loc == null || loc.getStart() == vc.getStart(); } - /** - * Given the reference allele, determines the mapping for common alternate alleles in the list of VariantContexts. - * - * @param VCs the list of VariantContexts - * @param refAllele the reference allele - * @param loc if not null, ignore records that do not begin at this start location - * @return non-null AlleleMapper - */ - static private AlleleMapper determineAlternateAlleleMapping(final List VCs, final Allele refAllele, final GenomeLoc loc) { - final Map map = new HashMap<>(); - - for ( final VariantContext vc : VCs ) { - if ( contextMatchesLoc(vc, loc) ) - addAllAlternateAllelesToMap(vc, refAllele, map); - } - - return new AlleleMapper(map); - } - - /** - * Adds all of the alternate alleles from the VariantContext to the allele mapping (for use in creating the AlleleMapper) - * - * @param vc the VariantContext - * @param refAllele the reference allele - * @param map the allele mapping to populate - */ - static private void addAllAlternateAllelesToMap(final VariantContext vc, final Allele refAllele, final Map map) { - // if the ref allele matches, then just add the alts as is - if ( refAllele.equals(vc.getReference()) ) { - for ( final Allele altAllele : vc.getAlternateAlleles() ) { - // ignore symbolic alleles - if ( ! altAllele.isSymbolic() ) - map.put(altAllele, altAllele); - } - } - else { - map.putAll(createAlleleMapping(refAllele, vc, map.values())); - } - } - static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final Set allAlleles) { if ( refAllele.equals(vc.getReference()) ) return new AlleleMapper(vc); @@ -1977,7 +1987,6 @@ public class GATKVariantContextUtils { return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); } - protected static class AlleleMapper { private VariantContext vc = null; private Map map = null; diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 953f72796..efc701a6d 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -42,6 +42,10 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { private final static boolean DEBUG = false; Allele Aref, T, C, G, Cref, ATC, ATCATC; + Allele ATCATCT; + Allele ATref; + Allele Anoref; + Allele GT; @BeforeSuite public void setup() { @@ -53,6 +57,10 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { G = Allele.create("G"); ATC = Allele.create("ATC"); ATCATC = Allele.create("ATCATC"); + ATCATCT = Allele.create("ATCATCT"); + ATref = Allele.create("AT",true); + Anoref = Allele.create("A",false); + GT = Allele.create("GT",false); } private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, int... pls) { @@ -86,7 +94,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { private VariantContext makeVC(String source, List alleles, Collection genotypes, Set filters) { int start = 10; - int stop = start; // alleles.contains(ATC) ? start + 3 : start; + int stop = start + alleles.get(0).length() - 1; // alleles.contains(ATC) ? start + 3 : start; return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(genotypes).filters(filters).make(); } @@ -138,6 +146,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { Arrays.asList(Aref, C, T)); new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C, T)); + new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, T, C)); new MergeAllelesTest(Arrays.asList(Aref, T, C), @@ -162,6 +171,10 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { Arrays.asList(Aref, ATCATC), Arrays.asList(Aref, ATC, ATCATC)); + new MergeAllelesTest(Arrays.asList(ATref, ATC, Anoref, G), + Arrays.asList(Aref, ATCATC, G), + Arrays.asList(ATref, ATC, Anoref, G, ATCATCT, GT)); + return MergeAllelesTest.getTests(MergeAllelesTest.class); } @@ -182,6 +195,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); + Assert.assertEquals(merged.getAlleles().size(),cfg.expected.size()); Assert.assertEquals(merged.getAlleles(), cfg.expected); } @@ -1413,224 +1427,6 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { // // -------------------------------------------------------------------------------- - @DataProvider(name = "generatePLsData") - public Object[][] makeGeneratePLsData() { - final List tests = new ArrayList<>(); - - for ( int originalAlleles = 2; originalAlleles <= 5; originalAlleles++ ) { - for ( int swapPosition1 = 0; swapPosition1 < originalAlleles; swapPosition1++ ) { - for ( int swapPosition2 = swapPosition1+1; swapPosition2 < originalAlleles; swapPosition2++ ) { - final int[] indexes = new int[originalAlleles]; - for ( int i = 0; i < originalAlleles; i++ ) - indexes[i] = i; - indexes[swapPosition1] = swapPosition2; - indexes[swapPosition2] = swapPosition1; - tests.add(new Object[]{originalAlleles, indexes}); - } - } - } - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "generatePLsData") - public void testGeneratePLs(final int numOriginalAlleles, final int[] indexOrdering) { - - final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(numOriginalAlleles, 2); - final int[] PLs = new int[numLikelihoods]; - for ( int i = 0; i < numLikelihoods; i++ ) - PLs[i] = i; - - final List alleles = new ArrayList<>(numOriginalAlleles); - alleles.add(Allele.create("A", true)); - for ( int i = 1; i < numOriginalAlleles; i++ ) - alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); - final Genotype genotype = new GenotypeBuilder("foo", alleles).PL(PLs).make(); - - final int[] newPLs = GATKVariantContextUtils.generatePLs(genotype, indexOrdering); - - Assert.assertEquals(newPLs.length, numLikelihoods); - - final int[] expectedPLs = new int[numLikelihoods]; - for ( int i = 0; i < numOriginalAlleles; i++ ) { - for ( int j = i; j < numOriginalAlleles; j++ ) { - final int index = GenotypeLikelihoods.calculatePLindex(i, j); - final int value = GATKVariantContextUtils.calculatePLindexFromUnorderedIndexes(indexOrdering[i], indexOrdering[j]); - expectedPLs[index] = value; - } - } - - for ( int i = 0; i < numLikelihoods; i++ ) { - Assert.assertEquals(newPLs[i], expectedPLs[i]); - } - } - - @Test(expectedExceptions = UserException.class) - public void testGetIndexesOfRelevantAllelesWithNoALT() { - - final List alleles1 = new ArrayList<>(1); - alleles1.add(Allele.create("A", true)); - final List alleles2 = new ArrayList<>(1); - alleles2.add(Allele.create("A", true)); - GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2, -1); - Assert.fail("We should have thrown an exception because the allele was not present"); - } - - @DataProvider(name = "getIndexesOfRelevantAllelesData") - public Object[][] makeGetIndexesOfRelevantAllelesData() { - final int totalAlleles = 5; - final List alleles = new ArrayList<>(totalAlleles); - alleles.add(Allele.create("A", true)); - for ( int i = 1; i < totalAlleles; i++ ) - alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); - - final List tests = new ArrayList<>(); - - for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) { - tests.add(new Object[]{alleleIndex, alleles}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "getIndexesOfRelevantAllelesData") - public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List allAlleles) { - final List myAlleles = new ArrayList<>(3); - - // always add the reference and alleles - myAlleles.add(allAlleles.get(0)); - myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - // optionally add another alternate allele - if ( allelesIndex > 0 ) - myAlleles.add(allAlleles.get(allelesIndex)); - - final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1); - - Assert.assertEquals(indexes.length, allAlleles.size()); - - for ( int i = 0; i < allAlleles.size(); i++ ) { - if ( i == 0 ) - Assert.assertEquals(indexes[i], 0); // ref should always match - else if ( i == allelesIndex ) - Assert.assertEquals(indexes[i], 2); // allele - else - Assert.assertEquals(indexes[i], 1); // - } - } - - @DataProvider(name = "referenceConfidenceMergeData") - public Object[][] makeReferenceConfidenceMergeData() { - final List tests = new ArrayList<>(); - - final int start = 10; - final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); - final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); - final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); - - final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; - final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10}; - - final List noCalls = new ArrayList<>(2); - noCalls.add(Allele.NO_CALL); - noCalls.add(Allele.NO_CALL); - - final List A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make(); - final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); - final Allele AAref = Allele.create("AA", true); - final List AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make(); - final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); - final List A_C = Arrays.asList(Aref, C); - final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make(); - final List A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); - final List A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); - final List A_C_G = Arrays.asList(Aref, C, G); - final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(); - final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make(); - final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); - final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); - final Allele A = Allele.create("A", false); - final List AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); - - // first test the case of a single record - tests.add(new Object[]{Arrays.asList(vcA_C_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).alleles(noCalls).make()}); - - // now, test pairs: - // a SNP with another SNP - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); - // a SNP with an indel - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); - // a SNP with 2 SNPs - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); - // a SNP with a ref record - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); - - // spanning records: - // a SNP with a spanning ref record - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); - // a SNP with a spanning deletion - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_A_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()}); - - // combination of all - tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC, G)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).make(), - new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).make(), - new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).make(), - new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 71, 72, 74, 20, 10, 73, 30}).make(), - new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).make(), - new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).make(), - new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).make()).make()}); - - // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts - tests.add(new Object[]{Arrays.asList(vcAA_ALT), - loc, false, - null}); - tests.add(new Object[]{Arrays.asList(vcAA_ALT), - loc, true, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).make()).make()}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "referenceConfidenceMergeData") - public void testReferenceConfidenceMerge(final List toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) { - final VariantContext result = GATKVariantContextUtils.referenceConfidenceMerge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte)'A' : null, true); - if ( result == null ) { - Assert.assertTrue(expectedResult == null); - return; - } - Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles()); - Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples()); - for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) { - Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName()); - // use string comparisons to test equality for now - Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); - } - } @Test(dataProvider = "indexOfAlleleData") public void testIndexOfAllele(final Allele reference, final List altAlleles, final List otherAlleles) { @@ -1709,14 +1505,237 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { }; } + + @Test(dataProvider = "generatePLsData") + public void testGeneratePLs(final int numOriginalAlleles, final int[] indexOrdering) { + + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(numOriginalAlleles, 2); + final int[] PLs = new int[numLikelihoods]; + for ( int i = 0; i < numLikelihoods; i++ ) + PLs[i] = i; + + final List alleles = new ArrayList<>(numOriginalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < numOriginalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + final Genotype genotype = new GenotypeBuilder("foo", alleles).PL(PLs).make(); + + final int[] newPLs = GATKVariantContextUtils.generatePLs(genotype, indexOrdering); + + Assert.assertEquals(newPLs.length, numLikelihoods); + + final int[] expectedPLs = new int[numLikelihoods]; + for ( int i = 0; i < numOriginalAlleles; i++ ) { + for ( int j = i; j < numOriginalAlleles; j++ ) { + final int index = GenotypeLikelihoods.calculatePLindex(i, j); + final int value = GATKVariantContextUtils.calculatePLindexFromUnorderedIndexes(indexOrdering[i], indexOrdering[j]); + expectedPLs[index] = value; + } + } + + for ( int i = 0; i < numLikelihoods; i++ ) { + Assert.assertEquals(newPLs[i], expectedPLs[i]); + } + } + + @Test(dataProvider = "referenceConfidenceMergeData") + public void testReferenceConfidenceMerge(final String testID, final List toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) { + final VariantContext result = GATKVariantContextUtils.referenceConfidenceMerge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true); + if ( result == null ) { + Assert.assertTrue(expectedResult == null); + return; + } + Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles(),testID); + Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples(),testID); + for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) { + Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName()); + // use string comparisons to test equality for now + Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); + } + } + @Test public void testGenerateADWithNewAlleles() { final int[] originalAD = new int[] {1,2,0}; final int[] indexesOfRelevantAlleles = new int[] {0,1,2,2}; - + final int[] newAD = GATKVariantContextUtils.generateAD(originalAD, indexesOfRelevantAlleles); Assert.assertEquals(newAD, new int[]{1,2,0,0}); } + + + @Test(expectedExceptions = UserException.class) + public void testGetIndexesOfRelevantAllelesWithNoALT() { + + final List alleles1 = new ArrayList<>(1); + alleles1.add(Allele.create("A", true)); + final List alleles2 = new ArrayList<>(1); + alleles2.add(Allele.create("A", true)); + GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2, -1); + Assert.fail("We should have thrown an exception because the allele was not present"); + } + + @Test(dataProvider = "getIndexesOfRelevantAllelesData") + public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List allAlleles) { + final List myAlleles = new ArrayList<>(3); + + // always add the reference and alleles + myAlleles.add(allAlleles.get(0)); + myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + // optionally add another alternate allele + if ( allelesIndex > 0 ) + myAlleles.add(allAlleles.get(allelesIndex)); + + final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1); + + Assert.assertEquals(indexes.length, allAlleles.size()); + + for ( int i = 0; i < allAlleles.size(); i++ ) { + if ( i == 0 ) + Assert.assertEquals(indexes[i], 0); // ref should always match + else if ( i == allelesIndex ) + Assert.assertEquals(indexes[i], 2); // allele + else + Assert.assertEquals(indexes[i], 1); // + } + } + + + @DataProvider(name = "getIndexesOfRelevantAllelesData") + public Object[][] makeGetIndexesOfRelevantAllelesData() { + final int totalAlleles = 5; + final List alleles = new ArrayList<>(totalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < totalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + + final List tests = new ArrayList<>(); + + for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) { + tests.add(new Object[]{alleleIndex, alleles}); + } + + return tests.toArray(new Object[][]{}); + } + + @DataProvider(name = "referenceConfidenceMergeData") + public Object[][] makeReferenceConfidenceMergeData() { + final List tests = new ArrayList<>(); + final int start = 10; + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); + final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); + final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); + + final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; + final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10}; + + final List noCalls = new ArrayList<>(2); + noCalls.add(Allele.NO_CALL); + noCalls.add(Allele.NO_CALL); + + final List A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make(); + final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); + final Allele AAref = Allele.create("AA", true); + final List AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make(); + final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); + final List A_C = Arrays.asList(Aref, C); + final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make(); + final List A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); + final List A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); + final List A_C_G = Arrays.asList(Aref, C, G); + final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(); + final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make(); + final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); + final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); + final Allele A = Allele.create("A", false); + final List AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); + + // first test the case of a single record + tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()}); + + // now, test pairs: + // a SNP with another SNP + tests.add(new Object[]{"test01",Arrays.asList(vcA_C_ALT, vcA_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); + // a SNP with an indel + tests.add(new Object[]{"test02",Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); + // a SNP with 2 SNPs + tests.add(new Object[]{"test03",Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); + // a SNP with a ref record + tests.add(new Object[]{"test04",Arrays.asList(vcA_C_ALT, vcA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); + + // spanning records: + // a SNP with a spanning ref record + tests.add(new Object[]{"test05",Arrays.asList(vcA_C_ALT, vcAA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); + // a SNP with a spanning deletion + tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()}); + + // combination of all + tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(), + new GenotypeBuilder("A_C_G").PL(new int[]{40,20,30,20,10,30,71,72,73,74}).alleles(noCalls).make(), + new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).alleles(noCalls).make(), + new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).alleles(noCalls).make(), + new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).alleles(noCalls).make()).make()}); + + // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts + tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT), + + loc, false, + null}); + tests.add(new Object[]{"test09", Arrays.asList(vcAA_ALT), + loc, true, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).alleles(noCalls).make()).make()}); + + final Object[][] result = tests.toArray(new Object[][]{}); + return result; + } + + @DataProvider(name = "generatePLsData") + public Object[][] makeGeneratePLsData() { + final List tests = new ArrayList<>(); + + for ( int originalAlleles = 2; originalAlleles <= 5; originalAlleles++ ) { + for ( int swapPosition1 = 0; swapPosition1 < originalAlleles; swapPosition1++ ) { + for ( int swapPosition2 = swapPosition1+1; swapPosition2 < originalAlleles; swapPosition2++ ) { + final int[] indexes = new int[originalAlleles]; + for ( int i = 0; i < originalAlleles; i++ ) + indexes[i] = i; + indexes[swapPosition1] = swapPosition2; + indexes[swapPosition2] = swapPosition1; + tests.add(new Object[]{originalAlleles, indexes}); + } + } + } + return tests.toArray(new Object[][]{}); + } }