From 0323caefc87b9f82ddaa0a114e7cd769971b5727 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 7 Jan 2014 15:50:21 -0500 Subject: [PATCH] Added some bug fixes to the gVCF merging code after finally getting some real data to play with. Still under construction, awaiting more test data from Valentin. --- .../variant/GATKVariantContextUtils.java | 38 +++++++++---------- .../GATKVariantContextUtilsUnitTest.java | 7 ++-- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index c36d7f888..7d4d66f7c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -33,6 +33,7 @@ import org.broad.tribble.TribbleException; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.vcf.VCFConstants; @@ -1064,7 +1065,6 @@ public class GATKVariantContextUtils { final Set inconsistentAttributes = new HashSet<>(); final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id - VariantContext longestVC = first; int depth = 0; final Map> annotationMap = new LinkedHashMap<>(); GenotypesContext genotypes = GenotypesContext.create(); @@ -1084,10 +1084,6 @@ public class GATKVariantContextUtils { if ( isSpanningEvent ) continue; - // keep track of the longest location that starts here - if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) - longestVC = vc; - // special case ID (just preserve it) if ( vc.hasID() ) rsIDs.add(vc.getID()); @@ -1105,15 +1101,15 @@ public class GATKVariantContextUtils { if ( depth > 0 ) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); + // remove stale AC and AF based attributes + removeStaleAttributesAfterMerge(attributes); + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(alleles) - .loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()) + .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(alleles, loc.getStart()) .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later - // remove stale AC and AF based attributes - removeStaleAttributesAfterMerge(builder); - return builder.make(); } @@ -1147,16 +1143,17 @@ public class GATKVariantContextUtils { } /** - * Remove the stale attributes from the merged VariantContext (builder) + * Remove the stale attributes from the merged set * - * @param builder the VC builder + * @param attributes the attribute map */ - private static void removeStaleAttributesAfterMerge(final VariantContextBuilder builder) { - builder.rmAttributes(Arrays.asList(VCFConstants.ALLELE_COUNT_KEY, - VCFConstants.ALLELE_FREQUENCY_KEY, - VCFConstants.ALLELE_NUMBER_KEY, - VCFConstants.MLE_ALLELE_COUNT_KEY, - VCFConstants.MLE_ALLELE_FREQUENCY_KEY)); + private static void removeStaleAttributesAfterMerge(final Map attributes) { + attributes.remove(VCFConstants.ALLELE_COUNT_KEY); + attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); + attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); + attributes.remove(VCFConstants.MLE_ALLELE_COUNT_KEY); + attributes.remove(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + attributes.remove(VCFConstants.END_KEY); } /** @@ -1544,7 +1541,7 @@ public class GATKVariantContextUtils { final String name = g.getSampleName(); if ( !mergedGenotypes.containsSample(name) ) { // we need to modify it even if it already contains all of the alleles because we need to purge the PLs out anyways - final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles); + final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart()); final int[] PLs = generatePLs(g, indexesOfRelevantAlleles); // note that we set the alleles to null here (as we expect it to be re-genotyped) final Genotype newG = new GenotypeBuilder(g).name(name).alleles(null).PL(PLs).noAD().noGQ().make(); @@ -1559,15 +1556,16 @@ public class GATKVariantContextUtils { * * @param remappedAlleles the list of alleles to evaluate * @param targetAlleles the target list of alleles + * @param position position to use for error messages * @return non-null array of ints representing indexes */ - protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles) { + protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position) { if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); if ( !remappedAlleles.contains(NON_REF_SYMBOLIC_ALLELE) ) - throw new IllegalArgumentException("The list of input alleles must contain " + NON_REF_SYMBOLIC_ALLELE + " as an allele; please use the Haplotype Caller with gVCF output to generate appropriate records"); + throw new UserException("The list of input alleles must contain " + NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records"); final int indexOfGenericAlt = remappedAlleles.indexOf(NON_REF_SYMBOLIC_ALLELE); final int[] indexMapping = new int[targetAlleles.size()]; diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 23a24e180..6672e3264 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -29,6 +29,7 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.variant.variantcontext.*; import org.testng.Assert; import org.testng.annotations.BeforeSuite; @@ -1463,14 +1464,14 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } } - @Test(expectedExceptions = IllegalArgumentException.class) + @Test(expectedExceptions = UserException.class) public void testGetIndexesOfRelevantAllelesWithNoALT() { final List alleles1 = new ArrayList<>(1); alleles1.add(Allele.create("A", true)); final List alleles2 = new ArrayList<>(1); alleles2.add(Allele.create("A", true)); - GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2); + GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2, -1); Assert.fail("We should have thrown an exception because the allele was not present"); } @@ -1502,7 +1503,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { if ( allelesIndex > 0 ) myAlleles.add(allAlleles.get(allelesIndex)); - final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles); + final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1); Assert.assertEquals(indexes.length, allAlleles.size());