From 1f23d99dfa83d37362d899d4a5bc1cc9d10fa846 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 20 Apr 2012 17:00:05 -0400 Subject: [PATCH] If we are subsetting alleles in the UG (either because there were too many or because some were not polymorphic), then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). Thanks to Ryan for reporting this. Only one of the integration tests had even partially covered this case, so I added one that did. --- .../genotyper/UnifiedGenotyperEngine.java | 5 ++ .../utils/codecs/vcf/AbstractVCFCodec.java | 25 +++++---- .../variantcontext/VariantContextUtils.java | 53 ++++++++++++++++--- .../UnifiedGenotyperIntegrationTest.java | 10 +++- .../utils/codecs/vcf/VCFCodecUnitTest.java | 2 +- 5 files changed, 76 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 94d340926..caa3a6b6b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -417,6 +417,11 @@ public class UnifiedGenotyperEngine { builder.attributes(attributes); VariantContext vcCall = builder.make(); + // if we are subsetting alleles (either because there were too many or because some were not polymorphic) + // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). + if ( myAlleles.size() != vc.getAlleles().size() ) + vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); + if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index a1127e35d..c2cbf23fb 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -617,10 +617,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { return true; } - public static int computeForwardClipping(List unclippedAlleles, String ref) { + public static int computeForwardClipping(final List unclippedAlleles, final byte ref0) { boolean clipping = true; int symbolicAlleleCount = 0; - final byte ref0 = (byte)ref.charAt(0); for ( Allele a : unclippedAlleles ) { if ( a.isSymbolic() ) { @@ -638,7 +637,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0; } - protected static int computeReverseClipping(List unclippedAlleles, String ref, int forwardClipping, int lineNo) { + public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip, final int lineNo) { int clipping = 0; boolean stillClipping = true; @@ -650,14 +649,20 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). if ( a.length() - clipping == 0 ) - return clipping - 1; + return clipping - (allowFullClip ? 0 : 1); - if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) { stillClipping = false; - else if ( ref.length() == clipping ) - generateException("bad alleles encountered", lineNo); - else if ( a.getBases()[a.length()-clipping-1] != ((byte)ref.charAt(ref.length()-clipping-1)) ) + } + else if ( ref.length == clipping ) { + if ( allowFullClip ) + stillClipping = false; + else + generateException("bad alleles encountered", lineNo); + } + else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { stillClipping = false; + } } if ( stillClipping ) clipping++; @@ -678,8 +683,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { */ protected static int clipAlleles(int position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { - int forwardClipping = computeForwardClipping(unclippedAlleles, ref); - int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo); + int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0)); + int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false, lineNo); if ( clippedAlleles != null ) { for ( Allele a : unclippedAlleles ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index cbaf705b4..de5deef57 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -612,7 +612,7 @@ public class VariantContextUtils { continue; if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { if ( ! genotypes.isEmpty() ) - logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", + logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); // this will remove stale AC,AF attributed from vc @@ -714,8 +714,7 @@ public class VariantContextUtils { else if (refAllele.isNull()) trimVC = false; else { - trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList(inputVC.getAlternateAlleles()), - inputVC.getReference().getDisplayString()) > 0); + trimVC = (AbstractVCFCodec.computeForwardClipping(inputVC.getAlternateAlleles(), (byte)inputVC.getReference().getDisplayString().charAt(0)) > 0); } // nothing to do if we don't need to trim bases @@ -723,9 +722,6 @@ public class VariantContextUtils { List alleles = new ArrayList(); GenotypesContext genotypes = GenotypesContext.create(); - // set the reference base for indels in the attributes - Map attributes = new TreeMap(inputVC.getAttributes()); - Map originalToTrimmedAlleleMap = new HashMap(); for (final Allele a : inputVC.getAlleles()) { @@ -768,12 +764,55 @@ public class VariantContextUtils { } final VariantContextBuilder builder = new VariantContextBuilder(inputVC); - return builder.alleles(alleles).genotypes(genotypes).attributes(attributes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make(); + return builder.alleles(alleles).genotypes(genotypes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make(); } return inputVC; } + public static VariantContext reverseTrimAlleles(VariantContext inputVC) { + // see if we need to trim common reference base from all alleles + + final int trimExtent = AbstractVCFCodec.computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true, -1); + if ( trimExtent <= 0 ) + return inputVC; + + final List alleles = new ArrayList(); + GenotypesContext genotypes = GenotypesContext.create(); + + Map originalToTrimmedAlleleMap = new HashMap(); + + for (final Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent); + Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // now we can recreate new genotypes with trimmed alleles + for ( final Genotype genotype : inputVC.getGenotypes() ) { + + List originalAlleles = genotype.getAlleles(); + List trimmedAlleles = new ArrayList(); + for ( final Allele a : originalAlleles ) { + if ( a.isCalled() ) + trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); + else + trimmedAlleles.add(Allele.NO_CALL); + } + genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles)); + } + + final VariantContextBuilder builder = new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length()); + return builder.alleles(alleles).genotypes(genotypes).make(); + } + public static GenotypesContext stripPLs(GenotypesContext genotypes) { GenotypesContext newGs = GenotypesContext.create(genotypes.size()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 067e9088c..e95284190 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("e948543b83bfd0640fcb994d72f8e234")); + Arrays.asList("ec907c65da5ed9b6046404b0f81422d4")); executeTest("test Multiple SNP alleles", spec); } @@ -74,6 +74,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test bad read", spec); } + @Test + public void testReverseTrim() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, + Arrays.asList("a70593bbb5042e2d0e46e3c932cae170")); + executeTest("test reverse trim", spec); + } + // -------------------------------------------------------------------------------------------------------------- // // testing compressed output diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java index 7681ed7d1..e0fb1b876 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java @@ -85,7 +85,7 @@ public class VCFCodecUnitTest extends BaseTest { @Test(dataProvider = "AlleleClippingTestProvider") public void TestAlleleClipping(AlleleClippingTestProvider cfg) { - int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref, 0, 1); + int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false, 1); Assert.assertEquals(result, cfg.expectedClip); } } \ No newline at end of file