From f1c5510ec09565dbeb4493550ec5220433611c99 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 20 Apr 2012 14:30:04 -0400 Subject: [PATCH 1/3] When running SelectVariants with the excludeNonVariants option, remove alleles from the ALT field that are no longer polymorphic. --- .../walkers/variantutils/SelectVariants.java | 26 +++++++++++-------- .../SelectVariantsIntegrationTest.java | 12 +++++++++ 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 204851e1f..42a40cde5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -189,7 +189,7 @@ public class SelectVariants extends RodWalker implements TreeR * or the sample is called reference in this track. */ @Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false) - private RodBinding discordanceTrack; + protected RodBinding discordanceTrack; /** * A site is considered concordant if (1) we are not looking for specific samples and there is a variant called @@ -197,7 +197,7 @@ public class SelectVariants extends RodWalker implements TreeR * concordance track and they have the sample genotype call. */ @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false) - private RodBinding concordanceTrack; + protected RodBinding concordanceTrack; @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; @@ -230,10 +230,10 @@ public class SelectVariants extends RodWalker implements TreeR public ArrayList SELECT_EXPRESSIONS = new ArrayList(); @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) - private boolean EXCLUDE_NON_VARIANTS = false; + protected boolean EXCLUDE_NON_VARIANTS = false; @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) - private boolean EXCLUDE_FILTERED = false; + protected boolean EXCLUDE_FILTERED = false; /** @@ -257,23 +257,23 @@ public class SelectVariants extends RodWalker implements TreeR private Boolean MENDELIAN_VIOLATIONS = false; @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) - private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; + protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; /** * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory * given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants. */ @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) - private int numRandom = 0; + protected int numRandom = 0; /** * This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions. */ @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false) - private double fractionRandom = 0; + protected double fractionRandom = 0; @Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false) - private double fractionGenotypes = 0; + protected double fractionGenotypes = 0; /** * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. @@ -508,7 +508,7 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; - VariantContext sub = subsetRecord(vc, samples); + VariantContext sub = subsetRecord(vc, samples, EXCLUDE_NON_VARIANTS); if ( (sub.isPolymorphicInSamples() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { boolean failedJexlMatch = false; for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) { @@ -645,11 +645,15 @@ public class SelectVariants extends RodWalker implements TreeR * @param samples the samples to extract * @return the subsetted VariantContext */ - private VariantContext subsetRecord(VariantContext vc, Set samples) { + private VariantContext subsetRecord(final VariantContext vc, final Set samples, final boolean excludeNonVariants) { if ( samples == null || samples.isEmpty() ) return vc; - final VariantContext sub = vc.subContextFromSamples(samples, vc.getAlleles()); + final VariantContext sub; + if ( excludeNonVariants ) + sub = vc.subContextFromSamples(samples); // strip out the alternate alleles that aren't being used + else + sub = vc.subContextFromSamples(samples, vc.getAlleles()); VariantContextBuilder builder = new VariantContextBuilder(sub); GenotypesContext newGC = sub.getGenotypes(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 900e3d489..973588cf0 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -163,4 +163,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testParallelization (4 threads)--" + testfile, spec); } + + @Test + public void testSelectFromMultiAllelic() { + String testfile = validationDataLocation + "multi-allelic.bi-allelicInGIH.vcf"; + String samplesFile = validationDataLocation + "GIH.samples.list"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " -o %s -NO_HEADER -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, + 1, + Arrays.asList("3fb50cc1c955491048108956d7087c35") + ); + executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); + } } From 1f23d99dfa83d37362d899d4a5bc1cc9d10fa846 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 20 Apr 2012 17:00:05 -0400 Subject: [PATCH 2/3] If we are subsetting alleles in the UG (either because there were too many or because some were not polymorphic), then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). Thanks to Ryan for reporting this. Only one of the integration tests had even partially covered this case, so I added one that did. --- .../genotyper/UnifiedGenotyperEngine.java | 5 ++ .../utils/codecs/vcf/AbstractVCFCodec.java | 25 +++++---- .../variantcontext/VariantContextUtils.java | 53 ++++++++++++++++--- .../UnifiedGenotyperIntegrationTest.java | 10 +++- .../utils/codecs/vcf/VCFCodecUnitTest.java | 2 +- 5 files changed, 76 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 94d340926..caa3a6b6b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -417,6 +417,11 @@ public class UnifiedGenotyperEngine { builder.attributes(attributes); VariantContext vcCall = builder.make(); + // if we are subsetting alleles (either because there were too many or because some were not polymorphic) + // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). + if ( myAlleles.size() != vc.getAlleles().size() ) + vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); + if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index a1127e35d..c2cbf23fb 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -617,10 +617,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { return true; } - public static int computeForwardClipping(List unclippedAlleles, String ref) { + public static int computeForwardClipping(final List unclippedAlleles, final byte ref0) { boolean clipping = true; int symbolicAlleleCount = 0; - final byte ref0 = (byte)ref.charAt(0); for ( Allele a : unclippedAlleles ) { if ( a.isSymbolic() ) { @@ -638,7 +637,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0; } - protected static int computeReverseClipping(List unclippedAlleles, String ref, int forwardClipping, int lineNo) { + public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip, final int lineNo) { int clipping = 0; boolean stillClipping = true; @@ -650,14 +649,20 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). if ( a.length() - clipping == 0 ) - return clipping - 1; + return clipping - (allowFullClip ? 0 : 1); - if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) { stillClipping = false; - else if ( ref.length() == clipping ) - generateException("bad alleles encountered", lineNo); - else if ( a.getBases()[a.length()-clipping-1] != ((byte)ref.charAt(ref.length()-clipping-1)) ) + } + else if ( ref.length == clipping ) { + if ( allowFullClip ) + stillClipping = false; + else + generateException("bad alleles encountered", lineNo); + } + else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { stillClipping = false; + } } if ( stillClipping ) clipping++; @@ -678,8 +683,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { */ protected static int clipAlleles(int position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { - int forwardClipping = computeForwardClipping(unclippedAlleles, ref); - int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo); + int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0)); + int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false, lineNo); if ( clippedAlleles != null ) { for ( Allele a : unclippedAlleles ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index cbaf705b4..de5deef57 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -612,7 +612,7 @@ public class VariantContextUtils { continue; if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { if ( ! genotypes.isEmpty() ) - logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", + logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); // this will remove stale AC,AF attributed from vc @@ -714,8 +714,7 @@ public class VariantContextUtils { else if (refAllele.isNull()) trimVC = false; else { - trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList(inputVC.getAlternateAlleles()), - inputVC.getReference().getDisplayString()) > 0); + trimVC = (AbstractVCFCodec.computeForwardClipping(inputVC.getAlternateAlleles(), (byte)inputVC.getReference().getDisplayString().charAt(0)) > 0); } // nothing to do if we don't need to trim bases @@ -723,9 +722,6 @@ public class VariantContextUtils { List alleles = new ArrayList(); GenotypesContext genotypes = GenotypesContext.create(); - // set the reference base for indels in the attributes - Map attributes = new TreeMap(inputVC.getAttributes()); - Map originalToTrimmedAlleleMap = new HashMap(); for (final Allele a : inputVC.getAlleles()) { @@ -768,12 +764,55 @@ public class VariantContextUtils { } final VariantContextBuilder builder = new VariantContextBuilder(inputVC); - return builder.alleles(alleles).genotypes(genotypes).attributes(attributes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make(); + return builder.alleles(alleles).genotypes(genotypes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make(); } return inputVC; } + public static VariantContext reverseTrimAlleles(VariantContext inputVC) { + // see if we need to trim common reference base from all alleles + + final int trimExtent = AbstractVCFCodec.computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true, -1); + if ( trimExtent <= 0 ) + return inputVC; + + final List alleles = new ArrayList(); + GenotypesContext genotypes = GenotypesContext.create(); + + Map originalToTrimmedAlleleMap = new HashMap(); + + for (final Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent); + Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // now we can recreate new genotypes with trimmed alleles + for ( final Genotype genotype : inputVC.getGenotypes() ) { + + List originalAlleles = genotype.getAlleles(); + List trimmedAlleles = new ArrayList(); + for ( final Allele a : originalAlleles ) { + if ( a.isCalled() ) + trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); + else + trimmedAlleles.add(Allele.NO_CALL); + } + genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles)); + } + + final VariantContextBuilder builder = new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length()); + return builder.alleles(alleles).genotypes(genotypes).make(); + } + public static GenotypesContext stripPLs(GenotypesContext genotypes) { GenotypesContext newGs = GenotypesContext.create(genotypes.size()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 067e9088c..e95284190 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("e948543b83bfd0640fcb994d72f8e234")); + Arrays.asList("ec907c65da5ed9b6046404b0f81422d4")); executeTest("test Multiple SNP alleles", spec); } @@ -74,6 +74,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test bad read", spec); } + @Test + public void testReverseTrim() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, + Arrays.asList("a70593bbb5042e2d0e46e3c932cae170")); + executeTest("test reverse trim", spec); + } + // -------------------------------------------------------------------------------------------------------------- // // testing compressed output diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java index 7681ed7d1..e0fb1b876 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java @@ -85,7 +85,7 @@ public class VCFCodecUnitTest extends BaseTest { @Test(dataProvider = "AlleleClippingTestProvider") public void TestAlleleClipping(AlleleClippingTestProvider cfg) { - int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref, 0, 1); + int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false, 1); Assert.assertEquals(result, cfg.expectedClip); } } \ No newline at end of file