diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 94d340926..caa3a6b6b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -417,6 +417,11 @@ public class UnifiedGenotyperEngine { builder.attributes(attributes); VariantContext vcCall = builder.make(); + // if we are subsetting alleles (either because there were too many or because some were not polymorphic) + // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). + if ( myAlleles.size() != vc.getAlleles().size() ) + vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); + if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 204851e1f..42a40cde5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -189,7 +189,7 @@ public class SelectVariants extends RodWalker implements TreeR * or the sample is called reference in this track. */ @Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false) - private RodBinding discordanceTrack; + protected RodBinding discordanceTrack; /** * A site is considered concordant if (1) we are not looking for specific samples and there is a variant called @@ -197,7 +197,7 @@ public class SelectVariants extends RodWalker implements TreeR * concordance track and they have the sample genotype call. */ @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false) - private RodBinding concordanceTrack; + protected RodBinding concordanceTrack; @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; @@ -230,10 +230,10 @@ public class SelectVariants extends RodWalker implements TreeR public ArrayList SELECT_EXPRESSIONS = new ArrayList(); @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) - private boolean EXCLUDE_NON_VARIANTS = false; + protected boolean EXCLUDE_NON_VARIANTS = false; @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) - private boolean EXCLUDE_FILTERED = false; + protected boolean EXCLUDE_FILTERED = false; /** @@ -257,23 +257,23 @@ public class SelectVariants extends RodWalker implements TreeR private Boolean MENDELIAN_VIOLATIONS = false; @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) - private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; + protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; /** * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory * given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants. */ @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) - private int numRandom = 0; + protected int numRandom = 0; /** * This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions. */ @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false) - private double fractionRandom = 0; + protected double fractionRandom = 0; @Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false) - private double fractionGenotypes = 0; + protected double fractionGenotypes = 0; /** * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. @@ -508,7 +508,7 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; - VariantContext sub = subsetRecord(vc, samples); + VariantContext sub = subsetRecord(vc, samples, EXCLUDE_NON_VARIANTS); if ( (sub.isPolymorphicInSamples() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { boolean failedJexlMatch = false; for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) { @@ -645,11 +645,15 @@ public class SelectVariants extends RodWalker implements TreeR * @param samples the samples to extract * @return the subsetted VariantContext */ - private VariantContext subsetRecord(VariantContext vc, Set samples) { + private VariantContext subsetRecord(final VariantContext vc, final Set samples, final boolean excludeNonVariants) { if ( samples == null || samples.isEmpty() ) return vc; - final VariantContext sub = vc.subContextFromSamples(samples, vc.getAlleles()); + final VariantContext sub; + if ( excludeNonVariants ) + sub = vc.subContextFromSamples(samples); // strip out the alternate alleles that aren't being used + else + sub = vc.subContextFromSamples(samples, vc.getAlleles()); VariantContextBuilder builder = new VariantContextBuilder(sub); GenotypesContext newGC = sub.getGenotypes(); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index a1127e35d..c2cbf23fb 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -617,10 +617,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { return true; } - public static int computeForwardClipping(List unclippedAlleles, String ref) { + public static int computeForwardClipping(final List unclippedAlleles, final byte ref0) { boolean clipping = true; int symbolicAlleleCount = 0; - final byte ref0 = (byte)ref.charAt(0); for ( Allele a : unclippedAlleles ) { if ( a.isSymbolic() ) { @@ -638,7 +637,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0; } - protected static int computeReverseClipping(List unclippedAlleles, String ref, int forwardClipping, int lineNo) { + public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip, final int lineNo) { int clipping = 0; boolean stillClipping = true; @@ -650,14 +649,20 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). if ( a.length() - clipping == 0 ) - return clipping - 1; + return clipping - (allowFullClip ? 0 : 1); - if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) { stillClipping = false; - else if ( ref.length() == clipping ) - generateException("bad alleles encountered", lineNo); - else if ( a.getBases()[a.length()-clipping-1] != ((byte)ref.charAt(ref.length()-clipping-1)) ) + } + else if ( ref.length == clipping ) { + if ( allowFullClip ) + stillClipping = false; + else + generateException("bad alleles encountered", lineNo); + } + else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { stillClipping = false; + } } if ( stillClipping ) clipping++; @@ -678,8 +683,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { */ protected static int clipAlleles(int position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { - int forwardClipping = computeForwardClipping(unclippedAlleles, ref); - int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo); + int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0)); + int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false, lineNo); if ( clippedAlleles != null ) { for ( Allele a : unclippedAlleles ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index cbaf705b4..de5deef57 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -612,7 +612,7 @@ public class VariantContextUtils { continue; if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { if ( ! genotypes.isEmpty() ) - logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", + logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); // this will remove stale AC,AF attributed from vc @@ -714,8 +714,7 @@ public class VariantContextUtils { else if (refAllele.isNull()) trimVC = false; else { - trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList(inputVC.getAlternateAlleles()), - inputVC.getReference().getDisplayString()) > 0); + trimVC = (AbstractVCFCodec.computeForwardClipping(inputVC.getAlternateAlleles(), (byte)inputVC.getReference().getDisplayString().charAt(0)) > 0); } // nothing to do if we don't need to trim bases @@ -723,9 +722,6 @@ public class VariantContextUtils { List alleles = new ArrayList(); GenotypesContext genotypes = GenotypesContext.create(); - // set the reference base for indels in the attributes - Map attributes = new TreeMap(inputVC.getAttributes()); - Map originalToTrimmedAlleleMap = new HashMap(); for (final Allele a : inputVC.getAlleles()) { @@ -768,12 +764,55 @@ public class VariantContextUtils { } final VariantContextBuilder builder = new VariantContextBuilder(inputVC); - return builder.alleles(alleles).genotypes(genotypes).attributes(attributes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make(); + return builder.alleles(alleles).genotypes(genotypes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make(); } return inputVC; } + public static VariantContext reverseTrimAlleles(VariantContext inputVC) { + // see if we need to trim common reference base from all alleles + + final int trimExtent = AbstractVCFCodec.computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true, -1); + if ( trimExtent <= 0 ) + return inputVC; + + final List alleles = new ArrayList(); + GenotypesContext genotypes = GenotypesContext.create(); + + Map originalToTrimmedAlleleMap = new HashMap(); + + for (final Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent); + Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // now we can recreate new genotypes with trimmed alleles + for ( final Genotype genotype : inputVC.getGenotypes() ) { + + List originalAlleles = genotype.getAlleles(); + List trimmedAlleles = new ArrayList(); + for ( final Allele a : originalAlleles ) { + if ( a.isCalled() ) + trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); + else + trimmedAlleles.add(Allele.NO_CALL); + } + genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles)); + } + + final VariantContextBuilder builder = new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length()); + return builder.alleles(alleles).genotypes(genotypes).make(); + } + public static GenotypesContext stripPLs(GenotypesContext genotypes) { GenotypesContext newGs = GenotypesContext.create(genotypes.size()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 067e9088c..e95284190 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("e948543b83bfd0640fcb994d72f8e234")); + Arrays.asList("ec907c65da5ed9b6046404b0f81422d4")); executeTest("test Multiple SNP alleles", spec); } @@ -74,6 +74,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test bad read", spec); } + @Test + public void testReverseTrim() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, + Arrays.asList("a70593bbb5042e2d0e46e3c932cae170")); + executeTest("test reverse trim", spec); + } + // -------------------------------------------------------------------------------------------------------------- // // testing compressed output diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 900e3d489..973588cf0 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -163,4 +163,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testParallelization (4 threads)--" + testfile, spec); } + + @Test + public void testSelectFromMultiAllelic() { + String testfile = validationDataLocation + "multi-allelic.bi-allelicInGIH.vcf"; + String samplesFile = validationDataLocation + "GIH.samples.list"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " -o %s -NO_HEADER -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, + 1, + Arrays.asList("3fb50cc1c955491048108956d7087c35") + ); + executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java index 7681ed7d1..e0fb1b876 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java @@ -85,7 +85,7 @@ public class VCFCodecUnitTest extends BaseTest { @Test(dataProvider = "AlleleClippingTestProvider") public void TestAlleleClipping(AlleleClippingTestProvider cfg) { - int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref, 0, 1); + int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false, 1); Assert.assertEquals(result, cfg.expectedClip); } } \ No newline at end of file