If we are subsetting alleles in the UG (either because there were too many or because some were not polymorphic), then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). Thanks to Ryan for reporting this. Only one of the integration tests had even partially covered this case, so I added one that did.

This commit is contained in:
Eric Banks 2012-04-20 17:00:05 -04:00
parent 4b81c75642
commit 1f23d99dfa
5 changed files with 76 additions and 19 deletions

View File

@ -417,6 +417,11 @@ public class UnifiedGenotyperEngine {
builder.attributes(attributes); builder.attributes(attributes);
VariantContext vcCall = builder.make(); VariantContext vcCall = builder.make();
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
if ( myAlleles.size() != vc.getAlleles().size() )
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) { if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) {
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
final ReadBackedPileup pileup = rawContext.getBasePileup(); final ReadBackedPileup pileup = rawContext.getBasePileup();

View File

@ -617,10 +617,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
return true; return true;
} }
public static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) { public static int computeForwardClipping(final List<Allele> unclippedAlleles, final byte ref0) {
boolean clipping = true; boolean clipping = true;
int symbolicAlleleCount = 0; int symbolicAlleleCount = 0;
final byte ref0 = (byte)ref.charAt(0);
for ( Allele a : unclippedAlleles ) { for ( Allele a : unclippedAlleles ) {
if ( a.isSymbolic() ) { if ( a.isSymbolic() ) {
@ -638,7 +637,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0; return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0;
} }
protected static int computeReverseClipping(List<Allele> unclippedAlleles, String ref, int forwardClipping, int lineNo) { public static int computeReverseClipping(final List<Allele> unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip, final int lineNo) {
int clipping = 0; int clipping = 0;
boolean stillClipping = true; boolean stillClipping = true;
@ -650,14 +649,20 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
// we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong
// position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine).
if ( a.length() - clipping == 0 ) if ( a.length() - clipping == 0 )
return clipping - 1; return clipping - (allowFullClip ? 0 : 1);
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) {
stillClipping = false; stillClipping = false;
else if ( ref.length() == clipping ) }
generateException("bad alleles encountered", lineNo); else if ( ref.length == clipping ) {
else if ( a.getBases()[a.length()-clipping-1] != ((byte)ref.charAt(ref.length()-clipping-1)) ) if ( allowFullClip )
stillClipping = false;
else
generateException("bad alleles encountered", lineNo);
}
else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) {
stillClipping = false; stillClipping = false;
}
} }
if ( stillClipping ) if ( stillClipping )
clipping++; clipping++;
@ -678,8 +683,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
*/ */
protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) { protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
int forwardClipping = computeForwardClipping(unclippedAlleles, ref); int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0));
int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo); int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false, lineNo);
if ( clippedAlleles != null ) { if ( clippedAlleles != null ) {
for ( Allele a : unclippedAlleles ) { for ( Allele a : unclippedAlleles ) {

View File

@ -612,7 +612,7 @@ public class VariantContextUtils {
continue; continue;
if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) {
if ( ! genotypes.isEmpty() ) if ( ! genotypes.isEmpty() )
logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
genotypes = stripPLs(genotypes); genotypes = stripPLs(genotypes);
// this will remove stale AC,AF attributed from vc // this will remove stale AC,AF attributed from vc
@ -714,8 +714,7 @@ public class VariantContextUtils {
else if (refAllele.isNull()) else if (refAllele.isNull())
trimVC = false; trimVC = false;
else { else {
trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList<Allele>(inputVC.getAlternateAlleles()), trimVC = (AbstractVCFCodec.computeForwardClipping(inputVC.getAlternateAlleles(), (byte)inputVC.getReference().getDisplayString().charAt(0)) > 0);
inputVC.getReference().getDisplayString()) > 0);
} }
// nothing to do if we don't need to trim bases // nothing to do if we don't need to trim bases
@ -723,9 +722,6 @@ public class VariantContextUtils {
List<Allele> alleles = new ArrayList<Allele>(); List<Allele> alleles = new ArrayList<Allele>();
GenotypesContext genotypes = GenotypesContext.create(); GenotypesContext genotypes = GenotypesContext.create();
// set the reference base for indels in the attributes
Map<String,Object> attributes = new TreeMap<String,Object>(inputVC.getAttributes());
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>(); Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
for (final Allele a : inputVC.getAlleles()) { for (final Allele a : inputVC.getAlleles()) {
@ -768,12 +764,55 @@ public class VariantContextUtils {
} }
final VariantContextBuilder builder = new VariantContextBuilder(inputVC); final VariantContextBuilder builder = new VariantContextBuilder(inputVC);
return builder.alleles(alleles).genotypes(genotypes).attributes(attributes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make(); return builder.alleles(alleles).genotypes(genotypes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make();
} }
return inputVC; return inputVC;
} }
public static VariantContext reverseTrimAlleles(VariantContext inputVC) {
// see if we need to trim common reference base from all alleles
final int trimExtent = AbstractVCFCodec.computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true, -1);
if ( trimExtent <= 0 )
return inputVC;
final List<Allele> alleles = new ArrayList<Allele>();
GenotypesContext genotypes = GenotypesContext.create();
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
for (final Allele a : inputVC.getAlleles()) {
if (a.isSymbolic()) {
alleles.add(a);
originalToTrimmedAlleleMap.put(a, a);
} else {
// get bases for current allele and create a new one with trimmed bases
byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent);
Allele trimmedAllele = Allele.create(newBases, a.isReference());
alleles.add(trimmedAllele);
originalToTrimmedAlleleMap.put(a, trimmedAllele);
}
}
// now we can recreate new genotypes with trimmed alleles
for ( final Genotype genotype : inputVC.getGenotypes() ) {
List<Allele> originalAlleles = genotype.getAlleles();
List<Allele> trimmedAlleles = new ArrayList<Allele>();
for ( final Allele a : originalAlleles ) {
if ( a.isCalled() )
trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
else
trimmedAlleles.add(Allele.NO_CALL);
}
genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles));
}
final VariantContextBuilder builder = new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length());
return builder.alleles(alleles).genotypes(genotypes).make();
}
public static GenotypesContext stripPLs(GenotypesContext genotypes) { public static GenotypesContext stripPLs(GenotypesContext genotypes) {
GenotypesContext newGs = GenotypesContext.create(genotypes.size()); GenotypesContext newGs = GenotypesContext.create(genotypes.size());

View File

@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() { public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
Arrays.asList("e948543b83bfd0640fcb994d72f8e234")); Arrays.asList("ec907c65da5ed9b6046404b0f81422d4"));
executeTest("test Multiple SNP alleles", spec); executeTest("test Multiple SNP alleles", spec);
} }
@ -74,6 +74,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
executeTest("test bad read", spec); executeTest("test bad read", spec);
} }
@Test
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("a70593bbb5042e2d0e46e3c932cae170"));
executeTest("test reverse trim", spec);
}
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
// testing compressed output // testing compressed output

View File

@ -85,7 +85,7 @@ public class VCFCodecUnitTest extends BaseTest {
@Test(dataProvider = "AlleleClippingTestProvider") @Test(dataProvider = "AlleleClippingTestProvider")
public void TestAlleleClipping(AlleleClippingTestProvider cfg) { public void TestAlleleClipping(AlleleClippingTestProvider cfg) {
int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref, 0, 1); int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false, 1);
Assert.assertEquals(result, cfg.expectedClip); Assert.assertEquals(result, cfg.expectedClip);
} }
} }