If we are subsetting alleles in the UG (either because there were too many or because some were not polymorphic), then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). Thanks to Ryan for reporting this. Only one of the integration tests had even partially covered this case, so I added one that did.
This commit is contained in:
parent
4b81c75642
commit
1f23d99dfa
|
|
@ -417,6 +417,11 @@ public class UnifiedGenotyperEngine {
|
||||||
builder.attributes(attributes);
|
builder.attributes(attributes);
|
||||||
VariantContext vcCall = builder.make();
|
VariantContext vcCall = builder.make();
|
||||||
|
|
||||||
|
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
|
||||||
|
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
|
||||||
|
if ( myAlleles.size() != vc.getAlleles().size() )
|
||||||
|
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
|
||||||
|
|
||||||
if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) {
|
if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) {
|
||||||
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
|
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
|
||||||
final ReadBackedPileup pileup = rawContext.getBasePileup();
|
final ReadBackedPileup pileup = rawContext.getBasePileup();
|
||||||
|
|
|
||||||
|
|
@ -617,10 +617,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
|
public static int computeForwardClipping(final List<Allele> unclippedAlleles, final byte ref0) {
|
||||||
boolean clipping = true;
|
boolean clipping = true;
|
||||||
int symbolicAlleleCount = 0;
|
int symbolicAlleleCount = 0;
|
||||||
final byte ref0 = (byte)ref.charAt(0);
|
|
||||||
|
|
||||||
for ( Allele a : unclippedAlleles ) {
|
for ( Allele a : unclippedAlleles ) {
|
||||||
if ( a.isSymbolic() ) {
|
if ( a.isSymbolic() ) {
|
||||||
|
|
@ -638,7 +637,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
||||||
return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0;
|
return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static int computeReverseClipping(List<Allele> unclippedAlleles, String ref, int forwardClipping, int lineNo) {
|
public static int computeReverseClipping(final List<Allele> unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip, final int lineNo) {
|
||||||
int clipping = 0;
|
int clipping = 0;
|
||||||
boolean stillClipping = true;
|
boolean stillClipping = true;
|
||||||
|
|
||||||
|
|
@ -650,14 +649,20 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
||||||
// we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong
|
// we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong
|
||||||
// position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine).
|
// position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine).
|
||||||
if ( a.length() - clipping == 0 )
|
if ( a.length() - clipping == 0 )
|
||||||
return clipping - 1;
|
return clipping - (allowFullClip ? 0 : 1);
|
||||||
|
|
||||||
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 )
|
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) {
|
||||||
stillClipping = false;
|
stillClipping = false;
|
||||||
else if ( ref.length() == clipping )
|
}
|
||||||
generateException("bad alleles encountered", lineNo);
|
else if ( ref.length == clipping ) {
|
||||||
else if ( a.getBases()[a.length()-clipping-1] != ((byte)ref.charAt(ref.length()-clipping-1)) )
|
if ( allowFullClip )
|
||||||
|
stillClipping = false;
|
||||||
|
else
|
||||||
|
generateException("bad alleles encountered", lineNo);
|
||||||
|
}
|
||||||
|
else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) {
|
||||||
stillClipping = false;
|
stillClipping = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if ( stillClipping )
|
if ( stillClipping )
|
||||||
clipping++;
|
clipping++;
|
||||||
|
|
@ -678,8 +683,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
||||||
*/
|
*/
|
||||||
protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
|
protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
|
||||||
|
|
||||||
int forwardClipping = computeForwardClipping(unclippedAlleles, ref);
|
int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0));
|
||||||
int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo);
|
int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false, lineNo);
|
||||||
|
|
||||||
if ( clippedAlleles != null ) {
|
if ( clippedAlleles != null ) {
|
||||||
for ( Allele a : unclippedAlleles ) {
|
for ( Allele a : unclippedAlleles ) {
|
||||||
|
|
|
||||||
|
|
@ -612,7 +612,7 @@ public class VariantContextUtils {
|
||||||
continue;
|
continue;
|
||||||
if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) {
|
if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) {
|
||||||
if ( ! genotypes.isEmpty() )
|
if ( ! genotypes.isEmpty() )
|
||||||
logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
|
logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
|
||||||
genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
|
genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
|
||||||
genotypes = stripPLs(genotypes);
|
genotypes = stripPLs(genotypes);
|
||||||
// this will remove stale AC,AF attributed from vc
|
// this will remove stale AC,AF attributed from vc
|
||||||
|
|
@ -714,8 +714,7 @@ public class VariantContextUtils {
|
||||||
else if (refAllele.isNull())
|
else if (refAllele.isNull())
|
||||||
trimVC = false;
|
trimVC = false;
|
||||||
else {
|
else {
|
||||||
trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList<Allele>(inputVC.getAlternateAlleles()),
|
trimVC = (AbstractVCFCodec.computeForwardClipping(inputVC.getAlternateAlleles(), (byte)inputVC.getReference().getDisplayString().charAt(0)) > 0);
|
||||||
inputVC.getReference().getDisplayString()) > 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// nothing to do if we don't need to trim bases
|
// nothing to do if we don't need to trim bases
|
||||||
|
|
@ -723,9 +722,6 @@ public class VariantContextUtils {
|
||||||
List<Allele> alleles = new ArrayList<Allele>();
|
List<Allele> alleles = new ArrayList<Allele>();
|
||||||
GenotypesContext genotypes = GenotypesContext.create();
|
GenotypesContext genotypes = GenotypesContext.create();
|
||||||
|
|
||||||
// set the reference base for indels in the attributes
|
|
||||||
Map<String,Object> attributes = new TreeMap<String,Object>(inputVC.getAttributes());
|
|
||||||
|
|
||||||
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
|
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
|
||||||
|
|
||||||
for (final Allele a : inputVC.getAlleles()) {
|
for (final Allele a : inputVC.getAlleles()) {
|
||||||
|
|
@ -768,12 +764,55 @@ public class VariantContextUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
final VariantContextBuilder builder = new VariantContextBuilder(inputVC);
|
final VariantContextBuilder builder = new VariantContextBuilder(inputVC);
|
||||||
return builder.alleles(alleles).genotypes(genotypes).attributes(attributes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make();
|
return builder.alleles(alleles).genotypes(genotypes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make();
|
||||||
}
|
}
|
||||||
|
|
||||||
return inputVC;
|
return inputVC;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static VariantContext reverseTrimAlleles(VariantContext inputVC) {
|
||||||
|
// see if we need to trim common reference base from all alleles
|
||||||
|
|
||||||
|
final int trimExtent = AbstractVCFCodec.computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true, -1);
|
||||||
|
if ( trimExtent <= 0 )
|
||||||
|
return inputVC;
|
||||||
|
|
||||||
|
final List<Allele> alleles = new ArrayList<Allele>();
|
||||||
|
GenotypesContext genotypes = GenotypesContext.create();
|
||||||
|
|
||||||
|
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
|
||||||
|
|
||||||
|
for (final Allele a : inputVC.getAlleles()) {
|
||||||
|
if (a.isSymbolic()) {
|
||||||
|
alleles.add(a);
|
||||||
|
originalToTrimmedAlleleMap.put(a, a);
|
||||||
|
} else {
|
||||||
|
// get bases for current allele and create a new one with trimmed bases
|
||||||
|
byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent);
|
||||||
|
Allele trimmedAllele = Allele.create(newBases, a.isReference());
|
||||||
|
alleles.add(trimmedAllele);
|
||||||
|
originalToTrimmedAlleleMap.put(a, trimmedAllele);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// now we can recreate new genotypes with trimmed alleles
|
||||||
|
for ( final Genotype genotype : inputVC.getGenotypes() ) {
|
||||||
|
|
||||||
|
List<Allele> originalAlleles = genotype.getAlleles();
|
||||||
|
List<Allele> trimmedAlleles = new ArrayList<Allele>();
|
||||||
|
for ( final Allele a : originalAlleles ) {
|
||||||
|
if ( a.isCalled() )
|
||||||
|
trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
|
||||||
|
else
|
||||||
|
trimmedAlleles.add(Allele.NO_CALL);
|
||||||
|
}
|
||||||
|
genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles));
|
||||||
|
}
|
||||||
|
|
||||||
|
final VariantContextBuilder builder = new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length());
|
||||||
|
return builder.alleles(alleles).genotypes(genotypes).make();
|
||||||
|
}
|
||||||
|
|
||||||
public static GenotypesContext stripPLs(GenotypesContext genotypes) {
|
public static GenotypesContext stripPLs(GenotypesContext genotypes) {
|
||||||
GenotypesContext newGs = GenotypesContext.create(genotypes.size());
|
GenotypesContext newGs = GenotypesContext.create(genotypes.size());
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testMultipleSNPAlleles() {
|
public void testMultipleSNPAlleles() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
|
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
|
||||||
Arrays.asList("e948543b83bfd0640fcb994d72f8e234"));
|
Arrays.asList("ec907c65da5ed9b6046404b0f81422d4"));
|
||||||
executeTest("test Multiple SNP alleles", spec);
|
executeTest("test Multiple SNP alleles", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -74,6 +74,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
executeTest("test bad read", spec);
|
executeTest("test bad read", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testReverseTrim() {
|
||||||
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
|
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
|
||||||
|
Arrays.asList("a70593bbb5042e2d0e46e3c932cae170"));
|
||||||
|
executeTest("test reverse trim", spec);
|
||||||
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// testing compressed output
|
// testing compressed output
|
||||||
|
|
|
||||||
|
|
@ -85,7 +85,7 @@ public class VCFCodecUnitTest extends BaseTest {
|
||||||
|
|
||||||
@Test(dataProvider = "AlleleClippingTestProvider")
|
@Test(dataProvider = "AlleleClippingTestProvider")
|
||||||
public void TestAlleleClipping(AlleleClippingTestProvider cfg) {
|
public void TestAlleleClipping(AlleleClippingTestProvider cfg) {
|
||||||
int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref, 0, 1);
|
int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false, 1);
|
||||||
Assert.assertEquals(result, cfg.expectedClip);
|
Assert.assertEquals(result, cfg.expectedClip);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue