Function to compute whether a VariantContext indel is part of a TandemRepeat

Returns true iff VC is an non-complex indel where every allele represents an expansion or
 contraction of a series of identical bases in the reference.

 The logic of this function is pretty simple.  Take all of the non-null alleles in VC.  For
 each insertion allele of n bases, check if that allele matches the next n reference bases.
 For each deletion allele of n bases, check if this matches the reference bases at n - 2 n,
 as it must necessarily match the first n bases.  If this test returns true for all
 alleles you are a tandem repeat, otherwise you are not.  Note that in this context n is the
 base differences between the ref and alt alleles
This commit is contained in:
Mark DePristo 2012-04-06 15:58:28 -04:00
parent 08fab49d30
commit 52ef4a3e26
2 changed files with 148 additions and 0 deletions

View File

@ -1201,4 +1201,80 @@ public class VariantContextUtils {
final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods);
return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false);
}
/**
* Returns true iff VC is an non-complex indel where every allele represents an expansion or
* contraction of a series of identical bases in the reference.
*
* For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT
*
* If VC = -/CT, then this function returns true because the CT insertion matches exactly the
* upcoming reference.
* If VC = -/CTA then this function returns false because the CTA isn't a perfect match
*
* Now consider deletions:
*
* If VC = CT/- then again the same logic applies and this returns true
* The case of CTA/- makes no sense because it doesn't actually match the reference bases.
*
* The logic of this function is pretty simple. Take all of the non-null alleles in VC. For
* each insertion allele of n bases, check if that allele matches the next n reference bases.
* For each deletion allele of n bases, check if this matches the reference bases at n - 2 n,
* as it must necessarily match the first n bases. If this test returns true for all
* alleles you are a tandem repeat, otherwise you are not.
*
* @param vc
* @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference
* @return
*/
@Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"})
public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) {
final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1);
if ( ! vc.isIndel() ) // only indels are tandem repeats
return false;
final Allele ref = vc.getReference();
for ( final Allele allele : vc.getAlternateAlleles() ) {
if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) )
return false;
}
// we've passed all of the tests, so we are a repeat
return true;
}
/**
* Helper function for isTandemRepeat that checks that allele matches somewhere on the reference
* @param ref
* @param alt
* @param refBasesStartingAtVCWithoutPad
* @return
*/
protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) {
if ( ! Allele.oneIsPrefixOfOther(ref, alt) )
return false; // we require one allele be a prefix of another
if ( ref.length() > alt.length() ) { // we are a deletion
return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2);
} else { // we are an insertion
return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1);
}
}
protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) {
final String potentialRepeat = l.substring(s.length()); // skip s bases
for ( int i = 0; i < minNumberOfMatches; i++) {
final int start = i * potentialRepeat.length();
final int end = (i+1) * potentialRepeat.length();
if ( ref.length() < end )
return false; // we ran out of bases to test
final String refSub = ref.substring(start, end);
if ( ! refSub.equals(potentialRepeat) )
return false; // repeat didn't match, fail
}
return true; // we passed all tests, we matched
}
}

View File

@ -589,4 +589,76 @@ public class VariantContextUtilsUnitTest extends BaseTest {
return priority;
}
// --------------------------------------------------------------------------------
//
// Test repeats
//
// --------------------------------------------------------------------------------
private class RepeatDetectorTest extends TestDataProvider {
String ref;
boolean isTrueRepeat;
VariantContext vc;
private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) {
super(RepeatDetectorTest.class);
this.ref = ref;
this.isTrueRepeat = isTrueRepeat;
List<Allele> alleles = new LinkedList<Allele>();
final Allele refAllele = Allele.create(refAlleleString, true);
alleles.add(refAllele);
for ( final String altString: altAlleleStrings) {
final Allele alt = Allele.create(altString, false);
alleles.add(alt);
}
VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, 1 + refAllele.length(), alleles);
this.vc = builder.make();
}
public String toString() {
return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc);
}
}
@DataProvider(name = "RepeatDetectorTest")
public Object[][] makeRepeatDetectorTest() {
new RepeatDetectorTest(true, "AAC", "-", "A");
new RepeatDetectorTest(true, "AAC", "A", "-");
new RepeatDetectorTest(false, "AAC", "AA", "-");
new RepeatDetectorTest(false, "AAC", "-", "C");
new RepeatDetectorTest(false, "AAC", "A", "C");
// running out of ref bases => false
new RepeatDetectorTest(false, "AAC", "-", "CAGTA");
// complex repeats
new RepeatDetectorTest(true, "ATATATC", "-", "AT");
new RepeatDetectorTest(true, "ATATATC", "-", "ATA");
new RepeatDetectorTest(true, "ATATATC", "-", "ATAT");
new RepeatDetectorTest(true, "ATATATC", "AT", "-");
new RepeatDetectorTest(false, "ATATATC", "ATA", "-");
new RepeatDetectorTest(false, "ATATATC", "ATAT", "-");
// multi-allelic
new RepeatDetectorTest(true, "ATATATC", "-", "AT", "ATAT");
new RepeatDetectorTest(true, "ATATATC", "-", "AT", "ATA");
new RepeatDetectorTest(true, "ATATATC", "AT", "-", "ATAT");
new RepeatDetectorTest(true, "ATATATC", "AT", "-", "ATA"); // two As
new RepeatDetectorTest(false, "ATATATC", "AT", "-", "ATC"); // false
new RepeatDetectorTest(false, "ATATATC", "AT", "-", "CC"); // false
new RepeatDetectorTest(false, "ATATATC", "AT", "ATAT", "CC"); // false
return RepeatDetectorTest.getTests(RepeatDetectorTest.class);
}
@Test(dataProvider = "RepeatDetectorTest")
public void testRepeatDetectorTest(RepeatDetectorTest cfg) {
// test alleles are equal
Assert.assertEquals(VariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat);
}
}