New annotation for indels that describe if they're STR's and their characteristics. If an indel is a STR, 3 fields are added to INFO: STR (boolean), RU = repeat unit (String), RPA = number of repetitions per allele. So, for example, if ATATAT* context gets changed to ATAT and ATATATAT, then RU=AT and RPA=3,2,4. Will be made standard annotation shortly. Added unit tests for new functionality. Pending: refactor VariantContextUtils.isRepeat() to unify code, and fix VariantEval functionality.

This commit is contained in:
Guillermo del Angel 2012-05-17 15:28:19 -04:00
parent 9c6bccfd8b
commit 5189b06468
3 changed files with 277 additions and 0 deletions

View File

@ -0,0 +1,81 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class TandemRepeatAnnotator extends InfoFieldAnnotation /*implements StandardAnnotation*/ {
private static final String STR_PRESENT = "STR";
private static final String REPEAT_UNIT_KEY = "RU";
private static final String REPEATS_PER_ALLELE_KEY = "RPA";
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( !vc.isIndel())
return null;
Pair<List<Integer>,byte[]> result = VariantContextUtils.getNumTandemRepeatUnits(vc, ref.getForwardBases());
if (result == null)
return null;
byte[] repeatUnit = result.second;
List<Integer> numUnits = result.first;
Map<String, Object> map = new HashMap<String, Object>();
map.put(STR_PRESENT,true);
map.put(REPEAT_UNIT_KEY,new String(repeatUnit));
map.put(REPEATS_PER_ALLELE_KEY, numUnits);
return map;
}
public static final String[] keyNames = {STR_PRESENT, REPEAT_UNIT_KEY,REPEATS_PER_ALLELE_KEY };
public static final VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(STR_PRESENT, 1, VCFHeaderLineType.Flag, "Variant is a short tandem repeat"),
new VCFInfoHeaderLine(REPEAT_UNIT_KEY, 1, VCFHeaderLineType.String, "Tandem repeat unit (bases)"),
new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of times tandem repeat unit is repeated, for each allele (including reference)") };
public List<String> getKeyNames() {
return Arrays.asList(keyNames);
}
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(descriptions); }
}

View File

@ -27,12 +27,14 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -1325,6 +1327,127 @@ public class VariantContextUtils {
return true;
}
/**
*
* @param vc
* @param refBasesStartingAtVCWithPad
* @return
*/
@Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"})
public static Pair<List<Integer>,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) {
final boolean VERBOSE = false;
final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1);
if ( ! vc.isIndel() ) // only indels are tandem repeats
return null;
final Allele ref = vc.getReference();
byte[] repeatUnit = null;
final ArrayList<Integer> lengths = new ArrayList<Integer>();
for ( final Allele allele : vc.getAlternateAlleles() ) {
Pair<int[],byte[]> result = getNumTandemRepeatUnits(ref.getBases(), allele.getBases(), refBasesStartingAtVCWithoutPad.getBytes());
final int[] repetitionCount = result.first;
// repetition count = 0 means allele is not a tandem expansion of context
if (repetitionCount[0] == 0 || repetitionCount[1] == 0)
return null;
if (lengths.size() == 0) {
lengths.add(repetitionCount[0]); // add ref allele length only once
}
lengths.add(repetitionCount[1]); // add this alt allele's length
repeatUnit = result.second;
if (VERBOSE) {
System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad);
System.out.println("Ref:"+ref.toString()+" Count:" + String.valueOf(repetitionCount[0]));
System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1]));
System.out.println("RU:"+new String(repeatUnit));
}
}
return new Pair<List<Integer>, byte[]>(lengths,repeatUnit);
}
protected static Pair<int[],byte[]> getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) {
/* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units.
Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)5.
*/
byte[] longB;
// find first repeat unit based on either ref or alt, whichever is longer
if (altBases.length > refBases.length)
longB = altBases;
else
longB = refBases;
// see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units
// for example, -*,CACA needs to first be decomposed into (CA)2
final int repeatUnitLength = findRepeatedSubstring(longB);
final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength);
final int[] repetitionCount = new int[2];
// repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext));
// repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext));
int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases);
repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext))-repetitionsInRef;
repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext))-repetitionsInRef;
return new Pair<int[], byte[]>(repetitionCount, repeatUnit);
}
/**
* Find out if a string can be represented as a tandem number of substrings.
* For example ACTACT is a 2-tandem of ACT,
* but ACTACA is not.
*
* @param bases String to be tested
* @return Length of repeat unit, if string can be represented as tandem of substring (if it can't
* be represented as one, it will be just the length of the input string)
*/
protected static int findRepeatedSubstring(byte[] bases) {
int repLength;
for (repLength=1; repLength <=bases.length; repLength++) {
final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength);
boolean allBasesMatch = true;
for (int start = repLength; start < bases.length; start += repLength ) {
// check that remaining of string is exactly equal to repeat unit
final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length);
if (!Arrays.equals(candidateRepeatUnit, basePiece)) {
allBasesMatch = false;
break;
}
}
if (allBasesMatch)
return repLength;
}
return repLength;
}
/**
* Helper routine that finds number of repetitions a string consists of.
* For example, for string ATAT and repeat unit AT, number of repetitions = 2
* @param repeatUnit Substring
* @param testString String to test
* @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's
*/
protected static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString) {
int numRepeats = 0;
for (int start = 0; start < testString.length; start += repeatUnit.length) {
int end = start + repeatUnit.length;
byte[] unit = Arrays.copyOfRange(testString,start, end);
if(Arrays.equals(unit,repeatUnit))
numRepeats++;
else
return numRepeats;
}
return numRepeats;
}
/**
* Helper function for isTandemRepeat that checks that allele matches somewhere on the reference
* @param ref

View File

@ -7,6 +7,7 @@ package org.broadinstitute.sting.utils.variantcontext;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.collections.Pair;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.BeforeMethod;
@ -467,6 +468,78 @@ public class VariantContextUnitTest extends BaseTest {
Assert.assertNotNull(vc.getFiltersMaybeNull());
}
@Test
public void testRepeatAllele() {
Allele nullR = Allele.create(Allele.NULL_ALLELE_STRING, true);
Allele nullA = Allele.create(Allele.NULL_ALLELE_STRING, false);
Allele atc = Allele.create("ATC", false);
Allele atcatc = Allele.create("ATCATC", false);
Allele ccccR = Allele.create("CCCC", true);
Allele cc = Allele.create("CC", false);
Allele cccccc = Allele.create("CCCCCC", false);
Allele gagaR = Allele.create("GAGA", true);
Allele gagagaga = Allele.create("GAGAGAGA", false);
Pair<List<Integer>,byte[]> result;
byte[] refBytes = "TATCATCATCGGA".getBytes();
Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "ATG".getBytes(), "ATGATGATGATG".getBytes()),4);
Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "G".getBytes(), "ATGATGATGATG".getBytes()),0);
Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "T".getBytes(), "T".getBytes()),1);
Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "AT".getBytes(), "ATGATGATCATG".getBytes()),1);
Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "CCC".getBytes(), "CCCCCCCC".getBytes()),2);
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3);
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1);
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CACACAC".getBytes()),7);
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CACACA".getBytes()),2);
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CATGCATG".getBytes()),4);
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("AATAATA".getBytes()),7);
// -*,ATC, context = ATC ATC ATC : (ATC)3 -> (ATC)4
VariantContext vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(nullR,atc)).make();
result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes);
Assert.assertEquals(result.getFirst().toArray()[0],3);
Assert.assertEquals(result.getFirst().toArray()[1],4);
Assert.assertEquals(result.getSecond().length,3);
// ATC*,-,ATCATC
vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(ATCref,nullA,atcatc)).make();
result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes);
Assert.assertEquals(result.getFirst().toArray()[0],3);
Assert.assertEquals(result.getFirst().toArray()[1],2);
Assert.assertEquals(result.getFirst().toArray()[2],4);
Assert.assertEquals(result.getSecond().length,3);
// simple non-tandem deletion: CCCC*, -
refBytes = "TCCCCCCCCATG".getBytes();
vc = new VariantContextBuilder("foo", delLoc, 10, 14, Arrays.asList(ccccR,nullA)).make();
result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes);
Assert.assertEquals(result.getFirst().toArray()[0],8);
Assert.assertEquals(result.getFirst().toArray()[1],4);
Assert.assertEquals(result.getSecond().length,1);
// CCCC*,CC,-,CCCCCC, context = CCC: (C)7 -> (C)5,(C)3,(C)9
refBytes = "TCCCCCCCAGAGAGAG".getBytes();
vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(ccccR,cc, nullA,cccccc)).make();
result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes);
Assert.assertEquals(result.getFirst().toArray()[0],7);
Assert.assertEquals(result.getFirst().toArray()[1],5);
Assert.assertEquals(result.getFirst().toArray()[2],3);
Assert.assertEquals(result.getFirst().toArray()[3],9);
Assert.assertEquals(result.getSecond().length,1);
// GAGA*,-,GAGAGAGA
refBytes = "TGAGAGAGAGATTT".getBytes();
vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(gagaR, nullA,gagagaga)).make();
result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes);
Assert.assertEquals(result.getFirst().toArray()[0],5);
Assert.assertEquals(result.getFirst().toArray()[1],3);
Assert.assertEquals(result.getFirst().toArray()[2],7);
Assert.assertEquals(result.getSecond().length,2);
}
@Test
public void testGetGenotypeCounts() {
List<Allele> alleles = Arrays.asList(Aref, T);