New annotation for indels that describe if they're STR's and their characteristics. If an indel is a STR, 3 fields are added to INFO: STR (boolean), RU = repeat unit (String), RPA = number of repetitions per allele. So, for example, if ATATAT* context gets changed to ATAT and ATATATAT, then RU=AT and RPA=3,2,4. Will be made standard annotation shortly. Added unit tests for new functionality. Pending: refactor VariantContextUtils.isRepeat() to unify code, and fix VariantEval functionality.
This commit is contained in:
parent
9c6bccfd8b
commit
5189b06468
|
|
@ -0,0 +1,81 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
public class TandemRepeatAnnotator extends InfoFieldAnnotation /*implements StandardAnnotation*/ {
|
||||
private static final String STR_PRESENT = "STR";
|
||||
private static final String REPEAT_UNIT_KEY = "RU";
|
||||
private static final String REPEATS_PER_ALLELE_KEY = "RPA";
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( !vc.isIndel())
|
||||
return null;
|
||||
|
||||
Pair<List<Integer>,byte[]> result = VariantContextUtils.getNumTandemRepeatUnits(vc, ref.getForwardBases());
|
||||
if (result == null)
|
||||
return null;
|
||||
|
||||
byte[] repeatUnit = result.second;
|
||||
List<Integer> numUnits = result.first;
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(STR_PRESENT,true);
|
||||
map.put(REPEAT_UNIT_KEY,new String(repeatUnit));
|
||||
map.put(REPEATS_PER_ALLELE_KEY, numUnits);
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
public static final String[] keyNames = {STR_PRESENT, REPEAT_UNIT_KEY,REPEATS_PER_ALLELE_KEY };
|
||||
public static final VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(STR_PRESENT, 1, VCFHeaderLineType.Flag, "Variant is a short tandem repeat"),
|
||||
new VCFInfoHeaderLine(REPEAT_UNIT_KEY, 1, VCFHeaderLineType.String, "Tandem repeat unit (bases)"),
|
||||
new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of times tandem repeat unit is repeated, for each allele (including reference)") };
|
||||
|
||||
public List<String> getKeyNames() {
|
||||
return Arrays.asList(keyNames);
|
||||
}
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(descriptions); }
|
||||
|
||||
}
|
||||
|
|
@ -27,12 +27,14 @@ import com.google.java.contract.Ensures;
|
|||
import com.google.java.contract.Requires;
|
||||
import org.apache.commons.jexl2.Expression;
|
||||
import org.apache.commons.jexl2.JexlEngine;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
|
|
@ -1325,6 +1327,127 @@ public class VariantContextUtils {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param vc
|
||||
* @param refBasesStartingAtVCWithPad
|
||||
* @return
|
||||
*/
|
||||
@Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"})
|
||||
public static Pair<List<Integer>,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) {
|
||||
final boolean VERBOSE = false;
|
||||
final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1);
|
||||
if ( ! vc.isIndel() ) // only indels are tandem repeats
|
||||
return null;
|
||||
|
||||
final Allele ref = vc.getReference();
|
||||
|
||||
byte[] repeatUnit = null;
|
||||
final ArrayList<Integer> lengths = new ArrayList<Integer>();
|
||||
|
||||
for ( final Allele allele : vc.getAlternateAlleles() ) {
|
||||
Pair<int[],byte[]> result = getNumTandemRepeatUnits(ref.getBases(), allele.getBases(), refBasesStartingAtVCWithoutPad.getBytes());
|
||||
|
||||
final int[] repetitionCount = result.first;
|
||||
// repetition count = 0 means allele is not a tandem expansion of context
|
||||
if (repetitionCount[0] == 0 || repetitionCount[1] == 0)
|
||||
return null;
|
||||
|
||||
if (lengths.size() == 0) {
|
||||
lengths.add(repetitionCount[0]); // add ref allele length only once
|
||||
}
|
||||
lengths.add(repetitionCount[1]); // add this alt allele's length
|
||||
|
||||
repeatUnit = result.second;
|
||||
if (VERBOSE) {
|
||||
System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad);
|
||||
System.out.println("Ref:"+ref.toString()+" Count:" + String.valueOf(repetitionCount[0]));
|
||||
System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1]));
|
||||
System.out.println("RU:"+new String(repeatUnit));
|
||||
}
|
||||
}
|
||||
|
||||
return new Pair<List<Integer>, byte[]>(lengths,repeatUnit);
|
||||
}
|
||||
|
||||
protected static Pair<int[],byte[]> getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) {
|
||||
/* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units.
|
||||
Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)5.
|
||||
*/
|
||||
|
||||
byte[] longB;
|
||||
// find first repeat unit based on either ref or alt, whichever is longer
|
||||
if (altBases.length > refBases.length)
|
||||
longB = altBases;
|
||||
else
|
||||
longB = refBases;
|
||||
|
||||
// see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units
|
||||
// for example, -*,CACA needs to first be decomposed into (CA)2
|
||||
final int repeatUnitLength = findRepeatedSubstring(longB);
|
||||
final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength);
|
||||
|
||||
final int[] repetitionCount = new int[2];
|
||||
// repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext));
|
||||
// repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext));
|
||||
int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases);
|
||||
repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext))-repetitionsInRef;
|
||||
repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext))-repetitionsInRef;
|
||||
|
||||
return new Pair<int[], byte[]>(repetitionCount, repeatUnit);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Find out if a string can be represented as a tandem number of substrings.
|
||||
* For example ACTACT is a 2-tandem of ACT,
|
||||
* but ACTACA is not.
|
||||
*
|
||||
* @param bases String to be tested
|
||||
* @return Length of repeat unit, if string can be represented as tandem of substring (if it can't
|
||||
* be represented as one, it will be just the length of the input string)
|
||||
*/
|
||||
protected static int findRepeatedSubstring(byte[] bases) {
|
||||
|
||||
int repLength;
|
||||
for (repLength=1; repLength <=bases.length; repLength++) {
|
||||
final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength);
|
||||
boolean allBasesMatch = true;
|
||||
for (int start = repLength; start < bases.length; start += repLength ) {
|
||||
// check that remaining of string is exactly equal to repeat unit
|
||||
final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length);
|
||||
if (!Arrays.equals(candidateRepeatUnit, basePiece)) {
|
||||
allBasesMatch = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allBasesMatch)
|
||||
return repLength;
|
||||
}
|
||||
|
||||
return repLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper routine that finds number of repetitions a string consists of.
|
||||
* For example, for string ATAT and repeat unit AT, number of repetitions = 2
|
||||
* @param repeatUnit Substring
|
||||
* @param testString String to test
|
||||
* @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's
|
||||
*/
|
||||
protected static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString) {
|
||||
int numRepeats = 0;
|
||||
for (int start = 0; start < testString.length; start += repeatUnit.length) {
|
||||
int end = start + repeatUnit.length;
|
||||
byte[] unit = Arrays.copyOfRange(testString,start, end);
|
||||
if(Arrays.equals(unit,repeatUnit))
|
||||
numRepeats++;
|
||||
else
|
||||
return numRepeats;
|
||||
}
|
||||
return numRepeats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function for isTandemRepeat that checks that allele matches somewhere on the reference
|
||||
* @param ref
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ package org.broadinstitute.sting.utils.variantcontext;
|
|||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.BeforeTest;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
|
|
@ -467,6 +468,78 @@ public class VariantContextUnitTest extends BaseTest {
|
|||
Assert.assertNotNull(vc.getFiltersMaybeNull());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRepeatAllele() {
|
||||
Allele nullR = Allele.create(Allele.NULL_ALLELE_STRING, true);
|
||||
Allele nullA = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
||||
Allele atc = Allele.create("ATC", false);
|
||||
Allele atcatc = Allele.create("ATCATC", false);
|
||||
Allele ccccR = Allele.create("CCCC", true);
|
||||
Allele cc = Allele.create("CC", false);
|
||||
Allele cccccc = Allele.create("CCCCCC", false);
|
||||
Allele gagaR = Allele.create("GAGA", true);
|
||||
Allele gagagaga = Allele.create("GAGAGAGA", false);
|
||||
|
||||
Pair<List<Integer>,byte[]> result;
|
||||
byte[] refBytes = "TATCATCATCGGA".getBytes();
|
||||
|
||||
Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "ATG".getBytes(), "ATGATGATGATG".getBytes()),4);
|
||||
Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "G".getBytes(), "ATGATGATGATG".getBytes()),0);
|
||||
Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "T".getBytes(), "T".getBytes()),1);
|
||||
Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "AT".getBytes(), "ATGATGATCATG".getBytes()),1);
|
||||
Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "CCC".getBytes(), "CCCCCCCC".getBytes()),2);
|
||||
|
||||
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3);
|
||||
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1);
|
||||
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CACACAC".getBytes()),7);
|
||||
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CACACA".getBytes()),2);
|
||||
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CATGCATG".getBytes()),4);
|
||||
Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("AATAATA".getBytes()),7);
|
||||
|
||||
|
||||
// -*,ATC, context = ATC ATC ATC : (ATC)3 -> (ATC)4
|
||||
VariantContext vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(nullR,atc)).make();
|
||||
result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes);
|
||||
Assert.assertEquals(result.getFirst().toArray()[0],3);
|
||||
Assert.assertEquals(result.getFirst().toArray()[1],4);
|
||||
Assert.assertEquals(result.getSecond().length,3);
|
||||
|
||||
// ATC*,-,ATCATC
|
||||
vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(ATCref,nullA,atcatc)).make();
|
||||
result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes);
|
||||
Assert.assertEquals(result.getFirst().toArray()[0],3);
|
||||
Assert.assertEquals(result.getFirst().toArray()[1],2);
|
||||
Assert.assertEquals(result.getFirst().toArray()[2],4);
|
||||
Assert.assertEquals(result.getSecond().length,3);
|
||||
|
||||
// simple non-tandem deletion: CCCC*, -
|
||||
refBytes = "TCCCCCCCCATG".getBytes();
|
||||
vc = new VariantContextBuilder("foo", delLoc, 10, 14, Arrays.asList(ccccR,nullA)).make();
|
||||
result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes);
|
||||
Assert.assertEquals(result.getFirst().toArray()[0],8);
|
||||
Assert.assertEquals(result.getFirst().toArray()[1],4);
|
||||
Assert.assertEquals(result.getSecond().length,1);
|
||||
|
||||
// CCCC*,CC,-,CCCCCC, context = CCC: (C)7 -> (C)5,(C)3,(C)9
|
||||
refBytes = "TCCCCCCCAGAGAGAG".getBytes();
|
||||
vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(ccccR,cc, nullA,cccccc)).make();
|
||||
result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes);
|
||||
Assert.assertEquals(result.getFirst().toArray()[0],7);
|
||||
Assert.assertEquals(result.getFirst().toArray()[1],5);
|
||||
Assert.assertEquals(result.getFirst().toArray()[2],3);
|
||||
Assert.assertEquals(result.getFirst().toArray()[3],9);
|
||||
Assert.assertEquals(result.getSecond().length,1);
|
||||
|
||||
// GAGA*,-,GAGAGAGA
|
||||
refBytes = "TGAGAGAGAGATTT".getBytes();
|
||||
vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(gagaR, nullA,gagagaga)).make();
|
||||
result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes);
|
||||
Assert.assertEquals(result.getFirst().toArray()[0],5);
|
||||
Assert.assertEquals(result.getFirst().toArray()[1],3);
|
||||
Assert.assertEquals(result.getFirst().toArray()[2],7);
|
||||
Assert.assertEquals(result.getSecond().length,2);
|
||||
|
||||
}
|
||||
@Test
|
||||
public void testGetGenotypeCounts() {
|
||||
List<Allele> alleles = Arrays.asList(Aref, T);
|
||||
|
|
|
|||
Loading…
Reference in New Issue