diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java new file mode 100644 index 000000000..9937f80d7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + + +public class TandemRepeatAnnotator extends InfoFieldAnnotation /*implements StandardAnnotation*/ { + private static final String STR_PRESENT = "STR"; + private static final String REPEAT_UNIT_KEY = "RU"; + private static final String REPEATS_PER_ALLELE_KEY = "RPA"; + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + if ( !vc.isIndel()) + return null; + + Pair,byte[]> result = VariantContextUtils.getNumTandemRepeatUnits(vc, ref.getForwardBases()); + if (result == null) + return null; + + byte[] repeatUnit = result.second; + List numUnits = result.first; + + Map map = new HashMap(); + map.put(STR_PRESENT,true); + map.put(REPEAT_UNIT_KEY,new String(repeatUnit)); + map.put(REPEATS_PER_ALLELE_KEY, numUnits); + + return map; + } + + public static final String[] keyNames = {STR_PRESENT, REPEAT_UNIT_KEY,REPEATS_PER_ALLELE_KEY }; + public static final VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(STR_PRESENT, 1, VCFHeaderLineType.Flag, "Variant is a short tandem repeat"), + new VCFInfoHeaderLine(REPEAT_UNIT_KEY, 1, VCFHeaderLineType.String, "Tandem repeat unit (bases)"), + new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of times tandem repeat unit is repeated, for each allele (including reference)") }; + + public List getKeyNames() { + return Arrays.asList(keyNames); + } + + public List getDescriptions() { return Arrays.asList(descriptions); } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index bda5ed4a1..d5a1a75fd 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -27,12 +27,14 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; +import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -1325,6 +1327,127 @@ public class VariantContextUtils { return true; } + /** + * + * @param vc + * @param refBasesStartingAtVCWithPad + * @return + */ + @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) + public static Pair,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { + final boolean VERBOSE = false; + final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); + if ( ! vc.isIndel() ) // only indels are tandem repeats + return null; + + final Allele ref = vc.getReference(); + + byte[] repeatUnit = null; + final ArrayList lengths = new ArrayList(); + + for ( final Allele allele : vc.getAlternateAlleles() ) { + Pair result = getNumTandemRepeatUnits(ref.getBases(), allele.getBases(), refBasesStartingAtVCWithoutPad.getBytes()); + + final int[] repetitionCount = result.first; + // repetition count = 0 means allele is not a tandem expansion of context + if (repetitionCount[0] == 0 || repetitionCount[1] == 0) + return null; + + if (lengths.size() == 0) { + lengths.add(repetitionCount[0]); // add ref allele length only once + } + lengths.add(repetitionCount[1]); // add this alt allele's length + + repeatUnit = result.second; + if (VERBOSE) { + System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad); + System.out.println("Ref:"+ref.toString()+" Count:" + String.valueOf(repetitionCount[0])); + System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1])); + System.out.println("RU:"+new String(repeatUnit)); + } + } + + return new Pair, byte[]>(lengths,repeatUnit); + } + + protected static Pair getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { + /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. + Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)5. + */ + + byte[] longB; + // find first repeat unit based on either ref or alt, whichever is longer + if (altBases.length > refBases.length) + longB = altBases; + else + longB = refBases; + + // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units + // for example, -*,CACA needs to first be decomposed into (CA)2 + final int repeatUnitLength = findRepeatedSubstring(longB); + final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); + + final int[] repetitionCount = new int[2]; +// repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext)); +// repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext)); + int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases); + repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext))-repetitionsInRef; + repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext))-repetitionsInRef; + + return new Pair(repetitionCount, repeatUnit); + + } + + /** + * Find out if a string can be represented as a tandem number of substrings. + * For example ACTACT is a 2-tandem of ACT, + * but ACTACA is not. + * + * @param bases String to be tested + * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't + * be represented as one, it will be just the length of the input string) + */ + protected static int findRepeatedSubstring(byte[] bases) { + + int repLength; + for (repLength=1; repLength <=bases.length; repLength++) { + final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength); + boolean allBasesMatch = true; + for (int start = repLength; start < bases.length; start += repLength ) { + // check that remaining of string is exactly equal to repeat unit + final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length); + if (!Arrays.equals(candidateRepeatUnit, basePiece)) { + allBasesMatch = false; + break; + } + } + if (allBasesMatch) + return repLength; + } + + return repLength; + } + + /** + * Helper routine that finds number of repetitions a string consists of. + * For example, for string ATAT and repeat unit AT, number of repetitions = 2 + * @param repeatUnit Substring + * @param testString String to test + * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's + */ + protected static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString) { + int numRepeats = 0; + for (int start = 0; start < testString.length; start += repeatUnit.length) { + int end = start + repeatUnit.length; + byte[] unit = Arrays.copyOfRange(testString,start, end); + if(Arrays.equals(unit,repeatUnit)) + numRepeats++; + else + return numRepeats; + } + return numRepeats; + } + /** * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference * @param ref diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index 0a7427df7..3c4f01dd6 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -7,6 +7,7 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.collections.Pair; import org.testng.annotations.BeforeSuite; import org.testng.annotations.BeforeTest; import org.testng.annotations.BeforeMethod; @@ -467,6 +468,78 @@ public class VariantContextUnitTest extends BaseTest { Assert.assertNotNull(vc.getFiltersMaybeNull()); } + @Test + public void testRepeatAllele() { + Allele nullR = Allele.create(Allele.NULL_ALLELE_STRING, true); + Allele nullA = Allele.create(Allele.NULL_ALLELE_STRING, false); + Allele atc = Allele.create("ATC", false); + Allele atcatc = Allele.create("ATCATC", false); + Allele ccccR = Allele.create("CCCC", true); + Allele cc = Allele.create("CC", false); + Allele cccccc = Allele.create("CCCCCC", false); + Allele gagaR = Allele.create("GAGA", true); + Allele gagagaga = Allele.create("GAGAGAGA", false); + + Pair,byte[]> result; + byte[] refBytes = "TATCATCATCGGA".getBytes(); + + Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "ATG".getBytes(), "ATGATGATGATG".getBytes()),4); + Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "G".getBytes(), "ATGATGATGATG".getBytes()),0); + Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "T".getBytes(), "T".getBytes()),1); + Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "AT".getBytes(), "ATGATGATCATG".getBytes()),1); + Assert.assertEquals(VariantContextUtils.findNumberofRepetitions( "CCC".getBytes(), "CCCCCCCC".getBytes()),2); + + Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3); + Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1); + Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CACACAC".getBytes()),7); + Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CACACA".getBytes()),2); + Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("CATGCATG".getBytes()),4); + Assert.assertEquals(VariantContextUtils.findRepeatedSubstring("AATAATA".getBytes()),7); + + + // -*,ATC, context = ATC ATC ATC : (ATC)3 -> (ATC)4 + VariantContext vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(nullR,atc)).make(); + result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],3); + Assert.assertEquals(result.getFirst().toArray()[1],4); + Assert.assertEquals(result.getSecond().length,3); + + // ATC*,-,ATCATC + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(ATCref,nullA,atcatc)).make(); + result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],3); + Assert.assertEquals(result.getFirst().toArray()[1],2); + Assert.assertEquals(result.getFirst().toArray()[2],4); + Assert.assertEquals(result.getSecond().length,3); + + // simple non-tandem deletion: CCCC*, - + refBytes = "TCCCCCCCCATG".getBytes(); + vc = new VariantContextBuilder("foo", delLoc, 10, 14, Arrays.asList(ccccR,nullA)).make(); + result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],8); + Assert.assertEquals(result.getFirst().toArray()[1],4); + Assert.assertEquals(result.getSecond().length,1); + + // CCCC*,CC,-,CCCCCC, context = CCC: (C)7 -> (C)5,(C)3,(C)9 + refBytes = "TCCCCCCCAGAGAGAG".getBytes(); + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(ccccR,cc, nullA,cccccc)).make(); + result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],7); + Assert.assertEquals(result.getFirst().toArray()[1],5); + Assert.assertEquals(result.getFirst().toArray()[2],3); + Assert.assertEquals(result.getFirst().toArray()[3],9); + Assert.assertEquals(result.getSecond().length,1); + + // GAGA*,-,GAGAGAGA + refBytes = "TGAGAGAGAGATTT".getBytes(); + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(gagaR, nullA,gagagaga)).make(); + result = VariantContextUtils.getNumTandemRepeatUnits(vc,refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],5); + Assert.assertEquals(result.getFirst().toArray()[1],3); + Assert.assertEquals(result.getFirst().toArray()[2],7); + Assert.assertEquals(result.getSecond().length,2); + + } @Test public void testGetGenotypeCounts() { List alleles = Arrays.asList(Aref, T);