diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java index d500a8ef9..a836fbb5e 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java @@ -61,8 +61,8 @@ import java.util.Map; import java.util.Set; public abstract class RepeatCovariate implements ExperimentalCovariate { - final int MAX_REPEAT_LENGTH = 20; - final int MAX_STR_UNIT_LENGTH = 8; + public static final int MAX_REPEAT_LENGTH = 20; + public static final int MAX_STR_UNIT_LENGTH = 8; private final HashMap repeatLookupTable = new HashMap(); private final HashMap repeatReverseLookupTable = new HashMap(); private int nextId = 0; @@ -144,25 +144,17 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { bestRepeatUnit = bestFWRepeatUnit; // arbitrary } else { - // tandem repeat starting forward from current offset - maxRL = maxFW; + // tandem repeat starting forward from current offset. + // It could be the case that best BW unit was differnet from FW unit, but that BW still contains FW unit. + // For example, TTCTT(C) CCC - at (C) place, best BW unit is (TTC)2, best FW unit is (C)3. + // but correct representation at that place might be (C)4. + // Hence, if the FW and BW units don't match, check if BW unit can still be a part of FW unit and add + // representations to total + maxBW = VariantContextUtils.findNumberofRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); + maxRL = maxFW + maxBW; bestRepeatUnit = bestFWRepeatUnit; } -/* if (maxFW > maxBW) { - // tandem repeat starting forward from current offset - maxRL = maxFW; - bestRepeatUnit = bestFWRepeatUnit; - } - else if (maxFW < maxBW) { - maxRL = maxBW; - bestRepeatUnit = bestBWRepeatUnit; - } */ - /* else { - // maxFW = maxBW but repeat units different: not in a tandem repeat. - maxRL = 1; - bestRepeatUnit = bestBWRepeatUnit; // arbitrary - } */ } diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatLengthCovariateUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java similarity index 81% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatLengthCovariateUnitTest.java rename to protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java index d08276147..e4311a534 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatLengthCovariateUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java @@ -49,7 +49,7 @@ import com.google.java.contract.Requires; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.*; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -65,9 +65,11 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Random; -public class RepeatLengthCovariateUnitTest { +public class RepeatCovariatesUnitTest { - RepeatLengthCovariate covariate; + RepeatLengthCovariate rlCovariate; + RepeatUnitCovariate ruCovariate; + RepeatUnitAndLengthCovariate rurlCovariate; RecalibrationArgumentCollection RAC; @@ -75,8 +77,12 @@ public class RepeatLengthCovariateUnitTest { @BeforeClass public void init() { RAC = new RecalibrationArgumentCollection(); - covariate = new RepeatLengthCovariate(); - covariate.initialize(RAC); + rlCovariate = new RepeatLengthCovariate(); + ruCovariate = new RepeatUnitCovariate(); + rurlCovariate = new RepeatUnitAndLengthCovariate(); + rlCovariate.initialize(RAC); + ruCovariate.initialize(RAC); + rurlCovariate.initialize(RAC); } @@ -136,9 +142,9 @@ public class RepeatLengthCovariateUnitTest { @Test(enabled = true) public void testManyObservations() { final int NUM_UNITS = 10; - final int MAX_REPEAT_UNIT_LENGTH = 6; - final int MAX_NUM_REPETITIONS = 5; - final int NUM_TEST_CASES = 1; + final int MAX_REPEAT_UNIT_LENGTH = RepeatCovariate.MAX_STR_UNIT_LENGTH; + final int MAX_NUM_REPETITIONS = RepeatCovariate.MAX_REPEAT_LENGTH; + final int NUM_TEST_CASES = 100; Random random = new Random(); @@ -161,29 +167,52 @@ public class RepeatLengthCovariateUnitTest { } - // final String readBases = sb.toString(); - final String readBases = "TTTTTTTTAATGGGGGAGGGAGGGAGACTTACTTACTTACTTACTTATCGGAATCGGATATATAACGGCTTTCTTCTTCCCCCCA"; + final String readBases = sb.toString(); System.out.println(readBases); + final int readLength = readBases.length(); - final byte[] readQuals = new byte[readBases.length()]; + final byte[] readQuals = new byte[readLength]; Arrays.fill(readQuals,(byte)30); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readBases.length()+"M"); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readLength+"M"); + + Covariate[] requestedCovariates = new Covariate[3]; + requestedCovariates[0] = rlCovariate; + requestedCovariates[1] = ruCovariate; + requestedCovariates[2] = rurlCovariate; + ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, readLength); + Assert.assertEquals(rc.getInsertionsKeySet().length, readLength); + Assert.assertEquals(rc.getDeletionsKeySet().length, readLength); - final ReadCovariates values = new ReadCovariates(read.getReadLength(),1); - covariate.recordValues( read, values); - final int fullReadKeySet[][] = values.getKeySet(EventType.BASE_SUBSTITUTION); for (int offset = 0; offset < readBases.length(); offset++) { // recalibrate all bases in the read - final int[] keySet = fullReadKeySet[offset]; - final String val = covariate.formatKey(keySet[0]); - System.out.format("offset %d:%s\n", offset, val); -// final String repeatString = RepeatLengthCovariate.getBasesFromRUandNR(val); -// final String partialRead = readBases.substring(0,offset+1); - // Assert.assertTrue(partialRead.endsWith(repeatString)); - } - //values. - int k=0; + // check RepeatLength + final String rlValM = rlCovariate.formatKey(rc.getMismatchesKeySet(offset)[0]); + final String rlValI = rlCovariate.formatKey(rc.getInsertionsKeySet(offset)[0]); + final String rlValD = rlCovariate.formatKey(rc.getDeletionsKeySet(offset)[0]); + // check RepeatUnit + final String ruValM = ruCovariate.formatKey(rc.getMismatchesKeySet(offset)[1]); + final String ruValI = ruCovariate.formatKey(rc.getInsertionsKeySet(offset)[1]); + final String ruValD = ruCovariate.formatKey(rc.getDeletionsKeySet(offset)[1]); + // check RepeatUnitAndLength + final String rurlValM = rurlCovariate.formatKey(rc.getMismatchesKeySet(offset)[2]); + final String rurlValI = rurlCovariate.formatKey(rc.getInsertionsKeySet(offset)[2]); + final String rurlValD = rurlCovariate.formatKey(rc.getDeletionsKeySet(offset)[2]); + // check all 3 values are identical + Assert.assertEquals(rlValD,rlValI); + Assert.assertEquals(rlValM,rlValI); + Assert.assertEquals(ruValD,ruValI); + Assert.assertEquals(ruValM,ruValI); + Assert.assertEquals(rurlValD,rurlValI); + Assert.assertEquals(rurlValM,rurlValI); + int fw = VariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(offset+1,readLength).getBytes(),true); + int bw = VariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(0,offset+1).getBytes(),false); + Assert.assertEquals(Math.min(fw+bw,RepeatCovariate.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); + } + }