From 1d5b29e764662424993a62a1c86bd7aa78759704 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 29 Jan 2013 15:23:02 -0500 Subject: [PATCH] Unit tests for repeat covariates: generate 100 random reads consisting of tandem repeat units of random content and size, and check that covariates match expected values at all positions in reads. Fixed corner case where value of covariate at border between 2 tandem repeats of different length/content wasn't consistent --- .../covariates/RepeatCovariate.java | 28 +++---- ...est.java => RepeatCovariatesUnitTest.java} | 77 +++++++++++++------ 2 files changed, 63 insertions(+), 42 deletions(-) rename protected/java/test/org/broadinstitute/sting/utils/recalibration/{RepeatLengthCovariateUnitTest.java => RepeatCovariatesUnitTest.java} (81%) diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java index d500a8ef9..a836fbb5e 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java @@ -61,8 +61,8 @@ import java.util.Map; import java.util.Set; public abstract class RepeatCovariate implements ExperimentalCovariate { - final int MAX_REPEAT_LENGTH = 20; - final int MAX_STR_UNIT_LENGTH = 8; + public static final int MAX_REPEAT_LENGTH = 20; + public static final int MAX_STR_UNIT_LENGTH = 8; private final HashMap repeatLookupTable = new HashMap(); private final HashMap repeatReverseLookupTable = new HashMap(); private int nextId = 0; @@ -144,25 +144,17 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { bestRepeatUnit = bestFWRepeatUnit; // arbitrary } else { - // tandem repeat starting forward from current offset - maxRL = maxFW; + // tandem repeat starting forward from current offset. + // It could be the case that best BW unit was differnet from FW unit, but that BW still contains FW unit. + // For example, TTCTT(C) CCC - at (C) place, best BW unit is (TTC)2, best FW unit is (C)3. + // but correct representation at that place might be (C)4. + // Hence, if the FW and BW units don't match, check if BW unit can still be a part of FW unit and add + // representations to total + maxBW = VariantContextUtils.findNumberofRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); + maxRL = maxFW + maxBW; bestRepeatUnit = bestFWRepeatUnit; } -/* if (maxFW > maxBW) { - // tandem repeat starting forward from current offset - maxRL = maxFW; - bestRepeatUnit = bestFWRepeatUnit; - } - else if (maxFW < maxBW) { - maxRL = maxBW; - bestRepeatUnit = bestBWRepeatUnit; - } */ - /* else { - // maxFW = maxBW but repeat units different: not in a tandem repeat. - maxRL = 1; - bestRepeatUnit = bestBWRepeatUnit; // arbitrary - } */ } diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatLengthCovariateUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java similarity index 81% rename from protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatLengthCovariateUnitTest.java rename to protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java index d08276147..e4311a534 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatLengthCovariateUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RepeatCovariatesUnitTest.java @@ -49,7 +49,7 @@ import com.google.java.contract.Requires; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.*; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -65,9 +65,11 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Random; -public class RepeatLengthCovariateUnitTest { +public class RepeatCovariatesUnitTest { - RepeatLengthCovariate covariate; + RepeatLengthCovariate rlCovariate; + RepeatUnitCovariate ruCovariate; + RepeatUnitAndLengthCovariate rurlCovariate; RecalibrationArgumentCollection RAC; @@ -75,8 +77,12 @@ public class RepeatLengthCovariateUnitTest { @BeforeClass public void init() { RAC = new RecalibrationArgumentCollection(); - covariate = new RepeatLengthCovariate(); - covariate.initialize(RAC); + rlCovariate = new RepeatLengthCovariate(); + ruCovariate = new RepeatUnitCovariate(); + rurlCovariate = new RepeatUnitAndLengthCovariate(); + rlCovariate.initialize(RAC); + ruCovariate.initialize(RAC); + rurlCovariate.initialize(RAC); } @@ -136,9 +142,9 @@ public class RepeatLengthCovariateUnitTest { @Test(enabled = true) public void testManyObservations() { final int NUM_UNITS = 10; - final int MAX_REPEAT_UNIT_LENGTH = 6; - final int MAX_NUM_REPETITIONS = 5; - final int NUM_TEST_CASES = 1; + final int MAX_REPEAT_UNIT_LENGTH = RepeatCovariate.MAX_STR_UNIT_LENGTH; + final int MAX_NUM_REPETITIONS = RepeatCovariate.MAX_REPEAT_LENGTH; + final int NUM_TEST_CASES = 100; Random random = new Random(); @@ -161,29 +167,52 @@ public class RepeatLengthCovariateUnitTest { } - // final String readBases = sb.toString(); - final String readBases = "TTTTTTTTAATGGGGGAGGGAGGGAGACTTACTTACTTACTTACTTATCGGAATCGGATATATAACGGCTTTCTTCTTCCCCCCA"; + final String readBases = sb.toString(); System.out.println(readBases); + final int readLength = readBases.length(); - final byte[] readQuals = new byte[readBases.length()]; + final byte[] readQuals = new byte[readLength]; Arrays.fill(readQuals,(byte)30); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readBases.length()+"M"); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readLength+"M"); + + Covariate[] requestedCovariates = new Covariate[3]; + requestedCovariates[0] = rlCovariate; + requestedCovariates[1] = ruCovariate; + requestedCovariates[2] = rurlCovariate; + ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, readLength); + Assert.assertEquals(rc.getInsertionsKeySet().length, readLength); + Assert.assertEquals(rc.getDeletionsKeySet().length, readLength); - final ReadCovariates values = new ReadCovariates(read.getReadLength(),1); - covariate.recordValues( read, values); - final int fullReadKeySet[][] = values.getKeySet(EventType.BASE_SUBSTITUTION); for (int offset = 0; offset < readBases.length(); offset++) { // recalibrate all bases in the read - final int[] keySet = fullReadKeySet[offset]; - final String val = covariate.formatKey(keySet[0]); - System.out.format("offset %d:%s\n", offset, val); -// final String repeatString = RepeatLengthCovariate.getBasesFromRUandNR(val); -// final String partialRead = readBases.substring(0,offset+1); - // Assert.assertTrue(partialRead.endsWith(repeatString)); - } - //values. - int k=0; + // check RepeatLength + final String rlValM = rlCovariate.formatKey(rc.getMismatchesKeySet(offset)[0]); + final String rlValI = rlCovariate.formatKey(rc.getInsertionsKeySet(offset)[0]); + final String rlValD = rlCovariate.formatKey(rc.getDeletionsKeySet(offset)[0]); + // check RepeatUnit + final String ruValM = ruCovariate.formatKey(rc.getMismatchesKeySet(offset)[1]); + final String ruValI = ruCovariate.formatKey(rc.getInsertionsKeySet(offset)[1]); + final String ruValD = ruCovariate.formatKey(rc.getDeletionsKeySet(offset)[1]); + // check RepeatUnitAndLength + final String rurlValM = rurlCovariate.formatKey(rc.getMismatchesKeySet(offset)[2]); + final String rurlValI = rurlCovariate.formatKey(rc.getInsertionsKeySet(offset)[2]); + final String rurlValD = rurlCovariate.formatKey(rc.getDeletionsKeySet(offset)[2]); + // check all 3 values are identical + Assert.assertEquals(rlValD,rlValI); + Assert.assertEquals(rlValM,rlValI); + Assert.assertEquals(ruValD,ruValI); + Assert.assertEquals(ruValM,ruValI); + Assert.assertEquals(rurlValD,rurlValI); + Assert.assertEquals(rurlValM,rurlValI); + int fw = VariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(offset+1,readLength).getBytes(),true); + int bw = VariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(0,offset+1).getBytes(),false); + Assert.assertEquals(Math.min(fw+bw,RepeatCovariate.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); + } + }