Unit tests for repeat covariates: generate 100 random reads consisting of tandem repeat units of random content and size, and check that covariates match expected values at all positions in reads.

Fixed corner case where value of covariate at border between 2 tandem repeats of different length/content wasn't consistent
This commit is contained in:
Guillermo del Angel 2013-01-29 15:23:02 -05:00
parent c11197e361
commit 1d5b29e764
2 changed files with 63 additions and 42 deletions

View File

@ -61,8 +61,8 @@ import java.util.Map;
import java.util.Set;
public abstract class RepeatCovariate implements ExperimentalCovariate {
final int MAX_REPEAT_LENGTH = 20;
final int MAX_STR_UNIT_LENGTH = 8;
public static final int MAX_REPEAT_LENGTH = 20;
public static final int MAX_STR_UNIT_LENGTH = 8;
private final HashMap<String, Integer> repeatLookupTable = new HashMap<String, Integer>();
private final HashMap<Integer, String> repeatReverseLookupTable = new HashMap<Integer, String>();
private int nextId = 0;
@ -144,25 +144,17 @@ public abstract class RepeatCovariate implements ExperimentalCovariate {
bestRepeatUnit = bestFWRepeatUnit; // arbitrary
}
else {
// tandem repeat starting forward from current offset
maxRL = maxFW;
// tandem repeat starting forward from current offset.
// It could be the case that best BW unit was differnet from FW unit, but that BW still contains FW unit.
// For example, TTCTT(C) CCC - at (C) place, best BW unit is (TTC)2, best FW unit is (C)3.
// but correct representation at that place might be (C)4.
// Hence, if the FW and BW units don't match, check if BW unit can still be a part of FW unit and add
// representations to total
maxBW = VariantContextUtils.findNumberofRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false);
maxRL = maxFW + maxBW;
bestRepeatUnit = bestFWRepeatUnit;
}
/* if (maxFW > maxBW) {
// tandem repeat starting forward from current offset
maxRL = maxFW;
bestRepeatUnit = bestFWRepeatUnit;
}
else if (maxFW < maxBW) {
maxRL = maxBW;
bestRepeatUnit = bestBWRepeatUnit;
} */
/* else {
// maxFW = maxBW but repeat units different: not in a tandem repeat.
maxRL = 1;
bestRepeatUnit = bestBWRepeatUnit; // arbitrary
} */
}

View File

@ -49,7 +49,7 @@ import com.google.java.contract.Requires;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
import org.broadinstitute.sting.utils.recalibration.covariates.*;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -65,9 +65,11 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;
public class RepeatLengthCovariateUnitTest {
public class RepeatCovariatesUnitTest {
RepeatLengthCovariate covariate;
RepeatLengthCovariate rlCovariate;
RepeatUnitCovariate ruCovariate;
RepeatUnitAndLengthCovariate rurlCovariate;
RecalibrationArgumentCollection RAC;
@ -75,8 +77,12 @@ public class RepeatLengthCovariateUnitTest {
@BeforeClass
public void init() {
RAC = new RecalibrationArgumentCollection();
covariate = new RepeatLengthCovariate();
covariate.initialize(RAC);
rlCovariate = new RepeatLengthCovariate();
ruCovariate = new RepeatUnitCovariate();
rurlCovariate = new RepeatUnitAndLengthCovariate();
rlCovariate.initialize(RAC);
ruCovariate.initialize(RAC);
rurlCovariate.initialize(RAC);
}
@ -136,9 +142,9 @@ public class RepeatLengthCovariateUnitTest {
@Test(enabled = true)
public void testManyObservations() {
final int NUM_UNITS = 10;
final int MAX_REPEAT_UNIT_LENGTH = 6;
final int MAX_NUM_REPETITIONS = 5;
final int NUM_TEST_CASES = 1;
final int MAX_REPEAT_UNIT_LENGTH = RepeatCovariate.MAX_STR_UNIT_LENGTH;
final int MAX_NUM_REPETITIONS = RepeatCovariate.MAX_REPEAT_LENGTH;
final int NUM_TEST_CASES = 100;
Random random = new Random();
@ -161,29 +167,52 @@ public class RepeatLengthCovariateUnitTest {
}
// final String readBases = sb.toString();
final String readBases = "TTTTTTTTAATGGGGGAGGGAGGGAGACTTACTTACTTACTTACTTATCGGAATCGGATATATAACGGCTTTCTTCTTCCCCCCA";
final String readBases = sb.toString();
System.out.println(readBases);
final int readLength = readBases.length();
final byte[] readQuals = new byte[readBases.length()];
final byte[] readQuals = new byte[readLength];
Arrays.fill(readQuals,(byte)30);
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readBases.length()+"M");
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readLength+"M");
Covariate[] requestedCovariates = new Covariate[3];
requestedCovariates[0] = rlCovariate;
requestedCovariates[1] = ruCovariate;
requestedCovariates[2] = rurlCovariate;
ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates);
// check that the length is correct
Assert.assertEquals(rc.getMismatchesKeySet().length, readLength);
Assert.assertEquals(rc.getInsertionsKeySet().length, readLength);
Assert.assertEquals(rc.getDeletionsKeySet().length, readLength);
final ReadCovariates values = new ReadCovariates(read.getReadLength(),1);
covariate.recordValues( read, values);
final int fullReadKeySet[][] = values.getKeySet(EventType.BASE_SUBSTITUTION);
for (int offset = 0; offset < readBases.length(); offset++) { // recalibrate all bases in the read
final int[] keySet = fullReadKeySet[offset];
final String val = covariate.formatKey(keySet[0]);
System.out.format("offset %d:%s\n", offset, val);
// final String repeatString = RepeatLengthCovariate.getBasesFromRUandNR(val);
// final String partialRead = readBases.substring(0,offset+1);
// Assert.assertTrue(partialRead.endsWith(repeatString));
}
//values.
int k=0;
// check RepeatLength
final String rlValM = rlCovariate.formatKey(rc.getMismatchesKeySet(offset)[0]);
final String rlValI = rlCovariate.formatKey(rc.getInsertionsKeySet(offset)[0]);
final String rlValD = rlCovariate.formatKey(rc.getDeletionsKeySet(offset)[0]);
// check RepeatUnit
final String ruValM = ruCovariate.formatKey(rc.getMismatchesKeySet(offset)[1]);
final String ruValI = ruCovariate.formatKey(rc.getInsertionsKeySet(offset)[1]);
final String ruValD = ruCovariate.formatKey(rc.getDeletionsKeySet(offset)[1]);
// check RepeatUnitAndLength
final String rurlValM = rurlCovariate.formatKey(rc.getMismatchesKeySet(offset)[2]);
final String rurlValI = rurlCovariate.formatKey(rc.getInsertionsKeySet(offset)[2]);
final String rurlValD = rurlCovariate.formatKey(rc.getDeletionsKeySet(offset)[2]);
// check all 3 values are identical
Assert.assertEquals(rlValD,rlValI);
Assert.assertEquals(rlValM,rlValI);
Assert.assertEquals(ruValD,ruValI);
Assert.assertEquals(ruValM,ruValI);
Assert.assertEquals(rurlValD,rurlValI);
Assert.assertEquals(rurlValM,rurlValI);
int fw = VariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(offset+1,readLength).getBytes(),true);
int bw = VariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(0,offset+1).getBytes(),false);
Assert.assertEquals(Math.min(fw+bw,RepeatCovariate.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM));
}
}