Removing an assumption that ADs were in the same order if the number of alleles matched. This happens for example when one sample is C->T and another sample is C->G.

This commit is contained in:
Ryan Poplin 2014-08-13 10:28:12 -04:00
parent c6a96c3958
commit 3a9a78c785
5 changed files with 38 additions and 41 deletions

View File

@ -119,7 +119,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate {
// get backward repeat unit and # repeats
byte[] backwardRepeatUnit = Arrays.copyOfRange(readBases, offset - str + 1, offset + 1);
maxBW = GATKVariantContextUtils.findNumberofRepetitions(backwardRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false);
maxBW = GATKVariantContextUtils.findNumberOfRepetitions(backwardRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false);
if (maxBW > 1) {
bestBWRepeatUnit = backwardRepeatUnit.clone();
break;
@ -139,7 +139,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate {
// get forward repeat unit and # repeats
byte[] forwardRepeatUnit = Arrays.copyOfRange(readBases, offset +1, offset+str+1);
maxFW = GATKVariantContextUtils.findNumberofRepetitions(forwardRepeatUnit, Arrays.copyOfRange(readBases, offset + 1, readBases.length), true);
maxFW = GATKVariantContextUtils.findNumberOfRepetitions(forwardRepeatUnit, Arrays.copyOfRange(readBases, offset + 1, readBases.length), true);
if (maxFW > 1) {
bestFWRepeatUnit = forwardRepeatUnit.clone();
break;
@ -157,7 +157,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate {
// but correct representation at that place might be (C)4.
// Hence, if the FW and BW units don't match, check if BW unit can still be a part of FW unit and add
// representations to total
maxBW = GATKVariantContextUtils.findNumberofRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false);
maxBW = GATKVariantContextUtils.findNumberOfRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false);
maxRL = maxFW + maxBW;
bestRepeatUnit = bestFWRepeatUnit;

View File

@ -69,7 +69,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("5487ad609548c30e79a431115dc772ba"));
Arrays.asList("9d9ddeb831e5512c5b1084ee22e65459"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -94,7 +94,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("f7650a8a861dec3138848bb972929002"));
Arrays.asList("aa0f9604bb496be143a6dde775e157fe"));
executeTest("combineSingleSamplePipelineGVCFHierarchical", spec);
}
@ -106,7 +106,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference),
1,
Arrays.asList("df5a6a574c48c243fad5b44f34343fe3"));
Arrays.asList("49f8ff728246d08cd20cd1c1521651f9"));
executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec);
}

View File

@ -92,38 +92,38 @@ public class RepeatCovariatesUnitTest {
@Test
public void testFindNumberOfRepetitions() {
// First, test logic to compute number of repetitions of a substring on a given string.
int result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), true);
int result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), true);
Assert.assertEquals(2,result);
result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true);
result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true);
Assert.assertEquals(4,result);
result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true);
result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true);
Assert.assertEquals(4,result);
result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true);
result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true);
Assert.assertEquals(0,result);
result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true);
result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true);
Assert.assertEquals(0,result);
result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true);
result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true);
Assert.assertEquals(1,result);
result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true);
result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true);
Assert.assertEquals(0,result);
result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true);
result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true);
Assert.assertEquals(0,result);
// Same tests but looking backward on string
result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), false);
result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), false);
Assert.assertEquals(2,result);
result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false);
result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false);
Assert.assertEquals(4,result);
result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false);
result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false);
Assert.assertEquals(0,result);
result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false);
result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false);
Assert.assertEquals(4,result);
result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false);
result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false);
Assert.assertEquals(0,result);
result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false);
result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false);
Assert.assertEquals(0,result);
result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false);
result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false);
Assert.assertEquals(0,result);
result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false);
result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false);
Assert.assertEquals(3,result);
// test logic to get repeat unit and number of repeats from covariate value
@ -211,8 +211,8 @@ public class RepeatCovariatesUnitTest {
Assert.assertEquals(rurlValM,rurlValI);
int fw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(offset+1,readLength).getBytes(),true);
int bw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(0,offset+1).getBytes(),false);
int fw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(offset + 1, readLength).getBytes(), true);
int bw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(0, offset + 1).getBytes(), false);
Assert.assertEquals(Math.min(fw+bw,RAC.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM));
}

View File

@ -347,9 +347,9 @@ public class GATKVariantContextUtils {
final int[] repetitionCount = new int[2];
// look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases)
int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases, true);
repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef;
repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef;
int repetitionsInRef = findNumberOfRepetitions(repeatUnit, refBases, true);
repetitionCount[0] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef;
repetitionCount[1] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef;
return new Pair<>(repetitionCount, repeatUnit);
@ -393,7 +393,7 @@ public class GATKVariantContextUtils {
* @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string)
* @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's
*/
public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) {
public static int findNumberOfRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) {
int numRepeats = 0;
if (lookForward) {
// look forward on the test string
@ -891,7 +891,7 @@ public class GATKVariantContextUtils {
final String name = first.getSource();
final Allele refAllele = determineReferenceAllele(VCs);
final Set<Allele> alleles = new LinkedHashSet<>();
final LinkedHashSet<Allele> alleles = new LinkedHashSet<>();
final Set<String> filters = new HashSet<>();
final Map<String, Object> attributes = new LinkedHashMap<>();
final Set<String> inconsistentAttributes = new HashSet<>();
@ -1159,7 +1159,7 @@ public class GATKVariantContextUtils {
final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList)
.chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart())
.genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later
.genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to re-genotype later
return builder.make();
}
@ -1289,7 +1289,7 @@ public class GATKVariantContextUtils {
return result;
}
public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) {
public static GenotypesContext stripPLsAndAD(final GenotypesContext genotypes) {
final GenotypesContext newGs = GenotypesContext.create(genotypes.size());
for ( final Genotype g : genotypes ) {
@ -1430,7 +1430,7 @@ public class GATKVariantContextUtils {
return loc == null || loc.getStart() == vc.getStart();
}
static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final Set<Allele> allAlleles) {
static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final LinkedHashSet<Allele> allAlleles) {
if ( refAllele.equals(vc.getReference()) )
return new AlleleMapper(vc);
else {
@ -1606,7 +1606,7 @@ public class GATKVariantContextUtils {
// create the index mapping, using the <ALT> allele whenever such a mapping doesn't exist
for ( int i = 1; i < targetAlleles.size(); i++ ) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt: indexOfRemappedAllele;
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt : indexOfRemappedAllele;
}
return indexMapping;
@ -1656,9 +1656,6 @@ public class GATKVariantContextUtils {
if ( originalAD == null || indexesOfRelevantAlleles == null ) throw new IllegalArgumentException("The list of input AD values and alleles must not be null");
final int numADs = indexesOfRelevantAlleles.length;
if ( numADs == originalAD.length )
return originalAD;
final int[] newAD = new int[numADs];
for ( int i = 0; i < numADs; i++ ) {

View File

@ -858,11 +858,11 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
Pair<List<Integer>,byte[]> result;
byte[] refBytes = "TATCATCATCGGA".getBytes();
Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4);
Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0);
Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("T".getBytes(), "T".getBytes(), true),1);
Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1);
Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2);
Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4);
Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0);
Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("T".getBytes(), "T".getBytes(), true),1);
Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1);
Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2);
Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3);
Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1);