From 3a9a78c7850f13fc554b1ea14047852ab92e305f Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 13 Aug 2014 10:28:12 -0400 Subject: [PATCH] Removing an assumption that ADs were in the same order if the number of alleles matched. This happens for example when one sample is C->T and another sample is C->G. --- .../covariates/RepeatCovariate.java | 6 ++-- .../GenotypeGVCFsIntegrationTest.java | 6 ++-- .../RepeatCovariatesUnitTest.java | 36 +++++++++---------- .../variant/GATKVariantContextUtils.java | 21 +++++------ .../GATKVariantContextUtilsUnitTest.java | 10 +++--- 5 files changed, 38 insertions(+), 41 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatCovariate.java index 1e8a5fbdf..bf82832f6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatCovariate.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatCovariate.java @@ -119,7 +119,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { // get backward repeat unit and # repeats byte[] backwardRepeatUnit = Arrays.copyOfRange(readBases, offset - str + 1, offset + 1); - maxBW = GATKVariantContextUtils.findNumberofRepetitions(backwardRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); + maxBW = GATKVariantContextUtils.findNumberOfRepetitions(backwardRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); if (maxBW > 1) { bestBWRepeatUnit = backwardRepeatUnit.clone(); break; @@ -139,7 +139,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { // get forward repeat unit and # repeats byte[] forwardRepeatUnit = Arrays.copyOfRange(readBases, offset +1, offset+str+1); - maxFW = GATKVariantContextUtils.findNumberofRepetitions(forwardRepeatUnit, Arrays.copyOfRange(readBases, offset + 1, readBases.length), true); + maxFW = GATKVariantContextUtils.findNumberOfRepetitions(forwardRepeatUnit, Arrays.copyOfRange(readBases, offset + 1, readBases.length), true); if (maxFW > 1) { bestFWRepeatUnit = forwardRepeatUnit.clone(); break; @@ -157,7 +157,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { // but correct representation at that place might be (C)4. // Hence, if the FW and BW units don't match, check if BW unit can still be a part of FW unit and add // representations to total - maxBW = GATKVariantContextUtils.findNumberofRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); + maxBW = GATKVariantContextUtils.findNumberOfRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); maxRL = maxFW + maxBW; bestRepeatUnit = bestFWRepeatUnit; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 07392ffdd..158533450 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -69,7 +69,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("5487ad609548c30e79a431115dc772ba")); + Arrays.asList("9d9ddeb831e5512c5b1084ee22e65459")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -94,7 +94,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("f7650a8a861dec3138848bb972929002")); + Arrays.asList("aa0f9604bb496be143a6dde775e157fe")); executeTest("combineSingleSamplePipelineGVCFHierarchical", spec); } @@ -106,7 +106,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), 1, - Arrays.asList("df5a6a574c48c243fad5b44f34343fe3")); + Arrays.asList("49f8ff728246d08cd20cd1c1521651f9")); executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RepeatCovariatesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RepeatCovariatesUnitTest.java index 8c7348593..62cdfc3a3 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RepeatCovariatesUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RepeatCovariatesUnitTest.java @@ -92,38 +92,38 @@ public class RepeatCovariatesUnitTest { @Test public void testFindNumberOfRepetitions() { // First, test logic to compute number of repetitions of a substring on a given string. - int result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), true); + int result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), true); Assert.assertEquals(2,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true); Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true); Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true); Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true); Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true); Assert.assertEquals(1,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true); Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); Assert.assertEquals(0,result); // Same tests but looking backward on string - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACAC".getBytes(), false); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), false); Assert.assertEquals(2,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false); Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false); Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false); Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false); Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false); Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false); Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberofRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); Assert.assertEquals(3,result); // test logic to get repeat unit and number of repeats from covariate value @@ -211,8 +211,8 @@ public class RepeatCovariatesUnitTest { Assert.assertEquals(rurlValM,rurlValI); - int fw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(offset+1,readLength).getBytes(),true); - int bw = GATKVariantContextUtils.findNumberofRepetitions(ruValM.getBytes(), readBases.substring(0,offset+1).getBytes(),false); + int fw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(offset + 1, readLength).getBytes(), true); + int bw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(0, offset + 1).getBytes(), false); Assert.assertEquals(Math.min(fw+bw,RAC.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java index e746566ee..97cbebd09 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java @@ -347,9 +347,9 @@ public class GATKVariantContextUtils { final int[] repetitionCount = new int[2]; // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) - int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases, true); - repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; - repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; + int repetitionsInRef = findNumberOfRepetitions(repeatUnit, refBases, true); + repetitionCount[0] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; + repetitionCount[1] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; return new Pair<>(repetitionCount, repeatUnit); @@ -393,7 +393,7 @@ public class GATKVariantContextUtils { * @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string) * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's */ - public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { + public static int findNumberOfRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { int numRepeats = 0; if (lookForward) { // look forward on the test string @@ -891,7 +891,7 @@ public class GATKVariantContextUtils { final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); - final Set alleles = new LinkedHashSet<>(); + final LinkedHashSet alleles = new LinkedHashSet<>(); final Set filters = new HashSet<>(); final Map attributes = new LinkedHashMap<>(); final Set inconsistentAttributes = new HashSet<>(); @@ -1159,7 +1159,7 @@ public class GATKVariantContextUtils { final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList) .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart()) - .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later + .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to re-genotype later return builder.make(); } @@ -1289,7 +1289,7 @@ public class GATKVariantContextUtils { return result; } - public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { + public static GenotypesContext stripPLsAndAD(final GenotypesContext genotypes) { final GenotypesContext newGs = GenotypesContext.create(genotypes.size()); for ( final Genotype g : genotypes ) { @@ -1430,7 +1430,7 @@ public class GATKVariantContextUtils { return loc == null || loc.getStart() == vc.getStart(); } - static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final Set allAlleles) { + static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final LinkedHashSet allAlleles) { if ( refAllele.equals(vc.getReference()) ) return new AlleleMapper(vc); else { @@ -1606,7 +1606,7 @@ public class GATKVariantContextUtils { // create the index mapping, using the allele whenever such a mapping doesn't exist for ( int i = 1; i < targetAlleles.size(); i++ ) { final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); - indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt: indexOfRemappedAllele; + indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt : indexOfRemappedAllele; } return indexMapping; @@ -1656,9 +1656,6 @@ public class GATKVariantContextUtils { if ( originalAD == null || indexesOfRelevantAlleles == null ) throw new IllegalArgumentException("The list of input AD values and alleles must not be null"); final int numADs = indexesOfRelevantAlleles.length; - if ( numADs == originalAD.length ) - return originalAD; - final int[] newAD = new int[numADs]; for ( int i = 0; i < numADs; i++ ) { diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java index 53ac7b7b1..c67c0350c 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -858,11 +858,11 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { Pair,byte[]> result; byte[] refBytes = "TATCATCATCGGA".getBytes(); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("T".getBytes(), "T".getBytes(), true),1); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1); - Assert.assertEquals(GATKVariantContextUtils.findNumberofRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2); + Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4); + Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0); + Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("T".getBytes(), "T".getBytes(), true),1); + Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1); + Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2); Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3); Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1);