From 556cc691858a482176965bedfd62a3c4f7b5aee9 Mon Sep 17 00:00:00 2001 From: meganshand Date: Fri, 24 Jun 2016 15:31:42 -0400 Subject: [PATCH] Fix for int overflow in RankSum calculation --- ...perGeneralPloidySuite1IntegrationTest.java | 6 +-- ...perGeneralPloidySuite2IntegrationTest.java | 2 +- ...GenotyperNormalCallingIntegrationTest.java | 4 +- .../HaplotypeCallerIntegrationTest.java | 2 +- .../gatk/utils/MannWhitneyU.java | 38 +++++++++++-------- .../broadinstitute/gatk/utils/MWUnitTest.java | 7 ++++ 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index 6e21e96d1..92fb16387 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -74,12 +74,12 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testBOTH_GGA_Pools() { - executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "c3826794a250e32b0497353ceb1deb26"); + executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "ee5a2f8954f38d6e5d44fe50b22e43a1"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "4eb0d8018da6612cd434491f338ed5a4"); + executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "5cb3fe396302f3d4a4a9b7b3cc1877cc"); } @Test(enabled = true) @@ -88,6 +88,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe //TODO the old MD5 is kept for the record. //TODO this should be revisit once we get into addressing inaccuracies by the independent allele approach. // executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2"); - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "c2fb9b05027c2b0ac9e338d9ddda69b1"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b99416c04ba951577f43fd2d25f46388"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index 51d7a5a65..4262ab665 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -63,7 +63,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","e22846de4567f576e08e00edda2931d0"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","1d27eaa3557dc28c95b9024114d50ad1"); } @Test(enabled = true) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index bdd3eb5c0..7348e12b1 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -102,7 +102,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("94ca1e00d4fad9c5279271c2779ff797")); + Arrays.asList("25b710f830749448cd056c9b2e7798ff")); executeTest("test Multiple SNP alleles", spec); } @@ -126,7 +126,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("c5aff2572ce09c413e7f5c9e1b3f92d6")); + Arrays.asList("1759c156bc45528504398a7ef4ce5bf8")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 09722f5ee..747609ace 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -397,7 +397,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testLackSensitivityDueToBadHaplotypeSelectionFix() { final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list"); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5514cfbcf12954bb12c725b77eaac248")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("22f5a3e9366e611509f03c984f8b4960")); spec.disableShadowBCF(); executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec); } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java index d413f6f79..d475692d4 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java @@ -280,23 +280,9 @@ public class MannWhitneyU { RankedData ranked = calculateRank(series1, series2); Rank[] ranks = ranked.getRank(); ArrayList numOfTies = ranked.getNumOfTies(); + int lengthOfRanks = ranks.length; - //Calculate number of ties transformed for formula for Sigma to calculate Z-score - ArrayList transformedTies = new ArrayList<>(); - for (int count : numOfTies) { - //If every single datapoint is tied then we want to return a p-value of .5 and - //the formula for sigma that includes the number of ties breaks down. Setting - //the number of ties to 0 in this case gets the desired result in the normal - //approximation case. - if (count != ranks.length) { - transformedTies.add((count * count * count) - count); - } - } - - double numOfTiesForSigma = 0.0; - for (int count : transformedTies) { - numOfTiesForSigma += count; - } + double numOfTiesForSigma = transformTies(lengthOfRanks, numOfTies); // Calculate R1 and R2 and U. float r1 = 0, r2 = 0; @@ -314,6 +300,26 @@ public class MannWhitneyU { return result; } + public double transformTies(int numOfRanks, ArrayList numOfTies) { + //Calculate number of ties transformed for formula for Sigma to calculate Z-score + ArrayList transformedTies = new ArrayList<>(); + for (int count : numOfTies) { + //If every single datapoint is tied then we want to return a p-value of .5 and + //the formula for sigma that includes the number of ties breaks down. Setting + //the number of ties to 0 in this case gets the desired result in the normal + //approximation case. + if (count != numOfRanks) { + transformedTies.add((Math.pow(count, 3)) - count); + } + } + + double numOfTiesForSigma = 0.0; + for (double count : transformedTies) { + numOfTiesForSigma += count; + } + + return(numOfTiesForSigma); + } /** * Calculates the rank-sum test statisic U (sometimes W) from two sets of input data for a one-sided test * with an int indicating which group is the dominator. Returns a test statistic object with trueU and number of diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java index a3e781343..d99fd42dd 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java @@ -30,6 +30,7 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; +import java.util.ArrayList; import java.util.Arrays; /** @@ -139,4 +140,10 @@ public class MWUnitTest extends BaseTest { System.out.println("1-side p: " + test.getP()); Assert.assertEquals(test.getZ(), Z, DELTA_PRECISION, name); } + + @Test + public void testTooManyTies(){ + ArrayList listOfNumberOfTies = new ArrayList<>(Arrays.asList(26,3,6,4,13,18,29,36,60,58,87,63,98,125,158,185,193,171,17592,115,100,141,216,298,451,719,1060,1909,3210,5167,7135,10125,11035,3541,732,9)); + Assert.assertEquals(rst.transformTies(64890, listOfNumberOfTies), 8.41378729572e+12); + } }