From c7e0f5b225c2d8fdb378a205038e6632faa059ff Mon Sep 17 00:00:00 2001 From: meganshand Date: Wed, 21 Oct 2015 09:27:52 -0400 Subject: [PATCH] Removes Dithering from Rank Sum Test Fixing empty group case Fixing MD5s First comments addressed Added permutation test Adding new RankSum to AS_RankSum Speeding up permutation algorithm and updating MD5s Missed a few tests Addressing comments Changing md5s --- .../walkers/annotator/AS_RankSumTest.java | 20 +- .../tools/walkers/annotator/RankSumTest.java | 40 +- .../walkers/annotator/RankSumUnitTest.java | 31 +- .../VariantAnnotatorIntegrationTest.java | 13 +- ...perGeneralPloidySuite1IntegrationTest.java | 6 +- ...perGeneralPloidySuite2IntegrationTest.java | 6 +- ...dGenotyperIndelCallingIntegrationTest.java | 16 +- .../UnifiedGenotyperIntegrationTest.java | 32 +- ...GenotyperNormalCallingIntegrationTest.java | 14 +- ...lexAndSymbolicVariantsIntegrationTest.java | 8 +- .../HaplotypeCallerGVCFIntegrationTest.java | 56 +- .../HaplotypeCallerIntegrationTest.java | 56 +- .../GenotypeGVCFsIntegrationTest.java | 12 +- public/gatk-root/pom.xml | 5 + public/gatk-utils/pom.xml | 4 + .../gatk/utils/MannWhitneyU.java | 859 ++++++++++-------- .../broadinstitute/gatk/utils/MWUnitTest.java | 149 ++- 17 files changed, 677 insertions(+), 650 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java index 637f4574f..f145bca4d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java @@ -59,11 +59,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ReducibleAnnotation; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller; -import org.broadinstitute.gatk.tools.walkers.variantutils.CombineGVCFs; -import org.broadinstitute.gatk.tools.walkers.variantutils.GenotypeGVCFs; import org.broadinstitute.gatk.utils.MannWhitneyU; -import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.exceptions.GATKException; @@ -298,14 +294,16 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn for (final Allele alt : perAlleleValues.keySet()) { if (alt.equals(ref, false)) continue; - final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering); - //load alts + final MannWhitneyU mannWhitneyU = new MannWhitneyU(); + //load alts (series 1) + List alts = new ArrayList<>(); for (final Number qual : perAlleleValues.get(alt)) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); + alts.add((double) qual.intValue()); } - //load refs + //load refs (series 2) + List refs = new ArrayList<>(); for (final Number qual : perAlleleValues.get(ref)) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); + refs.add((double) qual.intValue()); } if (DEBUG) { @@ -320,8 +318,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn } // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) - final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); - perAltRankSumResults.put(alt, testResults.first); + final MannWhitneyU.Result result = mannWhitneyU.test(convertToArray(alts), convertToArray(refs), MannWhitneyU.TestType.FIRST_DOMINATES); + perAltRankSumResults.put(alt, result.getZ()); } return perAltRankSumResults; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java index 4ed64464b..775f02739 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java @@ -51,7 +51,6 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; @@ -61,8 +60,6 @@ import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.MannWhitneyU; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.vcf.VCFHeaderLine; -import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import htsjdk.variant.variantcontext.Allele; @@ -79,7 +76,6 @@ import java.util.*; //TODO: will eventually implement ReducibleAnnotation in order to preserve accuracy for CombineGVCFs and GenotypeGVCFs -- see RMSAnnotation.java for an example of an abstract ReducibleAnnotation public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { static final boolean DEBUG = false; - protected boolean useDithering = true; public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -122,13 +118,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR if ( refQuals.isEmpty() && altQuals.isEmpty() ) return null; - final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering); - for (final Double qual : altQuals) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); - } - for (final Double qual : refQuals) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); - } + final MannWhitneyU mannWhitneyU = new MannWhitneyU(); if (DEBUG) { System.out.format("%s, REF QUALS:", this.getClass().getName()); @@ -142,14 +132,26 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR } // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) - final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); + final MannWhitneyU.Result result = mannWhitneyU.test(convertToArray(altQuals), convertToArray(refQuals), MannWhitneyU.TestType.FIRST_DOMINATES); + final double zScore = result.getZ(); + final Map map = new HashMap<>(); - if (!Double.isNaN(testResults.first)) - map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); + if (!Double.isNaN(zScore)) + map.put(getKeyNames().get(0), String.format("%.3f", zScore)); return map; } + public static double[] convertToArray(List list){ + double[] ret = new double[list.size()]; + Iterator iterator = list.iterator(); + for (int i = 0; i < ret.length; i++) + { + ret[i] = iterator.next().doubleValue(); + } + return ret; + } + private void fillQualsFromPileup(final List alleles, final ReadBackedPileup pileup, final List refQuals, @@ -255,14 +257,4 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ); } - /** - * Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if - * engine randomization is turned off, and if so does not dither. - * @param walker the walker - * @param toolkit the GATK engine - * @param headerLines the header lines - */ - public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set headerLines ) { - useDithering = ! toolkit.getArguments().disableDithering; - } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java index 5bacc7450..f66a2e6df 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java @@ -65,7 +65,7 @@ import java.util.List; public class RankSumUnitTest { - List distribution20, distribution30, distribution20_40; + List distribution20, distribution30, distribution20_40; static final int observations = 100; @BeforeClass @@ -86,11 +86,11 @@ public class RankSumUnitTest { Collections.shuffle(distribution20_40, Utils.getRandomGenerator()); } - private static void makeDistribution(final List result, final int target, final int skew, final int numObservations) { + private static void makeDistribution(final List result, final int target, final int skew, final int numObservations) { final int rangeStart = target - skew; final int rangeEnd = target + skew; - int current = rangeStart; + double current = rangeStart; for ( int i = 0; i < numObservations; i++ ) { result.add(current++); if ( current > rangeEnd ) @@ -118,40 +118,33 @@ public class RankSumUnitTest { } @Test(enabled = true, dataProvider = "DistributionData") - public void testDistribution(final List distribution1, final List distribution2, final int numToReduceIn2, final boolean distributionsShouldBeEqual, final String debugString) { - final MannWhitneyU mannWhitneyU = new MannWhitneyU(true); + public void testDistribution(final List distribution1, final List distribution2, final int numToReduceIn2, final boolean distributionsShouldBeEqual, final String debugString) { + final MannWhitneyU mannWhitneyU = new MannWhitneyU(); - for ( final Integer num : distribution1 ) - mannWhitneyU.add(num, MannWhitneyU.USet.SET1); - - final List dist2 = new ArrayList<>(distribution2); + final List dist2 = new ArrayList<>(distribution2); if ( numToReduceIn2 > 0 ) { - int counts = 0; - int quals = 0; + Double counts = 0.0; + Double quals = 0.0; for ( int i = 0; i < numToReduceIn2; i++ ) { counts++; quals += dist2.remove(0); } - final int qual = quals / counts; + final Double qual = quals / counts; for ( int i = 0; i < numToReduceIn2; i++ ) dist2.add(qual); } - for ( final Integer num : dist2 ) - mannWhitneyU.add(num, MannWhitneyU.USet.SET2); - - final Double result = mannWhitneyU.runTwoSidedTest().second; + final Double result = mannWhitneyU.test(RankSumTest.convertToArray(distribution1),RankSumTest.convertToArray(dist2), MannWhitneyU.TestType.TWO_SIDED).getP(); Assert.assertFalse(Double.isNaN(result)); if ( distributionsShouldBeEqual ) { - // TODO -- THIS IS THE FAILURE POINT OF USING REDUCED READS WITH RANK SUM TESTS if ( numToReduceIn2 >= observations / 2 ) return; - Assert.assertTrue(result > 0.1, String.format("%f %d %d", result, numToReduceIn2, dist2.get(0))); + Assert.assertTrue(result > 0.1, String.format("%f %d %f", result, numToReduceIn2, dist2.get(0))); } else { - Assert.assertTrue(result < 0.01, String.format("%f %d %d", result, numToReduceIn2, dist2.get(0))); + Assert.assertTrue(result < 0.01, String.format("%f %d %f", result, numToReduceIn2, dist2.get(0))); } } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java index 515888a09..93f59ae95 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -100,7 +100,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("d6cd81fc2f483f29d44fbb27d1772841")); + Arrays.asList("832861ecdc6344cfb7097d02903ffd4d")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -108,7 +108,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("300836de4e2c8424734d2ee0ca4261c1")); + Arrays.asList("5120f5f419cb32b37d5c56101f4c5da2")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -134,7 +134,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("192f393da4e28aecf16112562e65083a")); + Arrays.asList("95a4be70c46496fb6791fdd0c3bcf8a3")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -142,7 +142,8 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("52baff55535f7c87545a7818052a2d5c")); + Arrays.asList("8436bf4acb81f2c03bb5ecf8a514d90a")); + executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -150,7 +151,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + STANDARD_ANNOTATIONS + "-XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("334915d90fa92ee9fa07d4647912ceac")); + Arrays.asList("4995f77bedf7f476a15d46b5a8a392bd")); executeTest("test exclude annotations", spec); } @@ -183,7 +184,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("ab84654ac412a0aaaec99e86e357f0fd")); + Arrays.asList("de96b1e62f414b107430d179a154534d")); executeTest("test overwriting header", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index a1de74780..ae1cf8721 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -74,12 +74,12 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testBOTH_GGA_Pools() { - executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "217d9108c3014261dbe8befa383a2226"); + executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "1b834608171f67e52a4e7617458a3ba6"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "6a7f00e7f26cbc1891f40c9ed8b6579b"); + executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "5b9e08bb141c48f826dc513066cb8a13"); } @Test(enabled = true) @@ -88,6 +88,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe //TODO the old MD5 is kept for the record. //TODO this should be revisit once we get into addressing inaccuracies by the independent allele approach. // executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2"); - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "c0271a4c281991a86490c1955456af26"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "8449ea4fa85eac8834b049a0e3daee5a"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index 2fb2d11c5..b109c3973 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -63,16 +63,16 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","c445bcd828c540c009bc8fc9f48916bc"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","2a0dcd92f3a07f45b7927e66404bc37e"); } @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - executor.PC_MT_Test(CEUTRIO_BAM, "-A AlleleCountBySample -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","8d8c7926ca9c68251c41eb03b8efaba2"); + executor.PC_MT_Test(CEUTRIO_BAM, "-A AlleleCountBySample -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","b9f5d1db0d2f5eb00eeb72ea29130de6"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - executor.PC_MT_Test(CEUTRIO_BAM, String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "34242c0ae59b0645fb6df5c322e92f01"); + executor.PC_MT_Test(CEUTRIO_BAM, String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "d90e41081b4a910a50116ff18c311245"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index f4e03043b..e9811c17b 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -78,7 +78,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("b132da42e1017c8825d84a06cf79a1e9")); + Arrays.asList("f78975d22ac27a841d807765242d3c3f")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -92,7 +92,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("23e32822d81f198ee52676f24ef74343")); + Arrays.asList("75381df9490adf4206ebbfbe0f49226e")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -105,7 +105,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("93580986b871890a4fc86ebf98877efa")); + Arrays.asList("e89c2a0d30a409087af25fc06fd46404")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -115,7 +115,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("4da9878ee36d0fb387d5a58631383ffa")); + Arrays.asList("af5475e45b6ffca990dd035448e6b0fe")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -125,7 +125,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("9ab4f5306438b5a64cb2cb90020b7b82")); + Arrays.asList("f9b4ed435a53198d30199e8a5c98252d")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -140,7 +140,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("6de8d740908e22b46785d5eba6278eb2")); + Arrays.asList("feef70172d1edf2f9782143c8bbbf1d0")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -181,7 +181,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("56673c166ba25f625d026e593d5cc667")); + Arrays.asList("0369f9c03df86486709a35a5a2f23192")); executeTest("test minIndelFraction 0.0", spec); } @@ -189,7 +189,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("2ce1754313561b9cb134152b8f98bf43")); + Arrays.asList("8bd1f903445255f24bd9ef677e1ba9a8")); executeTest("test minIndelFraction 0.25", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 44a5ab268..ee4fb7897 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("21369e50334d2b77b0e638e47e1b8c64")); + Arrays.asList("b4201a4d30b6ed5c6fc80248676be5ff")); executeTest("test min_base_quality_score 26", spec); } @@ -94,7 +94,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("e58d9a5758c5b11f86558608260d93d5")); + Arrays.asList("45f867c73dc3008e874eb3c009cd9674")); executeTest("test SLOD", spec); } @@ -102,7 +102,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("fc8cdf9eeb475773303809c077f83c65")); + Arrays.asList("905e9a2ae183fc152101314774707372")); executeTest("test NDA", spec); } @@ -110,7 +110,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("c98294d321bde3e1e3c4fcee3e88d6d9")); + Arrays.asList("ac14f19fa1e9cecc8246814eac393662")); executeTest("test using comp track", spec); } @@ -124,17 +124,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "5b8938ed55a2b7ae8a52056c9130367b"); + testOutputParameters("-sites_only", "be100d3f04592d0865d907177ee8c7e4"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f353b36db7305f47963446220e39debe"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9024aa3ccf7339b873444b792ff5cb91"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "364aec53db79d20698fe0d088828736f"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "0d1e1fcc70fb4c159eac016f3085e5e9"); } private void testOutputParameters(final String args, final String md5) { @@ -148,7 +148,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("ec59b34bedf40d70850ab5ffe42bbddd")); + Arrays.asList("13d2cc1e0871f85a97fb1f73c052ea3e")); executeTest("test confidence 1", spec1); } @@ -156,7 +156,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNoPrior() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1, - Arrays.asList("7e8a51e658debdaadbcf17761ed011da")); + Arrays.asList("d94ab69be622fb23c97f524234eb05b3")); executeTest("test no prior 1", spec1); } @@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testUserPrior() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1, - Arrays.asList("b3514f5b3510b6667fd2c85ecc529de7")); + Arrays.asList("2f3c5fa06537c8a3f440fb70837702f6")); executeTest("test user prior 1", spec1); } @@ -174,7 +174,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void emitPLsAtAllSites() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1, - Arrays.asList("6fd14930acb08f0dd9749a8c4d7df831")); + Arrays.asList("76f7a8508ad40cf35b68b07bc7e29a72")); // GDA: TODO: BCF encoder/decoder doesn't seem to support non-standard values in genotype fields. IE even if there is a field defined in FORMAT and in the header the BCF2 encoder will still fail spec1.disableShadowBCF(); @@ -190,12 +190,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "815f01ef28bf576beb5528ac6fdd5248" ); + testHeterozosity( 0.01, "e3c708bfeea99835512c1fac477d02e4" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "8dd14ba8ef6314a99921849b2544b8c6" ); + testHeterozosity( 1.0 / 1850, "1226aaa771b8f057646474d306a84866" ); } private void testHeterozosity(final double arg, final String md5) { @@ -274,7 +274,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("34980dbce4fcd2aa21c46ea0e1897422")); + Arrays.asList("589f9dd567bb86ce74e5d48fd7cae0b2")); executeTest(String.format("test multiple technologies"), spec); } @@ -293,7 +293,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("5f02a7449305b26aab3ff994dfb53fda")); + Arrays.asList("be978fc8807fcbd514b2678adcdc2a28")); executeTest(String.format("test calling with BAQ"), spec); } @@ -310,7 +310,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + "-A SnpEff", 1, - Arrays.asList("b61c0dece2d77544f9313c24191e0089")); + Arrays.asList("52bac28a85b85d9596d668d3f934f792")); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index b652f1047..0334ff863 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -70,7 +70,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("0f9eff0ad2f8cb8e277922bd037825f7")); + Arrays.asList("46dd44e69ff078a48057ff08d278a293")); executeTest("test MultiSample Pilot1", spec); } @@ -78,7 +78,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("196979c91f84c01fb5d89e73339c09fa")); + Arrays.asList("a19e5f74606cd980c3083d41d23b4acb")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("391c4e1437a3fa5c99584b466a56d7bb")); + Arrays.asList("fe3aec195ceb9180b6ffc474a4bb37b3")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -94,7 +94,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("41b1d92cdea4581f7843b714345547d6")); + Arrays.asList("21b346710ccdbbfc904eafda2b0c7443")); executeTest("test SingleSample Pilot2", spec); } @@ -102,7 +102,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("4ae7c090dfb85f6286f1187a746def58")); + Arrays.asList("03e6cb2c81775b94c81af45560062705")); executeTest("test Multiple SNP alleles", spec); } @@ -118,7 +118,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("5c2c49bb5e276df933b2d264e9d4a327")); + Arrays.asList("710d308a32012911fc275042b612e3d0")); executeTest("test reverse trim", spec); } @@ -126,7 +126,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("ed5f90897c9a348e4f861eff8992b4e2")); + Arrays.asList("55f9bd98155a7a4b768fba97f90de282")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index f14fda293..740b8e59c 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -72,7 +72,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "066f1ce9e9826bcfedf6cd80bc560ab8"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6be5c8cbedea0a571c98fb1f4718042f"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -96,13 +96,13 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "2a5de432f04198737732064206c7d63d"); + "af3a490e5f41e890c59927426ac0fe9a"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "afaa41101f492d37f57bb18cc638c6bc"); + "12b1826ee23243c64a002c3cbdfa569a"); } private void HCTestComplexConsensusMode(String bam, String args, String md5) { @@ -114,7 +114,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleConsensusModeComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337", - "d42ba795fe346d8372cae3c00c9c2f23"); + "9765065daf3a008bd92c755d882c5f07"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 476f11317..478edbac4 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -79,12 +79,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { //TODO this might need to be addressed at some point. //TODO the following test is commented out for the record //tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "02300a1f64e085cc0f4420d8160743c1"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "f71ea433b1334a2d146cc6ad76b46d98"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "daddb5349c34e9190f0563e220894748"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "03e5815cc351f1ec2feed89d2aed8268"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "5d5cca382bdf6987b2aef87918ed374c"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "532888aff96c356397a21ef790636818"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "1c1ca2d76bc5d7dd45a5e7aef756ad8f"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7ddfd33f2efcd2617d896826434fb43c"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "30accd7a171e9a13969fa543fde3fed1"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "20d896bbf750739f937ccd2fb152d902"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "99dd66a5b36ba9e2d1b0f408ecfbb50b"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "72d90fe0f5641b373744f75a21d4d14c"}); return tests.toArray(new Object[][]{}); } @@ -98,13 +98,13 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "cfe629c5a3be3b6524258ad1f9145488"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "12fbfffa4bb2b8d520f8021a40b37d19"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a5ea6d4052bbf9e8bba9011bc6f0d203"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "31da2254620f4a9c34ccf7c311cc133f"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "c95f52fe395392331dc3102902d54408"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "9b2914cf121c411aad3a65cfdc98c1d4"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "259110565155a3a27ab3e98113bedef8"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "8058ce63339701d33425769217bffed1"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "3abdde9d9ddbe9ec7cef28c65844a409"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "f5f0cef38a529e1542a8fd8bcce7ab1e"}); - final String NA12878bandedResolutionMD5 = "fbe6099d138a069a65e4713bcae1e873"; + final String NA12878bandedResolutionMD5 = "0f848ee7240ca48827bdfb85b455c1ad"; tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5}); tests.add(new Object[]{NA12878_WEx + " -I " + privateTestDir + "NA20313.highCoverageRegion.bam -sn NA12878", ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5}); @@ -121,12 +121,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "5088d6cf80a6da7c769f97b1ab44c745"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "0895f09731d0ef89ec131c9f75aafe70"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "195017f498d3508e6eee492eb00da97b"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "4f8e3e249509a24da21d5dd8e3594f92"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "ef96f5295b048ef31f5ba82d078a44a2"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "cd809158689ddbbfd18a4eaae016f9a0"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "1f99396292974df55ef6497be79b1917"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "eabb44785fd4a84ade3c674a0bda16d9"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "904feadac0e4a2a0c6b2cc7e55718f3b"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "b430a401310b3812346d7496f9c62011"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "6e481747c50734b7fb0c4de39405044f"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "cff5e49456d5ecc0dafd31cd014def4f"}); return tests.toArray(new Object[][]{}); } @@ -139,12 +139,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "bb85582fcc2ce45407640fa8a70421ab"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "e01603c5d4c3142978a91b8cd6a98618"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "b9e471935b960a7dba3eb2b13939ccaf"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "c724ecdaea7eab5a6239ff4daaa6e034"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "194d27bd2321ff1a8b895a4e9a8d2938"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "4a3dfcfc2f5d27b75725346d63e0b83a"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "50156c819a0096fa22ed1b9749affecc"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "d91aace2100dc040659d77f366053a2e"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "d86dc51757e69aa3f2608fbbfaa90069"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "27c73b8b1ec384a880bf60daf0b0f80e"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "a6c70203e43d62b42c4b751fe9018410"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "7ebbbcab78d090d70a24819093812748"}); return tests.toArray(new Object[][]{}); } @@ -270,7 +270,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testWrongGVCFNonVariantRecordOrderBugFix() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9b3b232ceeb109f2624826ea20825a82")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a22bafa2f91c197ea1ab13e02a0c08fb")); spec.disableShadowBCF(); executeTest("testMissingGVCFIndexingStrategyException", spec); } @@ -320,7 +320,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testAlleleSpecificAnnotations() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a53ac6cb2379d8adccfca6495e32e225")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9db19d4706e32ce883c6e3f71e5f4178")); spec.disableShadowBCF(); executeTest(" testAlleleSpecificAnnotations", spec); } @@ -329,7 +329,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testASMQMateRankSumAnnotation() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("b3c0eccf035c0c5d3ac09dc93b8ee97a")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("85b26109c4586ba33c44e253297050be")); spec.disableShadowBCF(); executeTest(" testASMQMateRankSumAnnotation", spec); } @@ -338,7 +338,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testASInsertSizeRankSum() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("207e9c996f5c289a33d178c16d854bd0")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("f1531ed03d6f9538d4d6af7630ff2502")); spec.disableShadowBCF(); executeTest(" testASInsertSizeRankSum", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 00d9a1090..aa6d23c15 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -96,82 +96,82 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() throws IOException { - HCTest(CEUTRIO_BAM, "", "bdc5a942a7df2d4c61388ecc74a163cf"); + HCTest(CEUTRIO_BAM, "", "b86311f6a0b4b5f2cfd05662f03fbe6c"); } @Test public void testHaplotypeCallerSingleSample() throws IOException { - HCTest(NA12878_BAM, "", "5941f88fa1372eb598880a4711ac08de"); + HCTest(NA12878_BAM, "", "1ceedd80a36e042a93d60e25b94c0b03"); } @Test public void testHaplotypeCallerMultiSampleHaploid() throws IOException { - HCTest(CEUTRIO_BAM, "-ploidy 1", "4c32640204a11fe46a23d97b1012bca2"); + HCTest(CEUTRIO_BAM, "-ploidy 1", "a7e83df05d8ef3558fa98e8c6a86e12d"); } @Test public void testHaplotypeCallerSingleSampleHaploid() throws IOException { - HCTest(NA12878_BAM, "-ploidy 1", "e09c5fd862f86f69289d847e3b293e19"); + HCTest(NA12878_BAM, "-ploidy 1", "6e9855e22ed78a60eaaf0a06ed967b77"); } @Test public void testHaplotypeCallerSingleSampleTetraploid() throws IOException { - HCTest(NA12878_BAM, "-ploidy 4", "bbae9f77683591200b2034d5461e6425"); + HCTest(NA12878_BAM, "-ploidy 4", "ae463cceb8cb0937440cfe5b56904feb"); } @Test public void testHaplotypeCallerMinBaseQuality() throws IOException { - HCTest(NA12878_BAM, "-mbq 15", "5941f88fa1372eb598880a4711ac08de"); + HCTest(NA12878_BAM, "-mbq 15", "1ceedd80a36e042a93d60e25b94c0b03"); } @Test public void testHaplotypeCallerMinBaseQualityHaploid() throws IOException { - HCTest(NA12878_BAM, "-mbq 15 -ploidy 1", "e09c5fd862f86f69289d847e3b293e19"); + HCTest(NA12878_BAM, "-mbq 15 -ploidy 1", "6e9855e22ed78a60eaaf0a06ed967b77"); } @Test public void testHaplotypeCallerMinBaseQualityTetraploid() throws IOException { - HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "bbae9f77683591200b2034d5461e6425"); + HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "ae463cceb8cb0937440cfe5b56904feb"); } @Test public void testHaplotypeCallerGraphBasedSingleSample() throws IOException { - HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "9a4ba54146449914b1fbaed5465b692e"); + HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "58abaabab25b9a75aaf17ab48478f42a"); } @Test public void testHaplotypeCallerGraphBasedMultiSampleHaploid() throws IOException { - HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "5333e3ff4d7fc53624eab801250be4f0"); + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "d7c7ff3883e8a50be38ec32b1a8fa5b4"); } @Test public void testHaplotypeCallerGraphBasedMultiSample() throws IOException { - HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "e9759bf254b7b4d06e4a68c94a417cf1"); + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "ccec6b941d41ac2642450964c3ab85d7"); } @Test public void testHaplotypeCallerSingleSampleWithDbsnp() throws IOException { - HCTest(NA12878_BAM, "-D " + b37dbSNP132, "7466c4d9ce6a1d23bcb428fc9f446843"); + HCTest(NA12878_BAM, "-D " + b37dbSNP132, "48a008f41644c8e4bb256141d095a140"); } @Test public void testHaplotypeCallerMultiSampleGGA() throws IOException { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf" + " -isr INTERSECTION -L " + GGA_INTERVALS_FILE, - "c2fe156912e59626c0393b9b47c9419e"); + "b7d2576ac00c175f4136b2bfbffc4dcd"); } @Test public void testHaplotypeCallerMultiSampleGGAHaploid() throws IOException { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -ploidy 1 -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf -isr INTERSECTION -L 20:10080000-10100000", - "49f737983375c740361daa2d48ed0249"); + "07e0a87e762173bcd6bba5e7af704159"); } @Test public void testHaplotypeCallerMultiSampleGGATetraploid() throws IOException { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -ploidy 4 -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf -isr INTERSECTION -L 20:10080000-10100000", - "6ba0fa930899f70fee9a7b1161508f93"); + "be7315e2866eccfb951ed63090f9119c"); } @Test @@ -187,7 +187,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "22028b104dd60b23a399e5e3a877a1fb"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6f5678f8fb6a4c0b21df1e6b0d1019ee"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -224,7 +224,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "6e55de7bf49ecdc71f6d4c9565a19853"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "796f4a44a29fc3c1a1461f90aef45846"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -234,7 +234,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("f8774326a268f7d394b333818d15d05c")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("392ac1966cfa1cae78e8d674065a0d28")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @@ -296,7 +296,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("ab816ff6facc08acb19c55bd6e828f02")); + Arrays.asList("5430b91902813cf64f5bb781b25d76c6")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -305,7 +305,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,100,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("fd1a539e14902f6957eb939aac1412f0")); + Arrays.asList("8b775d84ea981f1e8207fa69a92b5e1f")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } @@ -313,7 +313,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGSGraphBased() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("1810c35e8298dff4aa1b7b04fb5f4962")); + Arrays.asList("f6f5b7e9348fc6dccd63110de0371d78")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -322,7 +322,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("8a06a53388d73ec667b353379f3b351e")); + Arrays.asList("82cd0142146521fd659905737dc6192c")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } @@ -345,7 +345,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestAggressivePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1, - Arrays.asList("be71ab16d55437cbe3005ea3b93cece6")); + Arrays.asList("c43caebd09b69dbc9d1e4b5f5f449712")); executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); } @@ -353,7 +353,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestConservativePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1, - Arrays.asList("8a5185d0a9400c8b3f4b12da65181c4b")); + Arrays.asList("54e7595f6d82a69566f4c0163045688d")); executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); } @@ -382,7 +382,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testLackSensitivityDueToBadHaplotypeSelectionFix() { final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list"); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("02ec3090c4a6359fa10e6d8b30e1d5a2")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("8d44a2e1967e6e844f221960d6ea42bb")); spec.disableShadowBCF(); executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec); } @@ -391,7 +391,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testMissingKeyAlternativeHaplotypesBugFix() { final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header ", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "lost-alt-key-hap.bam", privateTestDir + "lost-alt-key-hap.interval_list"); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("13c9f094b9c54960dc2fd3a1815a2645")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cf8912f84153b2357a4d1ddb361f786f")); spec.disableShadowBCF(); executeTest("testMissingKeyAlternativeHaplotypesBugFix", spec); } @@ -468,12 +468,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerTandemRepeatAnnotator() throws IOException{ - HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "03738462e7f0b6f149f40b790a3a7261"); + HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "abc2daf88b9c5b3c0989f5d06df84ab3"); } @Test public void testHBaseCountsBySample() throws IOException{ - HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "8739d92898113436666f1cff3bc39bc5"); + HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "3e0ee74ec384ae38bda8ac15d8d67cca"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index d5a5010fc..5c7468444 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -260,7 +260,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + gVCF.getAbsolutePath(), b37KGReference), 1, - Arrays.asList("eeff965cd79c0b7085c7d4d7ecf82b68")); + Arrays.asList("d8eb4a64a2ae7e7dad1efc4fe8b4b3ed")); spec.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SAC executeTest("testStrandAlleleCountsBySample", spec); } @@ -566,7 +566,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testAlleleSpecificAnnotations() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V " + privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("35daaea8dea591d35ca99854c8d36e5f")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("7261cde50d4556c6dc190b10222a8538")); spec.disableShadowBCF(); executeTest("testAlleleSpecificAnnotations", spec); } @@ -575,7 +575,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testASMateRankSumAnnotation() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_MQMateRankSumTest --disableDithering -V " + privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.MateRankSum.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f8bf55e16358b35449621f896b084b7a")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("08dc30527c98d78d09f8575efc3b84ae")); spec.disableShadowBCF(); executeTest("testASMateRankSumAnnotation", spec); } @@ -584,7 +584,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testASInsertSizeRankSumAnnotation() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V " + privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.InsertSizeRankSum.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("8a27e5fbcb0d8f91684c2887167eb043")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("723522fe1d05f2d11efda008b65884d2")); spec.disableShadowBCF(); executeTest("testASInsertSizeRankSumAnnotation", spec); } @@ -597,7 +597,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testAlleleSpecificAnnotations_oneSample() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V " + privateTestDir + "NA12878.AS.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("b6026b2a2d2da39f181a4905b2225dad")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("cc2fe42eafd9789be49b550a639a7a1b")); spec.disableShadowBCF(); executeTest("testAlleleSpecificAnnotations_oneSample", spec); } @@ -616,7 +616,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testFractionInformativeReads() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -G AS_Standard -o %s --no_cmdline_in_header -A FractionInformativeReads --disableDithering -V " + privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("794cfec86a8bee1f6955766b5a98b950")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("16dbeec9d640e61453e902918427343a")); spec.disableShadowBCF(); executeTest("testAlleleSpecificAnnotations", spec); } diff --git a/public/gatk-root/pom.xml b/public/gatk-root/pom.xml index 07280ea4d..a6d5a8b37 100644 --- a/public/gatk-root/pom.xml +++ b/public/gatk-root/pom.xml @@ -196,6 +196,11 @@ commons-math 2.2 + + org.apache.commons + commons-math3 + 3.5 + net.java.dev.jna jna diff --git a/public/gatk-utils/pom.xml b/public/gatk-utils/pom.xml index cbb4cfa61..7b7c7f102 100644 --- a/public/gatk-utils/pom.xml +++ b/public/gatk-utils/pom.xml @@ -66,6 +66,10 @@ org.apache.commons commons-math + + org.apache.commons + commons-math3 + net.java.dev.jna jna diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java index a768a1c55..bb8866f34 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java @@ -25,483 +25,534 @@ package org.broadinstitute.gatk.utils; -import cern.jet.math.Arithmetic; -import cern.jet.random.Normal; -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.commons.math.MathException; -import org.apache.commons.math.distribution.NormalDistribution; -import org.apache.commons.math.distribution.NormalDistributionImpl; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.GATKException; +import htsjdk.samtools.util.Histogram; +import org.apache.commons.math3.distribution.NormalDistribution; -import java.io.Serializable; -import java.util.Comparator; -import java.util.TreeSet; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; /** - * Created by IntelliJ IDEA. - * User: chartl + * Imported with changes from Picard private. + * + * @author Tim Fennell */ public class MannWhitneyU { - private static Normal STANDARD_NORMAL = new Normal(0.0,1.0,null); - private static NormalDistribution APACHE_NORMAL = new NormalDistributionImpl(0.0,1.0,1e-2); - private static double LNSQRT2PI = Math.log(Math.sqrt(2.0*Math.PI)); + private static final class Rank implements Comparable { + final double value; + float rank; + final int series; - private TreeSet> observations; - private int sizeSet1; - private int sizeSet2; - private ExactMode exactMode; - - public MannWhitneyU(ExactMode mode, boolean dither) { - if ( dither ) - observations = new TreeSet>(new DitheringComparator()); - else - observations = new TreeSet>(new NumberedPairComparator()); - sizeSet1 = 0; - sizeSet2 = 0; - exactMode = mode; - } - - public MannWhitneyU() { - this(ExactMode.POINT,true); - } - - public MannWhitneyU(boolean dither) { - this(ExactMode.POINT,dither); - } - - public MannWhitneyU(ExactMode mode) { - this(mode,true); - } - - /** - * Add an observation into the observation tree - * @param n: the observation (a number) - * @param set: whether the observation comes from set 1 or set 2 - */ - public void add(Number n, USet set) { - observations.add(new Pair(n,set)); - if ( set == USet.SET1 ) { - ++sizeSet1; - } else { - ++sizeSet2; - } - } - - public Pair getR1R2() { - long u1 = calculateOneSidedU(observations,MannWhitneyU.USet.SET1); - long n1 = sizeSet1*(sizeSet1+1)/2; - long r1 = u1 + n1; - long n2 = sizeSet2*(sizeSet2+1)/2; - long u2 = n1*n2-u1; - long r2 = u2 + n2; - - return new Pair(r1,r2); - } - - /** - * Runs the one-sided test under the hypothesis that the data in set "lessThanOther" stochastically - * dominates the other set - * @param lessThanOther - either Set1 or Set2 - * @return - u-based z-approximation, and p-value associated with the test (p-value is exact for small n,m) - */ - @Requires({"lessThanOther != null"}) - @Ensures({"validateObservations(observations) || Double.isNaN(result.getFirst())","result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) - public Pair runOneSidedTest(USet lessThanOther) { - long u = calculateOneSidedU(observations, lessThanOther); - int n = lessThanOther == USet.SET1 ? sizeSet1 : sizeSet2; - int m = lessThanOther == USet.SET1 ? sizeSet2 : sizeSet1; - if ( n == 0 || m == 0 ) { - // test is uninformative as one or both sets have no observations - return new Pair(Double.NaN,Double.NaN); + private Rank(double value, float rank, int series) { + this.value = value; + this.rank = rank; + this.series = series; } - // the null hypothesis is that {N} is stochastically less than {M}, so U has counted - // occurrences of {M}s before {N}s. We would expect that this should be less than (n*m+1)/2 under - // the null hypothesis, so we want to integrate from K=0 to K=U for cumulative cases. Always. - return calculateP(n, m, u, false, exactMode); - } - - /** - * Runs the standard two-sided test, - * returns the u-based z-approximate and p values. - * @return a pair holding the u and p-value. - */ - @Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) - //@Requires({"validateObservations(observations)"}) - public Pair runTwoSidedTest() { - Pair uPair = calculateTwoSidedU(observations); - long u = uPair.first; - int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2; - int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1; - if ( n == 0 || m == 0 ) { - // test is uninformative as one or both sets have no observations - return new Pair(Double.NaN,Double.NaN); - } - return calculateP(n, m, u, true, exactMode); - } - - /** - * Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate - * @param n - The number of entries in the stochastically smaller (dominant) set - * @param m - The number of entries in the stochastically larger (dominated) set - * @param u - the Mann-Whitney U value - * @param twoSided - is the test twosided - * @return the (possibly approximate) p-value associated with the MWU test, and the (possibly approximate) z-value associated with it - * todo -- there must be an approximation for small m and large n - */ - @Requires({"m > 0","n > 0"}) - @Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) - protected static Pair calculateP(int n, int m, long u, boolean twoSided, ExactMode exactMode) { - Pair zandP; - if ( n > 8 && m > 8 ) { - // large m and n - normal approx - zandP = calculatePNormalApproximation(n,m,u, twoSided); - } else if ( n > 5 && m > 7 ) { - // large m, small n - sum uniform approx - // todo -- find the appropriate regimes where this approximation is actually better enough to merit slowness - // pval = calculatePUniformApproximation(n,m,u); - zandP = calculatePNormalApproximation(n, m, u, twoSided); - } else if ( n > 8 || m > 8 ) { - zandP = calculatePFromTable(n, m, u, twoSided); - } else { - // small m and n - full approx - zandP = calculatePRecursively(n,m,u,twoSided,exactMode); + @Override + public int compareTo(Rank that) { + return (int) (this.value - that.value); } - return zandP; - } - - public static Pair calculatePFromTable(int n, int m, long u, boolean twoSided) { - // todo -- actually use a table for: - // todo - n large, m small - return calculatePNormalApproximation(n,m,u, twoSided); - } - - /** - * Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947] - * @param n - The number of entries in the stochastically smaller (dominant) set - * @param m - The number of entries in the stochastically larger (dominated) set - * @param u - the Mann-Whitney U value - * @param twoSided - whether the test should be two sided - * @return p-value associated with the normal approximation - */ - @Requires({"m > 0","n > 0"}) - @Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) - public static Pair calculatePNormalApproximation(int n,int m,long u, boolean twoSided) { - double z = getZApprox(n,m,u); - if ( twoSided ) { - return new Pair(z,2.0*(z < 0 ? STANDARD_NORMAL.cdf(z) : 1.0-STANDARD_NORMAL.cdf(z))); - } else { - return new Pair(z,STANDARD_NORMAL.cdf(z)); + @Override + public String toString() { + return "Rank{" + + "value=" + value + + ", rank=" + rank + + ", series=" + series + + '}'; } } /** - * Calculates the Z-score approximation of the u-statistic - * @param n - The number of entries in the stochastically smaller (dominant) set - * @param m - The number of entries in the stochastically larger (dominated) set - * @param u - the Mann-Whitney U value - * @return the asymptotic z-approximation corresponding to the MWU p-value for n < m + * The results of performing a rank sum test. */ - @Requires({"m > 0","n > 0"}) - @Ensures({"! Double.isNaN(result)", "! Double.isInfinite(result)"}) - private static double getZApprox(int n, int m, long u) { - double mean = ( ((long)m)*n+1.0)/2; - double var = (((long) n)*m*(n+m+1.0))/12; - double z = ( u - mean )/Math.sqrt(var); - return z; + public static class Result { + private final double u; + private final double z; + private final double p; + private final double medianShift; + + public Result(double u, double z, double p, double medianShift) { + this.u = u; + this.z = z; + this.p = p; + this.medianShift = medianShift; + } + + public double getU() { + return u; + } + + public double getZ() { + return z; + } + + public double getP() { + return p; + } + + public double getMedianShift() { + return medianShift; + } } /** - * Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate - * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens, MA [1966, biometrika] (sum of uniform CDF) - * @param n - The number of entries in the stochastically smaller (dominant) set - * @param m - The number of entries in the stochastically larger (dominated) set - * @param u - mann-whitney u value - * @return p-value according to sum of uniform approx - * todo -- this is currently not called due to not having a good characterization of where it is significantly more accurate than the - * todo -- normal approxmation (e.g. enough to merit the runtime hit) + * The values of U1, U2 and the transformed number of ties needed for the calculation of sigma + * in the normal approximation. */ - public static double calculatePUniformApproximation(int n, int m, long u) { - long R = u + (n*(n+1))/2; - double a = Math.sqrt(m*(n+m+1)); - double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m)); - double z = b + ((double)R)/a; - if ( z < 0 ) { return 1.0; } - else if ( z > n ) { return 0.0; } - else { - if ( z > ((double) n) /2 ) { - return 1.0-1/(Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); - } else { - return 1/(Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); + public static class TestStatistic { + private final double u1; + private final double u2; + private final double trueU; + private final double numOfTiesTransformed; + + public TestStatistic(double u1, double u2, double numOfTiesTransformed) { + this.u1 = u1; + this.u2 = u2; + this.numOfTiesTransformed = numOfTiesTransformed; + this.trueU = Double.NaN; + } + + public TestStatistic(double trueU, double numOfTiesTransformed) { + this.trueU = trueU; + this.numOfTiesTransformed = numOfTiesTransformed; + this.u1 = Double.NaN; + this.u2 = Double.NaN; + } + + public double getU1() { + return u1; + } + + public double getU2() { + return u2; + } + + public double getTies() { + return numOfTiesTransformed; + } + + public double getTrueU() { + return trueU; + } + } + + /** + * The ranked data in one list and a list of the number of ties. + */ + public static class RankedData { + private final Rank[] rank; + private final ArrayList numOfTies; + + public RankedData(Rank[] rank, ArrayList numOfTies) { + this.rank = rank; + this.numOfTies = numOfTies; + } + + public Rank[] getRank() { + return rank; + } + + public ArrayList getNumOfTies() { + return numOfTies; + } + } + + /** + * Key for the map from Integer[] to set of all permutations of that array. + */ + private static class Key { + final Integer[] listToPermute; + + private Key(Integer[] listToPermute) { + this.listToPermute = listToPermute; + } + + @Override + public boolean equals(Object o) { + if (o == null || getClass() != o.getClass()) return false; + + Key that = (Key) o; + return (Arrays.deepEquals(this.listToPermute, that.listToPermute)); + } + + @Override + public int hashCode() { + int result = 17; + for (Integer i : listToPermute) { + result = 31 * result + listToPermute[i]; } + return result; } } + // Constructs a normal distribution; actual values of mean and SD don't matter since it's + // just used to convert a z-score into a cumulative probability + private static final double NORMAL_MEAN = 100; + private static final double NORMAL_SD = 15; + private static final NormalDistribution NORMAL = new NormalDistribution(NORMAL_MEAN, NORMAL_SD); + /** - * Helper function for the sum of n uniform random variables - * @param z - value at which to compute the (un-normalized) cdf - * @param m - a cutoff integer (defined by m <= z < m + 1) - * @param n - the number of uniform random variables - * @param k - holder variable for the recursion (alternatively, the index of the term in the sequence) - * @return the (un-normalized) cdf for the sum of n random variables + * A map of an Integer[] of the labels to the set of all possible permutations of those labels. */ - private static double uniformSumHelper(double z, int m, int n, int k) { - if ( k > m ) { return 0; } - int coef = (k % 2 == 0) ? 1 : -1; - return coef*Arithmetic.binomial(n,k)*Math.pow(z-k,n) + uniformSumHelper(z,m,n,k+1); + private static Map>> PERMUTATIONS = new ConcurrentHashMap>>(); + + /** + * The minimum length for both data series in order to use a normal distribution + * to calculate Z and p. If both series are shorter than this value then a permutation test + * will be used. + */ + private int minimumNormalN = 10; + + /** + * If the exact p-value is 1 then the z-score is infinite, so we need a max cutoff instead. + */ + private double maxZScore = 9.0; + + /** + * Sets the minimum number of values in each data series to use the normal distribution approximation. + */ + public void setMinimumSeriesLengthForNormalApproximation(final int n) { + this.minimumNormalN = n; } /** - * Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn - * stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for - * later on simply by multiplying the p-value by 2. - * - * Recall: If X stochastically dominates Y, the test is for occurrences of Y before X, so the lower value of u is chosen - * @param observed - the observed data - * @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1) + * A variable that indicates if the test is one sided or two sided and if it's one sided + * which group is the dominator in the null hypothesis. */ - @Requires({"observed != null", "observed.size() > 0"}) - @Ensures({"result != null","result.first > 0"}) - public static Pair calculateTwoSidedU(TreeSet> observed) { - int set1SeenSoFar = 0; - int set2SeenSoFar = 0; - long uSet1DomSet2 = 0; - long uSet2DomSet1 = 0; - USet previous = null; - for ( Pair dataPoint : observed ) { + public enum TestType { + FIRST_DOMINATES, + SECOND_DOMINATES, + TWO_SIDED + } - if ( dataPoint.second == USet.SET1 ) { - ++set1SeenSoFar; - } else { - ++set2SeenSoFar; - } + public RankedData calculateRank(final double[] series1, final double[] series2) { + Arrays.sort(series1); + Arrays.sort(series2); - if ( previous != null ) { - if ( dataPoint.second == USet.SET1 ) { - uSet2DomSet1 += set2SeenSoFar; + // Make a merged ranks array + final Rank[] ranks = new Rank[series1.length + series2.length]; + { + int i = 0, j = 0, r = 0; + while (r < ranks.length) { + if (i >= series1.length) { + ranks[r++] = new Rank(series2[j++], r, 2); + } else if (j >= series2.length) { + ranks[r++] = new Rank(series1[i++], r, 1); + } else if (series1[i] <= series2[j]) { + ranks[r++] = new Rank(series1[i++], r, 1); } else { - uSet1DomSet2 += set1SeenSoFar; + ranks[r++] = new Rank(series2[j++], r, 2); } } - - previous = dataPoint.second; } - return uSet1DomSet2 < uSet2DomSet1 ? new Pair(uSet1DomSet2,USet.SET1) : new Pair(uSet2DomSet1,USet.SET2); + ArrayList numOfTies = new ArrayList<>(); + + // Now sort out any tie bands + for (int i = 0; i < ranks.length; ) { + float rank = ranks[i].rank; + int count = 1; + + for (int j = i + 1; j < ranks.length && ranks[j].value == ranks[i].value; ++j) { + rank += ranks[j].rank; + ++count; + } + + if (count > 1) { + rank /= count; + for (int j = i; j < i + count; ++j) { + ranks[j].rank = rank; + } + numOfTies.add(count); + } + + // Skip forward the right number of items + i += count; + } + + return new RankedData(ranks, numOfTies); } /** - * Calculates the U-statistic associated with the one-sided hypothesis that "dominator" stochastically dominates - * the other U-set. Note that if S1 dominates S2, we want to count the occurrences of points in S2 coming before points in S1. - * @param observed - the observed data points, tagged by each set - * @param dominator - the set that is hypothesized to be stochastically dominating - * @return the u-statistic associated with the hypothesis that dominator stochastically dominates the other set + * Rank both groups together and return a TestStatistic object that includes U1, U2 and number of ties for sigma */ - @Requires({"observed != null","dominator != null","observed.size() > 0"}) - @Ensures({"result >= 0"}) - public static long calculateOneSidedU(TreeSet> observed,USet dominator) { - long otherBeforeDominator = 0l; - int otherSeenSoFar = 0; - for ( Pair dataPoint : observed ) { - if ( dataPoint.second != dominator ) { - ++otherSeenSoFar; - } else { - otherBeforeDominator += otherSeenSoFar; + public TestStatistic calculateU1andU2(final double[] series1, final double[] series2) { + RankedData ranked = calculateRank(series1, series2); + Rank[] ranks = ranked.getRank(); + ArrayList numOfTies = ranked.getNumOfTies(); + + //Calculate number of ties transformed for formula for Sigma to calculate Z-score + ArrayList transformedTies = new ArrayList<>(); + for (int count : numOfTies) { + //If every single datapoint is tied then we want to return a p-value of .5 and + //the formula for sigma that includes the number of ties breaks down. Setting + //the number of ties to 0 in this case gets the desired result. + if (count != ranks.length) { + transformedTies.add((count * count * count) - count); } } - return otherBeforeDominator; + double numOfTiesForSigma = 0.0; + for (int count : transformedTies) { + numOfTiesForSigma += count; + } + + // Calculate R1 and R2 and U. + float r1 = 0, r2 = 0; + for (Rank rank : ranks) { + if (rank.series == 1) r1 += rank.rank; + else r2 += rank.rank; + } + + double n1 = series1.length; + double n2 = series2.length; + double u1 = r1 - ((n1 * (n1 + 1)) / 2); + double u2 = r2 - ((n2 * (n2 + 1)) / 2); + + TestStatistic result = new TestStatistic(u1, u2, numOfTiesForSigma); + return result; } /** - * The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible - * binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor - * calls into that recursive calculation. - * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two) - * @param m: number of set-two entries - * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) - * @param twoSided: whether the test is two sided or not. The recursive formula is symmetric, multiply by two for two-sidedness. - * @param mode: whether the mode is a point probability, or a cumulative distribution - * @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times. + * Calculates the rank-sum test statisic U (sometimes W) from two sets of input data for a one-sided test + * with an int indicating which group is the dominator. Returns a test statistic object with trueU and number of + * ties for sigma. */ - @Requires({"m > 0","n > 0","u >= 0"}) - @Ensures({"result != null","! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) - public static Pair calculatePRecursively(int n, int m, long u, boolean twoSided, ExactMode mode) { - if ( m > 8 && n > 5 ) { throw new GATKException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); } - double p = mode == ExactMode.POINT ? cpr(n,m,u) : cumulativeCPR(n,m,u); - //p *= twoSided ? 2.0 : 1.0; + public TestStatistic calculateOneSidedU(final double[] series1, final double[] series2, final TestType whichSeriesDominates) { + TestStatistic stat = calculateU1andU2(series1, series2); + TestStatistic result; + if (whichSeriesDominates == TestType.FIRST_DOMINATES) { + result = new TestStatistic(stat.getU1(), stat.getTies()); + } else { + result = new TestStatistic(stat.getU2(), stat.getTies()); + } + return result; + } + + /** + * Calculates the two-sided rank-sum test statisic U (sometimes W) from two sets of input data. + * Returns a test statistic object with trueU and number of ties for sigma. + */ + public TestStatistic calculateTwoSidedU(final double[] series1, final double[] series2) { + TestStatistic u1AndU2 = calculateU1andU2(series1, series2); + double u = Math.min(u1AndU2.getU1(), u1AndU2.getU2()); + TestStatistic result = new TestStatistic(u, u1AndU2.getTies()); + return result; + } + + /** + * Calculates the Z score (i.e. standard deviations from the mean) of the rank sum + * test statistics given input data of lengths n1 and n2 respectively, as well as the number of ties, for normal + * approximation only. + */ + public double calculateZ(final double u, final int n1, final int n2, final double nties, final TestType whichSide) { + double m = (n1 * n2) / 2d; + + //Adds a continuity correction + double correction; + if (whichSide == TestType.TWO_SIDED) { + correction = (u - m) >= 0 ? .5 : -.5; + } else { + correction = whichSide == TestType.FIRST_DOMINATES ? -.5 : .5; + } + + //If all the data is tied, the number of ties for sigma is set to 0. In order to get a p-value of .5 we need to + //remove the continuity correction. + if (nties == 0) { + correction = 0; + } + + double sigma = Math.sqrt((n1 * n2 / 12d) * ((n1 + n2 + 1) - nties / ((n1 + n2) * (n1 + n2 - 1)))); + return (u - m - correction) / sigma; + } + + /** + * Finds or calculates the median value of a sorted array of double. + */ + public double median(final double[] data) { + final int len = data.length; + final int mid = len / 2; + if (data.length % 2 == 0) { + return (data[mid] + data[mid - 1]) / 2d; + } else { + return data[mid]; + } + } + + + /** + * Constructs a new rank sum test with the given data. + * + * @param series1 group 1 data + * @param series2 group 2 data + * @param whichSide indicator of two sided test, 0 for two sided, 1 for series1 as dominator, 2 for series2 as dominator + * @return Result including U statistic, Z score, p-value, and difference in medians. + */ + public Result test(final double[] series1, final double[] series2, final TestType whichSide) { + final int n1 = series1.length; + final int n2 = series2.length; + + //If one of the groups is empty we return NaN + if (n1 == 0 || n2 == 0) { + return new Result(Float.NaN, Float.NaN, Float.NaN, Float.NaN); + } + + double u; + double nties; + + if (whichSide == TestType.TWO_SIDED) { + TestStatistic result = calculateTwoSidedU(series1, series2); + u = result.getTrueU(); + nties = result.getTies(); + } else { + TestStatistic result = calculateOneSidedU(series1, series2, whichSide); + u = result.getTrueU(); + nties = result.getTies(); + } + double z; - try { + double p; - if ( mode == ExactMode.CUMULATIVE ) { - z = APACHE_NORMAL.inverseCumulativeProbability(p); + if (n1 >= this.minimumNormalN || n2 >= this.minimumNormalN) { + z = calculateZ(u, n1, n2, nties, whichSide); + p = 2 * NORMAL.cumulativeProbability(NORMAL_MEAN + z * NORMAL_SD); + if (whichSide != TestType.TWO_SIDED) { + p = p / 2; + } + } else { + // TODO -- This exact test is only implemented for the one sided test, but we currently don't call the two sided version + Histogram distribution = permutationTest(series1, series2); + p = distribution.getCumulativeProbability(u); + if (p == 1) { + z = maxZScore; + } else if (p == 0) { + z = -1.0 * maxZScore; } else { - double sd = Math.sqrt((1.0+1.0/(1+n+m))*(n*m)*(1.0+n+m)/12); // biased variance empirically better fit to distribution then asymptotic variance - //System.out.printf("SD is %f and Max is %f and prob is %f%n",sd,1.0/Math.sqrt(sd*sd*2.0*Math.PI),p); - if ( p > 1.0/Math.sqrt(sd*sd*2.0*Math.PI) ) { // possible for p-value to be outside the range of the normal. Happens at the mean, so z is 0. - z = 0.0; - } else { - if ( u >= n*m/2 ) { - z = Math.sqrt(-2.0*(Math.log(sd)+Math.log(p)+LNSQRT2PI)); - } else { - z = -Math.sqrt(-2.0*(Math.log(sd)+Math.log(p)+LNSQRT2PI)); - } + z = NORMAL.inverseCumulativeProbability(p); + } + } + + return new Result(u, z, p, Math.abs(median(series1) - median(series2))); + } + + private void swap(Integer[] arr, int i, int j) { + int temp = arr[i]; + arr[i] = arr[j]; + arr[j] = temp; + } + + /** + * Uses a method that generates permutations in lexicographic order. (https://en.wikipedia.org/wiki/Permutation#Generation_in_lexicographic_order) + * @param temp Sorted list of elements to be permuted. + * @param allPermutations Empty set that will hold all possible permutations. + */ + private void calculatePermutations(Integer[] temp, Set> allPermutations) { + allPermutations.add(new ArrayList<>(Arrays.asList(temp))); + while (true) { + int k = -1; + for (int i = temp.length - 2; i >= 0; i--) { + if (temp[i] < temp[i + 1]) { + k = i; + break; } } - } catch (MathException me) { - throw new GATKException("A math exception occurred in inverting the probability",me); + if (k == -1) { + break; + } + + int l = -1; + for (int i = temp.length - 1; i >= k + 1; i--) { + if (temp[k] < temp[i]) { + l = i; + break; + } + } + + swap(temp, k, l); + + int end = temp.length - 1; + for (int begin = k + 1; begin < end; begin++) { + swap(temp, begin, end); + end--; + } + allPermutations.add(new ArrayList<>(Arrays.asList(temp))); } - - return new Pair(z,(twoSided ? 2.0*p : p)); } /** - * Hook into CPR with sufficient warning (for testing purposes) - * calls into that recursive calculation. - * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two) - * @param m: number of set-two entries - * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) - * @return same as cpr + * Checks to see if the permutations have already been computed before creating them from scratch. + * @param listToPermute List of tags in numerical order to be permuted + * @param numOfPermutations The number of permutations this list will have (n1+n2 choose n1) + * @return Set of all possible permutations for the given list. */ - protected static double calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(int n, int m, long u) { - return cpr(n,m,u); + Set> getPermutations(final Integer[] listToPermute, int numOfPermutations) { + Key key = new Key(listToPermute); + Set> permutations = PERMUTATIONS.get(key); + if (permutations == null) { + permutations = new HashSet<>(numOfPermutations); + calculatePermutations(listToPermute, permutations); + PERMUTATIONS.put(key, permutations); + } + return permutations; } /** - * For testing + * Creates histogram of test statistics from a permutation test. * - * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two) - * @param m: number of set-two entries - * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) + * @param series1 Data from group 1 + * @param series2 Data from group 2 + * @return Histogram with u calculated for every possible permutation of group tag. */ - protected static long countSequences(int n, int m, long u) { - if ( u < 0 ) { return 0; } - if ( m == 0 || n == 0 ) { return u == 0 ? 1 : 0; } + public Histogram permutationTest(final double[] series1, final double[] series2) { + final Histogram histo = new Histogram(); + final int n1 = series1.length; + final int n2 = series2.length; - return countSequences(n-1,m,u-m) + countSequences(n,m-1,u); - } + RankedData rankedGroups = calculateRank(series1, series2); + Rank[] ranks = rankedGroups.getRank(); - /** - * : just a shorter name for calculatePRecursively. See Mann, Whitney, [1947] - * @param n: number of set-1 entries - * @param m: number of set-2 entries - * @param u: number of times a set-2 entry as preceded a set-1 entry - * @return recursive p-value - */ - private static double cpr(int n, int m, long u) { - if ( u < 0 ) { - return 0.0; - } - if ( m == 0 || n == 0 ) { - // there are entries in set 1 or set 2, so no set-2 entry can precede a set-1 entry; thus u must be zero. - // note that this exists only for edification, as when we reach this point, the coefficient on this term is zero anyway - return ( u == 0 ) ? 1.0 : 0.0; - } + Integer[] firstPermutation = new Integer[n1 + n2]; - - return (((double)n)/(n+m))*cpr(n-1,m,u-m) + (((double)m)/(n+m))*cpr(n,m-1,u); - } - - private static double cumulativeCPR(int n, int m, long u ) { - // from above: - // the null hypothesis is that {N} is stochastically less than {M}, so U has counted - // occurrences of {M}s before {N}s. We would expect that this should be less than (n*m+1)/2 under - // the null hypothesis, so we want to integrate from K=0 to K=U for cumulative cases. Always. - double p = 0.0; - // optimization using symmetry, use the least amount of sums possible - long uSym = ( u <= n*m/2 ) ? u : ((long)n)*m-u; - for ( long uu = 0; uu < uSym; uu++ ) { - p += cpr(n,m,uu); - } - // correct by 1.0-p if the optimization above was used (e.g. 1-right tail = left tail) - return (u <= n*m/2) ? p : 1.0-p; - } - - /** - * hook into the data tree, for testing purposes only - * @return observations - */ - protected TreeSet> getObservations() { - return observations; - } - - /** - * hook into the set sizes, for testing purposes only - * @return size set 1, size set 2 - */ - protected Pair getSetSizes() { - return new Pair(sizeSet1,sizeSet2); - } - - /** - * Validates that observations are in the correct format for a MWU test -- this is only called by the contracts API during testing - * @param tree - the collection of labeled observations - * @return true iff the tree set is valid (no INFs or NaNs, at least one data point in each set) - */ - protected static boolean validateObservations(TreeSet> tree) { - boolean seen1 = false; - boolean seen2 = false; - boolean seenInvalid = false; - for ( Pair p : tree) { - if ( ! seen1 && p.getSecond() == USet.SET1 ) { - seen1 = true; + for (int i = 0; i < firstPermutation.length; i++) { + if (i < n1) { + firstPermutation[i] = 0; + } else { + firstPermutation[i] = 1; } + } - if ( ! seen2 && p.getSecond() == USet.SET2 ) { - seen2 = true; - } - - if ( Double.isNaN(p.getFirst().doubleValue()) || Double.isInfinite(p.getFirst().doubleValue())) { - seenInvalid = true; + final int numOfPerms = (int) MathUtils.binomialCoefficient(n1 + n2, n2); + Set> allPermutations = getPermutations(firstPermutation, numOfPerms); + + double[] newSeries1 = new double[n1]; + double[] newSeries2 = new double[n2]; + + //iterate over all permutations + for (List currPerm : allPermutations) { + int series1End = 0; + int series2End = 0; + for (int i = 0; i < currPerm.size(); i++) { + int grouping = currPerm.get(i); + if (grouping == 0) { + newSeries1[series1End] = ranks[i].rank; + series1End++; + } else { + newSeries2[series2End] = ranks[i].rank; + series2End++; + } } + assert (series1End == n1); + assert (series2End == n2); + double newU = MathUtils.sum(newSeries1) - ((n1 * (n1 + 1)) / 2.0); + histo.increment(newU); } - return ! seenInvalid && seen1 && seen2; + return histo; } - /** - * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values - * and to ensure that rank ties are broken at random. - */ - private static class DitheringComparator implements Comparator>, Serializable { - - public DitheringComparator() {} - - @Override - public boolean equals(Object other) { return false; } - - @Override - public int compare(Pair left, Pair right) { - double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue()); - if ( comp > 0 ) { return 1; } - if ( comp < 0 ) { return -1; } - return Utils.getRandomGenerator().nextBoolean() ? -1 : 1; - } - } - - /** - * A comparator that reaches into the pair and compares numbers without tie-braking. - */ - private static class NumberedPairComparator implements Comparator>, Serializable { - - public NumberedPairComparator() {} - - @Override - public boolean equals(Object other) { return false; } - - @Override - public int compare(Pair left, Pair right ) { - return Double.compare(left.first.doubleValue(),right.first.doubleValue()); - } - } - - public enum USet { SET1, SET2 } - public enum ExactMode { POINT, CUMULATIVE } - } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java index fb0057898..9cbfca59c 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java @@ -25,13 +25,13 @@ package org.broadinstitute.gatk.utils; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.collections.Pair; - import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; +import java.util.Arrays; + /** * Created by IntelliJ IDEA. * User: Ghost @@ -40,92 +40,75 @@ import org.testng.Assert; * To change this template use File | Settings | File Templates. */ public class MWUnitTest extends BaseTest { + private static double DELTA_PRECISION = 0.00001; + @BeforeClass public void init() { } - @Test - private void testMWU() { - logger.warn("Testing MWU"); - MannWhitneyU mwu = new MannWhitneyU(); - mwu.add(0, MannWhitneyU.USet.SET1); - mwu.add(1,MannWhitneyU.USet.SET2); - mwu.add(2,MannWhitneyU.USet.SET2); - mwu.add(3,MannWhitneyU.USet.SET2); - mwu.add(4,MannWhitneyU.USet.SET2); - mwu.add(5,MannWhitneyU.USet.SET2); - mwu.add(6,MannWhitneyU.USet.SET1); - mwu.add(7,MannWhitneyU.USet.SET1); - mwu.add(8,MannWhitneyU.USet.SET1); - mwu.add(9,MannWhitneyU.USet.SET1); - mwu.add(10,MannWhitneyU.USet.SET1); - mwu.add(11,MannWhitneyU.USet.SET2); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11L); + private static final MannWhitneyU rst = new MannWhitneyU(); - MannWhitneyU mwu2 = new MannWhitneyU(); - MannWhitneyU mwuNoDither = new MannWhitneyU(false); - for ( int dp : new int[]{2,4,5,6,8} ) { - mwu2.add(dp,MannWhitneyU.USet.SET1); - mwuNoDither.add(dp,MannWhitneyU.USet.SET1); - } + @Test(dataProvider = "rankSumTestData") + public void testSimpleU(String name, double[] series1, double[] series2, double U) { + MannWhitneyU.Result test = rst.test(series1, series2, MannWhitneyU.TestType.TWO_SIDED); + System.out.println("=========================================================="); + System.out.println("Series 1: " + Arrays.toString(series1)); + System.out.println("Series 2: " + Arrays.toString(series2)); + System.out.println(" U: " + test.getU()); + System.out.println(" Z: " + test.getZ()); + System.out.println("2-side p: " + test.getP()); + Assert.assertEquals(test.getU(), U, name); + } - for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) { - mwu2.add(dp,MannWhitneyU.USet.SET2); - mwuNoDither.add(dp,MannWhitneyU.USet.SET2); - } + @Test(dataProvider = "oneSidedPTestData") + public void testOnesidedP(String name, double[] series1, double[] series2, double P) { + MannWhitneyU.Result test = rst.test(series1, series2, MannWhitneyU.TestType.FIRST_DOMINATES); + System.out.println("=========================================================="); + System.out.println("Series 1: " + Arrays.toString(series1)); + System.out.println("Series 2: " + Arrays.toString(series2)); + System.out.println(" U: " + test.getU()); + System.out.println(" Z: " + test.getZ()); + System.out.println("1-side p: " + test.getP()); + Assert.assertEquals(test.getP(), P, DELTA_PRECISION, name); + } - MannWhitneyU.ExactMode pm = MannWhitneyU.ExactMode.POINT; - MannWhitneyU.ExactMode cm = MannWhitneyU.ExactMode.CUMULATIVE; - - // tests using the hypothesis that set 2 dominates set 1 (U value = 10) - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET1),10L); - Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET2),30L); - - Pair sizes = mwu2.getSetSizes(); - - Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10L),0.4180519701814064,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10L,false,pm).second,0.021756021756021756,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10L,false).second,0.06214143703127617,1e-14); - logger.warn("Testing two-sided"); - Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8); - - // tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off - Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30L,true).second,2.0*0.08216463976903321,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30L),0.0023473625009559074,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30L,false,pm).second,0.021756021756021756,1e-14); // note -- exactly same value as above - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).second,1.0-0.08547008547008,1e-14); // r does a correction, subtracting 1 from U - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).second,0.08547008547008,1e-14); // r does a correction, subtracting 1 from U - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).first,-1.36918910442,1e-2); // apache inversion set to be good only to 1e-2 - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).first,1.36918910442,1e-2); // apache inversion set to be good only to 1e-2 - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,pm).first,1.2558754796642067,1e-8); // PDF should be similar - Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,pm).first,-1.2558754796642067,1e-8); // PDF should be similar - Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).second,0.0952381,1e-5); - Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).first,0.0,1e-14); - - logger.warn("Set 1"); - Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8); - logger.warn("Set 2"); - Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8); - - MannWhitneyU mwu3 = new MannWhitneyU(); - for ( int dp : new int[]{0,2,4} ) { - mwu3.add(dp,MannWhitneyU.USet.SET1); - } - for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) { - mwu3.add(dp,MannWhitneyU.USet.SET2); - } - long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1); - //logger.warn(String.format("U is: %d",u)); - Pair nums = mwu3.getSetSizes(); - //logger.warn(String.format("Corrected p is: %.4e",MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u))); - //logger.warn(String.format("Counted sequences: %d",MannWhitneyU.countSequences(nums.first, nums.second, u))); - //logger.warn(String.format("Possible sequences: %d", (long) Arithmetic.binomial(nums.first+nums.second,nums.first))); - //logger.warn(String.format("Ratio: %.4e",MannWhitneyU.countSequences(nums.first,nums.second,u)/Arithmetic.binomial(nums.first+nums.second,nums.first))); - Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first, nums.second, u), 3.665689149560116E-4, 1e-14); - Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u,false).second,0.0032240865760884696,1e-14); - Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14); + @DataProvider(name="rankSumTestData") + public Object[][] dataProvider() { + return new Object[][] { + new Object[] {"test1", new double[] {20,20,20,20,20}, new double[] {20,20,20,20,21}, 10d}, + new Object[] {"test2", new double[] {20,20,20,20,21}, new double[] {20,20,20,20,21}, 12.5d}, + new Object[] {"test3", new double[] {13,14,15,15,16}, new double[] {16,20,20,21,21}, 0.5d}, + new Object[] {"test4", new double[] {13,14,15,15,16}, new double[] {16,20,20,21,21,21,21,22,23,27}, 0.5d}, + new Object[] {"test5", new double[] {13,14,15,15,16,18,20,22,25,24,25,26,27,28,22,23,19,30,28,22,17}, + new double[] {16,20,20,21,21,21,21,22,23,27,26,28,29,32,31,22,21,19,16,24,29}, + 180.5d}, + new Object[] {"test6", new double[] {13,14,15,15,16,18,13,14,15,15,16,18,13,14,15,15,16,18,13,14,15,15,16,18}, + new double[] {21,22,23,27,26,28,29,21,22,23,27,26,28,29,21,22,23,27,26,28,29,21,22,23,27,26,28,29}, + 0d}, + new Object[] {"test7", new double[] {11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20}, + new double[] {12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21}, + 145d}, + new Object[] {"test7", new double[] {11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20}, + new double[] {12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21}, + 162d}, + new Object[] {"test8", new double[] {20,20,20,20,20}, new double[] {20,20,20,20,20,20,20,20,20,20}, 25d}, + }; + } + @DataProvider(name="oneSidedPTestData") + public Object[][] oneSidedDataProvider() { + return new Object[][] { + new Object[] {"test0", new double[] {0,0}, new double[] {1,1}, .16666666}, + new Object[] {"test1", new double[] {20,20,20,20,20}, new double[] {20,20,20,20,21}, .5}, + new Object[] {"test2", new double[] {20,20,20,20,21}, new double[] {20,20,20,20,21}, .77778}, + new Object[] {"test3", new double[] {13,14,15,15,16}, new double[] {16,20,20,21,21}, 0.0079365}, + new Object[] {"test4", new double[] {13,14,15,15,16}, new double[] {16,20,20,21,21,21,21,22,23,27}, 0.001469192}, + new Object[] {"test5", new double[] {20,20,20,20,20}, new double[] {20,20,20,20,20,20,20,20,20,20}, .5}, + new Object[] {"test6", new double[] {1,2,3,4,5}, new double[] {6,7,8,9,10}, .00396}, + new Object[] {"test7", new double[] {6,7,8,9,10}, new double[] {1,2,3,4,5}, 1}, + new Object[] {"test8", new double[] {16,20,20,21,21,21,21,22,23,27,16,20,20,21,21,21,21,22,23,27}, + new double[] {22,23,27,16,20,20,21,21,21,21,22,23,27,60,60}, .08303102}, + new Object[] {"test8", new double[] {16,20,20,21,21,21,21,21,20}, + new double[] {22,23,27,16,20,20,20,20,21}, .395763}, + }; } }