Removes Dithering from Rank Sum Test

Fixing empty group case

Fixing MD5s

First comments addressed

Added permutation test

Adding new RankSum to AS_RankSum

Speeding up permutation algorithm and updating MD5s

Missed a few tests

Addressing comments

Changing md5s
This commit is contained in:
meganshand 2015-10-21 09:27:52 -04:00
parent 3b327ac0e4
commit c7e0f5b225
17 changed files with 677 additions and 650 deletions

View File

@ -59,11 +59,7 @@ import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ReducibleAnnotation;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller;
import org.broadinstitute.gatk.tools.walkers.variantutils.CombineGVCFs;
import org.broadinstitute.gatk.tools.walkers.variantutils.GenotypeGVCFs;
import org.broadinstitute.gatk.utils.MannWhitneyU;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
@ -298,14 +294,16 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
for (final Allele alt : perAlleleValues.keySet()) {
if (alt.equals(ref, false))
continue;
final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering);
//load alts
final MannWhitneyU mannWhitneyU = new MannWhitneyU();
//load alts (series 1)
List<Double> alts = new ArrayList<>();
for (final Number qual : perAlleleValues.get(alt)) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
alts.add((double) qual.intValue());
}
//load refs
//load refs (series 2)
List<Double> refs = new ArrayList<>();
for (final Number qual : perAlleleValues.get(ref)) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET2);
refs.add((double) qual.intValue());
}
if (DEBUG) {
@ -320,8 +318,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
}
// we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
final Pair<Double, Double> testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1);
perAltRankSumResults.put(alt, testResults.first);
final MannWhitneyU.Result result = mannWhitneyU.test(convertToArray(alts), convertToArray(refs), MannWhitneyU.TestType.FIRST_DOMINATES);
perAltRankSumResults.put(alt, result.getZ());
}
return perAltRankSumResults;
}

View File

@ -51,7 +51,6 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
@ -61,8 +60,6 @@ import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.MannWhitneyU;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import htsjdk.variant.variantcontext.Allele;
@ -79,7 +76,6 @@ import java.util.*;
//TODO: will eventually implement ReducibleAnnotation in order to preserve accuracy for CombineGVCFs and GenotypeGVCFs -- see RMSAnnotation.java for an example of an abstract ReducibleAnnotation
public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
static final boolean DEBUG = false;
protected boolean useDithering = true;
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@ -122,13 +118,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
if ( refQuals.isEmpty() && altQuals.isEmpty() )
return null;
final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering);
for (final Double qual : altQuals) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
}
for (final Double qual : refQuals) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET2);
}
final MannWhitneyU mannWhitneyU = new MannWhitneyU();
if (DEBUG) {
System.out.format("%s, REF QUALS:", this.getClass().getName());
@ -142,14 +132,26 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
}
// we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
final Pair<Double, Double> testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1);
final MannWhitneyU.Result result = mannWhitneyU.test(convertToArray(altQuals), convertToArray(refQuals), MannWhitneyU.TestType.FIRST_DOMINATES);
final double zScore = result.getZ();
final Map<String, Object> map = new HashMap<>();
if (!Double.isNaN(testResults.first))
map.put(getKeyNames().get(0), String.format("%.3f", testResults.first));
if (!Double.isNaN(zScore))
map.put(getKeyNames().get(0), String.format("%.3f", zScore));
return map;
}
public static double[] convertToArray(List<Double> list){
double[] ret = new double[list.size()];
Iterator<Double> iterator = list.iterator();
for (int i = 0; i < ret.length; i++)
{
ret[i] = iterator.next().doubleValue();
}
return ret;
}
private void fillQualsFromPileup(final List<Allele> alleles,
final ReadBackedPileup pileup,
final List<Double> refQuals,
@ -255,14 +257,4 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE );
}
/**
* Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if
* engine randomization is turned off, and if so does not dither.
* @param walker the walker
* @param toolkit the GATK engine
* @param headerLines the header lines
*/
public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
useDithering = ! toolkit.getArguments().disableDithering;
}
}

View File

@ -65,7 +65,7 @@ import java.util.List;
public class RankSumUnitTest {
List<Integer> distribution20, distribution30, distribution20_40;
List<Double> distribution20, distribution30, distribution20_40;
static final int observations = 100;
@BeforeClass
@ -86,11 +86,11 @@ public class RankSumUnitTest {
Collections.shuffle(distribution20_40, Utils.getRandomGenerator());
}
private static void makeDistribution(final List<Integer> result, final int target, final int skew, final int numObservations) {
private static void makeDistribution(final List<Double> result, final int target, final int skew, final int numObservations) {
final int rangeStart = target - skew;
final int rangeEnd = target + skew;
int current = rangeStart;
double current = rangeStart;
for ( int i = 0; i < numObservations; i++ ) {
result.add(current++);
if ( current > rangeEnd )
@ -118,40 +118,33 @@ public class RankSumUnitTest {
}
@Test(enabled = true, dataProvider = "DistributionData")
public void testDistribution(final List<Integer> distribution1, final List<Integer> distribution2, final int numToReduceIn2, final boolean distributionsShouldBeEqual, final String debugString) {
final MannWhitneyU mannWhitneyU = new MannWhitneyU(true);
public void testDistribution(final List<Double> distribution1, final List<Double> distribution2, final int numToReduceIn2, final boolean distributionsShouldBeEqual, final String debugString) {
final MannWhitneyU mannWhitneyU = new MannWhitneyU();
for ( final Integer num : distribution1 )
mannWhitneyU.add(num, MannWhitneyU.USet.SET1);
final List<Integer> dist2 = new ArrayList<>(distribution2);
final List<Double> dist2 = new ArrayList<>(distribution2);
if ( numToReduceIn2 > 0 ) {
int counts = 0;
int quals = 0;
Double counts = 0.0;
Double quals = 0.0;
for ( int i = 0; i < numToReduceIn2; i++ ) {
counts++;
quals += dist2.remove(0);
}
final int qual = quals / counts;
final Double qual = quals / counts;
for ( int i = 0; i < numToReduceIn2; i++ )
dist2.add(qual);
}
for ( final Integer num : dist2 )
mannWhitneyU.add(num, MannWhitneyU.USet.SET2);
final Double result = mannWhitneyU.runTwoSidedTest().second;
final Double result = mannWhitneyU.test(RankSumTest.convertToArray(distribution1),RankSumTest.convertToArray(dist2), MannWhitneyU.TestType.TWO_SIDED).getP();
Assert.assertFalse(Double.isNaN(result));
if ( distributionsShouldBeEqual ) {
// TODO -- THIS IS THE FAILURE POINT OF USING REDUCED READS WITH RANK SUM TESTS
if ( numToReduceIn2 >= observations / 2 )
return;
Assert.assertTrue(result > 0.1, String.format("%f %d %d", result, numToReduceIn2, dist2.get(0)));
Assert.assertTrue(result > 0.1, String.format("%f %d %f", result, numToReduceIn2, dist2.get(0)));
} else {
Assert.assertTrue(result < 0.01, String.format("%f %d %d", result, numToReduceIn2, dist2.get(0)));
Assert.assertTrue(result < 0.01, String.format("%f %d %f", result, numToReduceIn2, dist2.get(0)));
}
}
}

View File

@ -100,7 +100,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("d6cd81fc2f483f29d44fbb27d1772841"));
Arrays.asList("832861ecdc6344cfb7097d02903ffd4d"));
executeTest("test file has annotations, asking for annotations, #1", spec);
}
@ -108,7 +108,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsAsking2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
Arrays.asList("300836de4e2c8424734d2ee0ca4261c1"));
Arrays.asList("5120f5f419cb32b37d5c56101f4c5da2"));
executeTest("test file has annotations, asking for annotations, #2", spec);
}
@ -134,7 +134,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("192f393da4e28aecf16112562e65083a"));
Arrays.asList("95a4be70c46496fb6791fdd0c3bcf8a3"));
executeTest("test file doesn't have annotations, asking for annotations, #1", spec);
}
@ -142,7 +142,8 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsAsking2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
Arrays.asList("52baff55535f7c87545a7818052a2d5c"));
Arrays.asList("8436bf4acb81f2c03bb5ecf8a514d90a"));
executeTest("test file doesn't have annotations, asking for annotations, #2", spec);
}
@ -150,7 +151,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testExcludeAnnotations() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + STANDARD_ANNOTATIONS + "-XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("334915d90fa92ee9fa07d4647912ceac"));
Arrays.asList("4995f77bedf7f476a15d46b5a8a392bd"));
executeTest("test exclude annotations", spec);
}
@ -183,7 +184,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testOverwritingHeader() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1,
Arrays.asList("ab84654ac412a0aaaec99e86e357f0fd"));
Arrays.asList("de96b1e62f414b107430d179a154534d"));
executeTest("test overwriting header", spec);
}

View File

@ -74,12 +74,12 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testBOTH_GGA_Pools() {
executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "217d9108c3014261dbe8befa383a2226");
executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "1b834608171f67e52a4e7617458a3ba6");
}
@Test(enabled = true)
public void testINDEL_GGA_Pools() {
executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "6a7f00e7f26cbc1891f40c9ed8b6579b");
executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "5b9e08bb141c48f826dc513066cb8a13");
}
@Test(enabled = true)
@ -88,6 +88,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
//TODO the old MD5 is kept for the record.
//TODO this should be revisit once we get into addressing inaccuracies by the independent allele approach.
// executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2");
executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "c0271a4c281991a86490c1955456af26");
executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "8449ea4fa85eac8834b049a0e3daee5a");
}
}

View File

@ -63,16 +63,16 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","c445bcd828c540c009bc8fc9f48916bc");
executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","2a0dcd92f3a07f45b7927e66404bc37e");
}
@Test(enabled = true)
public void testMT_SNP_DISCOVERY_sp4() {
executor.PC_MT_Test(CEUTRIO_BAM, "-A AlleleCountBySample -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","8d8c7926ca9c68251c41eb03b8efaba2");
executor.PC_MT_Test(CEUTRIO_BAM, "-A AlleleCountBySample -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","b9f5d1db0d2f5eb00eeb72ea29130de6");
}
@Test(enabled = true)
public void testMT_SNP_GGA_sp10() {
executor.PC_MT_Test(CEUTRIO_BAM, String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "34242c0ae59b0645fb6df5c322e92f01");
executor.PC_MT_Test(CEUTRIO_BAM, String.format("-A AlleleCountBySample -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "d90e41081b4a910a50116ff18c311245");
}
}

View File

@ -78,7 +78,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("b132da42e1017c8825d84a06cf79a1e9"));
Arrays.asList("f78975d22ac27a841d807765242d3c3f"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -92,7 +92,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("23e32822d81f198ee52676f24ef74343"));
Arrays.asList("75381df9490adf4206ebbfbe0f49226e"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@ -105,7 +105,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("93580986b871890a4fc86ebf98877efa"));
Arrays.asList("e89c2a0d30a409087af25fc06fd46404"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -115,7 +115,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("4da9878ee36d0fb387d5a58631383ffa"));
Arrays.asList("af5475e45b6ffca990dd035448e6b0fe"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -125,7 +125,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("9ab4f5306438b5a64cb2cb90020b7b82"));
Arrays.asList("f9b4ed435a53198d30199e8a5c98252d"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -140,7 +140,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("6de8d740908e22b46785d5eba6278eb2"));
Arrays.asList("feef70172d1edf2f9782143c8bbbf1d0"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -181,7 +181,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("56673c166ba25f625d026e593d5cc667"));
Arrays.asList("0369f9c03df86486709a35a5a2f23192"));
executeTest("test minIndelFraction 0.0", spec);
}
@ -189,7 +189,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("2ce1754313561b9cb134152b8f98bf43"));
Arrays.asList("8bd1f903445255f24bd9ef677e1ba9a8"));
executeTest("test minIndelFraction 0.25", spec);
}

View File

@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinBaseQualityScore() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
Arrays.asList("21369e50334d2b77b0e638e47e1b8c64"));
Arrays.asList("b4201a4d30b6ed5c6fc80248676be5ff"));
executeTest("test min_base_quality_score 26", spec);
}
@ -94,7 +94,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("e58d9a5758c5b11f86558608260d93d5"));
Arrays.asList("45f867c73dc3008e874eb3c009cd9674"));
executeTest("test SLOD", spec);
}
@ -102,7 +102,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNDA() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("fc8cdf9eeb475773303809c077f83c65"));
Arrays.asList("905e9a2ae183fc152101314774707372"));
executeTest("test NDA", spec);
}
@ -110,7 +110,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testCompTrack() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("c98294d321bde3e1e3c4fcee3e88d6d9"));
Arrays.asList("ac14f19fa1e9cecc8246814eac393662"));
executeTest("test using comp track", spec);
}
@ -124,17 +124,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testOutputParameterSitesOnly() {
testOutputParameters("-sites_only", "5b8938ed55a2b7ae8a52056c9130367b");
testOutputParameters("-sites_only", "be100d3f04592d0865d907177ee8c7e4");
}
@Test
public void testOutputParameterAllConfident() {
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f353b36db7305f47963446220e39debe");
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9024aa3ccf7339b873444b792ff5cb91");
}
@Test
public void testOutputParameterAllSites() {
testOutputParameters("--output_mode EMIT_ALL_SITES", "364aec53db79d20698fe0d088828736f");
testOutputParameters("--output_mode EMIT_ALL_SITES", "0d1e1fcc70fb4c159eac016f3085e5e9");
}
private void testOutputParameters(final String args, final String md5) {
@ -148,7 +148,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
Arrays.asList("ec59b34bedf40d70850ab5ffe42bbddd"));
Arrays.asList("13d2cc1e0871f85a97fb1f73c052ea3e"));
executeTest("test confidence 1", spec1);
}
@ -156,7 +156,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNoPrior() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1,
Arrays.asList("7e8a51e658debdaadbcf17761ed011da"));
Arrays.asList("d94ab69be622fb23c97f524234eb05b3"));
executeTest("test no prior 1", spec1);
}
@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testUserPrior() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1,
Arrays.asList("b3514f5b3510b6667fd2c85ecc529de7"));
Arrays.asList("2f3c5fa06537c8a3f440fb70837702f6"));
executeTest("test user prior 1", spec1);
}
@ -174,7 +174,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void emitPLsAtAllSites() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1,
Arrays.asList("6fd14930acb08f0dd9749a8c4d7df831"));
Arrays.asList("76f7a8508ad40cf35b68b07bc7e29a72"));
// GDA: TODO: BCF encoder/decoder doesn't seem to support non-standard values in genotype fields. IE even if there is a field defined in FORMAT and in the header the BCF2 encoder will still fail
spec1.disableShadowBCF();
@ -190,12 +190,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "815f01ef28bf576beb5528ac6fdd5248" );
testHeterozosity( 0.01, "e3c708bfeea99835512c1fac477d02e4" );
}
@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "8dd14ba8ef6314a99921849b2544b8c6" );
testHeterozosity( 1.0 / 1850, "1226aaa771b8f057646474d306a84866" );
}
private void testHeterozosity(final double arg, final String md5) {
@ -274,7 +274,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("34980dbce4fcd2aa21c46ea0e1897422"));
Arrays.asList("589f9dd567bb86ce74e5d48fd7cae0b2"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -293,7 +293,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("5f02a7449305b26aab3ff994dfb53fda"));
Arrays.asList("be978fc8807fcbd514b2678adcdc2a28"));
executeTest(String.format("test calling with BAQ"), spec);
}
@ -310,7 +310,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " +
"-A SnpEff",
1,
Arrays.asList("b61c0dece2d77544f9313c24191e0089"));
Arrays.asList("52bac28a85b85d9596d668d3f934f792"));
executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec);
}

View File

@ -70,7 +70,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("0f9eff0ad2f8cb8e277922bd037825f7"));
Arrays.asList("46dd44e69ff078a48057ff08d278a293"));
executeTest("test MultiSample Pilot1", spec);
}
@ -78,7 +78,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testWithAllelesPassedIn1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("196979c91f84c01fb5d89e73339c09fa"));
Arrays.asList("a19e5f74606cd980c3083d41d23b4acb"));
executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
}
@ -86,7 +86,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testWithAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("391c4e1437a3fa5c99584b466a56d7bb"));
Arrays.asList("fe3aec195ceb9180b6ffc474a4bb37b3"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}
@ -94,7 +94,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("41b1d92cdea4581f7843b714345547d6"));
Arrays.asList("21b346710ccdbbfc904eafda2b0c7443"));
executeTest("test SingleSample Pilot2", spec);
}
@ -102,7 +102,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("4ae7c090dfb85f6286f1187a746def58"));
Arrays.asList("03e6cb2c81775b94c81af45560062705"));
executeTest("test Multiple SNP alleles", spec);
}
@ -118,7 +118,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("5c2c49bb5e276df933b2d264e9d4a327"));
Arrays.asList("710d308a32012911fc275042b612e3d0"));
executeTest("test reverse trim", spec);
}
@ -126,7 +126,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("ed5f90897c9a348e4f861eff8992b4e2"));
Arrays.asList("55f9bd98155a7a4b768fba97f90de282"));
executeTest("test mismatched PLs", spec);
}
}

View File

@ -72,7 +72,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "066f1ce9e9826bcfedf6cd80bc560ab8");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6be5c8cbedea0a571c98fb1f4718042f");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -96,13 +96,13 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"2a5de432f04198737732064206c7d63d");
"af3a490e5f41e890c59927426ac0fe9a");
}
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"afaa41101f492d37f57bb18cc638c6bc");
"12b1826ee23243c64a002c3cbdfa569a");
}
private void HCTestComplexConsensusMode(String bam, String args, String md5) {
@ -114,7 +114,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleConsensusModeComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337",
"d42ba795fe346d8372cae3c00c9c2f23");
"9765065daf3a008bd92c755d882c5f07");
}
}

View File

@ -79,12 +79,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
//TODO this might need to be addressed at some point.
//TODO the following test is commented out for the record
//tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "02300a1f64e085cc0f4420d8160743c1"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "f71ea433b1334a2d146cc6ad76b46d98"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "daddb5349c34e9190f0563e220894748"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "03e5815cc351f1ec2feed89d2aed8268"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "5d5cca382bdf6987b2aef87918ed374c"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "532888aff96c356397a21ef790636818"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "1c1ca2d76bc5d7dd45a5e7aef756ad8f"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7ddfd33f2efcd2617d896826434fb43c"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "30accd7a171e9a13969fa543fde3fed1"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "20d896bbf750739f937ccd2fb152d902"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "99dd66a5b36ba9e2d1b0f408ecfbb50b"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "72d90fe0f5641b373744f75a21d4d14c"});
return tests.toArray(new Object[][]{});
}
@ -98,13 +98,13 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "cfe629c5a3be3b6524258ad1f9145488"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "12fbfffa4bb2b8d520f8021a40b37d19"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a5ea6d4052bbf9e8bba9011bc6f0d203"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "31da2254620f4a9c34ccf7c311cc133f"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "c95f52fe395392331dc3102902d54408"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "9b2914cf121c411aad3a65cfdc98c1d4"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "259110565155a3a27ab3e98113bedef8"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "8058ce63339701d33425769217bffed1"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "3abdde9d9ddbe9ec7cef28c65844a409"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "f5f0cef38a529e1542a8fd8bcce7ab1e"});
final String NA12878bandedResolutionMD5 = "fbe6099d138a069a65e4713bcae1e873";
final String NA12878bandedResolutionMD5 = "0f848ee7240ca48827bdfb85b455c1ad";
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5});
tests.add(new Object[]{NA12878_WEx + " -I " + privateTestDir + "NA20313.highCoverageRegion.bam -sn NA12878",
ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5});
@ -121,12 +121,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "5088d6cf80a6da7c769f97b1ab44c745"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "0895f09731d0ef89ec131c9f75aafe70"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "195017f498d3508e6eee492eb00da97b"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "4f8e3e249509a24da21d5dd8e3594f92"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "ef96f5295b048ef31f5ba82d078a44a2"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "cd809158689ddbbfd18a4eaae016f9a0"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "1f99396292974df55ef6497be79b1917"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "eabb44785fd4a84ade3c674a0bda16d9"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "904feadac0e4a2a0c6b2cc7e55718f3b"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "b430a401310b3812346d7496f9c62011"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "6e481747c50734b7fb0c4de39405044f"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "cff5e49456d5ecc0dafd31cd014def4f"});
return tests.toArray(new Object[][]{});
}
@ -139,12 +139,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "bb85582fcc2ce45407640fa8a70421ab"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "e01603c5d4c3142978a91b8cd6a98618"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "b9e471935b960a7dba3eb2b13939ccaf"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "c724ecdaea7eab5a6239ff4daaa6e034"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "194d27bd2321ff1a8b895a4e9a8d2938"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "4a3dfcfc2f5d27b75725346d63e0b83a"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "50156c819a0096fa22ed1b9749affecc"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "d91aace2100dc040659d77f366053a2e"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "d86dc51757e69aa3f2608fbbfaa90069"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "27c73b8b1ec384a880bf60daf0b0f80e"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "a6c70203e43d62b42c4b751fe9018410"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "7ebbbcab78d090d70a24819093812748"});
return tests.toArray(new Object[][]{});
}
@ -270,7 +270,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testWrongGVCFNonVariantRecordOrderBugFix() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9b3b232ceeb109f2624826ea20825a82"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a22bafa2f91c197ea1ab13e02a0c08fb"));
spec.disableShadowBCF();
executeTest("testMissingGVCFIndexingStrategyException", spec);
}
@ -320,7 +320,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a53ac6cb2379d8adccfca6495e32e225"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9db19d4706e32ce883c6e3f71e5f4178"));
spec.disableShadowBCF();
executeTest(" testAlleleSpecificAnnotations", spec);
}
@ -329,7 +329,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASMQMateRankSumAnnotation() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("b3c0eccf035c0c5d3ac09dc93b8ee97a"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("85b26109c4586ba33c44e253297050be"));
spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec);
}
@ -338,7 +338,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSum() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("207e9c996f5c289a33d178c16d854bd0"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("f1531ed03d6f9538d4d6af7630ff2502"));
spec.disableShadowBCF();
executeTest(" testASInsertSizeRankSum", spec);
}

View File

@ -96,82 +96,82 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() throws IOException {
HCTest(CEUTRIO_BAM, "", "bdc5a942a7df2d4c61388ecc74a163cf");
HCTest(CEUTRIO_BAM, "", "b86311f6a0b4b5f2cfd05662f03fbe6c");
}
@Test
public void testHaplotypeCallerSingleSample() throws IOException {
HCTest(NA12878_BAM, "", "5941f88fa1372eb598880a4711ac08de");
HCTest(NA12878_BAM, "", "1ceedd80a36e042a93d60e25b94c0b03");
}
@Test
public void testHaplotypeCallerMultiSampleHaploid() throws IOException {
HCTest(CEUTRIO_BAM, "-ploidy 1", "4c32640204a11fe46a23d97b1012bca2");
HCTest(CEUTRIO_BAM, "-ploidy 1", "a7e83df05d8ef3558fa98e8c6a86e12d");
}
@Test
public void testHaplotypeCallerSingleSampleHaploid() throws IOException {
HCTest(NA12878_BAM, "-ploidy 1", "e09c5fd862f86f69289d847e3b293e19");
HCTest(NA12878_BAM, "-ploidy 1", "6e9855e22ed78a60eaaf0a06ed967b77");
}
@Test
public void testHaplotypeCallerSingleSampleTetraploid() throws IOException {
HCTest(NA12878_BAM, "-ploidy 4", "bbae9f77683591200b2034d5461e6425");
HCTest(NA12878_BAM, "-ploidy 4", "ae463cceb8cb0937440cfe5b56904feb");
}
@Test
public void testHaplotypeCallerMinBaseQuality() throws IOException {
HCTest(NA12878_BAM, "-mbq 15", "5941f88fa1372eb598880a4711ac08de");
HCTest(NA12878_BAM, "-mbq 15", "1ceedd80a36e042a93d60e25b94c0b03");
}
@Test
public void testHaplotypeCallerMinBaseQualityHaploid() throws IOException {
HCTest(NA12878_BAM, "-mbq 15 -ploidy 1", "e09c5fd862f86f69289d847e3b293e19");
HCTest(NA12878_BAM, "-mbq 15 -ploidy 1", "6e9855e22ed78a60eaaf0a06ed967b77");
}
@Test
public void testHaplotypeCallerMinBaseQualityTetraploid() throws IOException {
HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "bbae9f77683591200b2034d5461e6425");
HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "ae463cceb8cb0937440cfe5b56904feb");
}
@Test
public void testHaplotypeCallerGraphBasedSingleSample() throws IOException {
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "9a4ba54146449914b1fbaed5465b692e");
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "58abaabab25b9a75aaf17ab48478f42a");
}
@Test
public void testHaplotypeCallerGraphBasedMultiSampleHaploid() throws IOException {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "5333e3ff4d7fc53624eab801250be4f0");
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "d7c7ff3883e8a50be38ec32b1a8fa5b4");
}
@Test
public void testHaplotypeCallerGraphBasedMultiSample() throws IOException {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "e9759bf254b7b4d06e4a68c94a417cf1");
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "ccec6b941d41ac2642450964c3ab85d7");
}
@Test
public void testHaplotypeCallerSingleSampleWithDbsnp() throws IOException {
HCTest(NA12878_BAM, "-D " + b37dbSNP132, "7466c4d9ce6a1d23bcb428fc9f446843");
HCTest(NA12878_BAM, "-D " + b37dbSNP132, "48a008f41644c8e4bb256141d095a140");
}
@Test
public void testHaplotypeCallerMultiSampleGGA() throws IOException {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf" +
" -isr INTERSECTION -L " + GGA_INTERVALS_FILE,
"c2fe156912e59626c0393b9b47c9419e");
"b7d2576ac00c175f4136b2bfbffc4dcd");
}
@Test
public void testHaplotypeCallerMultiSampleGGAHaploid() throws IOException {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -ploidy 1 -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf -isr INTERSECTION -L 20:10080000-10100000",
"49f737983375c740361daa2d48ed0249");
"07e0a87e762173bcd6bba5e7af704159");
}
@Test
public void testHaplotypeCallerMultiSampleGGATetraploid() throws IOException {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -ploidy 4 -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf -isr INTERSECTION -L 20:10080000-10100000",
"6ba0fa930899f70fee9a7b1161508f93");
"be7315e2866eccfb951ed63090f9119c");
}
@Test
@ -187,7 +187,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "22028b104dd60b23a399e5e3a877a1fb");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6f5678f8fb6a4c0b21df1e6b0d1019ee");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@ -224,7 +224,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerNearbySmallIntervals() {
HCTestNearbySmallIntervals(NA12878_BAM, "", "6e55de7bf49ecdc71f6d4c9565a19853");
HCTestNearbySmallIntervals(NA12878_BAM, "", "796f4a44a29fc3c1a1461f90aef45846");
}
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -234,7 +234,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("f8774326a268f7d394b333818d15d05c"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("392ac1966cfa1cae78e8d674065a0d28"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@ -296,7 +296,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("ab816ff6facc08acb19c55bd6e828f02"));
Arrays.asList("5430b91902813cf64f5bb781b25d76c6"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -305,7 +305,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,100,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("fd1a539e14902f6957eb939aac1412f0"));
Arrays.asList("8b775d84ea981f1e8207fa69a92b5e1f"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
@ -313,7 +313,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("1810c35e8298dff4aa1b7b04fb5f4962"));
Arrays.asList("f6f5b7e9348fc6dccd63110de0371d78"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -322,7 +322,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("8a06a53388d73ec667b353379f3b351e"));
Arrays.asList("82cd0142146521fd659905737dc6192c"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
@ -345,7 +345,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestAggressivePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1,
Arrays.asList("be71ab16d55437cbe3005ea3b93cece6"));
Arrays.asList("c43caebd09b69dbc9d1e4b5f5f449712"));
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
}
@ -353,7 +353,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1,
Arrays.asList("8a5185d0a9400c8b3f4b12da65181c4b"));
Arrays.asList("54e7595f6d82a69566f4c0163045688d"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}
@ -382,7 +382,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testLackSensitivityDueToBadHaplotypeSelectionFix() {
final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list");
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("02ec3090c4a6359fa10e6d8b30e1d5a2"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("8d44a2e1967e6e844f221960d6ea42bb"));
spec.disableShadowBCF();
executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec);
}
@ -391,7 +391,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testMissingKeyAlternativeHaplotypesBugFix() {
final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header ",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "lost-alt-key-hap.bam", privateTestDir + "lost-alt-key-hap.interval_list");
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("13c9f094b9c54960dc2fd3a1815a2645"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("cf8912f84153b2357a4d1ddb361f786f"));
spec.disableShadowBCF();
executeTest("testMissingKeyAlternativeHaplotypesBugFix", spec);
}
@ -468,12 +468,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerTandemRepeatAnnotator() throws IOException{
HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "03738462e7f0b6f149f40b790a3a7261");
HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "abc2daf88b9c5b3c0989f5d06df84ab3");
}
@Test
public void testHBaseCountsBySample() throws IOException{
HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "8739d92898113436666f1cff3bc39bc5");
HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "3e0ee74ec384ae38bda8ac15d8d67cca");
}
}

View File

@ -260,7 +260,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + gVCF.getAbsolutePath(), b37KGReference),
1,
Arrays.asList("eeff965cd79c0b7085c7d4d7ecf82b68"));
Arrays.asList("d8eb4a64a2ae7e7dad1efc4fe8b4b3ed"));
spec.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SAC
executeTest("testStrandAlleleCountsBySample", spec);
}
@ -566,7 +566,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("35daaea8dea591d35ca99854c8d36e5f"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("7261cde50d4556c6dc190b10222a8538"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}
@ -575,7 +575,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testASMateRankSumAnnotation() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_MQMateRankSumTest --disableDithering -V "
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f8bf55e16358b35449621f896b084b7a"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("08dc30527c98d78d09f8575efc3b84ae"));
spec.disableShadowBCF();
executeTest("testASMateRankSumAnnotation", spec);
}
@ -584,7 +584,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSumAnnotation() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("8a27e5fbcb0d8f91684c2887167eb043"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("723522fe1d05f2d11efda008b65884d2"));
spec.disableShadowBCF();
executeTest("testASInsertSizeRankSumAnnotation", spec);
}
@ -597,7 +597,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations_oneSample() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("b6026b2a2d2da39f181a4905b2225dad"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("cc2fe42eafd9789be49b550a639a7a1b"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations_oneSample", spec);
}
@ -616,7 +616,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testFractionInformativeReads() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -G AS_Standard -o %s --no_cmdline_in_header -A FractionInformativeReads --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("794cfec86a8bee1f6955766b5a98b950"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("16dbeec9d640e61453e902918427343a"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}

View File

@ -196,6 +196,11 @@
<artifactId>commons-math</artifactId>
<version>2.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
<version>3.5</version>
</dependency>
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>

View File

@ -66,6 +66,10 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-math</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</dependency>
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>

View File

@ -25,483 +25,534 @@
package org.broadinstitute.gatk.utils;
import cern.jet.math.Arithmetic;
import cern.jet.random.Normal;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.commons.math.MathException;
import org.apache.commons.math.distribution.NormalDistribution;
import org.apache.commons.math.distribution.NormalDistributionImpl;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import htsjdk.samtools.util.Histogram;
import org.apache.commons.math3.distribution.NormalDistribution;
import java.io.Serializable;
import java.util.Comparator;
import java.util.TreeSet;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Imported with changes from Picard private.
*
* @author Tim Fennell
*/
public class MannWhitneyU {
private static Normal STANDARD_NORMAL = new Normal(0.0,1.0,null);
private static NormalDistribution APACHE_NORMAL = new NormalDistributionImpl(0.0,1.0,1e-2);
private static double LNSQRT2PI = Math.log(Math.sqrt(2.0*Math.PI));
private static final class Rank implements Comparable<Rank> {
final double value;
float rank;
final int series;
private TreeSet<Pair<Number,USet>> observations;
private int sizeSet1;
private int sizeSet2;
private ExactMode exactMode;
public MannWhitneyU(ExactMode mode, boolean dither) {
if ( dither )
observations = new TreeSet<Pair<Number,USet>>(new DitheringComparator());
else
observations = new TreeSet<Pair<Number,USet>>(new NumberedPairComparator());
sizeSet1 = 0;
sizeSet2 = 0;
exactMode = mode;
}
public MannWhitneyU() {
this(ExactMode.POINT,true);
}
public MannWhitneyU(boolean dither) {
this(ExactMode.POINT,dither);
}
public MannWhitneyU(ExactMode mode) {
this(mode,true);
}
/**
* Add an observation into the observation tree
* @param n: the observation (a number)
* @param set: whether the observation comes from set 1 or set 2
*/
public void add(Number n, USet set) {
observations.add(new Pair<Number,USet>(n,set));
if ( set == USet.SET1 ) {
++sizeSet1;
} else {
++sizeSet2;
}
}
public Pair<Long,Long> getR1R2() {
long u1 = calculateOneSidedU(observations,MannWhitneyU.USet.SET1);
long n1 = sizeSet1*(sizeSet1+1)/2;
long r1 = u1 + n1;
long n2 = sizeSet2*(sizeSet2+1)/2;
long u2 = n1*n2-u1;
long r2 = u2 + n2;
return new Pair<Long,Long>(r1,r2);
}
/**
* Runs the one-sided test under the hypothesis that the data in set "lessThanOther" stochastically
* dominates the other set
* @param lessThanOther - either Set1 or Set2
* @return - u-based z-approximation, and p-value associated with the test (p-value is exact for small n,m)
*/
@Requires({"lessThanOther != null"})
@Ensures({"validateObservations(observations) || Double.isNaN(result.getFirst())","result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"})
public Pair<Double,Double> runOneSidedTest(USet lessThanOther) {
long u = calculateOneSidedU(observations, lessThanOther);
int n = lessThanOther == USet.SET1 ? sizeSet1 : sizeSet2;
int m = lessThanOther == USet.SET1 ? sizeSet2 : sizeSet1;
if ( n == 0 || m == 0 ) {
// test is uninformative as one or both sets have no observations
return new Pair<Double,Double>(Double.NaN,Double.NaN);
private Rank(double value, float rank, int series) {
this.value = value;
this.rank = rank;
this.series = series;
}
// the null hypothesis is that {N} is stochastically less than {M}, so U has counted
// occurrences of {M}s before {N}s. We would expect that this should be less than (n*m+1)/2 under
// the null hypothesis, so we want to integrate from K=0 to K=U for cumulative cases. Always.
return calculateP(n, m, u, false, exactMode);
}
/**
* Runs the standard two-sided test,
* returns the u-based z-approximate and p values.
* @return a pair holding the u and p-value.
*/
@Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"})
//@Requires({"validateObservations(observations)"})
public Pair<Double,Double> runTwoSidedTest() {
Pair<Long,USet> uPair = calculateTwoSidedU(observations);
long u = uPair.first;
int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2;
int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1;
if ( n == 0 || m == 0 ) {
// test is uninformative as one or both sets have no observations
return new Pair<Double,Double>(Double.NaN,Double.NaN);
}
return calculateP(n, m, u, true, exactMode);
}
/**
* Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate
* @param n - The number of entries in the stochastically smaller (dominant) set
* @param m - The number of entries in the stochastically larger (dominated) set
* @param u - the Mann-Whitney U value
* @param twoSided - is the test twosided
* @return the (possibly approximate) p-value associated with the MWU test, and the (possibly approximate) z-value associated with it
* todo -- there must be an approximation for small m and large n
*/
@Requires({"m > 0","n > 0"})
@Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"})
protected static Pair<Double,Double> calculateP(int n, int m, long u, boolean twoSided, ExactMode exactMode) {
Pair<Double,Double> zandP;
if ( n > 8 && m > 8 ) {
// large m and n - normal approx
zandP = calculatePNormalApproximation(n,m,u, twoSided);
} else if ( n > 5 && m > 7 ) {
// large m, small n - sum uniform approx
// todo -- find the appropriate regimes where this approximation is actually better enough to merit slowness
// pval = calculatePUniformApproximation(n,m,u);
zandP = calculatePNormalApproximation(n, m, u, twoSided);
} else if ( n > 8 || m > 8 ) {
zandP = calculatePFromTable(n, m, u, twoSided);
} else {
// small m and n - full approx
zandP = calculatePRecursively(n,m,u,twoSided,exactMode);
@Override
public int compareTo(Rank that) {
return (int) (this.value - that.value);
}
return zandP;
}
public static Pair<Double,Double> calculatePFromTable(int n, int m, long u, boolean twoSided) {
// todo -- actually use a table for:
// todo - n large, m small
return calculatePNormalApproximation(n,m,u, twoSided);
}
/**
* Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947]
* @param n - The number of entries in the stochastically smaller (dominant) set
* @param m - The number of entries in the stochastically larger (dominated) set
* @param u - the Mann-Whitney U value
* @param twoSided - whether the test should be two sided
* @return p-value associated with the normal approximation
*/
@Requires({"m > 0","n > 0"})
@Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"})
public static Pair<Double,Double> calculatePNormalApproximation(int n,int m,long u, boolean twoSided) {
double z = getZApprox(n,m,u);
if ( twoSided ) {
return new Pair<Double,Double>(z,2.0*(z < 0 ? STANDARD_NORMAL.cdf(z) : 1.0-STANDARD_NORMAL.cdf(z)));
} else {
return new Pair<Double,Double>(z,STANDARD_NORMAL.cdf(z));
@Override
public String toString() {
return "Rank{" +
"value=" + value +
", rank=" + rank +
", series=" + series +
'}';
}
}
/**
* Calculates the Z-score approximation of the u-statistic
* @param n - The number of entries in the stochastically smaller (dominant) set
* @param m - The number of entries in the stochastically larger (dominated) set
* @param u - the Mann-Whitney U value
* @return the asymptotic z-approximation corresponding to the MWU p-value for n < m
* The results of performing a rank sum test.
*/
@Requires({"m > 0","n > 0"})
@Ensures({"! Double.isNaN(result)", "! Double.isInfinite(result)"})
private static double getZApprox(int n, int m, long u) {
double mean = ( ((long)m)*n+1.0)/2;
double var = (((long) n)*m*(n+m+1.0))/12;
double z = ( u - mean )/Math.sqrt(var);
return z;
public static class Result {
private final double u;
private final double z;
private final double p;
private final double medianShift;
public Result(double u, double z, double p, double medianShift) {
this.u = u;
this.z = z;
this.p = p;
this.medianShift = medianShift;
}
public double getU() {
return u;
}
public double getZ() {
return z;
}
public double getP() {
return p;
}
public double getMedianShift() {
return medianShift;
}
}
/**
* Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate
* p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens, MA [1966, biometrika] (sum of uniform CDF)
* @param n - The number of entries in the stochastically smaller (dominant) set
* @param m - The number of entries in the stochastically larger (dominated) set
* @param u - mann-whitney u value
* @return p-value according to sum of uniform approx
* todo -- this is currently not called due to not having a good characterization of where it is significantly more accurate than the
* todo -- normal approxmation (e.g. enough to merit the runtime hit)
* The values of U1, U2 and the transformed number of ties needed for the calculation of sigma
* in the normal approximation.
*/
public static double calculatePUniformApproximation(int n, int m, long u) {
long R = u + (n*(n+1))/2;
double a = Math.sqrt(m*(n+m+1));
double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m));
double z = b + ((double)R)/a;
if ( z < 0 ) { return 1.0; }
else if ( z > n ) { return 0.0; }
else {
if ( z > ((double) n) /2 ) {
return 1.0-1/(Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
} else {
return 1/(Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0);
public static class TestStatistic {
private final double u1;
private final double u2;
private final double trueU;
private final double numOfTiesTransformed;
public TestStatistic(double u1, double u2, double numOfTiesTransformed) {
this.u1 = u1;
this.u2 = u2;
this.numOfTiesTransformed = numOfTiesTransformed;
this.trueU = Double.NaN;
}
public TestStatistic(double trueU, double numOfTiesTransformed) {
this.trueU = trueU;
this.numOfTiesTransformed = numOfTiesTransformed;
this.u1 = Double.NaN;
this.u2 = Double.NaN;
}
public double getU1() {
return u1;
}
public double getU2() {
return u2;
}
public double getTies() {
return numOfTiesTransformed;
}
public double getTrueU() {
return trueU;
}
}
/**
* The ranked data in one list and a list of the number of ties.
*/
public static class RankedData {
private final Rank[] rank;
private final ArrayList numOfTies;
public RankedData(Rank[] rank, ArrayList numOfTies) {
this.rank = rank;
this.numOfTies = numOfTies;
}
public Rank[] getRank() {
return rank;
}
public ArrayList getNumOfTies() {
return numOfTies;
}
}
/**
* Key for the map from Integer[] to set of all permutations of that array.
*/
private static class Key {
final Integer[] listToPermute;
private Key(Integer[] listToPermute) {
this.listToPermute = listToPermute;
}
@Override
public boolean equals(Object o) {
if (o == null || getClass() != o.getClass()) return false;
Key that = (Key) o;
return (Arrays.deepEquals(this.listToPermute, that.listToPermute));
}
@Override
public int hashCode() {
int result = 17;
for (Integer i : listToPermute) {
result = 31 * result + listToPermute[i];
}
return result;
}
}
// Constructs a normal distribution; actual values of mean and SD don't matter since it's
// just used to convert a z-score into a cumulative probability
private static final double NORMAL_MEAN = 100;
private static final double NORMAL_SD = 15;
private static final NormalDistribution NORMAL = new NormalDistribution(NORMAL_MEAN, NORMAL_SD);
/**
* Helper function for the sum of n uniform random variables
* @param z - value at which to compute the (un-normalized) cdf
* @param m - a cutoff integer (defined by m <= z < m + 1)
* @param n - the number of uniform random variables
* @param k - holder variable for the recursion (alternatively, the index of the term in the sequence)
* @return the (un-normalized) cdf for the sum of n random variables
* A map of an Integer[] of the labels to the set of all possible permutations of those labels.
*/
private static double uniformSumHelper(double z, int m, int n, int k) {
if ( k > m ) { return 0; }
int coef = (k % 2 == 0) ? 1 : -1;
return coef*Arithmetic.binomial(n,k)*Math.pow(z-k,n) + uniformSumHelper(z,m,n,k+1);
private static Map<Key, Set<List<Integer>>> PERMUTATIONS = new ConcurrentHashMap<Key, Set<List<Integer>>>();
/**
* The minimum length for both data series in order to use a normal distribution
* to calculate Z and p. If both series are shorter than this value then a permutation test
* will be used.
*/
private int minimumNormalN = 10;
/**
* If the exact p-value is 1 then the z-score is infinite, so we need a max cutoff instead.
*/
private double maxZScore = 9.0;
/**
* Sets the minimum number of values in each data series to use the normal distribution approximation.
*/
public void setMinimumSeriesLengthForNormalApproximation(final int n) {
this.minimumNormalN = n;
}
/**
* Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn
* stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for
* later on simply by multiplying the p-value by 2.
*
* Recall: If X stochastically dominates Y, the test is for occurrences of Y before X, so the lower value of u is chosen
* @param observed - the observed data
* @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1)
* A variable that indicates if the test is one sided or two sided and if it's one sided
* which group is the dominator in the null hypothesis.
*/
@Requires({"observed != null", "observed.size() > 0"})
@Ensures({"result != null","result.first > 0"})
public static Pair<Long,USet> calculateTwoSidedU(TreeSet<Pair<Number,USet>> observed) {
int set1SeenSoFar = 0;
int set2SeenSoFar = 0;
long uSet1DomSet2 = 0;
long uSet2DomSet1 = 0;
USet previous = null;
for ( Pair<Number,USet> dataPoint : observed ) {
public enum TestType {
FIRST_DOMINATES,
SECOND_DOMINATES,
TWO_SIDED
}
if ( dataPoint.second == USet.SET1 ) {
++set1SeenSoFar;
} else {
++set2SeenSoFar;
}
public RankedData calculateRank(final double[] series1, final double[] series2) {
Arrays.sort(series1);
Arrays.sort(series2);
if ( previous != null ) {
if ( dataPoint.second == USet.SET1 ) {
uSet2DomSet1 += set2SeenSoFar;
// Make a merged ranks array
final Rank[] ranks = new Rank[series1.length + series2.length];
{
int i = 0, j = 0, r = 0;
while (r < ranks.length) {
if (i >= series1.length) {
ranks[r++] = new Rank(series2[j++], r, 2);
} else if (j >= series2.length) {
ranks[r++] = new Rank(series1[i++], r, 1);
} else if (series1[i] <= series2[j]) {
ranks[r++] = new Rank(series1[i++], r, 1);
} else {
uSet1DomSet2 += set1SeenSoFar;
ranks[r++] = new Rank(series2[j++], r, 2);
}
}
previous = dataPoint.second;
}
return uSet1DomSet2 < uSet2DomSet1 ? new Pair<Long,USet>(uSet1DomSet2,USet.SET1) : new Pair<Long,USet>(uSet2DomSet1,USet.SET2);
ArrayList<Integer> numOfTies = new ArrayList<>();
// Now sort out any tie bands
for (int i = 0; i < ranks.length; ) {
float rank = ranks[i].rank;
int count = 1;
for (int j = i + 1; j < ranks.length && ranks[j].value == ranks[i].value; ++j) {
rank += ranks[j].rank;
++count;
}
if (count > 1) {
rank /= count;
for (int j = i; j < i + count; ++j) {
ranks[j].rank = rank;
}
numOfTies.add(count);
}
// Skip forward the right number of items
i += count;
}
return new RankedData(ranks, numOfTies);
}
/**
* Calculates the U-statistic associated with the one-sided hypothesis that "dominator" stochastically dominates
* the other U-set. Note that if S1 dominates S2, we want to count the occurrences of points in S2 coming before points in S1.
* @param observed - the observed data points, tagged by each set
* @param dominator - the set that is hypothesized to be stochastically dominating
* @return the u-statistic associated with the hypothesis that dominator stochastically dominates the other set
* Rank both groups together and return a TestStatistic object that includes U1, U2 and number of ties for sigma
*/
@Requires({"observed != null","dominator != null","observed.size() > 0"})
@Ensures({"result >= 0"})
public static long calculateOneSidedU(TreeSet<Pair<Number,USet>> observed,USet dominator) {
long otherBeforeDominator = 0l;
int otherSeenSoFar = 0;
for ( Pair<Number,USet> dataPoint : observed ) {
if ( dataPoint.second != dominator ) {
++otherSeenSoFar;
} else {
otherBeforeDominator += otherSeenSoFar;
public TestStatistic calculateU1andU2(final double[] series1, final double[] series2) {
RankedData ranked = calculateRank(series1, series2);
Rank[] ranks = ranked.getRank();
ArrayList<Integer> numOfTies = ranked.getNumOfTies();
//Calculate number of ties transformed for formula for Sigma to calculate Z-score
ArrayList<Integer> transformedTies = new ArrayList<>();
for (int count : numOfTies) {
//If every single datapoint is tied then we want to return a p-value of .5 and
//the formula for sigma that includes the number of ties breaks down. Setting
//the number of ties to 0 in this case gets the desired result.
if (count != ranks.length) {
transformedTies.add((count * count * count) - count);
}
}
return otherBeforeDominator;
double numOfTiesForSigma = 0.0;
for (int count : transformedTies) {
numOfTiesForSigma += count;
}
// Calculate R1 and R2 and U.
float r1 = 0, r2 = 0;
for (Rank rank : ranks) {
if (rank.series == 1) r1 += rank.rank;
else r2 += rank.rank;
}
double n1 = series1.length;
double n2 = series2.length;
double u1 = r1 - ((n1 * (n1 + 1)) / 2);
double u2 = r2 - ((n2 * (n2 + 1)) / 2);
TestStatistic result = new TestStatistic(u1, u2, numOfTiesForSigma);
return result;
}
/**
* The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible
* binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor
* calls into that recursive calculation.
* @param n: number of set-one entries (hypothesis: set one is stochastically less than set two)
* @param m: number of set-two entries
* @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 )
* @param twoSided: whether the test is two sided or not. The recursive formula is symmetric, multiply by two for two-sidedness.
* @param mode: whether the mode is a point probability, or a cumulative distribution
* @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times.
* Calculates the rank-sum test statisic U (sometimes W) from two sets of input data for a one-sided test
* with an int indicating which group is the dominator. Returns a test statistic object with trueU and number of
* ties for sigma.
*/
@Requires({"m > 0","n > 0","u >= 0"})
@Ensures({"result != null","! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"})
public static Pair<Double,Double> calculatePRecursively(int n, int m, long u, boolean twoSided, ExactMode mode) {
if ( m > 8 && n > 5 ) { throw new GATKException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); }
double p = mode == ExactMode.POINT ? cpr(n,m,u) : cumulativeCPR(n,m,u);
//p *= twoSided ? 2.0 : 1.0;
public TestStatistic calculateOneSidedU(final double[] series1, final double[] series2, final TestType whichSeriesDominates) {
TestStatistic stat = calculateU1andU2(series1, series2);
TestStatistic result;
if (whichSeriesDominates == TestType.FIRST_DOMINATES) {
result = new TestStatistic(stat.getU1(), stat.getTies());
} else {
result = new TestStatistic(stat.getU2(), stat.getTies());
}
return result;
}
/**
* Calculates the two-sided rank-sum test statisic U (sometimes W) from two sets of input data.
* Returns a test statistic object with trueU and number of ties for sigma.
*/
public TestStatistic calculateTwoSidedU(final double[] series1, final double[] series2) {
TestStatistic u1AndU2 = calculateU1andU2(series1, series2);
double u = Math.min(u1AndU2.getU1(), u1AndU2.getU2());
TestStatistic result = new TestStatistic(u, u1AndU2.getTies());
return result;
}
/**
* Calculates the Z score (i.e. standard deviations from the mean) of the rank sum
* test statistics given input data of lengths n1 and n2 respectively, as well as the number of ties, for normal
* approximation only.
*/
public double calculateZ(final double u, final int n1, final int n2, final double nties, final TestType whichSide) {
double m = (n1 * n2) / 2d;
//Adds a continuity correction
double correction;
if (whichSide == TestType.TWO_SIDED) {
correction = (u - m) >= 0 ? .5 : -.5;
} else {
correction = whichSide == TestType.FIRST_DOMINATES ? -.5 : .5;
}
//If all the data is tied, the number of ties for sigma is set to 0. In order to get a p-value of .5 we need to
//remove the continuity correction.
if (nties == 0) {
correction = 0;
}
double sigma = Math.sqrt((n1 * n2 / 12d) * ((n1 + n2 + 1) - nties / ((n1 + n2) * (n1 + n2 - 1))));
return (u - m - correction) / sigma;
}
/**
* Finds or calculates the median value of a sorted array of double.
*/
public double median(final double[] data) {
final int len = data.length;
final int mid = len / 2;
if (data.length % 2 == 0) {
return (data[mid] + data[mid - 1]) / 2d;
} else {
return data[mid];
}
}
/**
* Constructs a new rank sum test with the given data.
*
* @param series1 group 1 data
* @param series2 group 2 data
* @param whichSide indicator of two sided test, 0 for two sided, 1 for series1 as dominator, 2 for series2 as dominator
* @return Result including U statistic, Z score, p-value, and difference in medians.
*/
public Result test(final double[] series1, final double[] series2, final TestType whichSide) {
final int n1 = series1.length;
final int n2 = series2.length;
//If one of the groups is empty we return NaN
if (n1 == 0 || n2 == 0) {
return new Result(Float.NaN, Float.NaN, Float.NaN, Float.NaN);
}
double u;
double nties;
if (whichSide == TestType.TWO_SIDED) {
TestStatistic result = calculateTwoSidedU(series1, series2);
u = result.getTrueU();
nties = result.getTies();
} else {
TestStatistic result = calculateOneSidedU(series1, series2, whichSide);
u = result.getTrueU();
nties = result.getTies();
}
double z;
try {
double p;
if ( mode == ExactMode.CUMULATIVE ) {
z = APACHE_NORMAL.inverseCumulativeProbability(p);
if (n1 >= this.minimumNormalN || n2 >= this.minimumNormalN) {
z = calculateZ(u, n1, n2, nties, whichSide);
p = 2 * NORMAL.cumulativeProbability(NORMAL_MEAN + z * NORMAL_SD);
if (whichSide != TestType.TWO_SIDED) {
p = p / 2;
}
} else {
// TODO -- This exact test is only implemented for the one sided test, but we currently don't call the two sided version
Histogram<Double> distribution = permutationTest(series1, series2);
p = distribution.getCumulativeProbability(u);
if (p == 1) {
z = maxZScore;
} else if (p == 0) {
z = -1.0 * maxZScore;
} else {
double sd = Math.sqrt((1.0+1.0/(1+n+m))*(n*m)*(1.0+n+m)/12); // biased variance empirically better fit to distribution then asymptotic variance
//System.out.printf("SD is %f and Max is %f and prob is %f%n",sd,1.0/Math.sqrt(sd*sd*2.0*Math.PI),p);
if ( p > 1.0/Math.sqrt(sd*sd*2.0*Math.PI) ) { // possible for p-value to be outside the range of the normal. Happens at the mean, so z is 0.
z = 0.0;
} else {
if ( u >= n*m/2 ) {
z = Math.sqrt(-2.0*(Math.log(sd)+Math.log(p)+LNSQRT2PI));
} else {
z = -Math.sqrt(-2.0*(Math.log(sd)+Math.log(p)+LNSQRT2PI));
}
z = NORMAL.inverseCumulativeProbability(p);
}
}
return new Result(u, z, p, Math.abs(median(series1) - median(series2)));
}
private void swap(Integer[] arr, int i, int j) {
int temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
/**
* Uses a method that generates permutations in lexicographic order. (https://en.wikipedia.org/wiki/Permutation#Generation_in_lexicographic_order)
* @param temp Sorted list of elements to be permuted.
* @param allPermutations Empty set that will hold all possible permutations.
*/
private void calculatePermutations(Integer[] temp, Set<List<Integer>> allPermutations) {
allPermutations.add(new ArrayList<>(Arrays.asList(temp)));
while (true) {
int k = -1;
for (int i = temp.length - 2; i >= 0; i--) {
if (temp[i] < temp[i + 1]) {
k = i;
break;
}
}
} catch (MathException me) {
throw new GATKException("A math exception occurred in inverting the probability",me);
if (k == -1) {
break;
}
int l = -1;
for (int i = temp.length - 1; i >= k + 1; i--) {
if (temp[k] < temp[i]) {
l = i;
break;
}
}
swap(temp, k, l);
int end = temp.length - 1;
for (int begin = k + 1; begin < end; begin++) {
swap(temp, begin, end);
end--;
}
allPermutations.add(new ArrayList<>(Arrays.asList(temp)));
}
return new Pair<Double,Double>(z,(twoSided ? 2.0*p : p));
}
/**
* Hook into CPR with sufficient warning (for testing purposes)
* calls into that recursive calculation.
* @param n: number of set-one entries (hypothesis: set one is stochastically less than set two)
* @param m: number of set-two entries
* @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 )
* @return same as cpr
* Checks to see if the permutations have already been computed before creating them from scratch.
* @param listToPermute List of tags in numerical order to be permuted
* @param numOfPermutations The number of permutations this list will have (n1+n2 choose n1)
* @return Set of all possible permutations for the given list.
*/
protected static double calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(int n, int m, long u) {
return cpr(n,m,u);
Set<List<Integer>> getPermutations(final Integer[] listToPermute, int numOfPermutations) {
Key key = new Key(listToPermute);
Set<List<Integer>> permutations = PERMUTATIONS.get(key);
if (permutations == null) {
permutations = new HashSet<>(numOfPermutations);
calculatePermutations(listToPermute, permutations);
PERMUTATIONS.put(key, permutations);
}
return permutations;
}
/**
* For testing
* Creates histogram of test statistics from a permutation test.
*
* @param n: number of set-one entries (hypothesis: set one is stochastically less than set two)
* @param m: number of set-two entries
* @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 )
* @param series1 Data from group 1
* @param series2 Data from group 2
* @return Histogram with u calculated for every possible permutation of group tag.
*/
protected static long countSequences(int n, int m, long u) {
if ( u < 0 ) { return 0; }
if ( m == 0 || n == 0 ) { return u == 0 ? 1 : 0; }
public Histogram permutationTest(final double[] series1, final double[] series2) {
final Histogram<Double> histo = new Histogram<Double>();
final int n1 = series1.length;
final int n2 = series2.length;
return countSequences(n-1,m,u-m) + countSequences(n,m-1,u);
}
RankedData rankedGroups = calculateRank(series1, series2);
Rank[] ranks = rankedGroups.getRank();
/**
* : just a shorter name for calculatePRecursively. See Mann, Whitney, [1947]
* @param n: number of set-1 entries
* @param m: number of set-2 entries
* @param u: number of times a set-2 entry as preceded a set-1 entry
* @return recursive p-value
*/
private static double cpr(int n, int m, long u) {
if ( u < 0 ) {
return 0.0;
}
if ( m == 0 || n == 0 ) {
// there are entries in set 1 or set 2, so no set-2 entry can precede a set-1 entry; thus u must be zero.
// note that this exists only for edification, as when we reach this point, the coefficient on this term is zero anyway
return ( u == 0 ) ? 1.0 : 0.0;
}
Integer[] firstPermutation = new Integer[n1 + n2];
return (((double)n)/(n+m))*cpr(n-1,m,u-m) + (((double)m)/(n+m))*cpr(n,m-1,u);
}
private static double cumulativeCPR(int n, int m, long u ) {
// from above:
// the null hypothesis is that {N} is stochastically less than {M}, so U has counted
// occurrences of {M}s before {N}s. We would expect that this should be less than (n*m+1)/2 under
// the null hypothesis, so we want to integrate from K=0 to K=U for cumulative cases. Always.
double p = 0.0;
// optimization using symmetry, use the least amount of sums possible
long uSym = ( u <= n*m/2 ) ? u : ((long)n)*m-u;
for ( long uu = 0; uu < uSym; uu++ ) {
p += cpr(n,m,uu);
}
// correct by 1.0-p if the optimization above was used (e.g. 1-right tail = left tail)
return (u <= n*m/2) ? p : 1.0-p;
}
/**
* hook into the data tree, for testing purposes only
* @return observations
*/
protected TreeSet<Pair<Number,USet>> getObservations() {
return observations;
}
/**
* hook into the set sizes, for testing purposes only
* @return size set 1, size set 2
*/
protected Pair<Integer,Integer> getSetSizes() {
return new Pair<Integer,Integer>(sizeSet1,sizeSet2);
}
/**
* Validates that observations are in the correct format for a MWU test -- this is only called by the contracts API during testing
* @param tree - the collection of labeled observations
* @return true iff the tree set is valid (no INFs or NaNs, at least one data point in each set)
*/
protected static boolean validateObservations(TreeSet<Pair<Number,USet>> tree) {
boolean seen1 = false;
boolean seen2 = false;
boolean seenInvalid = false;
for ( Pair<Number,USet> p : tree) {
if ( ! seen1 && p.getSecond() == USet.SET1 ) {
seen1 = true;
for (int i = 0; i < firstPermutation.length; i++) {
if (i < n1) {
firstPermutation[i] = 0;
} else {
firstPermutation[i] = 1;
}
}
if ( ! seen2 && p.getSecond() == USet.SET2 ) {
seen2 = true;
}
if ( Double.isNaN(p.getFirst().doubleValue()) || Double.isInfinite(p.getFirst().doubleValue())) {
seenInvalid = true;
final int numOfPerms = (int) MathUtils.binomialCoefficient(n1 + n2, n2);
Set<List<Integer>> allPermutations = getPermutations(firstPermutation, numOfPerms);
double[] newSeries1 = new double[n1];
double[] newSeries2 = new double[n2];
//iterate over all permutations
for (List<Integer> currPerm : allPermutations) {
int series1End = 0;
int series2End = 0;
for (int i = 0; i < currPerm.size(); i++) {
int grouping = currPerm.get(i);
if (grouping == 0) {
newSeries1[series1End] = ranks[i].rank;
series1End++;
} else {
newSeries2[series2End] = ranks[i].rank;
series2End++;
}
}
assert (series1End == n1);
assert (series2End == n2);
double newU = MathUtils.sum(newSeries1) - ((n1 * (n1 + 1)) / 2.0);
histo.increment(newU);
}
return ! seenInvalid && seen1 && seen2;
return histo;
}
/**
* A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values
* and to ensure that rank ties are broken at random.
*/
private static class DitheringComparator implements Comparator<Pair<Number,USet>>, Serializable {
public DitheringComparator() {}
@Override
public boolean equals(Object other) { return false; }
@Override
public int compare(Pair<Number,USet> left, Pair<Number,USet> right) {
double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue());
if ( comp > 0 ) { return 1; }
if ( comp < 0 ) { return -1; }
return Utils.getRandomGenerator().nextBoolean() ? -1 : 1;
}
}
/**
* A comparator that reaches into the pair and compares numbers without tie-braking.
*/
private static class NumberedPairComparator implements Comparator<Pair<Number,USet>>, Serializable {
public NumberedPairComparator() {}
@Override
public boolean equals(Object other) { return false; }
@Override
public int compare(Pair<Number,USet> left, Pair<Number,USet> right ) {
return Double.compare(left.first.doubleValue(),right.first.doubleValue());
}
}
public enum USet { SET1, SET2 }
public enum ExactMode { POINT, CUMULATIVE }
}

View File

@ -25,13 +25,13 @@
package org.broadinstitute.gatk.utils;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: Ghost
@ -40,92 +40,75 @@ import org.testng.Assert;
* To change this template use File | Settings | File Templates.
*/
public class MWUnitTest extends BaseTest {
private static double DELTA_PRECISION = 0.00001;
@BeforeClass
public void init() { }
@Test
private void testMWU() {
logger.warn("Testing MWU");
MannWhitneyU mwu = new MannWhitneyU();
mwu.add(0, MannWhitneyU.USet.SET1);
mwu.add(1,MannWhitneyU.USet.SET2);
mwu.add(2,MannWhitneyU.USet.SET2);
mwu.add(3,MannWhitneyU.USet.SET2);
mwu.add(4,MannWhitneyU.USet.SET2);
mwu.add(5,MannWhitneyU.USet.SET2);
mwu.add(6,MannWhitneyU.USet.SET1);
mwu.add(7,MannWhitneyU.USet.SET1);
mwu.add(8,MannWhitneyU.USet.SET1);
mwu.add(9,MannWhitneyU.USet.SET1);
mwu.add(10,MannWhitneyU.USet.SET1);
mwu.add(11,MannWhitneyU.USet.SET2);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(), MannWhitneyU.USet.SET1),25L);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11L);
private static final MannWhitneyU rst = new MannWhitneyU();
MannWhitneyU mwu2 = new MannWhitneyU();
MannWhitneyU mwuNoDither = new MannWhitneyU(false);
for ( int dp : new int[]{2,4,5,6,8} ) {
mwu2.add(dp,MannWhitneyU.USet.SET1);
mwuNoDither.add(dp,MannWhitneyU.USet.SET1);
}
@Test(dataProvider = "rankSumTestData")
public void testSimpleU(String name, double[] series1, double[] series2, double U) {
MannWhitneyU.Result test = rst.test(series1, series2, MannWhitneyU.TestType.TWO_SIDED);
System.out.println("==========================================================");
System.out.println("Series 1: " + Arrays.toString(series1));
System.out.println("Series 2: " + Arrays.toString(series2));
System.out.println(" U: " + test.getU());
System.out.println(" Z: " + test.getZ());
System.out.println("2-side p: " + test.getP());
Assert.assertEquals(test.getU(), U, name);
}
for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) {
mwu2.add(dp,MannWhitneyU.USet.SET2);
mwuNoDither.add(dp,MannWhitneyU.USet.SET2);
}
@Test(dataProvider = "oneSidedPTestData")
public void testOnesidedP(String name, double[] series1, double[] series2, double P) {
MannWhitneyU.Result test = rst.test(series1, series2, MannWhitneyU.TestType.FIRST_DOMINATES);
System.out.println("==========================================================");
System.out.println("Series 1: " + Arrays.toString(series1));
System.out.println("Series 2: " + Arrays.toString(series2));
System.out.println(" U: " + test.getU());
System.out.println(" Z: " + test.getZ());
System.out.println("1-side p: " + test.getP());
Assert.assertEquals(test.getP(), P, DELTA_PRECISION, name);
}
MannWhitneyU.ExactMode pm = MannWhitneyU.ExactMode.POINT;
MannWhitneyU.ExactMode cm = MannWhitneyU.ExactMode.CUMULATIVE;
// tests using the hypothesis that set 2 dominates set 1 (U value = 10)
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10L);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30L);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET1),10L);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET2),30L);
Pair<Integer,Integer> sizes = mwu2.getSetSizes();
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.first,sizes.second,10L),0.4180519701814064,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.first,sizes.second,10L,false,pm).second,0.021756021756021756,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.first,sizes.second,10L,false).second,0.06214143703127617,1e-14);
logger.warn("Testing two-sided");
Assert.assertEquals((double)mwu2.runTwoSidedTest().second,2*0.021756021756021756,1e-8);
// tests using the hypothesis that set 1 dominates set 2 (U value = 30) -- empirical should be identical, normall approx close, uniform way off
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(sizes.second,sizes.first,30L,true).second,2.0*0.08216463976903321,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(sizes.second,sizes.first,30L),0.0023473625009559074,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,30L,false,pm).second,0.021756021756021756,1e-14); // note -- exactly same value as above
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).second,1.0-0.08547008547008,1e-14); // r does a correction, subtracting 1 from U
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).second,0.08547008547008,1e-14); // r does a correction, subtracting 1 from U
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,cm).first,-1.36918910442,1e-2); // apache inversion set to be good only to 1e-2
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,cm).first,1.36918910442,1e-2); // apache inversion set to be good only to 1e-2
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,29L,false,pm).first,1.2558754796642067,1e-8); // PDF should be similar
Assert.assertEquals(MannWhitneyU.calculatePRecursively(sizes.second,sizes.first,11L,false,pm).first,-1.2558754796642067,1e-8); // PDF should be similar
Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).second,0.0952381,1e-5);
Assert.assertEquals(MannWhitneyU.calculatePRecursively(4,5,10L,false,pm).first,0.0,1e-14);
logger.warn("Set 1");
Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET1).second,0.021756021756021756,1e-8);
logger.warn("Set 2");
Assert.assertEquals((double)mwu2.runOneSidedTest(MannWhitneyU.USet.SET2).second,0.021756021756021756,1e-8);
MannWhitneyU mwu3 = new MannWhitneyU();
for ( int dp : new int[]{0,2,4} ) {
mwu3.add(dp,MannWhitneyU.USet.SET1);
}
for ( int dp : new int[]{1,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34} ) {
mwu3.add(dp,MannWhitneyU.USet.SET2);
}
long u = MannWhitneyU.calculateOneSidedU(mwu3.getObservations(),MannWhitneyU.USet.SET1);
//logger.warn(String.format("U is: %d",u));
Pair<Integer,Integer> nums = mwu3.getSetSizes();
//logger.warn(String.format("Corrected p is: %.4e",MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first,nums.second,u)));
//logger.warn(String.format("Counted sequences: %d",MannWhitneyU.countSequences(nums.first, nums.second, u)));
//logger.warn(String.format("Possible sequences: %d", (long) Arithmetic.binomial(nums.first+nums.second,nums.first)));
//logger.warn(String.format("Ratio: %.4e",MannWhitneyU.countSequences(nums.first,nums.second,u)/Arithmetic.binomial(nums.first+nums.second,nums.first)));
Assert.assertEquals(MannWhitneyU.calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(nums.first, nums.second, u), 3.665689149560116E-4, 1e-14);
Assert.assertEquals(MannWhitneyU.calculatePNormalApproximation(nums.first,nums.second,u,false).second,0.0032240865760884696,1e-14);
Assert.assertEquals(MannWhitneyU.calculatePUniformApproximation(nums.first,nums.second,u),0.0026195003025784036,1e-14);
@DataProvider(name="rankSumTestData")
public Object[][] dataProvider() {
return new Object[][] {
new Object[] {"test1", new double[] {20,20,20,20,20}, new double[] {20,20,20,20,21}, 10d},
new Object[] {"test2", new double[] {20,20,20,20,21}, new double[] {20,20,20,20,21}, 12.5d},
new Object[] {"test3", new double[] {13,14,15,15,16}, new double[] {16,20,20,21,21}, 0.5d},
new Object[] {"test4", new double[] {13,14,15,15,16}, new double[] {16,20,20,21,21,21,21,22,23,27}, 0.5d},
new Object[] {"test5", new double[] {13,14,15,15,16,18,20,22,25,24,25,26,27,28,22,23,19,30,28,22,17},
new double[] {16,20,20,21,21,21,21,22,23,27,26,28,29,32,31,22,21,19,16,24,29},
180.5d},
new Object[] {"test6", new double[] {13,14,15,15,16,18,13,14,15,15,16,18,13,14,15,15,16,18,13,14,15,15,16,18},
new double[] {21,22,23,27,26,28,29,21,22,23,27,26,28,29,21,22,23,27,26,28,29,21,22,23,27,26,28,29},
0d},
new Object[] {"test7", new double[] {11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20},
new double[] {12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21},
145d},
new Object[] {"test7", new double[] {11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20},
new double[] {12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21},
162d},
new Object[] {"test8", new double[] {20,20,20,20,20}, new double[] {20,20,20,20,20,20,20,20,20,20}, 25d},
};
}
@DataProvider(name="oneSidedPTestData")
public Object[][] oneSidedDataProvider() {
return new Object[][] {
new Object[] {"test0", new double[] {0,0}, new double[] {1,1}, .16666666},
new Object[] {"test1", new double[] {20,20,20,20,20}, new double[] {20,20,20,20,21}, .5},
new Object[] {"test2", new double[] {20,20,20,20,21}, new double[] {20,20,20,20,21}, .77778},
new Object[] {"test3", new double[] {13,14,15,15,16}, new double[] {16,20,20,21,21}, 0.0079365},
new Object[] {"test4", new double[] {13,14,15,15,16}, new double[] {16,20,20,21,21,21,21,22,23,27}, 0.001469192},
new Object[] {"test5", new double[] {20,20,20,20,20}, new double[] {20,20,20,20,20,20,20,20,20,20}, .5},
new Object[] {"test6", new double[] {1,2,3,4,5}, new double[] {6,7,8,9,10}, .00396},
new Object[] {"test7", new double[] {6,7,8,9,10}, new double[] {1,2,3,4,5}, 1},
new Object[] {"test8", new double[] {16,20,20,21,21,21,21,22,23,27,16,20,20,21,21,21,21,22,23,27},
new double[] {22,23,27,16,20,20,21,21,21,21,22,23,27,60,60}, .08303102},
new Object[] {"test8", new double[] {16,20,20,21,21,21,21,21,20},
new double[] {22,23,27,16,20,20,20,20,21}, .395763},
};
}
}