From d25579deeb24e5b0b24bfab10472b1cb67d81ad7 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 20 Sep 2012 12:48:13 -0400 Subject: [PATCH] A couple of minor things. 1) Better documentation on the meta data file for VariantsToBinaryPed with examples of each file type 2) MannWhitneyU can now take an argument on creation to turn off dithering. This pertains to JIRA-GSA-571 but does not fix it, as it isn't hooked up to the command line. Next step is to add an argument to the command line where it's accessible to the annotation classes (e.g. from either UG or the VariantAnnotator). 3) Added some dumb python scripts to deal with Plink files, and a script to convert plink binaries to VCF to help sanity check. Basically if you want to do an analysis on genotype data stored in plink binary format, your choices are: 1) Add a new module to Plink [difficulty rating: Impossible -- code obfuscation] 2) Steal plink parsing code from software (Plink/PlinkSeq/GCTA/Emacks/etc) that readds the files [difficulty rating: Oppressive -- code not modularized at all) 3) Write your own dumb stuff [difficutly rating: Annoying] What's been added is the result of 3. It's a library so nobody else has to do this, so long as they're comfortable with python. --- .../variantutils/VariantsToBinaryPed.java | 22 ++++++++++++ .../sting/utils/MannWhitneyU.java | 36 ++++++++++++++++--- .../sting/utils/MWUnitTest.java | 5 +++ 3 files changed, 58 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 6bc6153df..37fc96681 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -37,6 +37,28 @@ public class VariantsToBinaryPed extends RodWalker { @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + /** + * The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This + * is what Plink describes as a fam file. An example fam file is (note that there is no header): + * + * CEUTrio NA12878 NA12891 NA12892 2 -9 + * CEUTrio NA12891 UNKN1 UNKN2 2 -9 + * CEUTrio NA12892 UNKN3 UNKN4 1 -9 + * + * where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex) + * + * An alternate format is a two-column key-value file + * + * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9 + * NA12891 fid=CEUTrio;sex=2;phenotype=-9 + * NA12892 fid=CEUTrio;sex=1;phenotype=-9 + * + * wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs. + * + * Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the + * command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the + * alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data. + */ @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " + "(in which case it will be copied to the file you provide as fam output).") File metaDataFile; diff --git a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java index 8339e38c9..601f90b4d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java +++ b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java @@ -30,16 +30,26 @@ public class MannWhitneyU { private int sizeSet2; private ExactMode exactMode; - public MannWhitneyU() { - observations = new TreeSet>(new DitheringComparator()); + public MannWhitneyU(ExactMode mode, boolean dither) { + if ( dither ) + observations = new TreeSet>(new DitheringComparator()); + else + observations = new TreeSet>(new NumberedPairComparator()); sizeSet1 = 0; sizeSet2 = 0; - exactMode = ExactMode.POINT; + exactMode = mode; + } + + public MannWhitneyU() { + this(ExactMode.POINT,true); + } + + public MannWhitneyU(boolean dither) { + this(ExactMode.POINT,dither); } public MannWhitneyU(ExactMode mode) { - super(); - exactMode = mode; + this(mode,true); } /** @@ -451,6 +461,22 @@ public class MannWhitneyU { } } + /** + * A comparator that reaches into the pair and compares numbers without tie-braking. + */ + private static class NumberedPairComparator implements Comparator>, Serializable { + + public NumberedPairComparator() {} + + @Override + public boolean equals(Object other) { return false; } + + @Override + public int compare(Pair left, Pair right ) { + return Double.compare(left.first.doubleValue(),right.first.doubleValue()); + } + } + public enum USet { SET1, SET2 } public enum ExactMode { POINT, CUMULATIVE } diff --git a/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java index 6a01bb0b4..edd1bc356 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java @@ -40,12 +40,15 @@ public class MWUnitTest extends BaseTest { Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11L); MannWhitneyU mwu2 = new MannWhitneyU(); + MannWhitneyU mwuNoDither = new MannWhitneyU(false); for ( int dp : new int[]{2,4,5,6,8} ) { mwu2.add(dp,MannWhitneyU.USet.SET1); + mwuNoDither.add(dp,MannWhitneyU.USet.SET1); } for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) { mwu2.add(dp,MannWhitneyU.USet.SET2); + mwuNoDither.add(dp,MannWhitneyU.USet.SET2); } MannWhitneyU.ExactMode pm = MannWhitneyU.ExactMode.POINT; @@ -54,6 +57,8 @@ public class MWUnitTest extends BaseTest { // tests using the hypothesis that set 2 dominates set 1 (U value = 10) Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10L); Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET1),10L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET2),30L); Pair sizes = mwu2.getSetSizes();