A couple of minor things.

1) Better documentation on the meta data file for VariantsToBinaryPed with examples of each file type

2) MannWhitneyU can now take an argument on creation to turn off dithering. This pertains to JIRA-GSA-571 but does not fix it,
   as it isn't hooked up to the command line. Next step is to add an argument to the command line where it's accessible to the
   annotation classes (e.g. from either UG or the VariantAnnotator).

3) Added some dumb python scripts to deal with Plink files, and a script to convert plink binaries to VCF to help sanity check. Basically if you want to do an analysis on genotype data stored in plink binary format, your choices are:
  1) Add a new module to Plink [difficulty rating: Impossible -- code obfuscation]
  2) Steal plink parsing code from software (Plink/PlinkSeq/GCTA/Emacks/etc) that readds the files [difficulty rating: Oppressive -- code not modularized at all)
  3) Write your own dumb stuff [difficutly rating: Annoying]
What's been added is the result of 3. It's a library so nobody else has to do this, so long as they're comfortable with python.
This commit is contained in:
Christopher Hartl 2012-09-20 12:48:13 -04:00
parent 1ef6fa7eed
commit d25579deeb
3 changed files with 58 additions and 5 deletions

View File

@ -37,6 +37,28 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
/**
* The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This
* is what Plink describes as a fam file. An example fam file is (note that there is no header):
*
* CEUTrio NA12878 NA12891 NA12892 2 -9
* CEUTrio NA12891 UNKN1 UNKN2 2 -9
* CEUTrio NA12892 UNKN3 UNKN4 1 -9
*
* where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex)
*
* An alternate format is a two-column key-value file
*
* NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9
* NA12891 fid=CEUTrio;sex=2;phenotype=-9
* NA12892 fid=CEUTrio;sex=1;phenotype=-9
*
* wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs.
*
* Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the
* command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the
* alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data.
*/
@Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " +
"(in which case it will be copied to the file you provide as fam output).")
File metaDataFile;

View File

@ -30,16 +30,26 @@ public class MannWhitneyU {
private int sizeSet2;
private ExactMode exactMode;
public MannWhitneyU() {
observations = new TreeSet<Pair<Number,USet>>(new DitheringComparator());
public MannWhitneyU(ExactMode mode, boolean dither) {
if ( dither )
observations = new TreeSet<Pair<Number,USet>>(new DitheringComparator());
else
observations = new TreeSet<Pair<Number,USet>>(new NumberedPairComparator());
sizeSet1 = 0;
sizeSet2 = 0;
exactMode = ExactMode.POINT;
exactMode = mode;
}
public MannWhitneyU() {
this(ExactMode.POINT,true);
}
public MannWhitneyU(boolean dither) {
this(ExactMode.POINT,dither);
}
public MannWhitneyU(ExactMode mode) {
super();
exactMode = mode;
this(mode,true);
}
/**
@ -451,6 +461,22 @@ public class MannWhitneyU {
}
}
/**
* A comparator that reaches into the pair and compares numbers without tie-braking.
*/
private static class NumberedPairComparator implements Comparator<Pair<Number,USet>>, Serializable {
public NumberedPairComparator() {}
@Override
public boolean equals(Object other) { return false; }
@Override
public int compare(Pair<Number,USet> left, Pair<Number,USet> right ) {
return Double.compare(left.first.doubleValue(),right.first.doubleValue());
}
}
public enum USet { SET1, SET2 }
public enum ExactMode { POINT, CUMULATIVE }

View File

@ -40,12 +40,15 @@ public class MWUnitTest extends BaseTest {
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11L);
MannWhitneyU mwu2 = new MannWhitneyU();
MannWhitneyU mwuNoDither = new MannWhitneyU(false);
for ( int dp : new int[]{2,4,5,6,8} ) {
mwu2.add(dp,MannWhitneyU.USet.SET1);
mwuNoDither.add(dp,MannWhitneyU.USet.SET1);
}
for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) {
mwu2.add(dp,MannWhitneyU.USet.SET2);
mwuNoDither.add(dp,MannWhitneyU.USet.SET2);
}
MannWhitneyU.ExactMode pm = MannWhitneyU.ExactMode.POINT;
@ -54,6 +57,8 @@ public class MWUnitTest extends BaseTest {
// tests using the hypothesis that set 2 dominates set 1 (U value = 10)
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10L);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30L);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET1),10L);
Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET2),30L);
Pair<Integer,Integer> sizes = mwu2.getSetSizes();