100% version of PedReader

-- Passes all unit tests
-- Added unit tests for missing fields
This commit is contained in:
Mark DePristo 2011-10-03 06:12:58 -07:00
parent dd75ad9f49
commit 52f670c8b8
2 changed files with 114 additions and 70 deletions

View File

@ -117,13 +117,17 @@ public class PedReader {
final static private Set<String> CATAGORICAL_TRAIT_VALUES = new HashSet<String>(Arrays.asList("-9", "0", "1", "2")); final static private Set<String> CATAGORICAL_TRAIT_VALUES = new HashSet<String>(Arrays.asList("-9", "0", "1", "2"));
final static private String commentMarker = "#"; final static private String commentMarker = "#";
public enum MissingPedFields { public enum MissingPedField {
NO_FAMILY_ID, NO_FAMILY_ID,
NO_PARENTS, NO_PARENTS,
NO_SEX, NO_SEX,
NO_PHENOTYPE NO_PHENOTYPE
} }
protected enum Field {
FAMILY_ID, INDIVIDUAL_ID, PATERNAL_ID, MATERNAL_ID, GENDER, PHENOTYPE
}
// phenotype // phenotype
private final static String MISSING_VALUE1 = "-9"; private final static String MISSING_VALUE1 = "-9";
private final static String MISSING_VALUE2 = "0"; private final static String MISSING_VALUE2 = "0";
@ -137,21 +141,21 @@ public class PedReader {
public PedReader() { } public PedReader() { }
public final List<Sample> parse(File source, EnumSet<MissingPedFields> missingFields, SampleDataSource sampleDB) throws FileNotFoundException { public final List<Sample> parse(File source, EnumSet<MissingPedField> missingFields, SampleDataSource sampleDB) throws FileNotFoundException {
logger.info("Reading PED file " + source + " with missing fields: " + missingFields); logger.info("Reading PED file " + source + " with missing fields: " + missingFields);
return parse(new FileReader(source), missingFields, sampleDB); return parse(new FileReader(source), missingFields, sampleDB);
} }
public final List<Sample> parse(Reader reader, EnumSet<MissingPedFields> missingFields, SampleDataSource sampleDB) { public final List<Sample> parse(Reader reader, EnumSet<MissingPedField> missingFields, SampleDataSource sampleDB) {
final List<String> lines = new XReadLines(reader).readLines(); final List<String> lines = new XReadLines(reader).readLines();
// What are the record offsets? // What are the record offsets?
final int familyPos = missingFields.contains(MissingPedFields.NO_FAMILY_ID) ? -1 : 0; final int familyPos = missingFields.contains(MissingPedField.NO_FAMILY_ID) ? -1 : 0;
final int samplePos = familyPos + 1; final int samplePos = familyPos + 1;
final int paternalPos = missingFields.contains(MissingPedFields.NO_PARENTS) ? -1 : samplePos + 1; final int paternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : samplePos + 1;
final int maternalPos = missingFields.contains(MissingPedFields.NO_PARENTS) ? -1 : paternalPos + 1; final int maternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : paternalPos + 1;
final int sexPos = missingFields.contains(MissingPedFields.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1; final int sexPos = missingFields.contains(MissingPedField.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1;
final int phenotypePos = missingFields.contains(MissingPedFields.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1; final int phenotypePos = missingFields.contains(MissingPedField.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1;
final int nExpectedFields = MathUtils.arrayMaxInt(Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)) + 1; final int nExpectedFields = MathUtils.arrayMaxInt(Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)) + 1;
// go through once and determine properties // go through once and determine properties

View File

@ -26,15 +26,14 @@ package org.broadinstitute.sting.gatk.samples;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.DataProvider; import org.testng.annotations.DataProvider;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.io.StringReader; import java.io.StringReader;
import java.util.Arrays; import java.lang.reflect.Array;
import java.util.EnumSet; import java.util.*;
import java.util.HashSet;
import java.util.List;
/** /**
* UnitTest for PedReader * UnitTest for PedReader
@ -48,7 +47,7 @@ public class PedReaderUnitTest extends BaseTest {
private class PedReaderTest extends TestDataProvider { private class PedReaderTest extends TestDataProvider {
public String fileContents; public String fileContents;
public List<Sample> expectedSamples; public List<Sample> expectedSamples;
EnumSet<PedReader.MissingPedFields> missing; EnumSet<PedReader.MissingPedField> missing;
private PedReaderTest(final String name, final List<Sample> expectedSamples, final String fileContents) { private PedReaderTest(final String name, final List<Sample> expectedSamples, final String fileContents) {
super(PedReaderTest.class, name); super(PedReaderTest.class, name);
@ -57,19 +56,6 @@ public class PedReaderUnitTest extends BaseTest {
} }
} }
private class PedReaderTestMissing extends TestDataProvider {
public String fileContents;
public List<Sample> expectedSamples;
EnumSet<PedReader.MissingPedFields> missing;
private PedReaderTestMissing(final String name, EnumSet<PedReader.MissingPedFields> missing, final List<Sample> expectedSamples, final String fileContents) {
super(PedReaderTest.class, name);
this.fileContents = fileContents;
this.expectedSamples = expectedSamples;
this.missing = missing;
}
}
// Family ID // Family ID
// Individual ID // Individual ID
// Paternal ID // Paternal ID
@ -115,17 +101,17 @@ public class PedReaderUnitTest extends BaseTest {
new PedReaderTest("multipleUnrelated", new PedReaderTest("multipleUnrelated",
Arrays.asList( Arrays.asList(
new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED),
new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.AFFECTED)), new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.AFFECTED)),
String.format("%s%n%s", String.format("%s%n%s",
"fam1 s1 0 0 1 1", "fam1 s1 0 0 1 1",
"fam2 s2 0 0 2 2")); "fam2 s2 0 0 2 2"));
new PedReaderTest("explicitTrio", new PedReaderTest("explicitTrio",
Arrays.asList( Arrays.asList(
new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED),
new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED),
new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED)), new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED)),
String.format("%s%n%s%n%s", String.format("%s%n%s%n%s",
"fam1 kid dad mom 1 2", "fam1 kid dad mom 1 2",
"fam1 dad 0 0 1 1", "fam1 dad 0 0 1 1",
@ -133,29 +119,29 @@ public class PedReaderUnitTest extends BaseTest {
new PedReaderTest("implicitTrio", new PedReaderTest("implicitTrio",
Arrays.asList( Arrays.asList(
new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED),
new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNKNOWN),
new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)),
"fam1 kid dad mom 1 2"); "fam1 kid dad mom 1 2");
new PedReaderTest("partialTrio", new PedReaderTest("partialTrio",
Arrays.asList( Arrays.asList(
new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED),
new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED),
new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)),
String.format("%s%n%s", String.format("%s%n%s",
"fam1 kid dad mom 1 2", "fam1 kid dad mom 1 2",
"fam1 dad 0 0 1 1")); "fam1 dad 0 0 1 1"));
new PedReaderTest("bigPedigree", new PedReaderTest("bigPedigree",
Arrays.asList( Arrays.asList(
new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED),
new Sample("dad", "fam1", "granddad1", "grandma1", Gender.MALE, Affection.UNAFFECTED), new Sample("dad", "fam1", "granddad1", "grandma1", Gender.MALE, Affection.UNAFFECTED),
new Sample("granddad1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), new Sample("granddad1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN),
new Sample("grandma1", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN), new Sample("grandma1", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN),
new Sample("mom", "fam1", "granddad2", "grandma2", Gender.FEMALE, Affection.AFFECTED), new Sample("mom", "fam1", "granddad2", "grandma2", Gender.FEMALE, Affection.AFFECTED),
new Sample("granddad2", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), new Sample("granddad2", "fam1", null, null, Gender.MALE, Affection.UNKNOWN),
new Sample("grandma2", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), new Sample("grandma2", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)),
String.format("%s%n%s%n%s", String.format("%s%n%s%n%s",
"fam1 kid dad mom 1 2", "fam1 kid dad mom 1 2",
"fam1 dad granddad1 grandma1 1 1", "fam1 dad granddad1 grandma1 1 1",
@ -164,24 +150,24 @@ public class PedReaderUnitTest extends BaseTest {
// Quantitative trait // Quantitative trait
new PedReaderTest("QuantitativeTrait", new PedReaderTest("QuantitativeTrait",
Arrays.asList( Arrays.asList(
new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0),
new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)),
String.format("%s%n%s", String.format("%s%n%s",
"fam1 s1 0 0 1 1", "fam1 s1 0 0 1 1",
"fam2 s2 0 0 2 10.0")); "fam2 s2 0 0 2 10.0"));
new PedReaderTest("QuantitativeTraitWithMissing", new PedReaderTest("QuantitativeTraitWithMissing",
Arrays.asList( Arrays.asList(
new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN, Sample.UNSET_QT), new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN, Sample.UNSET_QT),
new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)),
String.format("%s%n%s", String.format("%s%n%s",
"fam1 s1 0 0 1 -9", "fam1 s1 0 0 1 -9",
"fam2 s2 0 0 2 10.0")); "fam2 s2 0 0 2 10.0"));
new PedReaderTest("QuantitativeTraitOnlyInts", new PedReaderTest("QuantitativeTraitOnlyInts",
Arrays.asList( Arrays.asList(
new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0), new Sample("s1", "fam1", null, null, Gender.MALE, Affection.QUANTITATIVE, 1.0),
new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)), new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.QUANTITATIVE, 10.0)),
String.format("%s%n%s", String.format("%s%n%s",
"fam1 s1 0 0 1 1", "fam1 s1 0 0 1 1",
"fam2 s2 0 0 2 10")); "fam2 s2 0 0 2 10"));
@ -189,7 +175,7 @@ public class PedReaderUnitTest extends BaseTest {
return PedReaderTest.getTests(PedReaderTest.class); return PedReaderTest.getTests(PedReaderTest.class);
} }
private static final void runTest(PedReaderTest test, String myFileContents, EnumSet<PedReader.MissingPedFields> missing) { private static final void runTest(PedReaderTest test, String myFileContents, EnumSet<PedReader.MissingPedField> missing) {
logger.warn("Test " + test); logger.warn("Test " + test);
PedReader reader = new PedReader(); PedReader reader = new PedReader();
SampleDataSource sampleDB = new SampleDataSource(); SampleDataSource sampleDB = new SampleDataSource();
@ -199,37 +185,91 @@ public class PedReaderUnitTest extends BaseTest {
@Test(enabled = true, dataProvider = "readerTest") @Test(enabled = true, dataProvider = "readerTest")
public void testPedReader(PedReaderTest test) { public void testPedReader(PedReaderTest test) {
runTest(test, test.fileContents, EnumSet.noneOf(PedReader.MissingPedFields.class)); runTest(test, test.fileContents, EnumSet.noneOf(PedReader.MissingPedField.class));
} }
@Test(enabled = true, dataProvider = "readerTest", dependsOnMethods = "testPedReader") @Test(enabled = true, dataProvider = "readerTest", dependsOnMethods = "testPedReader")
public void testPedReaderWithComments(PedReaderTest test) { public void testPedReaderWithComments(PedReaderTest test) {
runTest(test, String.format("#comment%n%s", test.fileContents), EnumSet.noneOf(PedReader.MissingPedFields.class)); runTest(test, String.format("#comment%n%s", test.fileContents), EnumSet.noneOf(PedReader.MissingPedField.class));
}
// -----------------------------------------------------------------
// missing format field tests
// -----------------------------------------------------------------
private class PedReaderTestMissing extends TestDataProvider {
public EnumSet<PedReader.MissingPedField> missingDesc;
public EnumSet<PedReader.Field> missingFields;
public final String fileContents;
public Sample expected;
private PedReaderTestMissing(final String name, final String fileContents,
EnumSet<PedReader.MissingPedField> missingDesc,
EnumSet<PedReader.Field> missingFields,
final Sample expected) {
super(PedReaderTestMissing.class, name);
this.fileContents = fileContents;
this.missingDesc = missingDesc;
this.missingFields = missingFields;
this.expected = expected;
}
} }
@DataProvider(name = "readerTestMissing") @DataProvider(name = "readerTestMissing")
public Object[][] createPEDFilesWithMissing() { public Object[][] createPEDFilesWithMissing() {
new PedReaderTestMissing("trioMissingFam", EnumSet.of(PedReader.MissingPedFields.NO_FAMILY_ID),
Arrays.asList( new PedReaderTestMissing("missingFam",
new Sample("kid", null, "dad", "mom", Gender.MALE, Affection.AFFECTED), "fam1 kid dad mom 1 2",
new Sample("dad", null, null, null, Gender.MALE, Affection.UNAFFECTED), EnumSet.of(PedReader.MissingPedField.NO_FAMILY_ID),
new Sample("mom", null, null, null, Gender.FEMALE, Affection.AFFECTED)), EnumSet.of(PedReader.Field.FAMILY_ID),
String.format("%s%n%s%n%s", new Sample("kid", null, "dad", "mom", Gender.MALE, Affection.AFFECTED));
"kid dad mom 1 2",
"dad 0 0 1 1", new PedReaderTestMissing("missingParents",
"mom 0 0 2 2")); "fam1 kid dad mom 1 2",
EnumSet.of(PedReader.MissingPedField.NO_PARENTS),
EnumSet.of(PedReader.Field.PATERNAL_ID, PedReader.Field.MATERNAL_ID),
new Sample("kid", "fam1", null, null, Gender.MALE, Affection.AFFECTED));
new PedReaderTestMissing("missingSex",
"fam1 kid dad mom 1 2",
EnumSet.of(PedReader.MissingPedField.NO_SEX),
EnumSet.of(PedReader.Field.GENDER),
new Sample("kid", "fam1", "dad", "mom", Gender.UNKNOWN, Affection.AFFECTED));
new PedReaderTestMissing("missingPhenotype",
"fam1 kid dad mom 1 2",
EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE),
EnumSet.of(PedReader.Field.PHENOTYPE),
new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.UNKNOWN));
new PedReaderTestMissing("missingEverythingButGender",
"fam1 kid dad mom 1 2",
EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE, PedReader.MissingPedField.NO_PARENTS, PedReader.MissingPedField.NO_FAMILY_ID),
EnumSet.of(PedReader.Field.FAMILY_ID, PedReader.Field.PATERNAL_ID, PedReader.Field.MATERNAL_ID, PedReader.Field.PHENOTYPE),
new Sample("kid", null, null, null, Gender.MALE, Affection.UNKNOWN));
return PedReaderTestMissing.getTests(PedReaderTestMissing.class); return PedReaderTestMissing.getTests(PedReaderTestMissing.class);
} }
@Test(enabled = true, dataProvider = "readerTestMissing", dependsOnMethods = "testPedReader") @Test(enabled = true, dataProvider = "readerTestMissing", dependsOnMethods = "testPedReader")
public void testPedReaderWithMissing(PedReaderTest test) { public void testPedReaderWithMissing(PedReaderTestMissing test) {
// public enum MissingPedFields { final String contents = sliceContents(test.missingFields, test.fileContents);
// NO_FAMILY_ID, logger.warn("Test " + test);
// NO_PARENTS, PedReader reader = new PedReader();
// NO_SEX, SampleDataSource sampleDB = new SampleDataSource();
// NO_PHENOTYPE reader.parse(new StringReader(contents), test.missingDesc, sampleDB);
// } final Sample missingSample = sampleDB.getSample("kid");
// runTest(test, sliceContents(0, test.fileContents), EnumSet.of(PedReader.MissingPedFields.NO_FAMILY_ID)); Assert.assertEquals(test.expected, missingSample, "Missing field value not expected value for " + test);
}
private final static String sliceContents(EnumSet<PedReader.Field> missingFieldsSet, String full) {
List<String> parts = new ArrayList<String>(Arrays.asList(full.split("\\s+")));
final List<PedReader.Field> missingFields = new ArrayList<PedReader.Field>(missingFieldsSet);
Collections.reverse(missingFields);
for ( PedReader.Field field : missingFields )
parts.remove(field.ordinal());
return Utils.join("\t", parts);
} }
} }