Code improvements re: JIRA GSA-510. Trio class migrated into the Samples package - because the trio structure is so ubiquitously used, it makes sense, I think, to have a class which imposes the structure on the samples. Existing functions which slightly duplicated the getTrios() method look like they have bugs. These functions are now deprecated.

A number of functions int he sampleDB looked to be assuming that samples could not share IDs (e.g. sample IDs are unique, so a sample present in two families could not be represented by multiple Sample objects). Added an assertion in the SampleDBBuilder to document/test this assumption.

MVLikelihoodRatio now uses the trio methods from SampleDB.
This commit is contained in:
Christopher Hartl 2012-08-25 08:48:27 -07:00
parent 0996bbd548
commit b59948709f
4 changed files with 124 additions and 66 deletions

View File

@ -168,13 +168,70 @@ public class SampleDB {
return families;
}
/**
* Returns all the trios present in the sample database. The strictOneChild parameter determines
* whether multiple children of the same parents resolve to multiple trios, or are excluded
* @param strictOneChild - exclude pedigrees with >1 child for parental pair
* @return - all of the mother+father=child triplets, subject to strictOneChild
*/
public final Set<Trio> getTrios(boolean strictOneChild) {
Set<Trio> trioSet = new HashSet<Trio>();
for ( String familyString : getFamilyIDs() ) {
Set<Sample> family = getFamily(familyString);
for ( Sample sample : family) {
if ( sample.getParents().size() == 2 ) {
Trio trio = new Trio(sample.getMother(),sample.getFather(),sample);
trioSet.add(trio);
}
}
}
if ( strictOneChild )
trioSet = removeTriosWithSameParents(trioSet);
return trioSet;
}
/**
* Returns all the trios present in the db. See getTrios(boolean strictOneChild)
* @return all the trios present in the samples db.
*/
public final Set<Trio> getTrios() {
return getTrios(false);
}
/**
* Subsets a set of trios to only those with nonmatching founders. If two (or more) trio objects have
* the same mother and father, then both (all) are removed from the returned set.
* @param trios - a set of Trio objects
* @return those subset of Trio objects in the input set with nonmatching founders
*/
private Set<Trio> removeTriosWithSameParents(final Set<Trio> trios) {
Set<Trio> filteredTrios = new HashSet<Trio>();
filteredTrios.addAll(trios);
Set<Trio> triosWithSameParents = new HashSet<Trio>();
for ( Trio referenceTrio : filteredTrios ) {
for ( Trio compareTrio : filteredTrios ) {
if ( referenceTrio != compareTrio &&
referenceTrio.getFather().equals(compareTrio.getFather()) &&
referenceTrio.getMother().equals(compareTrio.getMother()) ) {
triosWithSameParents.add(referenceTrio);
triosWithSameParents.add(compareTrio);
}
}
}
filteredTrios.removeAll(triosWithSameParents);
return filteredTrios;
}
/**
* Returns the set of all children that have both of their parents.
* Note that if a family is composed of more than 1 child, each child is
* returned.
* @return - all the children that have both of their parents
* @deprecated - getTrios() replaces this function
*/
@Deprecated
public final Set<Sample> getChildrenWithParents(){
return getChildrenWithParents(false);
}
@ -188,7 +245,15 @@ public class SampleDB {
*
* @param triosOnly - if set to true, only strict trios are returned
* @return - all the children that have both of their parents
* @deprecated - getTrios(boolean strict) replaces this function
* @bug -- does not work for extracting multiple generations of trios, e.g.
* ..........Mom1------Dad1
* ................|
* ..............Child1--------Mom2
* .......................|
* .....................Child2
*/
@Deprecated
public final Set<Sample> getChildrenWithParents(boolean triosOnly) {
Map<String, Set<Sample>> families = getFamilies();

View File

@ -135,9 +135,8 @@ public class SampleDBBuilder {
// --------------------------------------------------------------------------------
protected final void validate() {
if ( validationStrictness == PedigreeValidationType.SILENT )
return;
else {
validatePedigreeIDUniqueness();
if ( validationStrictness != PedigreeValidationType.SILENT ) {
// check that samples in data sources are all annotated, if anything is annotated
if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) {
final Set<String> sampleNamesFromPedigrees = new HashSet<String>();
@ -150,4 +149,12 @@ public class SampleDBBuilder {
}
}
}
private void validatePedigreeIDUniqueness() {
Set<String> pedigreeIDs = new HashSet<String>();
for ( Sample sample : samplesFromPedigrees ) {
pedigreeIDs.add(sample.getID());
}
assert pedigreeIDs.size() == samplesFromPedigrees.size() : "The number of sample IDs extracted from the pedigree does not equal the number of samples in the pedigree. Is a sample associated with multiple families?";
}
}

View File

@ -0,0 +1,45 @@
package org.broadinstitute.sting.gatk.samples;
/**
* A class for imposing a trio structure on three samples; a common paradigm
*
* todo -- there should probably be an interface or abstract class "Pedigree" that generalizes the notion of
* -- imposing structure on samples. But given how complex pedigrees can quickly become, it's not
* -- clear the best way to do this.
*/
public class Trio {
private Sample mother;
private Sample father;
private Sample child;
public Trio(Sample mom, Sample dad, Sample spawn) {
assert mom.getID().equals(spawn.getMaternalID()) && dad.getID().equals(spawn.getPaternalID()) : "Samples passed to trio constructor do not form a trio";
mother = mom;
father = dad;
child = spawn;
}
public Sample getMother() {
return mother;
}
public String getMaternalID() {
return mother.getID();
}
public Sample getFather() {
return father;
}
public String getPaternalID() {
return father.getID();
}
public Sample getChild() {
return child;
}
public String getChildID() {
return child.getID();
}
}

View File

@ -3,8 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.samples.Trio;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
@ -39,7 +38,7 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
if ( mendelianViolation == null ) {
trios = checkAndSetSamples(((Walker) walker).getSampleDB());
trios = ((Walker) walker).getSampleDB().getTrios();
if ( trios.size() > 0 ) {
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP );
}
@ -53,7 +52,7 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
double maxMVLR = Double.MIN_VALUE;
for ( Trio trio : trios ) {
if ( contextHasTrioLikelihoods(vc,trio) ) {
Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.getMaternalID(),trio.getPaternalID(),trio.childId);
Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.getMaternalID(),trio.getPaternalID(),trio.getChildID());
maxMVLR = likR > maxMVLR ? likR : maxMVLR;
//pNoMV *= (1.0-Math.pow(10.0,likR)/(1+Math.pow(10.0,likR)));
}
@ -71,24 +70,9 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(MVLR_KEY, 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); }
// todo - this entire function should be in samples DB
private Set<Trio> checkAndSetSamples(SampleDB db){
Set<Trio> trioSet = new HashSet<Trio>();
for ( String familyString : db.getFamilyIDs() ) {
Set<Sample> family = db.getFamily(familyString);
for ( Sample sample : family) {
if ( sample.getParents().size() == 2 ) {
Trio trio = new Trio(sample.getMaternalID(),sample.getPaternalID(),sample.getID());
trioSet.add(trio);
}
}
}
return trioSet;
}
private boolean contextHasTrioLikelihoods(VariantContext context, Trio trio) {
for ( String sample : trio ) {
for ( String sample : Arrays.asList(trio.getMaternalID(),trio.getPaternalID(),trio.getChildID()) ) {
if ( ! context.hasGenotype(sample) )
return false;
if ( ! context.getGenotype(sample).hasLikelihoods() )
@ -98,47 +82,4 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
return true;
}
// TODO -- this class is too much.
// TODO -- Why iterable?
// TODO -- shuoldn't this be in samplesDB() so you can just called samplesDB().getTrios()
// TODO -- should just have final string IDs, and getters, no setters
private class Trio implements Iterable<String> {
private String maternalID;
private String paternalID;
private String childId;
public Trio(String mom, String dad, String child) {
this.maternalID = mom;
this.paternalID = dad;
this.childId = child;
}
public String getMaternalID() {
return this.maternalID;
}
public String getPaternalID() {
return this.paternalID;
}
public String getChildId() {
return this.childId;
}
public void setMaternalID(String id) {
this.maternalID = id;
}
public void setPaternalID(String id) {
this.paternalID = id;
}
public void setChildId(String id) {
this.childId = id;
}
public Iterator<String> iterator() {
return Arrays.asList(maternalID,paternalID,childId).iterator();
}
}
}