Adding a few helper methods for accessing sample metadata, and associated unit tests. These are motivated by discussion with Ryan about how he'll use sample metadata in VariantEvalwalker - hopefully will make it easier for him. Methods are:

-- getToolkit().subContextFromSampleProperty(): filters a VariantContext to genotypes that come from samples that have a given property value
-- getToolkit().getSamplesWithProperty(): gets all samples with a given property
-- getToolkit().getSamplesFromVariantContext(): sample objects that are referenced by name in a VariantContext



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4361 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
bthomas 2010-09-28 02:16:25 +00:00
parent 51fdf9d701
commit 96cccafb0d
6 changed files with 251 additions and 19 deletions

View File

@ -29,6 +29,7 @@ import net.sf.picard.filter.SamRecordFilter;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.*;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.commandline.ArgumentSource;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
@ -799,4 +800,61 @@ public abstract class AbstractGenomeAnalysisEngine {
return sampleDataSource.getSamples(sampleNameList);
}
/**
* Returns a set of samples that have any value (which could be null) for a given property
* @param key Property key
* @return Set of samples with the property
*/
public Set<Sample> getSamplesWithProperty(String key) {
return sampleDataSource.getSamplesWithProperty(key);
}
/**
* Returns a set of samples that have a property with a certain value
* Value must be a string for now - could add a similar method for matching any objects in the future
*
* @param key Property key
* @param value String property value
* @return Set of samples that match key and value
*/
public Set<Sample> getSamplesWithProperty(String key, String value) {
return sampleDataSource.getSamplesWithProperty(key, value);
}
/**
* Returns a set of sample objects for the sample names in a variant context
*
* @param context Any variant context
* @return a set of the sample objects
*/
public Set<Sample> getSamplesByVariantContext(VariantContext context) {
Set<Sample> samples = new HashSet<Sample>();
for (String sampleName : context.getSampleNames()) {
samples.add(sampleDataSource.getOrCreateSample(sampleName));
}
return samples;
}
/**
* Returns all samples that were referenced in the SAM file
*/
public Set<Sample> getSAMFileSamples() {
return sampleDataSource.getSAMFileSamples();
}
/**
* Return a subcontext restricted to samples with a given property key/value
* Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering
* @param context VariantContext to filter
* @param key property key
* @param value property value (must be string)
* @return subcontext
*/
public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) {
return sampleDataSource.subContextFromSampleProperty(context, key, value);
}
}

View File

@ -31,6 +31,7 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.commandline.ArgumentException;
import org.broadinstitute.sting.commandline.ArgumentSource;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShardStrategy;
@ -368,4 +369,5 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine {
outputTracker.prepareWalker(walker, getArguments().strictnessLevel);
}
}

View File

@ -31,9 +31,9 @@ public class Sample implements java.io.Serializable {
}
public Sample(String id) {
if (id == null) {
/* if (id == null) {
throw new StingException("Error creating sample: sample ID cannot be null");
}
}*/
this.id = id;
}
@ -166,6 +166,15 @@ public class Sample implements java.io.Serializable {
return properties.get("gender") == Gender.MALE;
}
/**
*
* @param key property key
* @return true if sample has this property (even if its value is null)
*/
public boolean hasProperty(String key) {
return properties.containsKey(key);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.datasources.sample;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.StingException;
@ -14,12 +16,7 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.ArrayList;
import java.util.Collection;
import java.util.*;
/**
* Created by IntelliJ IDEA.
@ -488,5 +485,91 @@ public class SampleDataSource {
return samples;
}
/**
* Returns a set of samples that have any value (which could be null) for a given property
* @param key Property key
* @return Set of samples with the property
*/
public Set<Sample> getSamplesWithProperty(String key) {
HashSet<Sample> toReturn = new HashSet<Sample>();
for (Sample s : samples.values()) {
if (s.hasProperty(key))
toReturn.add(s);
}
return toReturn;
}
/**
* Returns a set of samples that have a property with a certain value
* Value must be a string for now - could add a similar method for matching any objects in the future
*
* @param key Property key
* @param value String property value
* @return Set of samples that match key and value
*/
public Set<Sample> getSamplesWithProperty(String key, String value) {
Set<Sample> toReturn = getSamplesWithProperty(key);
for (Sample s : toReturn) {
if (!s.getProperty(key).equals(value))
toReturn.remove(s);
}
return toReturn;
}
public Sample getOrCreateSample(String id) {
Sample sample = getSampleById(id);
if (sample == null) {
sample = new Sample(id);
addSample(sample);
}
return sample;
}
/**
* Returns all samples that were referenced in the SAM file
*/
public Set<Sample> getSAMFileSamples() {
Set<Sample> toReturn = new HashSet<Sample>();
for (Sample sample : samples.values()) {
if (sample.hasSAMFileEntry())
toReturn.add(sample);
}
return toReturn;
}
/**
* Returns a set of sample objects for the sample names in a variant context
*
* @param context Any variant context
* @return a set of the sample objects
*/
public Set<Sample> getSamplesByVariantContext(VariantContext context) {
Set<Sample> samples = new HashSet<Sample>();
for (String sampleName : context.getSampleNames()) {
samples.add(getOrCreateSample(sampleName));
}
return samples;
}
/**
* Return a subcontext restricted to samples with a given property key/value
* Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering
* @param context VariantContext to filter
* @param key property key
* @param value property value (must be string)
* @return subcontext
*/
public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) {
Set<String> samplesWithProperty = new HashSet<String>();
for (String sampleName : context.getSampleNames()) {
Sample s = samples.get(sampleName);
if (s != null && s.hasProperty(key) && s.getProperty(key).equals(value))
samplesWithProperty.add(sampleName);
}
Map<String, Genotype> genotypes = context.getGenotypes(samplesWithProperty);
return context.subContextFromGenotypes(genotypes.values());
}
}

View File

@ -25,9 +25,11 @@
package org.broadinstitute.sting.utils.vcf;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.apache.log4j.Logger;

View File

@ -1,15 +1,16 @@
package org.broadinstitute.sting.gatk.datasources.sample;
import net.sf.samtools.SAMFileHeader;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.junit.Assert;
import org.junit.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.util.*;
/**
* Created by IntelliJ IDEA.
@ -46,13 +47,6 @@ public class SampleDataSourceUnitTest extends BaseTest {
Assert.assertTrue(family.size() == 2);
Assert.assertTrue(family.contains(sampleA));
Assert.assertTrue(family.contains(sampleB));
// make sure getSamples(List names) works
ArrayList<String> names = new ArrayList<String>();
names.add("sampleA");
names.add("sampleB");
Set<Sample> testList = s.getSamples(names);
Assert.assertTrue(testList.size() == 2);
}
// but that file should fail if it has an extra character in it...
@ -137,7 +131,27 @@ public class SampleDataSourceUnitTest extends BaseTest {
Assert.assertTrue(s.sampleCount() == 5);
Assert.assertTrue(s.getSampleById("sampleE").getProperty("propC").equals("valC"));
Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA"));
}
}
/**
* testing getSamplesWithProperty
* in this file there are 4 samples - 2 with population "CEU", 1 with population "ABC", 1 with no population
*/
@Test()
public void getSamplesWithPropertyTest() {
File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml");
SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
Assert.assertTrue(s.sampleCount() == 4);
Set<Sample> ceuSamples = s.getSamplesWithProperty("population", "CEU");
Assert.assertTrue(ceuSamples.size() == 2);
Iterator<Sample> i = ceuSamples.iterator();
ArrayList<String> sampleNames = new ArrayList<String>();
sampleNames.add(i.next().getId());
sampleNames.add(i.next().getId());
Assert.assertTrue(sampleNames.contains("sampleA"));
Assert.assertTrue(sampleNames.contains("sampleB"));
}
// make sure we can import data types other than Strings
@Test()
@ -150,6 +164,70 @@ public class SampleDataSourceUnitTest extends BaseTest {
Assert.assertTrue(sample.getProperty("c").getClass() == Double.class);
Assert.assertTrue(sample.getProperty("b").getClass() == String.class);
}
/**
* check that getSamplesFromVariantContext works
* create a variant context with two sample names, and make sure the right samples are there
*/
@Test()
public void variantContextTest() {
SampleDataSource s = new SampleDataSource(header, null);
List<Allele> alleleCollection = new ArrayList<Allele>();
Allele a1 = Allele.create("A", true);
alleleCollection.add(a1);
Set<Genotype> genotypeCollection = new HashSet<Genotype>();
genotypeCollection.add(new Genotype("NA123", alleleCollection));
genotypeCollection.add(new Genotype("NA456", alleleCollection));
VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection);
// make sure the set that's returned is the right size
HashSet<Sample> set = (HashSet) s.getSamplesByVariantContext(v);
Assert.assertTrue(set.size() == 2);
// make sure both samples are included
Iterator<Sample> i = set.iterator();
ArrayList<String> sampleNames = new ArrayList<String>();
sampleNames.add(i.next().getId());
sampleNames.add(i.next().getId());
Assert.assertTrue(sampleNames.contains("NA123"));
Assert.assertTrue(sampleNames.contains("NA456"));
}
/**
* checking subContextFromSampleProperty
*/
/**
* check that subContextFromSampleProperty works
* create a variant context with four sample names, make sure that it filters correctly to 2
*/
@Test()
public void subContextFromSamplePropertyTest() {
File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml");
SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
Assert.assertTrue(s.sampleCount() == 4);
List<Allele> alleleCollection = new ArrayList<Allele>();
Allele a1 = Allele.create("A", true);
alleleCollection.add(a1);
Set<Genotype> genotypeCollection = new HashSet<Genotype>();
genotypeCollection.add(new Genotype("NA123", alleleCollection));
genotypeCollection.add(new Genotype("sampleA", alleleCollection));
genotypeCollection.add(new Genotype("sampleB", alleleCollection));
genotypeCollection.add(new Genotype("sampleC", alleleCollection));
VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection);
VariantContext subContext = s.subContextFromSampleProperty(v, "population", "CEU");
Assert.assertTrue(subContext.getSampleNames().contains("sampleA"));
Assert.assertTrue(subContext.getSampleNames().contains("sampleA"));
Assert.assertTrue(subContext.getSampleNames().size() == 2);
}
// we create lots of single item lists...