Adding a few helper methods for accessing sample metadata, and associated unit tests. These are motivated by discussion with Ryan about how he'll use sample metadata in VariantEvalwalker - hopefully will make it easier for him. Methods are:
-- getToolkit().subContextFromSampleProperty(): filters a VariantContext to genotypes that come from samples that have a given property value -- getToolkit().getSamplesWithProperty(): gets all samples with a given property -- getToolkit().getSamplesFromVariantContext(): sample objects that are referenced by name in a VariantContext git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4361 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
51fdf9d701
commit
96cccafb0d
|
|
@ -29,6 +29,7 @@ import net.sf.picard.filter.SamRecordFilter;
|
|||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.samtools.*;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.ArgumentSource;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
|
|
@ -799,4 +800,61 @@ public abstract class AbstractGenomeAnalysisEngine {
|
|||
return sampleDataSource.getSamples(sampleNameList);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a set of samples that have any value (which could be null) for a given property
|
||||
* @param key Property key
|
||||
* @return Set of samples with the property
|
||||
*/
|
||||
public Set<Sample> getSamplesWithProperty(String key) {
|
||||
return sampleDataSource.getSamplesWithProperty(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a set of samples that have a property with a certain value
|
||||
* Value must be a string for now - could add a similar method for matching any objects in the future
|
||||
*
|
||||
* @param key Property key
|
||||
* @param value String property value
|
||||
* @return Set of samples that match key and value
|
||||
*/
|
||||
public Set<Sample> getSamplesWithProperty(String key, String value) {
|
||||
return sampleDataSource.getSamplesWithProperty(key, value);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a set of sample objects for the sample names in a variant context
|
||||
*
|
||||
* @param context Any variant context
|
||||
* @return a set of the sample objects
|
||||
*/
|
||||
public Set<Sample> getSamplesByVariantContext(VariantContext context) {
|
||||
Set<Sample> samples = new HashSet<Sample>();
|
||||
for (String sampleName : context.getSampleNames()) {
|
||||
samples.add(sampleDataSource.getOrCreateSample(sampleName));
|
||||
}
|
||||
return samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all samples that were referenced in the SAM file
|
||||
*/
|
||||
public Set<Sample> getSAMFileSamples() {
|
||||
return sampleDataSource.getSAMFileSamples();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a subcontext restricted to samples with a given property key/value
|
||||
* Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering
|
||||
* @param context VariantContext to filter
|
||||
* @param key property key
|
||||
* @param value property value (must be string)
|
||||
* @return subcontext
|
||||
*/
|
||||
public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) {
|
||||
return sampleDataSource.subContextFromSampleProperty(context, key, value);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ import net.sf.samtools.SAMFileHeader;
|
|||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.commandline.ArgumentException;
|
||||
import org.broadinstitute.sting.commandline.ArgumentSource;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShardStrategy;
|
||||
|
|
@ -368,4 +369,5 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine {
|
|||
|
||||
outputTracker.prepareWalker(walker, getArguments().strictnessLevel);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,9 +31,9 @@ public class Sample implements java.io.Serializable {
|
|||
}
|
||||
|
||||
public Sample(String id) {
|
||||
if (id == null) {
|
||||
/* if (id == null) {
|
||||
throw new StingException("Error creating sample: sample ID cannot be null");
|
||||
}
|
||||
}*/
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
|
|
@ -166,6 +166,15 @@ public class Sample implements java.io.Serializable {
|
|||
return properties.get("gender") == Gender.MALE;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param key property key
|
||||
* @return true if sample has this property (even if its value is null)
|
||||
*/
|
||||
public boolean hasProperty(String key) {
|
||||
return properties.containsKey(key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.datasources.sample;
|
|||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
|
@ -14,12 +16,7 @@ import java.io.BufferedReader;
|
|||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -488,5 +485,91 @@ public class SampleDataSource {
|
|||
return samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a set of samples that have any value (which could be null) for a given property
|
||||
* @param key Property key
|
||||
* @return Set of samples with the property
|
||||
*/
|
||||
public Set<Sample> getSamplesWithProperty(String key) {
|
||||
HashSet<Sample> toReturn = new HashSet<Sample>();
|
||||
for (Sample s : samples.values()) {
|
||||
if (s.hasProperty(key))
|
||||
toReturn.add(s);
|
||||
}
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a set of samples that have a property with a certain value
|
||||
* Value must be a string for now - could add a similar method for matching any objects in the future
|
||||
*
|
||||
* @param key Property key
|
||||
* @param value String property value
|
||||
* @return Set of samples that match key and value
|
||||
*/
|
||||
public Set<Sample> getSamplesWithProperty(String key, String value) {
|
||||
Set<Sample> toReturn = getSamplesWithProperty(key);
|
||||
for (Sample s : toReturn) {
|
||||
if (!s.getProperty(key).equals(value))
|
||||
toReturn.remove(s);
|
||||
}
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
public Sample getOrCreateSample(String id) {
|
||||
Sample sample = getSampleById(id);
|
||||
if (sample == null) {
|
||||
sample = new Sample(id);
|
||||
addSample(sample);
|
||||
}
|
||||
return sample;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all samples that were referenced in the SAM file
|
||||
*/
|
||||
public Set<Sample> getSAMFileSamples() {
|
||||
Set<Sample> toReturn = new HashSet<Sample>();
|
||||
for (Sample sample : samples.values()) {
|
||||
if (sample.hasSAMFileEntry())
|
||||
toReturn.add(sample);
|
||||
}
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a set of sample objects for the sample names in a variant context
|
||||
*
|
||||
* @param context Any variant context
|
||||
* @return a set of the sample objects
|
||||
*/
|
||||
public Set<Sample> getSamplesByVariantContext(VariantContext context) {
|
||||
Set<Sample> samples = new HashSet<Sample>();
|
||||
for (String sampleName : context.getSampleNames()) {
|
||||
samples.add(getOrCreateSample(sampleName));
|
||||
}
|
||||
return samples;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return a subcontext restricted to samples with a given property key/value
|
||||
* Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering
|
||||
* @param context VariantContext to filter
|
||||
* @param key property key
|
||||
* @param value property value (must be string)
|
||||
* @return subcontext
|
||||
*/
|
||||
public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) {
|
||||
|
||||
Set<String> samplesWithProperty = new HashSet<String>();
|
||||
for (String sampleName : context.getSampleNames()) {
|
||||
Sample s = samples.get(sampleName);
|
||||
if (s != null && s.hasProperty(key) && s.getProperty(key).equals(value))
|
||||
samplesWithProperty.add(sampleName);
|
||||
}
|
||||
Map<String, Genotype> genotypes = context.getGenotypes(samplesWithProperty);
|
||||
return context.subContextFromGenotypes(genotypes.values());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,9 +25,11 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.vcf;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.apache.log4j.Logger;
|
||||
|
|
|
|||
|
|
@ -1,15 +1,16 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.sample;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -46,13 +47,6 @@ public class SampleDataSourceUnitTest extends BaseTest {
|
|||
Assert.assertTrue(family.size() == 2);
|
||||
Assert.assertTrue(family.contains(sampleA));
|
||||
Assert.assertTrue(family.contains(sampleB));
|
||||
|
||||
// make sure getSamples(List names) works
|
||||
ArrayList<String> names = new ArrayList<String>();
|
||||
names.add("sampleA");
|
||||
names.add("sampleB");
|
||||
Set<Sample> testList = s.getSamples(names);
|
||||
Assert.assertTrue(testList.size() == 2);
|
||||
}
|
||||
|
||||
// but that file should fail if it has an extra character in it...
|
||||
|
|
@ -137,7 +131,27 @@ public class SampleDataSourceUnitTest extends BaseTest {
|
|||
Assert.assertTrue(s.sampleCount() == 5);
|
||||
Assert.assertTrue(s.getSampleById("sampleE").getProperty("propC").equals("valC"));
|
||||
Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* testing getSamplesWithProperty
|
||||
* in this file there are 4 samples - 2 with population "CEU", 1 with population "ABC", 1 with no population
|
||||
*/
|
||||
@Test()
|
||||
public void getSamplesWithPropertyTest() {
|
||||
File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml");
|
||||
SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
|
||||
Assert.assertTrue(s.sampleCount() == 4);
|
||||
Set<Sample> ceuSamples = s.getSamplesWithProperty("population", "CEU");
|
||||
Assert.assertTrue(ceuSamples.size() == 2);
|
||||
|
||||
Iterator<Sample> i = ceuSamples.iterator();
|
||||
ArrayList<String> sampleNames = new ArrayList<String>();
|
||||
sampleNames.add(i.next().getId());
|
||||
sampleNames.add(i.next().getId());
|
||||
Assert.assertTrue(sampleNames.contains("sampleA"));
|
||||
Assert.assertTrue(sampleNames.contains("sampleB"));
|
||||
}
|
||||
|
||||
// make sure we can import data types other than Strings
|
||||
@Test()
|
||||
|
|
@ -150,6 +164,70 @@ public class SampleDataSourceUnitTest extends BaseTest {
|
|||
Assert.assertTrue(sample.getProperty("c").getClass() == Double.class);
|
||||
Assert.assertTrue(sample.getProperty("b").getClass() == String.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* check that getSamplesFromVariantContext works
|
||||
* create a variant context with two sample names, and make sure the right samples are there
|
||||
*/
|
||||
@Test()
|
||||
public void variantContextTest() {
|
||||
SampleDataSource s = new SampleDataSource(header, null);
|
||||
List<Allele> alleleCollection = new ArrayList<Allele>();
|
||||
Allele a1 = Allele.create("A", true);
|
||||
alleleCollection.add(a1);
|
||||
|
||||
Set<Genotype> genotypeCollection = new HashSet<Genotype>();
|
||||
genotypeCollection.add(new Genotype("NA123", alleleCollection));
|
||||
genotypeCollection.add(new Genotype("NA456", alleleCollection));
|
||||
|
||||
VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection);
|
||||
|
||||
// make sure the set that's returned is the right size
|
||||
HashSet<Sample> set = (HashSet) s.getSamplesByVariantContext(v);
|
||||
Assert.assertTrue(set.size() == 2);
|
||||
|
||||
// make sure both samples are included
|
||||
Iterator<Sample> i = set.iterator();
|
||||
ArrayList<String> sampleNames = new ArrayList<String>();
|
||||
sampleNames.add(i.next().getId());
|
||||
sampleNames.add(i.next().getId());
|
||||
Assert.assertTrue(sampleNames.contains("NA123"));
|
||||
Assert.assertTrue(sampleNames.contains("NA456"));
|
||||
}
|
||||
|
||||
/**
|
||||
* checking subContextFromSampleProperty
|
||||
*/
|
||||
|
||||
/**
|
||||
* check that subContextFromSampleProperty works
|
||||
* create a variant context with four sample names, make sure that it filters correctly to 2
|
||||
*/
|
||||
@Test()
|
||||
public void subContextFromSamplePropertyTest() {
|
||||
|
||||
File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml");
|
||||
SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
|
||||
Assert.assertTrue(s.sampleCount() == 4);
|
||||
|
||||
List<Allele> alleleCollection = new ArrayList<Allele>();
|
||||
Allele a1 = Allele.create("A", true);
|
||||
alleleCollection.add(a1);
|
||||
|
||||
Set<Genotype> genotypeCollection = new HashSet<Genotype>();
|
||||
genotypeCollection.add(new Genotype("NA123", alleleCollection));
|
||||
genotypeCollection.add(new Genotype("sampleA", alleleCollection));
|
||||
genotypeCollection.add(new Genotype("sampleB", alleleCollection));
|
||||
genotypeCollection.add(new Genotype("sampleC", alleleCollection));
|
||||
|
||||
VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection);
|
||||
VariantContext subContext = s.subContextFromSampleProperty(v, "population", "CEU");
|
||||
|
||||
Assert.assertTrue(subContext.getSampleNames().contains("sampleA"));
|
||||
Assert.assertTrue(subContext.getSampleNames().contains("sampleA"));
|
||||
Assert.assertTrue(subContext.getSampleNames().size() == 2);
|
||||
|
||||
}
|
||||
|
||||
|
||||
// we create lots of single item lists...
|
||||
|
|
|
|||
Loading…
Reference in New Issue