Adding a few helper methods for accessing sample metadata, and associated unit tests. These are motivated by discussion with Ryan about how he'll use sample metadata in VariantEvalwalker - hopefully will make it easier for him. Methods are:
-- getToolkit().subContextFromSampleProperty(): filters a VariantContext to genotypes that come from samples that have a given property value -- getToolkit().getSamplesWithProperty(): gets all samples with a given property -- getToolkit().getSamplesFromVariantContext(): sample objects that are referenced by name in a VariantContext git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4361 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
51fdf9d701
commit
96cccafb0d
|
|
@ -29,6 +29,7 @@ import net.sf.picard.filter.SamRecordFilter;
|
||||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.commandline.ArgumentSource;
|
import org.broadinstitute.sting.commandline.ArgumentSource;
|
||||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||||
|
|
@ -799,4 +800,61 @@ public abstract class AbstractGenomeAnalysisEngine {
|
||||||
return sampleDataSource.getSamples(sampleNameList);
|
return sampleDataSource.getSamples(sampleNameList);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a set of samples that have any value (which could be null) for a given property
|
||||||
|
* @param key Property key
|
||||||
|
* @return Set of samples with the property
|
||||||
|
*/
|
||||||
|
public Set<Sample> getSamplesWithProperty(String key) {
|
||||||
|
return sampleDataSource.getSamplesWithProperty(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a set of samples that have a property with a certain value
|
||||||
|
* Value must be a string for now - could add a similar method for matching any objects in the future
|
||||||
|
*
|
||||||
|
* @param key Property key
|
||||||
|
* @param value String property value
|
||||||
|
* @return Set of samples that match key and value
|
||||||
|
*/
|
||||||
|
public Set<Sample> getSamplesWithProperty(String key, String value) {
|
||||||
|
return sampleDataSource.getSamplesWithProperty(key, value);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a set of sample objects for the sample names in a variant context
|
||||||
|
*
|
||||||
|
* @param context Any variant context
|
||||||
|
* @return a set of the sample objects
|
||||||
|
*/
|
||||||
|
public Set<Sample> getSamplesByVariantContext(VariantContext context) {
|
||||||
|
Set<Sample> samples = new HashSet<Sample>();
|
||||||
|
for (String sampleName : context.getSampleNames()) {
|
||||||
|
samples.add(sampleDataSource.getOrCreateSample(sampleName));
|
||||||
|
}
|
||||||
|
return samples;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns all samples that were referenced in the SAM file
|
||||||
|
*/
|
||||||
|
public Set<Sample> getSAMFileSamples() {
|
||||||
|
return sampleDataSource.getSAMFileSamples();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a subcontext restricted to samples with a given property key/value
|
||||||
|
* Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering
|
||||||
|
* @param context VariantContext to filter
|
||||||
|
* @param key property key
|
||||||
|
* @param value property value (must be string)
|
||||||
|
* @return subcontext
|
||||||
|
*/
|
||||||
|
public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) {
|
||||||
|
return sampleDataSource.subContextFromSampleProperty(context, key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,7 @@ import net.sf.samtools.SAMFileHeader;
|
||||||
import net.sf.samtools.SAMSequenceRecord;
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
import org.broadinstitute.sting.commandline.ArgumentException;
|
import org.broadinstitute.sting.commandline.ArgumentException;
|
||||||
import org.broadinstitute.sting.commandline.ArgumentSource;
|
import org.broadinstitute.sting.commandline.ArgumentSource;
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||||
import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShardStrategy;
|
import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShardStrategy;
|
||||||
|
|
@ -368,4 +369,5 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine {
|
||||||
|
|
||||||
outputTracker.prepareWalker(walker, getArguments().strictnessLevel);
|
outputTracker.prepareWalker(walker, getArguments().strictnessLevel);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -31,9 +31,9 @@ public class Sample implements java.io.Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Sample(String id) {
|
public Sample(String id) {
|
||||||
if (id == null) {
|
/* if (id == null) {
|
||||||
throw new StingException("Error creating sample: sample ID cannot be null");
|
throw new StingException("Error creating sample: sample ID cannot be null");
|
||||||
}
|
}*/
|
||||||
this.id = id;
|
this.id = id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -166,6 +166,15 @@ public class Sample implements java.io.Serializable {
|
||||||
return properties.get("gender") == Gender.MALE;
|
return properties.get("gender") == Gender.MALE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param key property key
|
||||||
|
* @return true if sample has this property (even if its value is null)
|
||||||
|
*/
|
||||||
|
public boolean hasProperty(String key) {
|
||||||
|
return properties.containsKey(key);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o) return true;
|
if (this == o) return true;
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.datasources.sample;
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import net.sf.samtools.SAMReadGroupRecord;
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.utils.SampleUtils;
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
|
@ -14,12 +16,7 @@ import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.*;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
|
|
@ -488,5 +485,91 @@ public class SampleDataSource {
|
||||||
return samples;
|
return samples;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a set of samples that have any value (which could be null) for a given property
|
||||||
|
* @param key Property key
|
||||||
|
* @return Set of samples with the property
|
||||||
|
*/
|
||||||
|
public Set<Sample> getSamplesWithProperty(String key) {
|
||||||
|
HashSet<Sample> toReturn = new HashSet<Sample>();
|
||||||
|
for (Sample s : samples.values()) {
|
||||||
|
if (s.hasProperty(key))
|
||||||
|
toReturn.add(s);
|
||||||
|
}
|
||||||
|
return toReturn;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a set of samples that have a property with a certain value
|
||||||
|
* Value must be a string for now - could add a similar method for matching any objects in the future
|
||||||
|
*
|
||||||
|
* @param key Property key
|
||||||
|
* @param value String property value
|
||||||
|
* @return Set of samples that match key and value
|
||||||
|
*/
|
||||||
|
public Set<Sample> getSamplesWithProperty(String key, String value) {
|
||||||
|
Set<Sample> toReturn = getSamplesWithProperty(key);
|
||||||
|
for (Sample s : toReturn) {
|
||||||
|
if (!s.getProperty(key).equals(value))
|
||||||
|
toReturn.remove(s);
|
||||||
|
}
|
||||||
|
return toReturn;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Sample getOrCreateSample(String id) {
|
||||||
|
Sample sample = getSampleById(id);
|
||||||
|
if (sample == null) {
|
||||||
|
sample = new Sample(id);
|
||||||
|
addSample(sample);
|
||||||
|
}
|
||||||
|
return sample;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns all samples that were referenced in the SAM file
|
||||||
|
*/
|
||||||
|
public Set<Sample> getSAMFileSamples() {
|
||||||
|
Set<Sample> toReturn = new HashSet<Sample>();
|
||||||
|
for (Sample sample : samples.values()) {
|
||||||
|
if (sample.hasSAMFileEntry())
|
||||||
|
toReturn.add(sample);
|
||||||
|
}
|
||||||
|
return toReturn;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a set of sample objects for the sample names in a variant context
|
||||||
|
*
|
||||||
|
* @param context Any variant context
|
||||||
|
* @return a set of the sample objects
|
||||||
|
*/
|
||||||
|
public Set<Sample> getSamplesByVariantContext(VariantContext context) {
|
||||||
|
Set<Sample> samples = new HashSet<Sample>();
|
||||||
|
for (String sampleName : context.getSampleNames()) {
|
||||||
|
samples.add(getOrCreateSample(sampleName));
|
||||||
|
}
|
||||||
|
return samples;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a subcontext restricted to samples with a given property key/value
|
||||||
|
* Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering
|
||||||
|
* @param context VariantContext to filter
|
||||||
|
* @param key property key
|
||||||
|
* @param value property value (must be string)
|
||||||
|
* @return subcontext
|
||||||
|
*/
|
||||||
|
public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) {
|
||||||
|
|
||||||
|
Set<String> samplesWithProperty = new HashSet<String>();
|
||||||
|
for (String sampleName : context.getSampleNames()) {
|
||||||
|
Sample s = samples.get(sampleName);
|
||||||
|
if (s != null && s.hasProperty(key) && s.getProperty(key).equals(value))
|
||||||
|
samplesWithProperty.add(sampleName);
|
||||||
|
}
|
||||||
|
Map<String, Genotype> genotypes = context.getGenotypes(samplesWithProperty);
|
||||||
|
return context.subContextFromGenotypes(genotypes.values());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -25,9 +25,11 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils.vcf;
|
package org.broadinstitute.sting.utils.vcf;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
import org.broad.tribble.vcf.*;
|
import org.broad.tribble.vcf.*;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
|
||||||
|
|
@ -1,15 +1,16 @@
|
||||||
package org.broadinstitute.sting.gatk.datasources.sample;
|
package org.broadinstitute.sting.gatk.datasources.sample;
|
||||||
|
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
|
|
@ -46,13 +47,6 @@ public class SampleDataSourceUnitTest extends BaseTest {
|
||||||
Assert.assertTrue(family.size() == 2);
|
Assert.assertTrue(family.size() == 2);
|
||||||
Assert.assertTrue(family.contains(sampleA));
|
Assert.assertTrue(family.contains(sampleA));
|
||||||
Assert.assertTrue(family.contains(sampleB));
|
Assert.assertTrue(family.contains(sampleB));
|
||||||
|
|
||||||
// make sure getSamples(List names) works
|
|
||||||
ArrayList<String> names = new ArrayList<String>();
|
|
||||||
names.add("sampleA");
|
|
||||||
names.add("sampleB");
|
|
||||||
Set<Sample> testList = s.getSamples(names);
|
|
||||||
Assert.assertTrue(testList.size() == 2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// but that file should fail if it has an extra character in it...
|
// but that file should fail if it has an extra character in it...
|
||||||
|
|
@ -137,7 +131,27 @@ public class SampleDataSourceUnitTest extends BaseTest {
|
||||||
Assert.assertTrue(s.sampleCount() == 5);
|
Assert.assertTrue(s.sampleCount() == 5);
|
||||||
Assert.assertTrue(s.getSampleById("sampleE").getProperty("propC").equals("valC"));
|
Assert.assertTrue(s.getSampleById("sampleE").getProperty("propC").equals("valC"));
|
||||||
Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA"));
|
Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* testing getSamplesWithProperty
|
||||||
|
* in this file there are 4 samples - 2 with population "CEU", 1 with population "ABC", 1 with no population
|
||||||
|
*/
|
||||||
|
@Test()
|
||||||
|
public void getSamplesWithPropertyTest() {
|
||||||
|
File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml");
|
||||||
|
SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
|
||||||
|
Assert.assertTrue(s.sampleCount() == 4);
|
||||||
|
Set<Sample> ceuSamples = s.getSamplesWithProperty("population", "CEU");
|
||||||
|
Assert.assertTrue(ceuSamples.size() == 2);
|
||||||
|
|
||||||
|
Iterator<Sample> i = ceuSamples.iterator();
|
||||||
|
ArrayList<String> sampleNames = new ArrayList<String>();
|
||||||
|
sampleNames.add(i.next().getId());
|
||||||
|
sampleNames.add(i.next().getId());
|
||||||
|
Assert.assertTrue(sampleNames.contains("sampleA"));
|
||||||
|
Assert.assertTrue(sampleNames.contains("sampleB"));
|
||||||
|
}
|
||||||
|
|
||||||
// make sure we can import data types other than Strings
|
// make sure we can import data types other than Strings
|
||||||
@Test()
|
@Test()
|
||||||
|
|
@ -150,6 +164,70 @@ public class SampleDataSourceUnitTest extends BaseTest {
|
||||||
Assert.assertTrue(sample.getProperty("c").getClass() == Double.class);
|
Assert.assertTrue(sample.getProperty("c").getClass() == Double.class);
|
||||||
Assert.assertTrue(sample.getProperty("b").getClass() == String.class);
|
Assert.assertTrue(sample.getProperty("b").getClass() == String.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* check that getSamplesFromVariantContext works
|
||||||
|
* create a variant context with two sample names, and make sure the right samples are there
|
||||||
|
*/
|
||||||
|
@Test()
|
||||||
|
public void variantContextTest() {
|
||||||
|
SampleDataSource s = new SampleDataSource(header, null);
|
||||||
|
List<Allele> alleleCollection = new ArrayList<Allele>();
|
||||||
|
Allele a1 = Allele.create("A", true);
|
||||||
|
alleleCollection.add(a1);
|
||||||
|
|
||||||
|
Set<Genotype> genotypeCollection = new HashSet<Genotype>();
|
||||||
|
genotypeCollection.add(new Genotype("NA123", alleleCollection));
|
||||||
|
genotypeCollection.add(new Genotype("NA456", alleleCollection));
|
||||||
|
|
||||||
|
VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection);
|
||||||
|
|
||||||
|
// make sure the set that's returned is the right size
|
||||||
|
HashSet<Sample> set = (HashSet) s.getSamplesByVariantContext(v);
|
||||||
|
Assert.assertTrue(set.size() == 2);
|
||||||
|
|
||||||
|
// make sure both samples are included
|
||||||
|
Iterator<Sample> i = set.iterator();
|
||||||
|
ArrayList<String> sampleNames = new ArrayList<String>();
|
||||||
|
sampleNames.add(i.next().getId());
|
||||||
|
sampleNames.add(i.next().getId());
|
||||||
|
Assert.assertTrue(sampleNames.contains("NA123"));
|
||||||
|
Assert.assertTrue(sampleNames.contains("NA456"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* checking subContextFromSampleProperty
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* check that subContextFromSampleProperty works
|
||||||
|
* create a variant context with four sample names, make sure that it filters correctly to 2
|
||||||
|
*/
|
||||||
|
@Test()
|
||||||
|
public void subContextFromSamplePropertyTest() {
|
||||||
|
|
||||||
|
File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml");
|
||||||
|
SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile));
|
||||||
|
Assert.assertTrue(s.sampleCount() == 4);
|
||||||
|
|
||||||
|
List<Allele> alleleCollection = new ArrayList<Allele>();
|
||||||
|
Allele a1 = Allele.create("A", true);
|
||||||
|
alleleCollection.add(a1);
|
||||||
|
|
||||||
|
Set<Genotype> genotypeCollection = new HashSet<Genotype>();
|
||||||
|
genotypeCollection.add(new Genotype("NA123", alleleCollection));
|
||||||
|
genotypeCollection.add(new Genotype("sampleA", alleleCollection));
|
||||||
|
genotypeCollection.add(new Genotype("sampleB", alleleCollection));
|
||||||
|
genotypeCollection.add(new Genotype("sampleC", alleleCollection));
|
||||||
|
|
||||||
|
VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection);
|
||||||
|
VariantContext subContext = s.subContextFromSampleProperty(v, "population", "CEU");
|
||||||
|
|
||||||
|
Assert.assertTrue(subContext.getSampleNames().contains("sampleA"));
|
||||||
|
Assert.assertTrue(subContext.getSampleNames().contains("sampleA"));
|
||||||
|
Assert.assertTrue(subContext.getSampleNames().size() == 2);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// we create lots of single item lists...
|
// we create lots of single item lists...
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue