From 96cccafb0d9c7bf01d0eecfe358e6da050004cde Mon Sep 17 00:00:00 2001 From: bthomas Date: Tue, 28 Sep 2010 02:16:25 +0000 Subject: [PATCH] Adding a few helper methods for accessing sample metadata, and associated unit tests. These are motivated by discussion with Ryan about how he'll use sample metadata in VariantEvalwalker - hopefully will make it easier for him. Methods are: -- getToolkit().subContextFromSampleProperty(): filters a VariantContext to genotypes that come from samples that have a given property value -- getToolkit().getSamplesWithProperty(): gets all samples with a given property -- getToolkit().getSamplesFromVariantContext(): sample objects that are referenced by name in a VariantContext git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4361 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/AbstractGenomeAnalysisEngine.java | 58 ++++++++++ .../sting/gatk/GenomeAnalysisEngine.java | 2 + .../sting/gatk/datasources/sample/Sample.java | 13 ++- .../datasources/sample/SampleDataSource.java | 95 +++++++++++++++-- .../sting/utils/vcf/VCFUtils.java | 2 + .../sample/SampleDataSourceUnitTest.java | 100 ++++++++++++++++-- 6 files changed, 251 insertions(+), 19 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java index 4a39678e8..bc9903380 100755 --- a/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java @@ -29,6 +29,7 @@ import net.sf.picard.filter.SamRecordFilter; import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.*; import org.apache.log4j.Logger; +import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.commandline.ArgumentSource; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -799,4 +800,61 @@ public abstract class AbstractGenomeAnalysisEngine { return sampleDataSource.getSamples(sampleNameList); } + + /** + * Returns a set of samples that have any value (which could be null) for a given property + * @param key Property key + * @return Set of samples with the property + */ + public Set getSamplesWithProperty(String key) { + return sampleDataSource.getSamplesWithProperty(key); + } + + /** + * Returns a set of samples that have a property with a certain value + * Value must be a string for now - could add a similar method for matching any objects in the future + * + * @param key Property key + * @param value String property value + * @return Set of samples that match key and value + */ + public Set getSamplesWithProperty(String key, String value) { + return sampleDataSource.getSamplesWithProperty(key, value); + + } + + /** + * Returns a set of sample objects for the sample names in a variant context + * + * @param context Any variant context + * @return a set of the sample objects + */ + public Set getSamplesByVariantContext(VariantContext context) { + Set samples = new HashSet(); + for (String sampleName : context.getSampleNames()) { + samples.add(sampleDataSource.getOrCreateSample(sampleName)); + } + return samples; + } + + /** + * Returns all samples that were referenced in the SAM file + */ + public Set getSAMFileSamples() { + return sampleDataSource.getSAMFileSamples(); + } + + /** + * Return a subcontext restricted to samples with a given property key/value + * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering + * @param context VariantContext to filter + * @param key property key + * @param value property value (must be string) + * @return subcontext + */ + public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) { + return sampleDataSource.subContextFromSampleProperty(context, key, value); + } + + } diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 0710fd9d8..2a1de19c1 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -31,6 +31,7 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.commandline.ArgumentException; import org.broadinstitute.sting.commandline.ArgumentSource; +import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShardStrategy; @@ -368,4 +369,5 @@ public class GenomeAnalysisEngine extends AbstractGenomeAnalysisEngine { outputTracker.prepareWalker(walker, getArguments().strictnessLevel); } + } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java index c509df317..09f1912c6 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java @@ -31,9 +31,9 @@ public class Sample implements java.io.Serializable { } public Sample(String id) { - if (id == null) { +/* if (id == null) { throw new StingException("Error creating sample: sample ID cannot be null"); - } + }*/ this.id = id; } @@ -166,6 +166,15 @@ public class Sample implements java.io.Serializable { return properties.get("gender") == Gender.MALE; } + /** + * + * @param key property key + * @return true if sample has this property (even if its value is null) + */ + public boolean hasProperty(String key) { + return properties.containsKey(key); + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java index f9f5ad4f1..2b7efcba4 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java @@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.datasources.sample; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.exceptions.StingException; @@ -14,12 +16,7 @@ import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.ArrayList; -import java.util.Collection; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -488,5 +485,91 @@ public class SampleDataSource { return samples; } + /** + * Returns a set of samples that have any value (which could be null) for a given property + * @param key Property key + * @return Set of samples with the property + */ + public Set getSamplesWithProperty(String key) { + HashSet toReturn = new HashSet(); + for (Sample s : samples.values()) { + if (s.hasProperty(key)) + toReturn.add(s); + } + return toReturn; + } + + /** + * Returns a set of samples that have a property with a certain value + * Value must be a string for now - could add a similar method for matching any objects in the future + * + * @param key Property key + * @param value String property value + * @return Set of samples that match key and value + */ + public Set getSamplesWithProperty(String key, String value) { + Set toReturn = getSamplesWithProperty(key); + for (Sample s : toReturn) { + if (!s.getProperty(key).equals(value)) + toReturn.remove(s); + } + return toReturn; + } + + public Sample getOrCreateSample(String id) { + Sample sample = getSampleById(id); + if (sample == null) { + sample = new Sample(id); + addSample(sample); + } + return sample; + } + + /** + * Returns all samples that were referenced in the SAM file + */ + public Set getSAMFileSamples() { + Set toReturn = new HashSet(); + for (Sample sample : samples.values()) { + if (sample.hasSAMFileEntry()) + toReturn.add(sample); + } + return toReturn; + } + + /** + * Returns a set of sample objects for the sample names in a variant context + * + * @param context Any variant context + * @return a set of the sample objects + */ + public Set getSamplesByVariantContext(VariantContext context) { + Set samples = new HashSet(); + for (String sampleName : context.getSampleNames()) { + samples.add(getOrCreateSample(sampleName)); + } + return samples; + } + + + /** + * Return a subcontext restricted to samples with a given property key/value + * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering + * @param context VariantContext to filter + * @param key property key + * @param value property value (must be string) + * @return subcontext + */ + public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) { + + Set samplesWithProperty = new HashSet(); + for (String sampleName : context.getSampleNames()) { + Sample s = samples.get(sampleName); + if (s != null && s.hasProperty(key) && s.getProperty(key).equals(value)) + samplesWithProperty.add(sampleName); + } + Map genotypes = context.getGenotypes(samplesWithProperty); + return context.subContextFromGenotypes(genotypes.values()); + } } diff --git a/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java b/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java index 2d27a9b1b..4689c462d 100755 --- a/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java +++ b/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java @@ -25,9 +25,11 @@ package org.broadinstitute.sting.utils.vcf; +import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.apache.log4j.Logger; diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java index 8b4046e5c..befbce602 100644 --- a/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java @@ -1,15 +1,16 @@ package org.broadinstitute.sting.gatk.datasources.sample; import net.sf.samtools.SAMFileHeader; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.StingException; import org.junit.Assert; import org.junit.Test; import java.io.File; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Set; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -46,13 +47,6 @@ public class SampleDataSourceUnitTest extends BaseTest { Assert.assertTrue(family.size() == 2); Assert.assertTrue(family.contains(sampleA)); Assert.assertTrue(family.contains(sampleB)); - - // make sure getSamples(List names) works - ArrayList names = new ArrayList(); - names.add("sampleA"); - names.add("sampleB"); - Set testList = s.getSamples(names); - Assert.assertTrue(testList.size() == 2); } // but that file should fail if it has an extra character in it... @@ -137,7 +131,27 @@ public class SampleDataSourceUnitTest extends BaseTest { Assert.assertTrue(s.sampleCount() == 5); Assert.assertTrue(s.getSampleById("sampleE").getProperty("propC").equals("valC")); Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA")); - } + } + + /** + * testing getSamplesWithProperty + * in this file there are 4 samples - 2 with population "CEU", 1 with population "ABC", 1 with no population + */ + @Test() + public void getSamplesWithPropertyTest() { + File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + Assert.assertTrue(s.sampleCount() == 4); + Set ceuSamples = s.getSamplesWithProperty("population", "CEU"); + Assert.assertTrue(ceuSamples.size() == 2); + + Iterator i = ceuSamples.iterator(); + ArrayList sampleNames = new ArrayList(); + sampleNames.add(i.next().getId()); + sampleNames.add(i.next().getId()); + Assert.assertTrue(sampleNames.contains("sampleA")); + Assert.assertTrue(sampleNames.contains("sampleB")); + } // make sure we can import data types other than Strings @Test() @@ -150,6 +164,70 @@ public class SampleDataSourceUnitTest extends BaseTest { Assert.assertTrue(sample.getProperty("c").getClass() == Double.class); Assert.assertTrue(sample.getProperty("b").getClass() == String.class); } + + /** + * check that getSamplesFromVariantContext works + * create a variant context with two sample names, and make sure the right samples are there + */ + @Test() + public void variantContextTest() { + SampleDataSource s = new SampleDataSource(header, null); + List alleleCollection = new ArrayList(); + Allele a1 = Allele.create("A", true); + alleleCollection.add(a1); + + Set genotypeCollection = new HashSet(); + genotypeCollection.add(new Genotype("NA123", alleleCollection)); + genotypeCollection.add(new Genotype("NA456", alleleCollection)); + + VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection); + + // make sure the set that's returned is the right size + HashSet set = (HashSet) s.getSamplesByVariantContext(v); + Assert.assertTrue(set.size() == 2); + + // make sure both samples are included + Iterator i = set.iterator(); + ArrayList sampleNames = new ArrayList(); + sampleNames.add(i.next().getId()); + sampleNames.add(i.next().getId()); + Assert.assertTrue(sampleNames.contains("NA123")); + Assert.assertTrue(sampleNames.contains("NA456")); + } + + /** + * checking subContextFromSampleProperty + */ + + /** + * check that subContextFromSampleProperty works + * create a variant context with four sample names, make sure that it filters correctly to 2 + */ + @Test() + public void subContextFromSamplePropertyTest() { + + File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml"); + SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); + Assert.assertTrue(s.sampleCount() == 4); + + List alleleCollection = new ArrayList(); + Allele a1 = Allele.create("A", true); + alleleCollection.add(a1); + + Set genotypeCollection = new HashSet(); + genotypeCollection.add(new Genotype("NA123", alleleCollection)); + genotypeCollection.add(new Genotype("sampleA", alleleCollection)); + genotypeCollection.add(new Genotype("sampleB", alleleCollection)); + genotypeCollection.add(new Genotype("sampleC", alleleCollection)); + + VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection); + VariantContext subContext = s.subContextFromSampleProperty(v, "population", "CEU"); + + Assert.assertTrue(subContext.getSampleNames().contains("sampleA")); + Assert.assertTrue(subContext.getSampleNames().contains("sampleA")); + Assert.assertTrue(subContext.getSampleNames().size() == 2); + + } // we create lots of single item lists...