From 3b1ab86d11aefcd969e5e012155839a694e63842 Mon Sep 17 00:00:00 2001 From: depristo Date: Sat, 6 Feb 2010 16:26:06 +0000 Subject: [PATCH] Added generic interfaces to RefMetaDataTracker to obtain VariantContext objects. More docs. Integration tests for VariantContexts using dbSNP and VCF. At this stage if you use dbSNP or VCF files only in your walkers, please move them over to the VariantContext, it's just nicer. If you've got RODs that implemented the old variation/genotype interfaces, and you want them to work in new walkers, please add an adaptor to VariantContextAdaptors in refdata package. It should be easy and will reduce burden in the long term when those interfaces are retired. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2803 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/contexts/variantcontext/Allele.java | 5 +- .../contexts/variantcontext/Genotype.java | 6 +- .../gatk/refdata/RefMetaDataTracker.java | 120 ++++++++++++++---- .../gatk/refdata/VariantContextAdaptors.java | 17 ++- .../walkers/TestVariantContextWalker.java | 2 +- .../VariantContextIntegrationTest.java | 53 ++++++++ 6 files changed, 170 insertions(+), 33 deletions(-) create mode 100755 java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java index 0d48a4105..60e84ae6d 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Allele.java @@ -280,7 +280,10 @@ public class Allele implements Comparable { } } - return null; // couldn't find anything + if ( wouldBeNoCallAllele(alleleBases) ) + return NO_CALL; + else + return null; // couldn't find anything } public static List resolveAlleles(List possibleAlleles, List alleleStrings) { diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java index ea1200bfe..b9bc95298 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java @@ -114,7 +114,11 @@ public class Genotype { if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles"); int nNoCalls = 0; - for ( Allele allele : alleles ) { nNoCalls += allele.isNoCall() ? 1 : 0; } + for ( Allele allele : alleles ) { + if ( allele == null ) + throw new IllegalArgumentException("BUG: allele cannot be null in Genotype"); + nNoCalls += allele.isNoCall() ? 1 : 0; + } if ( nNoCalls > 0 && nNoCalls != alleles.size() ) throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this); } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 570ecf79f..d67f8b23b 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.refdata; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; import java.util.*; @@ -13,7 +14,7 @@ import java.util.*; * The standard interaction model is: * * Traversal system arrives at a site, which has a bunch of rods covering it - * Traversal calls tracker.bind(name, rod) for each rod in rods +Genotype * Traversal calls tracker.bind(name, rod) for each rod in rods * Traversal passes tracker to the walker * walker calls lookup(name, default) to obtain the rod values at this site, or default if none was * bound at this site. @@ -197,37 +198,106 @@ public class RefMetaDataTracker { } - public Collection getAllVariantContexts(GenomeLoc curLocation) { - return getAllVariantContexts(curLocation, null, false, false); + /** + * Converts all possible ROD tracks to VariantContexts objects, of all types, allowing any start and any number + * of entries per ROD. + */ + public Collection getAllVariantContexts() { + return getAllVariantContexts(null, null, false, false); } - public Collection getAllVariantContexts(GenomeLoc curLocation, EnumSet allowedTypes, boolean requireStartHere, boolean takeFirstOnly ) { + + /** + * Converts all possible ROD tracks to VariantContexts objects. If allowedTypes != null, then only + * VariantContexts in the allow set of types will be returned. If requireStartsHere is true, then curLocation + * must not be null, and only records whose start position is == to curLocation.getStart() will be returned. + * If takeFirstOnly is true, then only a single VariantContext will be converted from any individual ROD. Of course, + * this single object must pass the allowed types and start here options if provided. Note that the result + * may return multiple VariantContexts with the same name if that particular track contained multiple RODs spanning + * the current location. + * + * The name of each VariantContext corresponds to the ROD name. + * + * @param curLocation + * @param allowedTypes + * @param requireStartHere + * @param takeFirstOnly + * @return + */ + public Collection getAllVariantContexts(EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { List contexts = new ArrayList(); for ( RODRecordList rodList : getBoundRodTracks() ) { - for ( ReferenceOrderedDatum rec : rodList.getRecords() ) { - if ( VariantContextAdaptors.canBeConvertedToVariantContext(rec) ) { - // ok, we might actually be able to turn this record in a variant context - VariantContext vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec); - - // now, let's decide if we want to keep it - boolean goodType = allowedTypes == null || allowedTypes.contains(vc.getType()); - boolean goodPos = ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart(); - - if ( goodType && goodPos ) { // ok, we are going to keep this thing - contexts.add(vc); - - if ( takeFirstOnly ) - // we only want the first passing instance, so break the loop over records in rodList - break; - } - } - } + addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly); } return contexts; } + /** + * Gets the variant contexts associated with track name name + * + * see getVariantContexts for more information. + * + * @param name + * @param curLocation + * @param allowedTypes + * @param requireStartHere + * @param takeFirstOnly + * @return + */ + public Collection getVariantContexts(String name, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { + RODRecordList rodList = getTrackData(name, null); + Collection contexts = new ArrayList(); + + if ( rodList != null ) + addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly ); + + return contexts; + } + + /** + * Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not. + * see getVariantContexts for more information. + * + * @param name + * @param curLocation + * @param allowedTypes + * @param requireStartHere + * @return + */ + public VariantContext getVariantContext(String name, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere ) { + Collection contexts = getVariantContexts(name, allowedTypes, curLocation, requireStartHere, false ); + + if ( contexts.size() > 1 ) + throw new StingException("Requested a single VariantContext object for track " + name + " but multiple variants were present at position " + curLocation); + + return contexts.iterator().next(); + } + + private void addVariantContexts(Collection contexts, RODRecordList rodList, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { + for ( ReferenceOrderedDatum rec : rodList.getRecords() ) { + if ( VariantContextAdaptors.canBeConvertedToVariantContext(rec) ) { + // ok, we might actually be able to turn this record in a variant context + VariantContext vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec); + + if ( vc == null ) // sometimes the track has odd stuff in it that can't be converted + continue; + + // now, let's decide if we want to keep it + boolean goodType = allowedTypes == null || allowedTypes.contains(vc.getType()); + boolean goodPos = ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart(); + + if ( goodType && goodPos ) { // ok, we are going to keep this thing + contexts.add(vc); + + if ( takeFirstOnly ) + // we only want the first passing instance, so break the loop over records in rodList + break; + } + } + } + } /** @@ -241,10 +311,4 @@ public class RefMetaDataTracker { //logger.debug(String.format("Binding %s to %s", name, rod)); map.put(canonicalName(name), rod); } -/* - public void bind(final String name, ReferenceOrderedDatum rod) { - //logger.debug(String.format("Binding %s to %s", name, rod)); - map.put(canonicalName(name), rod); - } - */ } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 381eff78a..ede1f4740 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding; import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; +import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; @@ -14,6 +15,13 @@ import java.util.*; * you need to create a adaptor object here and register a converter from your class to this object. When tribble arrives, * we'll use a better approach. * + * To add a new converter: + * + * create a subclass of VCAdaptor, overloading the convert operator + * add it to the static map from input type -> converter where the input type is the object.class you want to convert + * + * That's it + * * @author depristo@broadinstitute.org */ public class VariantContextAdaptors { @@ -141,8 +149,13 @@ public class VariantContextAdaptors { Map genotypes = new HashMap(); for ( VCFGenotypeRecord vcfG : vcf.getVCFGenotypeRecords() ) { List genotypeAlleles = new ArrayList(); - for ( VCFGenotypeEncoding s : vcfG.getAlleles() ) - genotypeAlleles.add(Allele.getMatchingAllele(alleles, s.getBases())); + for ( VCFGenotypeEncoding s : vcfG.getAlleles() ) { + Allele a = Allele.getMatchingAllele(alleles, s.getBases()); + if ( a == null ) + throw new StingException("Invalid VCF genotype allele " + s + " in VCF " + vcf); + + genotypeAlleles.add(a); + } double pError = vcfG.getNegLog10PError() == VCFGenotypeRecord.MISSING_GENOTYPE_QUALITY ? Genotype.NO_NEG_LOG_10PERROR : vcfG.getNegLog10PError(); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java index 2d2d87fdb..665367ee4 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java @@ -33,7 +33,7 @@ public class TestVariantContextWalker extends RodWalker { EnumSet allowedTypes = onlyOfThisType == null ? null : EnumSet.of(onlyOfThisType); int n = 0; - for (VariantContext vc : tracker.getAllVariantContexts(context.getLocation(), allowedTypes, onlyContextsStartinAtCurrentPosition, takeFirstOnly) ) { + for (VariantContext vc : tracker.getAllVariantContexts(allowedTypes, context.getLocation(), onlyContextsStartinAtCurrentPosition, takeFirstOnly) ) { n++; if ( printContexts ) out.printf(" %s%n", vc); } diff --git a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java new file mode 100755 index 000000000..5437494b8 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java @@ -0,0 +1,53 @@ +package org.broadinstitute.sting.gatk.contexts.variantcontext; + +import org.broadinstitute.sting.WalkerTest; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; +import java.util.Arrays; +import java.util.List; +import java.io.File; + +public class VariantContextIntegrationTest extends WalkerTest { + private static String root = "-T TestVariantContext" + + " -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" + + " -B vcf,VCF,/humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + + " -R " + oneKGLocation + "reference/human_b36_both.fasta"; + + static HashMap expectations = new HashMap(); + static { + expectations.put("-L 1:1-10000 --printPerLocus", "39c035ae756eb176a7baffcd0f0fe0af"); + expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "33ab797ac3de900e75fc0d9b01efe482"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "38619d704068072a4ccfd354652957a2"); + expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "bf64ab634126382813a6a6b29a5f47d8"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "be087f53429974b4e505cd59f9363bfe"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL", "d758adbab9011e42c77d502fe4d62c27"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "933ec8327192c5ed58a1952c73fb4f73"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType MIXED", "7d5d0283d92220ee78db7465d675b37f"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType NO_VARIATION", "39335acdb34c8a2af433dc50d619bcbc"); + } + + @Test + public void testConversionSelection() { + + for ( Map.Entry entry : expectations.entrySet() ) { + String extraArgs = entry.getKey(); + String md5 = entry.getValue(); + + WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testDbSNPAndVCFConversions", spec); + } + } + + @Test + public void testLargeScaleConversion() { + // this really just tests that we are seeing the same number of objects over all of chr1 + WalkerTestSpec spec = new WalkerTestSpec( root + " -L 1" + " -o %s", + 1, // just one output file + Arrays.asList("3fcdd982df080e6abc0afaba6abdf386")); + executeTest("testLargeScaleConversion", spec); + } +} \ No newline at end of file