Added generic interfaces to RefMetaDataTracker to obtain VariantContext objects. More docs. Integration tests for VariantContexts using dbSNP and VCF. At this stage if you use dbSNP or VCF files only in your walkers, please move them over to the VariantContext, it's just nicer. If you've got RODs that implemented the old variation/genotype interfaces, and you want them to work in new walkers, please add an adaptor to VariantContextAdaptors in refdata package. It should be easy and will reduce burden in the long term when those interfaces are retired.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2803 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-02-06 16:26:06 +00:00
parent 995d55da81
commit 3b1ab86d11
6 changed files with 170 additions and 33 deletions

View File

@ -280,7 +280,10 @@ public class Allele implements Comparable<Allele> {
}
}
return null; // couldn't find anything
if ( wouldBeNoCallAllele(alleleBases) )
return NO_CALL;
else
return null; // couldn't find anything
}
public static List<Allele> resolveAlleles(List<Allele> possibleAlleles, List<String> alleleStrings) {

View File

@ -114,7 +114,11 @@ public class Genotype {
if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles");
int nNoCalls = 0;
for ( Allele allele : alleles ) { nNoCalls += allele.isNoCall() ? 1 : 0; }
for ( Allele allele : alleles ) {
if ( allele == null )
throw new IllegalArgumentException("BUG: allele cannot be null in Genotype");
nNoCalls += allele.isNoCall() ? 1 : 0;
}
if ( nNoCalls > 0 && nNoCalls != alleles.size() )
throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this);
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.refdata;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import java.util.*;
@ -13,7 +14,7 @@ import java.util.*;
* The standard interaction model is:
*
* Traversal system arrives at a site, which has a bunch of rods covering it
* Traversal calls tracker.bind(name, rod) for each rod in rods
Genotype * Traversal calls tracker.bind(name, rod) for each rod in rods
* Traversal passes tracker to the walker
* walker calls lookup(name, default) to obtain the rod values at this site, or default if none was
* bound at this site.
@ -197,37 +198,106 @@ public class RefMetaDataTracker {
}
public Collection<VariantContext> getAllVariantContexts(GenomeLoc curLocation) {
return getAllVariantContexts(curLocation, null, false, false);
/**
* Converts all possible ROD tracks to VariantContexts objects, of all types, allowing any start and any number
* of entries per ROD.
*/
public Collection<VariantContext> getAllVariantContexts() {
return getAllVariantContexts(null, null, false, false);
}
public Collection<VariantContext> getAllVariantContexts(GenomeLoc curLocation, EnumSet<VariantContext.Type> allowedTypes, boolean requireStartHere, boolean takeFirstOnly ) {
/**
* Converts all possible ROD tracks to VariantContexts objects. If allowedTypes != null, then only
* VariantContexts in the allow set of types will be returned. If requireStartsHere is true, then curLocation
* must not be null, and only records whose start position is == to curLocation.getStart() will be returned.
* If takeFirstOnly is true, then only a single VariantContext will be converted from any individual ROD. Of course,
* this single object must pass the allowed types and start here options if provided. Note that the result
* may return multiple VariantContexts with the same name if that particular track contained multiple RODs spanning
* the current location.
*
* The name of each VariantContext corresponds to the ROD name.
*
* @param curLocation
* @param allowedTypes
* @param requireStartHere
* @param takeFirstOnly
* @return
*/
public Collection<VariantContext> getAllVariantContexts(EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
List<VariantContext> contexts = new ArrayList<VariantContext>();
for ( RODRecordList<ReferenceOrderedDatum> rodList : getBoundRodTracks() ) {
for ( ReferenceOrderedDatum rec : rodList.getRecords() ) {
if ( VariantContextAdaptors.canBeConvertedToVariantContext(rec) ) {
// ok, we might actually be able to turn this record in a variant context
VariantContext vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec);
// now, let's decide if we want to keep it
boolean goodType = allowedTypes == null || allowedTypes.contains(vc.getType());
boolean goodPos = ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart();
if ( goodType && goodPos ) { // ok, we are going to keep this thing
contexts.add(vc);
if ( takeFirstOnly )
// we only want the first passing instance, so break the loop over records in rodList
break;
}
}
}
addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly);
}
return contexts;
}
/**
* Gets the variant contexts associated with track name name
*
* see getVariantContexts for more information.
*
* @param name
* @param curLocation
* @param allowedTypes
* @param requireStartHere
* @param takeFirstOnly
* @return
*/
public Collection<VariantContext> getVariantContexts(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
RODRecordList<ReferenceOrderedDatum> rodList = getTrackData(name, null);
Collection<VariantContext> contexts = new ArrayList<VariantContext>();
if ( rodList != null )
addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly );
return contexts;
}
/**
* Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not.
* see getVariantContexts for more information.
*
* @param name
* @param curLocation
* @param allowedTypes
* @param requireStartHere
* @return
*/
public VariantContext getVariantContext(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere ) {
Collection<VariantContext> contexts = getVariantContexts(name, allowedTypes, curLocation, requireStartHere, false );
if ( contexts.size() > 1 )
throw new StingException("Requested a single VariantContext object for track " + name + " but multiple variants were present at position " + curLocation);
return contexts.iterator().next();
}
private void addVariantContexts(Collection<VariantContext> contexts, RODRecordList<ReferenceOrderedDatum> rodList, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
for ( ReferenceOrderedDatum rec : rodList.getRecords() ) {
if ( VariantContextAdaptors.canBeConvertedToVariantContext(rec) ) {
// ok, we might actually be able to turn this record in a variant context
VariantContext vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec);
if ( vc == null ) // sometimes the track has odd stuff in it that can't be converted
continue;
// now, let's decide if we want to keep it
boolean goodType = allowedTypes == null || allowedTypes.contains(vc.getType());
boolean goodPos = ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart();
if ( goodType && goodPos ) { // ok, we are going to keep this thing
contexts.add(vc);
if ( takeFirstOnly )
// we only want the first passing instance, so break the loop over records in rodList
break;
}
}
}
}
/**
@ -241,10 +311,4 @@ public class RefMetaDataTracker {
//logger.debug(String.format("Binding %s to %s", name, rod));
map.put(canonicalName(name), rod);
}
/*
public void bind(final String name, ReferenceOrderedDatum rod) {
//logger.debug(String.format("Binding %s to %s", name, rod));
map.put(canonicalName(name), rod);
}
*/
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
@ -14,6 +15,13 @@ import java.util.*;
* you need to create a adaptor object here and register a converter from your class to this object. When tribble arrives,
* we'll use a better approach.
*
* To add a new converter:
*
* create a subclass of VCAdaptor, overloading the convert operator
* add it to the static map from input type -> converter where the input type is the object.class you want to convert
*
* That's it
*
* @author depristo@broadinstitute.org
*/
public class VariantContextAdaptors {
@ -141,8 +149,13 @@ public class VariantContextAdaptors {
Map<String, Genotype> genotypes = new HashMap<String, Genotype>();
for ( VCFGenotypeRecord vcfG : vcf.getVCFGenotypeRecords() ) {
List<Allele> genotypeAlleles = new ArrayList<Allele>();
for ( VCFGenotypeEncoding s : vcfG.getAlleles() )
genotypeAlleles.add(Allele.getMatchingAllele(alleles, s.getBases()));
for ( VCFGenotypeEncoding s : vcfG.getAlleles() ) {
Allele a = Allele.getMatchingAllele(alleles, s.getBases());
if ( a == null )
throw new StingException("Invalid VCF genotype allele " + s + " in VCF " + vcf);
genotypeAlleles.add(a);
}
double pError = vcfG.getNegLog10PError() == VCFGenotypeRecord.MISSING_GENOTYPE_QUALITY ? Genotype.NO_NEG_LOG_10PERROR : vcfG.getNegLog10PError();

View File

@ -33,7 +33,7 @@ public class TestVariantContextWalker extends RodWalker<Integer, Integer> {
EnumSet<VariantContext.Type> allowedTypes = onlyOfThisType == null ? null : EnumSet.of(onlyOfThisType);
int n = 0;
for (VariantContext vc : tracker.getAllVariantContexts(context.getLocation(), allowedTypes, onlyContextsStartinAtCurrentPosition, takeFirstOnly) ) {
for (VariantContext vc : tracker.getAllVariantContexts(allowedTypes, context.getLocation(), onlyContextsStartinAtCurrentPosition, takeFirstOnly) ) {
n++;
if ( printContexts ) out.printf(" %s%n", vc);
}

View File

@ -0,0 +1,53 @@
package org.broadinstitute.sting.gatk.contexts.variantcontext;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.HashMap;
import java.util.Map;
import java.util.Arrays;
import java.util.List;
import java.io.File;
public class VariantContextIntegrationTest extends WalkerTest {
private static String root = "-T TestVariantContext" +
" -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -B vcf,VCF,/humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" +
" -R " + oneKGLocation + "reference/human_b36_both.fasta";
static HashMap<String, String> expectations = new HashMap<String, String>();
static {
expectations.put("-L 1:1-10000 --printPerLocus", "39c035ae756eb176a7baffcd0f0fe0af");
expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "33ab797ac3de900e75fc0d9b01efe482");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "38619d704068072a4ccfd354652957a2");
expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "bf64ab634126382813a6a6b29a5f47d8");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "be087f53429974b4e505cd59f9363bfe");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL", "d758adbab9011e42c77d502fe4d62c27");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "933ec8327192c5ed58a1952c73fb4f73");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType MIXED", "7d5d0283d92220ee78db7465d675b37f");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType NO_VARIATION", "39335acdb34c8a2af433dc50d619bcbc");
}
@Test
public void testConversionSelection() {
for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
String extraArgs = entry.getKey();
String md5 = entry.getValue();
WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s",
1, // just one output file
Arrays.asList(md5));
executeTest("testDbSNPAndVCFConversions", spec);
}
}
@Test
public void testLargeScaleConversion() {
// this really just tests that we are seeing the same number of objects over all of chr1
WalkerTestSpec spec = new WalkerTestSpec( root + " -L 1" + " -o %s",
1, // just one output file
Arrays.asList("3fcdd982df080e6abc0afaba6abdf386"));
executeTest("testLargeScaleConversion", spec);
}
}