Moving the util VariantContext 'modifying' routines into VC itself (as opposed to VCUtils) so that we can pass the genotype data directly into it and are no longer forced to decode the genotypes for no reason. This means that any walker that takes in a VCF and modifies the records without touching the genotypes never have to decode them. I've hooked this into the other two Variant Recalibrator walkers for Ryan. One side effect, though, is that we no longer can sort the sample names in the VCF (i.e. if the input VCF doesn't have samples in alphabetical order, then we used to sort them when writing a new VCF but no longer do that), because if we don't decode then we can't re-order the genotypes. I don't think this is a big concern given that the Unified Genotyper does emit sorted samples and that's the main source for most of the VCFs we use.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4300 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-09-17 07:09:58 +00:00
parent f66ef4626e
commit a10b2a00a5
15 changed files with 48 additions and 65 deletions

View File

@ -687,30 +687,10 @@ public class VariantContextUtils {
return uniqify ? sampleName + "." + trackName : sampleName;
}
public static VariantContext modifyGenotypes(VariantContext vc, Map<String, Genotype> genotypes) {
return new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes());
}
public static VariantContext modifyLocation(VariantContext vc, GenomeLoc loc) {
return new VariantContext(vc.getName(), loc.getContig(), loc.getStart(), loc.getStop(), vc.getAlleles(), vc.getGenotypes(), vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes());
}
public static VariantContext modifyFilters(VariantContext vc, Set<String> filters) {
return new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.getGenotypes(), vc.getNegLog10PError(), filters, vc.getAttributes());
}
public static VariantContext modifyAttributes(VariantContext vc, Map<String, Object> attributes) {
return new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.getGenotypes(), vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes);
}
public static Genotype modifyName(Genotype g, String name) {
return new Genotype(name, g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.genotypesArePhased());
}
public static Genotype modifyAttributes(Genotype g, Map<String, Object> attributes) {
return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.genotypesArePhased());
}
public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set<String> allowedAttributes) {
if ( allowedAttributes == null )
return vc;
@ -722,10 +702,10 @@ public class VariantContextUtils {
if ( allowedAttributes.contains(attr.getKey()) )
attrs.put(attr.getKey(), attr.getValue());
}
newGenotypes.put(genotype.getKey(), VariantContextUtils.modifyAttributes(genotype.getValue(), attrs));
newGenotypes.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attrs));
}
return VariantContextUtils.modifyGenotypes(vc, newGenotypes);
return VariantContext.modifyGenotypes(vc, newGenotypes);
}
public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) {

View File

@ -77,14 +77,14 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
if ( dbsnp != null )
attrs.put(VariantContext.ID_KEY, dbsnp.getRsID());
vc = VariantContextUtils.modifyAttributes(vc, attrs);
vc = VariantContext.modifyAttributes(vc, attrs);
// set the appropriate sample name if necessary
if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(INPUT_ROD_NAME) ) {
Genotype g = VariantContextUtils.modifyName(vc.getGenotype(INPUT_ROD_NAME), sampleName);
Genotype g = Genotype.modifyName(vc.getGenotype(INPUT_ROD_NAME), sampleName);
Map<String, Genotype> genotypes = new HashMap<String, Genotype>();
genotypes.put(sampleName, g);
vc = VariantContextUtils.modifyGenotypes(vc, genotypes);
vc = VariantContext.modifyGenotypes(vc, genotypes);
}
writeRecord(vc, tracker, ref.getBase());
@ -110,7 +110,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
}
}
Set<String> samples = new TreeSet<String>();
Set<String> samples = new LinkedHashSet<String>();
if ( sampleName != null ) {
samples.add(sampleName);
} else {

View File

@ -119,7 +119,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
// get the list of all sample names from the variant VCF input rod, if applicable
Set<String> rodName = new HashSet<String>();
rodName.add("variant");
TreeSet<String> samples = new TreeSet<String>(SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName));
Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
// add the non-VCF sample from the command-line, if applicable
if ( sampleName != null ) {

View File

@ -212,7 +212,7 @@ public class GenomicAnnotator extends RodWalker<Integer, Integer> implements Tre
Set<String> rodName = new HashSet<String>();
rodName.add("variant");
TreeSet<String> samples = new TreeSet<String>(SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName));
Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
}

View File

@ -32,7 +32,6 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -326,7 +325,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
vcfWriter.add(VariantContextUtils.modifyAttributes(filteredVC, attributes), ref.getBase());
vcfWriter.add(VariantContext.modifyAttributes(filteredVC, attributes), ref.getBase());
return 1;

View File

@ -117,7 +117,7 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
hInfo.add(new VCFHeaderLine("VariantFiltration", "\"" + CommandLineUtils.createApproximateCommandLineArgumentString(getToolkit(), this) + "\""));
}
writer.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(vc.getSampleNames())));
writer.writeHeader(new VCFHeader(hInfo, vc.getSampleNames()));
}
public void initialize() {
@ -178,7 +178,7 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
// make new Genotypes based on filters
Map<String, Genotype> genotypes;
if ( genotypeFilterExps.size() == 0 ) {
genotypes = vc.getGenotypes();
genotypes = null;
} else {
genotypes = new HashMap<String, Genotype>(vc.getGenotypes().size());
@ -186,14 +186,18 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
Genotype g = genotype.getValue();
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
if ( VariantContextUtils.match(vc, g, exp) )
filters.add(exp.name);
if ( g.isCalled() ) {
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
if ( VariantContextUtils.match(vc, g, exp) )
filters.add(exp.name);
}
genotypes.put(genotype.getKey(), new Genotype(genotype.getKey(), g.getAlleles(), g.getNegLog10PError(), filters, g.getAttributes(), g.genotypesArePhased()));
} else {
genotypes.put(genotype.getKey(), g);
}
genotypes.put(genotype.getKey(), new Genotype(genotype.getKey(), g.getAlleles(), g.getNegLog10PError(), filters, g.getAttributes(), g.genotypesArePhased()));
}
}
@ -214,7 +218,11 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
filters.add(exp.name);
}
VariantContext filteredVC = new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), filters, vc.getAttributes());
VariantContext filteredVC;
if ( genotypes == null )
filteredVC = VariantContext.modifyFilters(vc, filters);
else
filteredVC = new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), filters, vc.getAttributes());
writeVCF(filteredVC, context.getReferenceContext().getBase());
}

View File

@ -33,7 +33,6 @@ import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
@ -192,7 +191,7 @@ public class UnifiedGenotyperEngine {
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc);
call.vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
}
}
}
if ( call != null && call.vc != null ) {
@ -202,7 +201,7 @@ public class UnifiedGenotyperEngine {
if ( rawContext.hasPileupBeenDownsampled() ) {
Map<String, Object> attrs = new HashMap<String, Object>(call.vc.getAttributes());
attrs.put(VCFConstants.DOWNSAMPLED_KEY, true);
call.vc = VariantContextUtils.modifyAttributes(call.vc, attrs);
call.vc = VariantContext.modifyAttributes(call.vc, attrs);
}
}
return call;

View File

@ -190,7 +190,7 @@ public class SequenomValidationConverter extends RodWalker<Pair<VariantContext,
numHomVarViolations++;
isViolation = true;
}
vContext = VariantContextUtils.modifyFilters(vContext, filters);
vContext = VariantContext.modifyFilters(vContext, filters);
numRecords++;
// add the info fields
@ -207,7 +207,7 @@ public class SequenomValidationConverter extends RodWalker<Pair<VariantContext,
infoMap.put(VCFConstants.ALLELE_COUNT_KEY, String.format("%d", altAlleleCount));
infoMap.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", vContext.getChromosomeCount()));
vContext = VariantContextUtils.modifyAttributes(vContext, infoMap);
vContext = VariantContext.modifyAttributes(vContext, infoMap);
return new Pair<VariantContext, Byte>(vContext, ref.getBase());
}

View File

@ -220,7 +220,7 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
if ( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
Set<String> filters = new HashSet<String>();
filters.add(filterString);
vc = new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.getGenotypes(), vc.getNegLog10PError(), filters, vc.getAttributes());
vc = VariantContext.modifyFilters(vc, filters);
}
}
vcfWriter.add( vc, ref.getBase() );

View File

@ -251,9 +251,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
attrs.put("OQ", String.format("%.2f", ((Double)vc.getPhredScaledQual())));
attrs.put("LOD", String.format("%.4f", lod));
Set<String> filters = new HashSet<String>();
filters.add(VCFConstants.PASSES_FILTERS_v4);
VariantContext newVC = new VariantContext(vc.getName(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.getGenotypes(), variantDatum.qual / 10.0, filters, attrs);
VariantContext newVC = VariantContext.modifyPErrorFiltersAndAttributes(vc, variantDatum.qual / 10.0, new HashSet<String>(), attrs);
vcfWriter.add( newVC, ref.getBase() );

View File

@ -270,7 +270,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
}
attributes.put("DP", depth);
sub = VariantContextUtils.modifyAttributes(sub, attributes);
sub = VariantContext.modifyAttributes(sub, attributes);
return sub;
}

View File

@ -7,7 +7,6 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
@ -187,7 +186,7 @@ public class IndelAnnotator extends RodWalker<Integer,Long> {
Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
attrs.putAll(annotationMap);
vc = VariantContextUtils.modifyAttributes(vc, attrs);
vc = VariantContext.modifyAttributes(vc, attrs);
vcfWriter.add(vc, ref.getBase());
return 1;

View File

@ -84,7 +84,7 @@ public class SampleUtils {
* @return the set of unique samples
*/
public static Set<String> getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
Set<String> samples = new TreeSet<String>();
Set<String> samples = new LinkedHashSet<String>();
for ( VCFHeader header : VCFUtils.getVCFHeadersFromRods(toolkit, rodNames).values() )
samples.addAll(header.getGenotypeSamples());

View File

@ -15,7 +15,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsNotAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("89b846266a0c565b9d2a7dbe793546aa"));
Arrays.asList("30ca0a572407f8f8d69ce83800db35ea"));
executeTest("test file has annotations, not asking for annotations, #1", spec);
}
@ -31,7 +31,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("b48199f9dc6f91c61418536151afa8fd"));
Arrays.asList("bcd5c80e54d90f1ddf65e0c47b0710a5"));
executeTest("test file has annotations, asking for annotations, #1", spec);
}
@ -47,7 +47,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsNotAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B:variant,VCF " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("34d2831e7d843093a2cf47c1f4e5f0f0"));
Arrays.asList("63fcabe0198b88a87ea645ffbf25165f"));
executeTest("test file doesn't have annotations, not asking for annotations, #1", spec);
}
@ -63,7 +63,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("133275d150a8100ba4dc756d17b23ef1"));
Arrays.asList("10ceed55fd51f104d4b57aa555770253"));
executeTest("test file doesn't have annotations, asking for annotations, #1", spec);
}

View File

@ -16,7 +16,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testNoAction() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("89b846266a0c565b9d2a7dbe793546aa"));
Arrays.asList("a08a88866aac0ec4a844386bea5c585f"));
executeTest("test no action", spec);
}
@ -24,7 +24,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testClusteredSnps() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -window 10 -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("46877eb3ec258e6b70d8fcacca1acb25"));
Arrays.asList("59f0f365550cc01e0fdef65e98963826"));
executeTest("test clustered SNPs", spec);
}
@ -32,7 +32,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testMask() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -mask foo -B:mask,VCF " + validationDataLocation + "vcfexample2.vcf -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("650504a58af863d9cef699087a7961aa"));
Arrays.asList("cb67d20027e4e0cb45544a69ff49476e"));
executeTest("test mask", spec);
}
@ -40,7 +40,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testFilter1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("b0b9110aeff967dd87cd0ec273d20791"));
Arrays.asList("1fdccdb8ca837d5fc7a619d285e2308a"));
executeTest("test filter #1", spec);
}
@ -48,15 +48,15 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testFilter2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("8405a7ef69792c239d903d13832a1ea7"));
Arrays.asList("40fdd0321091402a669d7e2eaadf072a"));
executeTest("test filter #2", spec);
}
@Test
public void testFilterWithSeparateNames() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --filterName ABF -filter 'AlleleBalance < 70.0' --filterName FSF -filter 'FisherStrand == 1.4' -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("8266b9eb2ba35d77ab1cecb395322f31"));
baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("a017fbbb6d610481b174c53d29b1ae5a"));
executeTest("test filter with separate names #2", spec);
}
@ -64,7 +64,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testGenotypeFilter1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("99d8a47623cba215ee7d803c514ef116"));
Arrays.asList("1a200b0e47cac16d1dfd8ce44484c667"));
executeTest("test genotype filter #1", spec);
}
@ -72,7 +72,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testGenotypeFilter2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G_filter 'AF == 0.04 && isHomVar == 1' -G_filterName foo -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("c58ea74c32290c9b4e8ae3dd1d0250e1"));
Arrays.asList("78315a09eb3ac8cc47010bb92fad342f"));
executeTest("test genotype filter #2", spec);
}
}