Genotype And Validate walker is now ready to be used by anyone.

given an annotated VCF and a BAM file, it genotypes (using the reads in the BAM) each variant in the VCF (for snp or indel) and validates (or not) the 'known' annotation. Outputs a truth table with the PPV and NPV values, and optionally a vcf file with the variants that had enough coverage to be validated. You can optionally provide a minimum depth of coverage and only do the analysis conditional on that. (will write a wiki for this walker, as it might be useful for future validation essays).


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5409 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
carneiro 2011-03-08 22:10:38 +00:00
parent da88c29b6e
commit fa7284b7a1
1 changed files with 37 additions and 14 deletions

View File

@ -25,11 +25,16 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import com.google.common.collect.ImmutableSet;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFWriter;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
@ -39,8 +44,8 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.PrintStream;
import java.util.*;
/**
@ -61,15 +66,18 @@ import java.util.*;
public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalker.CountedData, GenotypeAndValidateWalker.CountedData> {
@Output(doc="File to which validated variants should be written", required=true)
protected VCFWriter vcfWriter = null;
@Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false)
private int mbq = -1;
@Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false)
private double deletions = -1;
@Argument(fullName="condition_on_depth", shortName="depth", doc="Condition validation on a minimum depth of coverage by the reads", required=false)
private int minDepth = -1;
@Output( doc="File to write the two-way truth table", required=false)
private PrintStream printStream = null;
private String compName = "alleles";
private UnifiedGenotyperEngine snpEngine;
@ -115,6 +123,15 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
if ( !rodList.get(0).getName().equals(compName))
throw new UserException.BadInput("The ROD track has to be named \""+ compName +"\". Not " + rodList.get(0).getName());
// Initialize VCF header
Map<String, VCFHeader> header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), compName);
Set<String> samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(header.values(), logger);
headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate"));
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
// Filling in SNP calling arguments for UG
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
@ -143,7 +160,12 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
return counter;
VariantContext vcComp = tracker.getVariantContext(ref, compName, null, context.getLocation(), false);
if( vcComp == null ){
if( vcComp == null )
return counter;
// Do not operate on variants that are not covered to the optional minimum depth
if (!context.hasReads() || minDepth > 0 && context.getBasePileup().getBases().length < minDepth) {
counter.numUncovered = 1L;
return counter;
}
@ -153,23 +175,22 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
VariantCallContext call;
if ( vcComp.isSNP() )
call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
else if ( vcComp.isIndel() )
else if ( vcComp.isIndel() ) {
call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
if (call.vc == null) // variant context will be null on an extended indel event and I just want to call it one event.
return counter;
}
else {
logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles());
return counter;
}
if (!call.confidentlyCalled) {
if (context.hasReads()) {
counter.numNotConfidentCalls = 1L;
if (vcComp.getFilters().contains("TP"))
counter.numFN = 1L;
else
counter.numTN = 1L;
}
counter.numNotConfidentCalls = 1L;
if (vcComp.getFilters().contains("TP"))
counter.numFN = 1L;
else
counter.numUncovered = 1L;
counter.numTN = 1L;
}
else {
counter.numConfidentCalls = 1L;
@ -178,7 +199,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
else
counter.numFP = 1L;
}
vcfWriter.add(vcComp, ref.getBase());
return counter;
}
@ -202,6 +223,8 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
logger.info("TN = " + reduceSum.numTN);
logger.info("FP = " + reduceSum.numFP);
logger.info("FN = " + reduceSum.numFN);
logger.info("PPV = " + ((double) reduceSum.numTP /( reduceSum.numTP + reduceSum.numFP)));
logger.info("FPV = " + ((double) reduceSum.numTN /( reduceSum.numTN + reduceSum.numFN)));
logger.info("Uncovered = " + reduceSum.numUncovered);
logger.info("confidently called = " + reduceSum.numConfidentCalls);
logger.info("not confidently called = " + reduceSum.numNotConfidentCalls );