Genotype And Validate walker is now ready to be used by anyone.
given an annotated VCF and a BAM file, it genotypes (using the reads in the BAM) each variant in the VCF (for snp or indel) and validates (or not) the 'known' annotation. Outputs a truth table with the PPV and NPV values, and optionally a vcf file with the variants that had enough coverage to be validated. You can optionally provide a minimum depth of coverage and only do the analysis conditional on that. (will write a wiki for this walker, as it might be useful for future validation essays). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5409 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
da88c29b6e
commit
fa7284b7a1
|
|
@ -25,11 +25,16 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableSet;
|
||||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
|
import org.broad.tribble.vcf.VCFHeader;
|
||||||
|
import org.broad.tribble.vcf.VCFHeaderLine;
|
||||||
|
import org.broad.tribble.vcf.VCFWriter;
|
||||||
import org.broadinstitute.sting.commandline.Argument;
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
|
@ -39,8 +44,8 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||||
import org.broadinstitute.sting.utils.SampleUtils;
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||||
|
|
||||||
import java.io.PrintStream;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -61,15 +66,18 @@ import java.util.*;
|
||||||
|
|
||||||
public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalker.CountedData, GenotypeAndValidateWalker.CountedData> {
|
public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalker.CountedData, GenotypeAndValidateWalker.CountedData> {
|
||||||
|
|
||||||
|
@Output(doc="File to which validated variants should be written", required=true)
|
||||||
|
protected VCFWriter vcfWriter = null;
|
||||||
|
|
||||||
@Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false)
|
@Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false)
|
||||||
private int mbq = -1;
|
private int mbq = -1;
|
||||||
|
|
||||||
@Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false)
|
@Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false)
|
||||||
private double deletions = -1;
|
private double deletions = -1;
|
||||||
|
|
||||||
|
@Argument(fullName="condition_on_depth", shortName="depth", doc="Condition validation on a minimum depth of coverage by the reads", required=false)
|
||||||
|
private int minDepth = -1;
|
||||||
|
|
||||||
@Output( doc="File to write the two-way truth table", required=false)
|
|
||||||
private PrintStream printStream = null;
|
|
||||||
|
|
||||||
private String compName = "alleles";
|
private String compName = "alleles";
|
||||||
private UnifiedGenotyperEngine snpEngine;
|
private UnifiedGenotyperEngine snpEngine;
|
||||||
|
|
@ -115,6 +123,15 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
|
||||||
if ( !rodList.get(0).getName().equals(compName))
|
if ( !rodList.get(0).getName().equals(compName))
|
||||||
throw new UserException.BadInput("The ROD track has to be named \""+ compName +"\". Not " + rodList.get(0).getName());
|
throw new UserException.BadInput("The ROD track has to be named \""+ compName +"\". Not " + rodList.get(0).getName());
|
||||||
|
|
||||||
|
|
||||||
|
// Initialize VCF header
|
||||||
|
Map<String, VCFHeader> header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), compName);
|
||||||
|
Set<String> samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
|
||||||
|
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(header.values(), logger);
|
||||||
|
headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate"));
|
||||||
|
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
|
||||||
|
|
||||||
|
|
||||||
// Filling in SNP calling arguments for UG
|
// Filling in SNP calling arguments for UG
|
||||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||||
uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
|
uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
|
||||||
|
|
@ -143,7 +160,12 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
|
||||||
return counter;
|
return counter;
|
||||||
|
|
||||||
VariantContext vcComp = tracker.getVariantContext(ref, compName, null, context.getLocation(), false);
|
VariantContext vcComp = tracker.getVariantContext(ref, compName, null, context.getLocation(), false);
|
||||||
if( vcComp == null ){
|
if( vcComp == null )
|
||||||
|
return counter;
|
||||||
|
|
||||||
|
// Do not operate on variants that are not covered to the optional minimum depth
|
||||||
|
if (!context.hasReads() || minDepth > 0 && context.getBasePileup().getBases().length < minDepth) {
|
||||||
|
counter.numUncovered = 1L;
|
||||||
return counter;
|
return counter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -153,24 +175,23 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
|
||||||
VariantCallContext call;
|
VariantCallContext call;
|
||||||
if ( vcComp.isSNP() )
|
if ( vcComp.isSNP() )
|
||||||
call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
|
call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
|
||||||
else if ( vcComp.isIndel() )
|
else if ( vcComp.isIndel() ) {
|
||||||
call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
|
call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
|
||||||
|
if (call.vc == null) // variant context will be null on an extended indel event and I just want to call it one event.
|
||||||
|
return counter;
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles());
|
logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles());
|
||||||
return counter;
|
return counter;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!call.confidentlyCalled) {
|
if (!call.confidentlyCalled) {
|
||||||
if (context.hasReads()) {
|
|
||||||
counter.numNotConfidentCalls = 1L;
|
counter.numNotConfidentCalls = 1L;
|
||||||
if (vcComp.getFilters().contains("TP"))
|
if (vcComp.getFilters().contains("TP"))
|
||||||
counter.numFN = 1L;
|
counter.numFN = 1L;
|
||||||
else
|
else
|
||||||
counter.numTN = 1L;
|
counter.numTN = 1L;
|
||||||
}
|
}
|
||||||
else
|
|
||||||
counter.numUncovered = 1L;
|
|
||||||
}
|
|
||||||
else {
|
else {
|
||||||
counter.numConfidentCalls = 1L;
|
counter.numConfidentCalls = 1L;
|
||||||
if (vcComp.getFilters().contains("TP"))
|
if (vcComp.getFilters().contains("TP"))
|
||||||
|
|
@ -178,7 +199,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
|
||||||
else
|
else
|
||||||
counter.numFP = 1L;
|
counter.numFP = 1L;
|
||||||
}
|
}
|
||||||
|
vcfWriter.add(vcComp, ref.getBase());
|
||||||
return counter;
|
return counter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -202,6 +223,8 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
|
||||||
logger.info("TN = " + reduceSum.numTN);
|
logger.info("TN = " + reduceSum.numTN);
|
||||||
logger.info("FP = " + reduceSum.numFP);
|
logger.info("FP = " + reduceSum.numFP);
|
||||||
logger.info("FN = " + reduceSum.numFN);
|
logger.info("FN = " + reduceSum.numFN);
|
||||||
|
logger.info("PPV = " + ((double) reduceSum.numTP /( reduceSum.numTP + reduceSum.numFP)));
|
||||||
|
logger.info("FPV = " + ((double) reduceSum.numTN /( reduceSum.numTN + reduceSum.numFN)));
|
||||||
logger.info("Uncovered = " + reduceSum.numUncovered);
|
logger.info("Uncovered = " + reduceSum.numUncovered);
|
||||||
logger.info("confidently called = " + reduceSum.numConfidentCalls);
|
logger.info("confidently called = " + reduceSum.numConfidentCalls);
|
||||||
logger.info("not confidently called = " + reduceSum.numNotConfidentCalls );
|
logger.info("not confidently called = " + reduceSum.numNotConfidentCalls );
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue