give it an annotated VCF and a BAM and it creates a truth table on the validation of the VCF calls. This is just the first version, not ready for primetime.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5371 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
carneiro 2011-03-03 21:45:11 +00:00
parent 0daa65b9ef
commit 8ae42b70ac
1 changed files with 208 additions and 0 deletions

View File

@ -0,0 +1,208 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.PrintStream;
import java.util.*;
/**
* Validates the calls on a ROD track using a BAM dataset.
*
* @author carneiro
* @since Mar 3, 2011
* @help.summary Validates the calls on a ROD track using a BAM dataset.
*/
//@Requires(value={DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_ORDERED_DATA})
@Allows(value={DataSource.READS, DataSource.REFERENCE})
public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalker.CountedData, GenotypeAndValidateWalker.CountedData> {
@Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false)
private int mbq = -1;
@Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false)
private double deletions = -1;
@Output( doc="File to write the two-way truth table", required=false)
private PrintStream printStream = null;
private String compName = "alleles";
private UnifiedGenotyperEngine snpEngine;
private UnifiedGenotyperEngine indelEngine;
public static enum VARIANT_TYPE {
TRUE_POSITIVE,
FALSE_POSITIVE,
}
public static class CountedData {
private long numTP = 0L;
private long numTN = 0L;
private long numFP = 0L;
private long numFN = 0L;
private long numConfidentCalls = 0L;
private long numNotConfidentCalls = 0L;
/**
* Adds the values of other to this, returning this
* @param other the other object
*/
public void add(CountedData other) {
numTP += other.numTP;
numTN += other.numTN;
numFP += other.numFP;
numFN += other.numFN;
numNotConfidentCalls += other.numNotConfidentCalls;
numConfidentCalls += other.numConfidentCalls;
}
}
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
List<ReferenceOrderedDataSource> rodList = this.getToolkit().getRodDataSources();
if ( rodList.size() != 1 )
throw new UserException.BadInput("You should provide exactly one genotype VCF");
if ( !rodList.get(0).getName().equals(compName))
throw new UserException.BadInput("The ROD track has to be named \""+ compName +"\". Not " + rodList.get(0).getName());
// Filling in SNP calling arguments for UG
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
if (mbq > 0) uac.MIN_BASE_QUALTY_SCORE = mbq;
if (deletions > 0) uac.MAX_DELETION_FRACTION = deletions;
snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac);
// Adding the INDEL calling arguments for UG
uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.DINDEL;
indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac);
}
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
final CountedData counter = new CountedData();
// For some reason RodWalkers get map calls with null trackers
if( tracker == null )
return counter;
VariantContext vcComp = tracker.getVariantContext(ref, compName, null, context.getLocation(), false);
if( vcComp == null )
return counter;
vcComp = vcComp.subContextFromGenotypes( vcComp.getGenotypes().values() );
if ((vcComp.getFilters().contains("TP") && vcComp.getFilters().contains("FP")) || (!vcComp.getFilters().contains("TP") && !vcComp.getFilters().contains("FP")))
throw new UserException.BadInput("Variant has the wrong filter annotation -- either missing FP/TP or has both. " + vcComp.getChr() + ":" + vcComp.getStart());
VariantCallContext call;
if ( vcComp.isSNP() )
call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
else if ( vcComp.isIndel() )
call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
else
return counter;
VARIANT_TYPE variantType;
if (vcComp.getFilters().contains("TP"))
variantType = VARIANT_TYPE.TRUE_POSITIVE;
else
variantType = VARIANT_TYPE.FALSE_POSITIVE;
if (!call.confidentlyCalled) {
counter.numNotConfidentCalls = 1L;
if (variantType == VARIANT_TYPE.TRUE_POSITIVE)
counter.numFN = 1L;
else
counter.numTN = 1L;
}
else {
counter.numConfidentCalls = 1L;
if (variantType == VARIANT_TYPE.TRUE_POSITIVE)
counter.numTP = 1L;
else
counter.numFP = 1L;
}
return counter;
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
public CountedData reduceInit() {
return new CountedData();
}
public CountedData reduce( final CountedData mapValue, final CountedData reduceSum ) {
reduceSum.add(mapValue);
return reduceSum;
}
public void onTraversalDone( CountedData reduceSum ) {
logger.info("TP = " + reduceSum.numTP);
logger.info("TN = " + reduceSum.numTN);
logger.info("FP = " + reduceSum.numFP);
logger.info("FN = " + (reduceSum.numFN) );
logger.info("confidently called = " + reduceSum.numConfidentCalls);
logger.info("not confidently called = " + reduceSum.numNotConfidentCalls );
}
}