VQSR now only uses the valid polymorphic sites for training and truth sensitivity calculations. Any number of tracks whose ROD binding begins with the name truth can be used as truth sensitivity tracks.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5012 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2011-01-18 20:48:19 +00:00
parent 4bec93e3e4
commit 00453919d2
3 changed files with 27 additions and 16 deletions

View File

@ -196,13 +196,13 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null ); final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null );
final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null ); final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null );
variantDatum.isKnown = ( vcDbsnp != null ); variantDatum.isKnown = ( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() );
variantDatum.weight = WEIGHT_NOVELS; variantDatum.weight = WEIGHT_NOVELS;
if( vcHapMap != null ) { if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (!vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
variantDatum.weight = WEIGHT_HAPMAP; variantDatum.weight = WEIGHT_HAPMAP;
} else if( vc1KG != null ) { } else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (!vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
variantDatum.weight = WEIGHT_1KG; variantDatum.weight = WEIGHT_1KG;
} else if( vcDbsnp != null ) { } else if( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() ) {
variantDatum.weight = WEIGHT_DBSNP; variantDatum.weight = WEIGHT_DBSNP;
} }

View File

@ -59,7 +59,6 @@ import java.util.*;
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> { public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
public static final String VQS_LOD_KEY = "VQSLOD"; public static final String VQS_LOD_KEY = "VQSLOD";
public static final double SMALLEST_LOG10_PVAR = -1e6;
///////////////////////////// /////////////////////////////
// Inputs // Inputs
@ -78,7 +77,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
///////////////////////////// /////////////////////////////
// Command Line Arguments // Command Line Arguments
///////////////////////////// /////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=true) @Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=false)
private double TARGET_TITV = 2.07; private double TARGET_TITV = 2.07;
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by enlarging the Gaussians.", required=false) @Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by enlarging the Gaussians.", required=false)
private double BACKOFF_FACTOR = 1.3; private double BACKOFF_FACTOR = 1.3;
@ -139,6 +138,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private VariantGaussianMixtureModel theModel = null; private VariantGaussianMixtureModel theModel = null;
private Set<String> ignoreInputFilterSet = null; private Set<String> ignoreInputFilterSet = null;
private Set<String> inputNames = new HashSet<String>(); private Set<String> inputNames = new HashSet<String>();
private Set<String> truthNames = new HashSet<String>();
private NestedHashMap priorCache = new NestedHashMap(); private NestedHashMap priorCache = new NestedHashMap();
private boolean trustACField = false; private boolean trustACField = false;
private PrintStream tranchesStream = null; private PrintStream tranchesStream = null;
@ -202,6 +202,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
if( d.getName().startsWith("input") ) { if( d.getName().startsWith("input") ) {
inputNames.add(d.getName()); inputNames.add(d.getName());
logger.info("Found input variant track with name " + d.getName()); logger.info("Found input variant track with name " + d.getName());
} else if( d.getName().startsWith("truth") ) {
truthNames.add(d.getName());
logger.info("Found truth variant track with name " + d.getName());
} else if ( d.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { } else if ( d.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
logger.info("Found dbSNP track with prior probability = Q" + PRIOR_DBSNP); logger.info("Found dbSNP track with prior probability = Q" + PRIOR_DBSNP);
foundDBSNP = true; foundDBSNP = true;
@ -222,7 +225,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." ); throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
} }
// setup the header fields // setup the header fields
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>(); final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
final TreeSet<String> samples = new TreeSet<String>(); final TreeSet<String> samples = new TreeSet<String>();
@ -253,9 +255,18 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
return mapList; return mapList;
} }
final Collection<VariantContext> vcsTruth = tracker.getVariantContexts(ref, "truth", null, context.getLocation(), false, true); final Collection<VariantContext> vcsTruth = tracker.getVariantContexts(ref, truthNames, null, context.getLocation(), false, true);
final VariantContext vcTruth = ( vcsTruth.size() != 0 ? vcsTruth.iterator().next() : null ); boolean isAtTruthSite = false;
nTruthSites += vcTruth != null && vcTruth.isVariant() ? 1 : 0; for( final VariantContext vcTruth : vcsTruth ) {
if( vcTruth != null && vcTruth.isVariant() && !vcTruth.isFiltered() && (!vcTruth.hasGenotypes() || vcTruth.isPolymorphic()) ) {
isAtTruthSite = true;
break;
}
}
nTruthSites += isAtTruthSite ? 1 : 0;
//final VariantContext vcTruth = ( vcsTruth.size() != 0 ? vcsTruth.iterator().next() : null );
//nTruthSites += vcTruth != null && vcTruth.isVariant() ? 1 : 0;
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), false, false) ) { for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), false, false) ) {
if( vc != null && vc.isSNP() ) { if( vc != null && vc.isSNP() ) {
@ -271,13 +282,13 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null ); final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null );
final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null ); final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null );
variantDatum.isKnown = ( vcDbsnp != null ); variantDatum.isKnown = ( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() );
double knownPrior_qScore = PRIOR_NOVELS; double knownPrior_qScore = PRIOR_NOVELS;
if( vcHapMap != null ) { if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (!vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
knownPrior_qScore = PRIOR_HAPMAP; knownPrior_qScore = PRIOR_HAPMAP;
} else if( vc1KG != null ) { } else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (!vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
knownPrior_qScore = PRIOR_1KG; knownPrior_qScore = PRIOR_1KG;
} else if( vcDbsnp != null ) { } else if( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() ) {
knownPrior_qScore = PRIOR_DBSNP; knownPrior_qScore = PRIOR_DBSNP;
} }
@ -334,7 +345,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
variantDatum.lod = round4(lod); variantDatum.lod = round4(lod);
// deal with the truth calculation // deal with the truth calculation
variantDatum.atTruthSite = vcTruth != null && vcTruth.isVariant(); variantDatum.atTruthSite = isAtTruthSite; //vcTruth != null && vcTruth.isVariant();
mapList.add( variantDatum ); mapList.add( variantDatum );
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes()); final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());

View File

@ -178,7 +178,7 @@ class VariantCalling(attribs: Pipeline,gatkJar: File) {
vr.analysisName = "VariantQualityRecalibration" vr.analysisName = "VariantQualityRecalibration"
vr.rodBind :+= new RodBind("input","VCF",raw_vcf) vr.rodBind :+= new RodBind("input","VCF",raw_vcf)
vr.cluster_file = cluster vr.cluster_file = cluster
vr.target_titv = target_titv vr.target_titv = Some(target_titv)
vr.out = out_vcf vr.out = out_vcf
vr.tranches_file = out_tranches vr.tranches_file = out_tranches
vr.tranche :+= "0.1" vr.tranche :+= "0.1"