We no longer look at by-hapmap validation status in the VQSR because using the HapMap VCF file is higher quality. As a side effect we now support the dbsnp 132 vcf file. ApplyVariantCuts now requires that the input VCF rod bindings begin with input, matching the other VQSR walkers. Wiki updated with information about how to obtain the hapmap and 1kg truth sets.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4772 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
220fb0c44a
commit
0adf505b53
|
|
@ -30,8 +30,8 @@ import org.broad.tribble.vcf.*;
|
|||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -73,7 +73,9 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
|||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
final List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
final private List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
final private Set<String> inputNames = new HashSet<String>();
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -91,11 +93,24 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
Collections.reverse(tranches); // this algorithm wants the tranches ordered from worst to best
|
||||
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
|
||||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit()));
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
|
||||
|
||||
if( tranches.size() >= 2 ) {
|
||||
for( int iii = 0; iii < tranches.size() - 1; iii++ ) {
|
||||
|
|
@ -127,8 +142,8 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
|||
return 1;
|
||||
}
|
||||
|
||||
for( VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
|
||||
if( vc != null && !vc.getSource().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
|
||||
for( VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), false, false) ) {
|
||||
if( vc != null && vc.isSNP() ) {
|
||||
String filterString = null;
|
||||
if( !vc.isFiltered() ) {
|
||||
try {
|
||||
|
|
@ -174,11 +189,11 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 1;
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
public Integer reduce( final Integer mapValue, final Integer reduceSum ) {
|
||||
return 1;
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
public void onTraversalDone( Integer reduceSum ) {
|
||||
|
|
|
|||
|
|
@ -25,16 +25,13 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.CommandLineUtils;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
|
@ -99,9 +96,6 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
@Hidden
|
||||
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
|
||||
protected Boolean NO_HEADER_LINE = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "NoByHapMapValidationStatus", shortName = "NoByHapMapValidationStatus", doc = "Don't consider sites in dbsnp rod tagged as by-hapmap validation status as real HapMap sites. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private Boolean NO_BY_HAPMAP_VALIDATION_STATUS = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
|
|
@ -132,28 +126,39 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
CLUSTER_FILE.println("\"" + getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this) + "\"");
|
||||
}
|
||||
|
||||
boolean foundDBSNP = false;
|
||||
boolean foundTruthSet = false;
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else if ( d.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
} else if ( d.getName().equals("dbsnp") ) {
|
||||
logger.info("Found dbSNP track for use in training with weight = " + WEIGHT_DBSNP);
|
||||
if( !NO_BY_HAPMAP_VALIDATION_STATUS ) {
|
||||
logger.info("\tsites in dbSNP track tagged with by-hapmap validation status will be used in training with weight = " + WEIGHT_HAPMAP);
|
||||
if( WEIGHT_DBSNP > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
foundDBSNP = true;
|
||||
} else if ( d.getName().equals("hapmap") ) {
|
||||
logger.info("Found HapMap track for use in training with weight = " + WEIGHT_HAPMAP);
|
||||
if( WEIGHT_HAPMAP > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
|
||||
} else if ( d.getName().equals("1kg") ) {
|
||||
logger.info("Found 1KG track for use in training with weight = " + WEIGHT_1KG);
|
||||
if( WEIGHT_1KG > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if(!foundDBSNP) {
|
||||
throw new UserException.CommandLineException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
|
||||
if( !foundTruthSet ) {
|
||||
throw new UserException.CommandLineException("No truth set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, or 1kg rod bindings. Clustering weights can be specified using -weightDBSNP, -weightHapMap, and -weight1KG");
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -184,19 +189,20 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
final VariantDatum variantDatum = new VariantDatum();
|
||||
variantDatum.annotations = annotationValues;
|
||||
|
||||
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||
final Collection<VariantContext> vcsDbsnp = tracker.getVariantContexts(ref, "dbsnp", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcsHapMap = tracker.getVariantContexts(ref, "hapmap", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcs1KG = tracker.getVariantContexts(ref, "1kg", null, context.getLocation(), false, true);
|
||||
final VariantContext vcDbsnp = ( vcsDbsnp.size() != 0 ? vcsDbsnp.iterator().next() : null );
|
||||
final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null );
|
||||
final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null );
|
||||
|
||||
variantDatum.isKnown = ( dbsnp != null );
|
||||
variantDatum.weight = WEIGHT_NOVELS;
|
||||
if( vcHapMap != null || ( !NO_BY_HAPMAP_VALIDATION_STATUS && dbsnp != null && DbSNPHelper.isHapmap(dbsnp) ) ) {
|
||||
variantDatum.isKnown = ( vcDbsnp != null );
|
||||
variantDatum.weight = WEIGHT_NOVELS;
|
||||
if( vcHapMap != null ) {
|
||||
variantDatum.weight = WEIGHT_HAPMAP;
|
||||
} else if( vc1KG != null ) {
|
||||
variantDatum.weight = WEIGHT_1KG;
|
||||
} else if( dbsnp != null ) {
|
||||
} else if( vcDbsnp != null ) {
|
||||
variantDatum.weight = WEIGHT_DBSNP;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,7 +25,6 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
|
|
@ -82,8 +81,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
private double TARGET_TITV = 2.07;
|
||||
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by enlarging the Gaussians.", required=false)
|
||||
private double BACKOFF_FACTOR = 1.3;
|
||||
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
|
||||
private int DESIRED_NUM_VARIANTS = 0;
|
||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
|
|
@ -113,11 +110,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
// Debug Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName = "NoByHapMapValidationStatus", shortName = "NoByHapMapValidationStatus", doc = "Don't consider sites in dbsnp rod tagged as by-hapmap validation status as real HapMap sites. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private Boolean NO_BY_HAPMAP_VALIDATION_STATUS = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "qual", shortName = "qual", doc = "Don't use sites with original quality scores below the qual threshold. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private double QUAL_THRESHOLD = 0.0;
|
||||
@Hidden
|
||||
@Argument(fullName = "debugFile", shortName = "debugFile", doc = "Print debugging information here", required=false)
|
||||
private File DEBUG_FILE = null;
|
||||
|
||||
|
|
@ -173,9 +168,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else if ( d.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
logger.info("Found dbSNP track with prior probability = Q" + PRIOR_DBSNP);
|
||||
if( !NO_BY_HAPMAP_VALIDATION_STATUS ) {
|
||||
logger.info("\tsites in dbSNP track tagged with by-hapmap validation status will be given prior probability = Q" + PRIOR_HAPMAP);
|
||||
}
|
||||
foundDBSNP = true;
|
||||
} else if ( d.getName().equals("hapmap") ) {
|
||||
logger.info("Found HapMap track with prior probability = Q" + PRIOR_HAPMAP);
|
||||
|
|
@ -190,6 +182,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
throw new UserException.CommandLineException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
|
||||
|
||||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
|
|
@ -227,19 +224,20 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
final VariantDatum variantDatum = new VariantDatum();
|
||||
variantDatum.isTransition = VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||
|
||||
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||
final Collection<VariantContext> vcsDbsnp = tracker.getVariantContexts(ref, "dbsnp", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcsHapMap = tracker.getVariantContexts(ref, "hapmap", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcs1KG = tracker.getVariantContexts(ref, "1kg", null, context.getLocation(), false, true);
|
||||
final VariantContext vcDbsnp = ( vcsDbsnp.size() != 0 ? vcsDbsnp.iterator().next() : null );
|
||||
final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null );
|
||||
final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null );
|
||||
|
||||
variantDatum.isKnown = ( dbsnp != null );
|
||||
variantDatum.isKnown = ( vcDbsnp != null );
|
||||
double knownPrior_qScore = PRIOR_NOVELS;
|
||||
if( vcHapMap != null || ( !NO_BY_HAPMAP_VALIDATION_STATUS && dbsnp != null && DbSNPHelper.isHapmap(dbsnp) ) ) {
|
||||
if( vcHapMap != null ) {
|
||||
knownPrior_qScore = PRIOR_HAPMAP;
|
||||
} else if( vc1KG != null ) {
|
||||
knownPrior_qScore = PRIOR_1KG;
|
||||
} else if( dbsnp != null ) {
|
||||
} else if( vcDbsnp != null ) {
|
||||
knownPrior_qScore = PRIOR_DBSNP;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -29,16 +29,16 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
}
|
||||
|
||||
VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
|
||||
"ab2629d67e378fd3aceb8318f0fbfe04", // in vcf
|
||||
"d8f0a92aa42989d332d3c65f61352978", // tranches
|
||||
"3370e51f6e5035f2472c66762d13c821", // recalVCF
|
||||
"4801e6cfaeadc81c5c1f9bf5948e2a1c"); // cut VCF
|
||||
"4eeffa7a1965ce0c25c5edd0bae76290", // in vcf
|
||||
"7407987a0148284ed910e1858116dd8d", // tranches
|
||||
"15ab55be5b2f62627aea8546a4728d77", // recalVCF
|
||||
"9435f1aed7313fbfff540a4d6d19d0c4"); // cut VCF
|
||||
|
||||
VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf",
|
||||
"725489156426e4ddd8d623ab3d4b1023", // in vcf
|
||||
"3a7067247146f4a77bb4fd7bc36f94c4", // tranches
|
||||
"9a525c6838bff695321fc7ac0e458f9c", // recalVCF
|
||||
"41e0a16af150244454ee68948ced00fb"); // cut VCF
|
||||
"8937a3ae7f176dacf47b8ee6c0023416", // in vcf
|
||||
"2896657b5c30bfd8e82e62e58d94ef4e", // tranches
|
||||
"a5fe2ee50144ef61121c42daf430381c", // recalVCF
|
||||
"9a35b69bed93894306c87bc9a0bcc116"); // cut VCF
|
||||
|
||||
@DataProvider(name = "VRTest")
|
||||
public Object[][] createData1() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue