CombineVariants now outputs the command line in the VCF header. Added a new hidden argument to VR walkers called --NoByHapMapValidationStatus to turn off the by-hapmap dbsnp rod behavior. Very useful for experimenting with which sets to use as training data.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4307 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6f6d2eb31f
commit
7e58d8ed61
|
|
@ -74,10 +74,6 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
private int MAX_GAUSSIANS = 4;
|
||||
@Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
|
||||
private int MAX_ITERATIONS = 200;
|
||||
// @Argument(fullName = "path_to_Rscript", shortName="Rscript", doc="The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
// private String PATH_TO_RSCRIPT = "Rscript";
|
||||
// @Argument(fullName = "path_to_resources", shortName = "resources", doc="Path to resources folder holding the Sting R scripts.", required=false)
|
||||
// private String PATH_TO_RESOURCES = "R/";
|
||||
@Argument(fullName="weightNovel", shortName="weightNovel", doc="The weight for novel variants during clustering", required=false)
|
||||
private double WEIGHT_NOVELS = 0.0;
|
||||
@Argument(fullName="weightDBSNP", shortName="weightDBSNP", doc="The weight for dbSNP variants during clustering", required=false)
|
||||
|
|
@ -97,18 +93,15 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algoirthm.", required=false)
|
||||
private double DIRICHLET_PARAMETER = 1000.0;
|
||||
|
||||
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
|
||||
//private int NUM_KNN = 2000;
|
||||
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Debug Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
|
||||
protected Boolean NO_HEADER_LINE = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "NoByHapMapValidationStatus", shortName = "NoByHapMapValidationStatus", doc = "Don't consider sites in dbsnp rod tagged as by-hapmap validation status as real HapMap sites. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private Boolean NO_BY_HAPMAP_VALIDATION_STATUS = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
|
|
@ -116,7 +109,8 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
private ExpandingArrayList<String> annotationKeys;
|
||||
private Set<String> ignoreInputFilterSet = null;
|
||||
private Set<String> inputNames = new HashSet<String>();
|
||||
VariantGaussianMixtureModel theModel = new VariantGaussianMixtureModel();
|
||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||
private VariantGaussianMixtureModel theModel = new VariantGaussianMixtureModel();
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -195,7 +189,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
|
||||
variantDatum.isKnown = ( dbsnp != null );
|
||||
variantDatum.weight = WEIGHT_NOVELS;
|
||||
if( vcHapMap != null || ( dbsnp != null && DbSNPHelper.isHapmap(dbsnp) ) ) {
|
||||
if( vcHapMap != null || ( !NO_BY_HAPMAP_VALIDATION_STATUS && dbsnp != null && DbSNPHelper.isHapmap(dbsnp) ) ) {
|
||||
variantDatum.weight = WEIGHT_HAPMAP;
|
||||
} else if( vc1KG != null ) {
|
||||
variantDatum.weight = WEIGHT_1KG;
|
||||
|
|
@ -253,23 +247,5 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
}
|
||||
|
||||
theModel.run( CLUSTER_FILE );
|
||||
|
||||
/*
|
||||
theModel.outputClusterReports( CLUSTER_FILENAME );
|
||||
|
||||
for( final String annotation : annotationKeys ) {
|
||||
// Execute Rscript command to plot the optimization curve
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_ClusterReport.R" + " " + CLUSTER_FILENAME + "." + annotation + ".dat " + annotation;
|
||||
logger.info( rScriptCommandLine );
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
try {
|
||||
Runtime.getRuntime().exec( rScriptCommandLine );
|
||||
} catch ( IOException e ) {
|
||||
Utils.warnUser("Unable to execute the RScript command because of [" + e.getMessage() + "]. While not critical to the calculations themselves, the script outputs a report that is extremely useful for confirming that the clustering proceded as expected. We highly recommend trying to rerun the script manually if possible.");
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -112,7 +112,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
|
||||
protected Boolean NO_VCF_HEADER_LINE = false;
|
||||
protected Boolean NO_VCF_HEADER_LINE = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "NoByHapMapValidationStatus", shortName = "NoByHapMapValidationStatus", doc = "Don't consider sites in dbsnp rod tagged as by-hapmap validation status as real HapMap sites. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private Boolean NO_BY_HAPMAP_VALIDATION_STATUS = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
|
|
@ -156,13 +159,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else if ( d.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
logger.info("Found dbSNP track with prior probability = Q" + PRIOR_DBSNP);
|
||||
logger.info("\tsites in dbSNP track tagged with by-hapmap validation status will be given prior probability = Q" + PRIOR_HAPMAP);
|
||||
foundDBSNP = true;
|
||||
} else if ( d.getName().equals("hapmap") ) {
|
||||
logger.info("Found HapMap track with prior probability = Q" + PRIOR_HAPMAP);
|
||||
foundDBSNP = true;
|
||||
} else if ( d.getName().equals("1kg") ) {
|
||||
logger.info("Found 1KG track for with prior probability = Q" + PRIOR_1KG);
|
||||
foundDBSNP = true;
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
|
|
@ -223,7 +225,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
variantDatum.isKnown = ( dbsnp != null );
|
||||
double knownPrior_qScore = PRIOR_NOVELS;
|
||||
if( vcHapMap != null || ( dbsnp != null && DbSNPHelper.isHapmap(dbsnp) ) ) {
|
||||
if( vcHapMap != null || ( !NO_BY_HAPMAP_VALIDATION_STATUS && dbsnp != null && DbSNPHelper.isHapmap(dbsnp) ) ) {
|
||||
knownPrior_qScore = PRIOR_HAPMAP;
|
||||
} else if( vc1KG != null ) {
|
||||
knownPrior_qScore = PRIOR_1KG;
|
||||
|
|
|
|||
|
|
@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
|
|||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.CommandLineUtils;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
|
|
@ -79,6 +81,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="setKey", shortName="setKey", doc="Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from. Set to null if you don't want the set field emitted.", required=false)
|
||||
public String SET_KEY = "set";
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
|
||||
protected Boolean NO_VCF_HEADER_LINE = false;
|
||||
|
||||
private List<String> priority = null;
|
||||
|
||||
private VariantAnnotatorEngine engine;
|
||||
|
|
@ -97,9 +103,11 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
SET_KEY = null;
|
||||
|
||||
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
|
||||
headerLines.add(new VCFHeaderLine("source", "CombineVariants"));
|
||||
if ( SET_KEY != null )
|
||||
headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants"));
|
||||
if ( !NO_VCF_HEADER_LINE ) {
|
||||
headerLines.add(new VCFHeaderLine("CombineVariants", "\"" + CommandLineUtils.createApproximateCommandLineArgumentString(getToolkit(), this) + "\""));
|
||||
}
|
||||
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -361,7 +361,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariatesFailWithoutDBSNP() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "3700eaf567e4937f442fc777a226d6ad");
|
||||
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "");
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ import java.util.Arrays;
|
|||
public class CombineVariantsIntegrationTest extends WalkerTest {
|
||||
|
||||
public static String baseTestString(String args) {
|
||||
return "-T CombineVariants -L 1:1-50,000,000 -o %s -R " + b36KGReference + args;
|
||||
return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args;
|
||||
}
|
||||
|
||||
public void test1InOut(String file, String md5) {
|
||||
|
|
@ -60,21 +60,21 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
|
|||
}
|
||||
|
||||
|
||||
@Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "f203232c2fa51862e911940ad9d60387"); }
|
||||
@Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "29aa605c3f08f3ad58f5eea64dc709c1", " -setKey foo"); }
|
||||
@Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "445bff7b13b31c900492f1ccaed62a80", " -setKey null"); }
|
||||
@Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "38b7e64b91c726867a604cf95b9cb10a"); } // official project VCF files in tabix format
|
||||
@Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "def6365e318cb506aa69ee8ec6899c4e"); }
|
||||
@Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "6d1d9fffcc10d4f9ab086760603b577e", " -setKey foo"); }
|
||||
@Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "e1d43e0c4df65df5e9761916b12f0b5b", " -setKey null"); }
|
||||
@Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "79e7e474b0a2bb82930201f143328d5d"); } // official project VCF files in tabix format
|
||||
|
||||
@Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "cdf66ad481b7f98204e368b968d6d8ec"); }
|
||||
@Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "17c8468b1b963c9abc49dff17fd811ba"); }
|
||||
@Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "d93ee8b9f36eedc80a3afa548bffc888"); }
|
||||
@Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "4d16ffe9cf3beff94ac05ea49f85049a"); }
|
||||
|
||||
@Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "27b509c5c64c5b7520d348bceaca67f5"); } // official project VCF files in tabix format
|
||||
@Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "7ad58764b855ec7ad61075dda63567b3"); } // official project VCF files in tabix format
|
||||
@Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "a438947372f5e748931d9c5e2ba5fc3a"); }
|
||||
@Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "78f2a2c09df8fa51ccfe5b3706a5e313"); } // official project VCF files in tabix format
|
||||
@Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "460bdbeaf7e9641395ac2ce6e1afc106"); } // official project VCF files in tabix format
|
||||
@Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "d445583e7f3a8bc98004e8e44878fce6"); }
|
||||
|
||||
@Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "19f8c7ed0ed3b59c53ac76164679b7f5"); }
|
||||
@Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e64284f598fc4f473ee5554282d84b0c"); }
|
||||
|
||||
@Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "96383d62d6b9f0e7ee2d3637a985af28"); }
|
||||
@Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "1f3d118dcee9b711714856639930694d"); }
|
||||
|
||||
@Test public void threeWayWithRefs() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
|
|
@ -87,7 +87,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
|
|||
" -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" +
|
||||
" -genotypeMergeOptions UNIQUIFY -L 1"),
|
||||
1,
|
||||
Arrays.asList("daf2c43b629c9fc8d5f064e05bbc51b7"));
|
||||
Arrays.asList("81f53a9f2a6d410c1236266e2516bf6f"));
|
||||
executeTest("threeWayWithRefs", spec);
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue