Adding header lines to output of VR walkers to settle validator warnings. Command lines are added to the VCF header. GATK version numbers will be added to the header lines by Matt.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4288 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-09-15 16:45:03 +00:00
parent 41fa323e63
commit 0a06fbdb94
7 changed files with 65 additions and 31 deletions

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.filters;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
@ -71,6 +72,7 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="maskName", shortName="mask", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask']", required=false)
protected String MASK_NAME = "Mask";
@Hidden
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
protected Boolean NO_VCF_HEADER_LINE = false;

View File

@ -102,11 +102,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/////////////////////////////
// Debugging-only Arguments
/////////////////////////////
@Hidden
@Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
private boolean NO_PG_TAG = false;
@Hidden
@Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.")
private boolean REQUIRE_EOF = false;
@Hidden
@Argument(fullName="skipUQUpdate", shortName="skipUQUpdate", required=false, doc="If true, we will skip the UQ updating step for each read, speeding up the calculations")
private boolean skipUQUpdate = false;

View File

@ -27,8 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -36,7 +35,6 @@ import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.broadinstitute.sting.utils.text.XReadLines;
@ -75,6 +73,13 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
@Argument(fullName="fdr_filter_level", shortName="fdr_filter_level", doc="The FDR level at which to start filtering.", required=false)
private double FDR_FILTER_LEVEL = 0.0;
/////////////////////////////
// Debug Arguments
/////////////////////////////
@Hidden
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
protected Boolean NO_VCF_HEADER_LINE = false;
/////////////////////////////
// Private Member Variables
/////////////////////////////
@ -162,15 +167,20 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
// setup the header fields
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFHeaderLineType.Float, "The original variant quality score"));
hInfo.add(new VCFHeaderLine("source", "ApplyVariantCuts"));
if( !NO_VCF_HEADER_LINE ) {
hInfo.add(new VCFHeaderLine("ApplyVariantCuts", "\"" + CommandLineUtils.createApproximateCommandLineArgumentString(getToolkit(), this) + "\""));
}
final TreeSet<String> samples = new TreeSet<String>();
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit()));
for( int iii = 1; iii < filterName.size(); iii++ ) {
hInfo.add(new VCFFilterHeaderLine(filterName.get(iii), String.format("FDR tranche level at qual " + qCuts.get(iii))));
if( filterName.size() >= 2 ) {
for( int iii = 0; iii < filterName.size() - 1; iii++ ) {
hInfo.add(new VCFFilterHeaderLine(filterName.get(iii), String.format("FDR tranche level at qual: " + qCuts.get(iii) + " <= x < " + qCuts.get(iii+1))));
}
}
if( filterName.size() >= 1 ) {
hInfo.add(new VCFFilterHeaderLine(filterName.get(0)+"+", String.format("FDR tranche level at qual < " + qCuts.get(0))));
}
hInfo.add(new VCFFilterHeaderLine(filterName.get(0)+"+", String.format("FDR tranche level at qual > " + qCuts.get(filterName.size()-1))));
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);

View File

@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.commandline.CommandLineUtils;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -76,7 +78,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
// private String PATH_TO_RSCRIPT = "Rscript";
// @Argument(fullName = "path_to_resources", shortName = "resources", doc="Path to resources folder holding the Sting R scripts.", required=false)
// private String PATH_TO_RESOURCES = "R/";
@Argument(fullName="weightNovels", shortName="weightNovels", doc="The weight for novel variants during clustering", required=false)
@Argument(fullName="weightNovel", shortName="weightNovel", doc="The weight for novel variants during clustering", required=false)
private double WEIGHT_NOVELS = 0.0;
@Argument(fullName="weightDBSNP", shortName="weightDBSNP", doc="The weight for dbSNP variants during clustering", required=false)
private double WEIGHT_DBSNP = 0.0;
@ -100,6 +102,14 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
/////////////////////////////
// Debug Arguments
/////////////////////////////
@Hidden
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
protected Boolean NO_HEADER_LINE = false;
/////////////////////////////
// Private Member Variables
/////////////////////////////
@ -115,7 +125,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
// if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
//if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
@ -123,6 +133,11 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
}
if( !NO_HEADER_LINE ) {
CLUSTER_FILE.print("##GenerateVariantClusters = ");
CLUSTER_FILE.println("\"" + CommandLineUtils.createApproximateCommandLineArgumentString(getToolkit(), this) + "\"");
}
boolean foundDBSNP = false;
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
if( d.getName().startsWith("input") ) {
@ -235,7 +250,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
default:
throw new UserException.BadArgumentValue("OPTIMIZATION_MODEL", "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
}
theModel.run( CLUSTER_FILE );
/*

View File

@ -78,6 +78,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
private final double[] hyperParameter_b;
private final double[] hyperParameter_lambda;
private static final Pattern COMMENT_PATTERN = Pattern.compile("^##.*");
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
@ -130,7 +131,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
annotationLines.add(line);
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
clusterLines.add(line);
} else {
} else if( !COMMENT_PATTERN.matcher(line).matches() ) {
throw new UserException.MalformedFile(clusterFile, "Could not parse line: " + line);
}
}
@ -417,9 +418,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
public double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter ) {
double value;
//if( annotationKey.equals("AB") && !vc.getAttributes().containsKey(annotationKey) ) {
// value = (0.5 - 0.005) + (0.01 * rand.nextDouble()); // HomVar calls don't have an allele balance
//}
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // HRun values must be jittered a bit to work in this GMM
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
value += -0.25 + 0.5 * rand.nextDouble();
@ -448,7 +446,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
double sum = 0.0;
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk];
sum += pVarInCluster[kkk];
}
return sum;

View File

@ -28,8 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
@ -39,7 +38,6 @@ import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
@ -75,7 +73,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Output(doc="File to which recalibrated variants should be written",required=true)
private VCFWriter vcfWriter = null;
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@ -89,7 +86,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private boolean IGNORE_ALL_INPUT_FILTERS = false;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName="priorNovels", shortName="priorNovels", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
@Argument(fullName="priorNovel", shortName="priorNovel", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
private double PRIOR_NOVELS = 2.0;
@Argument(fullName="priorDBSNP", shortName="priorDBSNP", doc="A prior on the quality of dbSNP variants, a phred scaled probability of being true.", required=false)
private double PRIOR_DBSNP = 10.0;
@ -110,6 +107,13 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Argument(fullName="quality_scale_factor", shortName="qScale", doc="Multiply all final quality scores by this value. Needed to normalize the quality scores.", required=false)
private double QUALITY_SCALE_FACTOR = 100.0;
/////////////////////////////
// Debug Arguments
/////////////////////////////
@Hidden
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
protected Boolean NO_VCF_HEADER_LINE = false;
/////////////////////////////
// Private Member Variables
/////////////////////////////
@ -173,7 +177,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
final TreeSet<String> samples = new TreeSet<String>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFHeaderLineType.Float, "The original variant quality score"));
hInfo.add(new VCFHeaderLine("source", "VariantRecalibrator"));
hInfo.add(new VCFInfoHeaderLine("LOD", 1, VCFHeaderLineType.Float, "The log odds ratio calculated by the VR algorithm which was turned into the phred scaled recalibrated quality score"));
if( !NO_VCF_HEADER_LINE ) {
hInfo.add(new VCFHeaderLine("VariantRecalibrator", "\"" + CommandLineUtils.createApproximateCommandLineArgumentString(getToolkit(), this) + "\""));
}
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);

View File

@ -23,6 +23,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" -NO_HEADER" +
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" +
" -weightDBSNP 0.2 -weightHapMap 1.0" +
@ -44,9 +45,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
public void testVariantRecalibrator() {
HashMap<String, List<String>> e = new HashMap<String, List<String>>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
Arrays.asList("9e4509be067284eefc31e91378faba6a", "038c31c5bb46a4df89b8ee69ec740812","7d42bbdfb69fdfb18cbda13a63d92602")); // Each test checks the md5 of three output files
Arrays.asList("0c6b5085a678b6312ab4bc8ce7b4eee4", "038c31c5bb46a4df89b8ee69ec740812","7d42bbdfb69fdfb18cbda13a63d92602")); // Each test checks the md5 of three output files
e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf",
Arrays.asList("80243bd6731f6b0341fa1762c095a72e", "661360e85392af9c97e386399871854a","371e5a70a4006420737c5ab259e0e23e")); // Each test checks the md5 of three output files
Arrays.asList("bbdffb7fa611f4ae80e919cdf86b9bc6", "661360e85392af9c97e386399871854a","371e5a70a4006420737c5ab259e0e23e")); // Each test checks the md5 of three output files
for ( Map.Entry<String, List<String>> entry : e.entrySet() ) {
String vcf = entry.getKey();
@ -56,6 +57,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
if ( clusterFile != null ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" -NO_HEADER" +
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" +
" -T VariantRecalibrator" +
@ -69,22 +71,20 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -o %s" +
" -tranchesFile %s" +
" -reportDatFile %s",
3, // two output file
3, // three output files
md5s);
List<File> result = executeTest("testVariantRecalibrator", spec).getFirst();
inputVCFFiles.put(vcf, result.get(0).getAbsolutePath());
tranchesFiles.put(vcf, result.get(1).getAbsolutePath());
}
}
}
@Test
public void testApplyVariantCuts() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "5fc641e291ad7330985ad94d4a347511" );
e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "d9e00e6fe8269b3218880d2c084804c6" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "7429ed494131eb1dab5a9169cf65d1f0" );
e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "ad8661cba3b04a7977c97a541fd8a668" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();
@ -96,6 +96,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
if ( inputVCFFile != null && tranchesFile != null ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" -NO_HEADER" +
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
" -T ApplyVariantCuts" +
" -L 1:20,000,000-100,000,000" +