VariantFilteration now accepts a VCF rod in addition to an input geli. It will then annotate this VCF file with filtering information in the INFO field too. --OnlyAnnotate will not write in filtering output

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2008 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-11-10 13:24:58 +00:00
parent f9819d5f13
commit d316cbad4c
16 changed files with 94 additions and 26 deletions

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.contexts;
import org.broadinstitute.sting.gatk.refdata.RodGeliText;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import net.sf.samtools.SAMRecord;
import java.util.*;
@ -37,19 +38,22 @@ public class VariantContext {
private ReferenceContext ref;
private AlignmentContext context;
private RodGeliText variant;
private RodVCF variantVCF;
private AlignmentContext Q0freeContext = null;
public VariantContext(RefMetaDataTracker tracker, ReferenceContext ref,
AlignmentContext context, RodGeliText variant) {
AlignmentContext context, RodGeliText variant, RodVCF variantVCF) {
this.tracker = tracker;
this.ref = ref;
this.context = context;
this.variant = variant;
this.variantVCF = variantVCF;
}
public RefMetaDataTracker getTracker() { return tracker; }
public ReferenceContext getReferenceContext() { return ref; }
public RodGeliText getVariant() { return variant; }
public RodVCF getVariantVCF() { return variantVCF; }
public AlignmentContext getAlignmentContext(boolean useMQ0Reads) {
return (useMQ0Reads ? context : getQ0freeContext());
}

View File

@ -80,4 +80,7 @@ public class VECAlleleBalance extends RatioFilter {
return "AlleleBalance";
}
public String getScoreString() {
return String.format("%.2f", ratio);
}
}

View File

@ -60,5 +60,9 @@ public class VECClusteredSnps implements VariantExclusionCriterion {
return "ClusteredSnp";
}
public String getScoreString() {
return String.format("%d", distance);
}
public boolean useZeroQualityReads() { return false; }
}

View File

@ -50,5 +50,10 @@ public class VECDepthOfCoverage implements VariantExclusionCriterion {
return "DoC";
}
public String getScoreString() {
return String.format("%d", depth);
}
public boolean useZeroQualityReads() { return false; }
}

View File

@ -51,6 +51,10 @@ public class VECFisherStrand implements VariantExclusionCriterion {
return "strand";
}
public String getScoreString() {
return String.format("%.4f",pValue);
}
public boolean useZeroQualityReads() { return false; }
public boolean strandTest(char ref, AlignmentContext context, int allele1, int allele2, double threshold, StringBuffer out) {

View File

@ -119,4 +119,8 @@ public class VECHomopolymer implements VariantExclusionCriterion {
public String getVCFFilterString() {
return "homopolymer";
}
public String getScoreString() {
return exclude ? "1" : "0";
}
}

View File

@ -59,5 +59,9 @@ public class VECIndelArtifact implements VariantExclusionCriterion {
return "InDel";
}
public String getScoreString() {
return exclude ? "1" : "0";
}
public boolean useZeroQualityReads() { return false; }
}

View File

@ -38,4 +38,8 @@ public class VECLodThreshold implements VariantExclusionCriterion {
public String getVCFFilterString() {
return "LOD";
}
public String getScoreString() {
return String.format("%.2f",lod);
}
}

View File

@ -37,6 +37,10 @@ public class VECLodThresholdByCoverage implements VariantExclusionCriterion {
return "LODbyDoC";
}
public String getScoreString() {
return exclude ? "1" : "0";
}
public boolean useZeroQualityReads() {
return false;
}

View File

@ -45,5 +45,9 @@ public class VECMappingQuality implements VariantExclusionCriterion {
return "MQ";
}
public String getScoreString() {
return String.format("%.2f", rms);
}
public boolean useZeroQualityReads() { return true; }
}

View File

@ -45,5 +45,9 @@ public class VECMappingQualityZero implements VariantExclusionCriterion {
return "MQzero";
}
public String getScoreString() {
return String.format("%d", mq0Count);
}
public boolean useZeroQualityReads() { return true; }
}

View File

@ -24,6 +24,10 @@ public class VECNull implements VariantExclusionCriterion {
return "";
}
public String getScoreString() {
return "";
}
public boolean useZeroQualityReads() {
return false;
}

View File

@ -75,4 +75,8 @@ public class VECOnOffGenotypeRatio extends RatioFilter {
public String getVCFFilterString() {
return "onOffGenotype";
}
public String getScoreString() {
return String.format("%.2f", ratio);
}
}

View File

@ -12,6 +12,7 @@ public interface VariantExclusionCriterion {
public String getStudyHeader();
public String getStudyInfo();
public String getVCFFilterString();
public String getScoreString();
public boolean useZeroQualityReads();
}

View File

@ -21,6 +21,7 @@ import java.io.*;
* variant outright. At the moment, the variants are expected to be in gelitext format.
*/
@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type= RodGeliText.class))
//@Allows(value={DataSource.READS, DataSource.REFERENCE}, referenceMetaData={@RMD(name="variant",type= RodGeliText.class), @RMD(name="variantvcf",type= RodVCF.class), @RMD(name="dbsnp",type=rodDbSNP.class), @RMD(name="interval",type= IntervalRod.class)})
public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true) public File VCF_OUT;
@Argument(fullName="sampleName", shortName="sample", doc="Temporary hack to get VCF to work: the sample (NA-ID) corresponding to the variants", required=true) public String sampleName;
@ -31,6 +32,7 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="inclusion_threshold", shortName="IT", doc="The product of the probability to include variants based on these filters must be greater than the value specified here in order to be included", required=false) public Double INCLUSION_THRESHOLD = 0.9;
@Argument(fullName="verbose", shortName="V", doc="Show how the variant likelihoods are changing with the application of each feature") public Boolean VERBOSE = false;
@Argument(fullName="list", shortName="ls", doc="List the available features and exclusion criteria and exit") public Boolean LIST = false;
@Argument(fullName="onlyAnnotate", shortName="oa", doc="If provided, do not write output into the FILTER field, but only annotate the VCF") public boolean onlyAnnotate = false;
private List<Class<? extends IndependentVariantFeature>> featureClasses;
private List<Class<? extends VariantExclusionCriterion>> exclusionClasses;
@ -220,7 +222,8 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
if ( variant == null || BaseUtils.simpleBaseToBaseIndex(ref.getBase()) == -1 )
return 0;
VariantContext varContext = new VariantContext(tracker, ref, context, variant);
RodVCF variantVCF = (RodVCF) tracker.lookup("variantVCF", null);
VariantContext varContext = new VariantContext(tracker, ref, context, variant, variantVCF);
// if we're still initializing the context, do so
if ( windowInitializer != null ) {
@ -242,22 +245,20 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
VariantContext context = variantContextWindow.getContext();
if ( context == null )
return;
RodGeliText variant = context.getVariant();
HashMap<String, Double> exclusionResults = new HashMap<String, Double>();
RodGeliText variant = context.getVariant();
GenomeLoc loc = context.getAlignmentContext(true).getLocation();
if (VERBOSE) { out.println("Original:\n" + variant); }
GenomeLoc loc = context.getAlignmentContext(true).getLocation();
if ( annotatedWriter != null )
annotatedWriter.print(loc.getContig() + "\t" + loc.getStart() + "\t");
// Apply features that modify the likelihoods and LOD scores
for ( IndependentVariantFeature ivf : requestedFeatures ) {
ivf.compute(variantContextWindow);
double[] weights = ivf.getLikelihoods();
variant.adjustLikelihoods(weights);
if (VERBOSE) { out.println(rationalizeClassName(ivf.getClass()) + ":\n " + variant); }
@ -273,6 +274,7 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
// Use the filters to score the variant
String filterFailureString = "";
HashMap<String,String> filterINFOString = new HashMap<String,String>();
double jointInclusionProbability = 1.0;
for ( VariantExclusionCriterion vec : requestedExclusions ) {
vec.compute(variantContextWindow);
@ -286,6 +288,7 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
if (inclusionProbability < INCLUSION_THRESHOLD) {
filterFailureString += vec.getVCFFilterString() + ";";
}
filterINFOString.put(vec.getVCFFilterString(), vec.getScoreString());
if (VERBOSE) {
out.print(exclusionClassName + "=" + inclusionProbability + ";");
@ -298,14 +301,11 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
// Decide whether we should keep the call or not
if (jointInclusionProbability >= INCLUSION_THRESHOLD) {
includedWriter.println(variant);
if (VERBOSE) { out.println("] JointInclusionProbability:" + jointInclusionProbability + " State:included\n"); }
} else {
if (VERBOSE) { out.println("] JointInclusionProbability:" + jointInclusionProbability + " State:excluded\n"); }
}
if (VERBOSE) { out.println("] JointInclusionProbability:" + jointInclusionProbability + " State:included\n"); }
rodDbSNP dbsnp = (rodDbSNP) context.getTracker().lookup("dbSNP", null);
if ( annotatedWriter != null ) {
rodDbSNP dbsnp = (rodDbSNP) context.getTracker().lookup("dbSNP", null);
if ( dbsnp == null )
annotatedWriter.print("false\tfalse\t");
else
@ -313,16 +313,31 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
annotatedWriter.println(GenotypeUtils.isHet(variant));
}
List<VCFGenotypeRecord> gt = new ArrayList<VCFGenotypeRecord>();
Map<VCFHeader.HEADER_FIELDS,String> map = new HashMap<VCFHeader.HEADER_FIELDS,String>();
VCFRecord rec = VariantsToVCF.generateVCFRecord(context.getTracker(), context.getReferenceContext(), context.getAlignmentContext(true), vcfHeader, gt, map, sampleNames, out, false, false);
if ( rec != null) {
if ( !filterFailureString.equals("") )
rec.setFilterString(filterFailureString);
writeVCF(context, filterFailureString, filterINFOString);
}
private void writeVCF(VariantContext context, final String filterFailureString, HashMap<String,String> filterINFOString) {
VCFRecord rec = getVCFRecord(context);
if ( rec != null ) {
rec.addInfoFields(filterINFOString);
if ( ! onlyAnnotate && !filterFailureString.equals("") )
rec.setFilterString(filterFailureString);
vcfWriter.addRecord(rec);
}
}
private VCFRecord getVCFRecord(VariantContext context) {
if ( context.getVariantVCF() != null ) {
return context.getVariantVCF().mCurrentRecord;
} else {
List<VCFGenotypeRecord> gt = new ArrayList<VCFGenotypeRecord>();
Map<VCFHeader.HEADER_FIELDS,String> map = new HashMap<VCFHeader.HEADER_FIELDS,String>();
return VariantsToVCF.generateVCFRecord(context.getTracker(), context.getReferenceContext(), context.getAlignmentContext(true), vcfHeader, gt, map, sampleNames, out, false, false);
}
}
/**
* Increment the number of loci processed.
*

View File

@ -8,63 +8,63 @@ import java.util.Arrays;
public class VariantFiltrationIntegrationTest extends WalkerTest {
@Test
public void testIntervals() {
String[] md5DoC = {"c0a7e2fc07d565e633b3064f9f3cdaf5", "21c8e1f9dc65fdfb39347547f9b04011"};
String[] md5DoC = {"d11547059193b90e2fd1eb47dac18999", "21c8e1f9dc65fdfb39347547f9b04011"};
WalkerTestSpec spec1 = new WalkerTestSpec(
"-T VariantFiltration -X DepthOfCoverage:max=70 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5DoC));
executeTest("testDoCFilter", spec1);
String[] md5AlleleBalance = {"aa0f7800cfd346236620ae0eac220817", "a13e4ce6260bf9f33ca99dc808b8e6ad"};
String[] md5AlleleBalance = {"409f2b7fb193919d28c20be4dc5c4516", "a13e4ce6260bf9f33ca99dc808b8e6ad"};
WalkerTestSpec spec2 = new WalkerTestSpec(
"-T VariantFiltration -X AlleleBalance:low=0.25,high=0.75 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5AlleleBalance));
executeTest("testAlleleBalanceFilter", spec2);
String[] md5Strand = {"9f430f251dbeb58a2f80a1306a5dd492", "0f7db0aad764268ee8fa3b857df8d87d"};
String[] md5Strand = {"e5806925867088edeee9f6276ab030f3", "0f7db0aad764268ee8fa3b857df8d87d"};
WalkerTestSpec spec3 = new WalkerTestSpec(
"-T VariantFiltration -X FisherStrand:pvalue=0.0001 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5Strand));
executeTest("testStrandFilter", spec3);
String[] md5Lod = {"56177258c0b3944c043f86faee4b42ae", "7e0c4f2b0fda85fd2891eee76c396a55"};
String[] md5Lod = {"2c93f56e07e320ac7a012bd4940b1dbc", "7e0c4f2b0fda85fd2891eee76c396a55"};
WalkerTestSpec spec4 = new WalkerTestSpec(
"-T VariantFiltration -X LodThreshold:lod=10 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5Lod));
executeTest("testLodFilter", spec4);
String[] md5MQ0 = {"0e303c32f5c1503f4c875771f28fc46c", "3203de335621851bccf596242b079e23"};
String[] md5MQ0 = {"fde4be5eec12be3405863649170aaaac", "3203de335621851bccf596242b079e23"};
WalkerTestSpec spec5 = new WalkerTestSpec(
"-T VariantFiltration -X MappingQualityZero:max=70 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5MQ0));
executeTest("testMappingQuality0Filter", spec5);
String[] md5MQ = {"946462a6199e9453784e0942e18e6830", "ecc777feedea61f7b570d114c2ab89b1"};
String[] md5MQ = {"17c35a5ccd814991dbcd3be6c2f244cd", "ecc777feedea61f7b570d114c2ab89b1"};
WalkerTestSpec spec6 = new WalkerTestSpec(
"-T VariantFiltration -X MappingQuality:min=20 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5MQ));
executeTest("testRMSMappingQualityFilter", spec6);
String[] md5OnOff = {"2ff84e104ce73e347e55d272170b4d03", "67f2e1bc025833b0fa31f47195198997"};
String[] md5OnOff = {"fc649a7ebd4a7b7fa739fc4fcb7704e2", "67f2e1bc025833b0fa31f47195198997"};
WalkerTestSpec spec7 = new WalkerTestSpec(
"-T VariantFiltration -X OnOffGenotypeRatio:threshold=0.9 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5OnOff));
executeTest("testOnOffGenotypeFilter", spec7);
String[] md5Clusters = {"e6a1c088678b1c31ff340ebd622b476e", "8fa6b6ffc93ee7fb8d6b52a7fb7815ef"};
String[] md5Clusters = {"9b3aede689a25431ed2e1aac9dd73dc6", "8fa6b6ffc93ee7fb8d6b52a7fb7815ef"};
WalkerTestSpec spec8 = new WalkerTestSpec(
"-T VariantFiltration -X ClusteredSnps:window=10,snps=3 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,
Arrays.asList(md5Clusters));
executeTest("testClusteredSnpsFilter", spec8);
String[] md5Indels = {"82e555b76c12474154f8e5e402516d73", "8e0e915a1cb63d7049e0671ed00101fe"};
String[] md5Indels = {"c36cb0021f90b3509e483b57f4f41f6e", "8e0e915a1cb63d7049e0671ed00101fe"};
WalkerTestSpec spec9 = new WalkerTestSpec(
"-T VariantFiltration -X IndelArtifact -B indels,PointIndel,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.indels -B cleaned,CleanedOutSNP,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.realigner_badsnps -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chr1_10mb_11mb.slx.geli.calls -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -vcf %s -included %s -sample NA12878",
2,