Support for automated selects for tranches in variant eval -- use -tf to make tranch-specific ve outputs. ApplyVariantCuts with tranche reading functions for general use, along with todo for ryan. CombineVariants now has --filteredAreUncalled and will treat filtered snps in input VCFs are uncalled, and so won't emit -filteredInOther set features

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3908 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-07-30 14:16:43 +00:00
parent 9231d13252
commit ac8048f17b
6 changed files with 94 additions and 14 deletions

View File

@ -77,6 +77,13 @@ public class VariantContextUtils {
return VariantContextUtils.initializeMatchExps(map);
}
public static List<JexlVCMatchExp> initializeMatchExps(ArrayList<String> names, ArrayList<String> exps) {
String[] nameArray = new String[names.size()];
String[] expArray = new String[exps.size()];
return initializeMatchExps(names.toArray(nameArray), exps.toArray(expArray));
}
/**
* Method for creating JexlVCMatchExp from input walker arguments mapping from names to exps. These two arrays contain
* the name associated with each JEXL expression. initializeMatchExps will parse each expression and return
@ -191,12 +198,13 @@ public class VariantContextUtils {
VariantMergeType variantMergeOptions, GenotypeMergeType genotypeMergeOptions,
boolean annotateOrigin, boolean printMessages, byte inputRefBase ) {
return simpleMerge(unsortedVCs, priorityListOfVCs, variantMergeOptions, genotypeMergeOptions, annotateOrigin, printMessages, inputRefBase, "set");
return simpleMerge(unsortedVCs, priorityListOfVCs, variantMergeOptions, genotypeMergeOptions, annotateOrigin, printMessages, inputRefBase, "set", false);
}
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs,
VariantMergeType variantMergeOptions, GenotypeMergeType genotypeMergeOptions,
boolean annotateOrigin, boolean printMessages, byte inputRefBase, String setKey ) {
public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs,
VariantMergeType variantMergeOptions, GenotypeMergeType genotypeMergeOptions,
boolean annotateOrigin, boolean printMessages, byte inputRefBase, String setKey,
boolean filteredAreUncalled ) {
if ( unsortedVCs == null || unsortedVCs.size() == 0 )
return null;
@ -211,9 +219,13 @@ public static VariantContext simpleMerge(Collection<VariantContext> unsortedVCs,
List<VariantContext> VCs = new ArrayList<VariantContext>();
for (VariantContext vc : prepaddedVCs) {
VCs.add(createVariantContextWithPaddedAlleles(vc,inputRefBase));
// also a reasonable place to remove filtered calls, if needed
if ( ! filteredAreUncalled || vc.isNotFiltered() )
VCs.add(createVariantContextWithPaddedAlleles(vc,inputRefBase));
}
if ( VCs.size() == 0 ) // everything is filtered out and we're filteredareUncalled
return null;
// establish the baseline info from the first VC
VariantContext first = VCs.get(0);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The Broad Institute
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -12,15 +12,14 @@
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;

View File

@ -38,7 +38,7 @@ import java.util.*;
@Analysis(name = "Genotype Concordance", description = "Determine the genotype concordance between the genotypes in difference tracks")
public class GenotypeConcordance extends VariantEvaluator {
private static final boolean PRINT_INTERESTING_SITES = false;
private static final boolean PRINT_INTERESTING_SITES = true;
protected final static Logger logger = Logger.getLogger(GenotypeConcordance.class);
@ -306,7 +306,8 @@ public class GenotypeConcordance extends VariantEvaluator {
final Genotype.Type truth = validation.getGenotype(sample).getType();
sampleStats.incrValue(sample, truth, called);
if ( (truth == Genotype.Type.HOM_VAR || truth == Genotype.Type.HET) && called == Genotype.Type.NO_CALL ) {
if ( PRINT_INTERESTING_SITES ) System.out.printf("%s: HM3 FN => %s%n", group, validation);
if ( PRINT_INTERESTING_SITES && super.getVEWalker().printInterestingSites() )
System.out.printf("%s: HM3 FN => %s%n", group, validation);
}
}
}

View File

@ -38,6 +38,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.gatk.walkers.variantrecalibration.ApplyVariantCuts;
import org.broadinstitute.sting.playground.utils.report.ReportMarshaller;
import org.broadinstitute.sting.playground.utils.report.VE2ReportFactory;
import org.broadinstitute.sting.playground.utils.report.templates.ReportFormat;
@ -114,7 +115,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
// --------------------------------------------------------------------------------------------------------------
@Argument(shortName="select", doc="One or more stratifications to use when evaluating the data", required=false)
protected String[] SELECT_EXPS = {};
protected ArrayList<String> SELECT_EXPS = new ArrayList<String>();
//protected String[] SELECT_EXPS = {"set == \"Intersection\"",
// "set == \"HiSeq.WGS.cleaned.ug.vcf\"",
// "set == \"HiSeq.WGS.cleaned.ug.vcf\" || set == \"Intersection\"",
@ -122,7 +123,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
// "set == \"HiSeq.WGS.raw.OQ.ug.vcf\" || set == \"Intersection\""};
@Argument(shortName="selectName", doc="Names to use for the list of stratifications (must be a 1-to-1 mapping)", required=false)
protected String[] SELECT_NAMES = {};
protected ArrayList<String> SELECT_NAMES = new ArrayList<String>();
//protected String[] SELECT_NAMES = {"Intersection", "x1", "x2", "x3", "x4"};
@Argument(shortName="known", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
@ -194,6 +195,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
@Argument(shortName="aatUseCodons", fullName="aminoAcidsRepresentedByCodons", doc="for the amino acid table, specifiy that the transitions are represented as codon changes, and not directly amino acid names", required = false)
protected boolean aatUseCodons = false;
@Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=false)
private String TRANCHE_FILENAME = null;
// --------------------------------------------------------------------------------------------------------------
//
// private walker data
@ -279,6 +284,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
//
// --------------------------------------------------------------------------------------------------------------
public boolean printInterestingSites() { return writer != null; }
public void initialize() {
if ( dels ) {
ALLOW_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.NO_VARIATION);
@ -293,6 +300,17 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
determineEvalations();
if ( TRANCHE_FILENAME != null ) {
// we are going to build a few select names automatically from the tranches file
for ( ApplyVariantCuts.Tranche t : ApplyVariantCuts.readTraches(new File(TRANCHE_FILENAME)) ) {
logger.info("Adding select for all variant above the pCut of : " + t);
SELECT_EXPS.add(String.format("QUAL >= %.2f", t.pCut));
SELECT_NAMES.add(String.format("FDR-%.2f", t.fdr));
}
}
logger.info("Selects: " + SELECT_NAMES);
logger.info("Selects: " + SELECT_EXPS);
List<VariantContextUtils.JexlVCMatchExp> selectExps = VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS);
for ( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {

View File

@ -72,14 +72,61 @@ public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
final ExpandingArrayList<Double> qCuts = new ExpandingArrayList<Double>();
final ExpandingArrayList<String> filterName = new ExpandingArrayList<String>();
public static class Tranche {
public double fdr, pCut, novelTiTv;
public int numNovel;
public String name;
public Tranche(double fdr, double pCut, double novelTiTv, int numNovel, String name) {
this.fdr = fdr;
this.pCut = pCut;
this.novelTiTv = novelTiTv;
this.numNovel = numNovel;
this.name = name;
}
public Tranche(final String line) {
final String[] vals = line.split(",");
this.fdr = Double.parseDouble(vals[0]);
this.novelTiTv = Double.parseDouble(vals[1]);
this.pCut = Double.parseDouble(vals[2]);
this.numNovel = Integer.parseInt(vals[3]);
this.name = vals[4];
}
public String toString() {
return String.format("[Tranche %s cut = %.3f with %d novels @ %.2f]", name, pCut, numNovel, novelTiTv);
}
}
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
public static List<Tranche> readTraches(File f) {
boolean firstLine = true;
List<Tranche> tranches = new ArrayList<Tranche>();
try {
for( final String line : new XReadLines(f) ) {
if( ! firstLine ) {
tranches.add(new Tranche(line));
}
firstLine = false;
}
return tranches;
} catch( FileNotFoundException e ) {
throw new StingException("Can not find input file: " + f);
}
}
public void initialize() {
// todo -- ryan, it's always best to use a read data structure, I need to read these in.
// todo -- I would have updated your code but there's no integration test to protect me from unexpected effects
boolean firstLine = true;
try {
for( final String line : new XReadLines(new File( TRANCHE_FILENAME )) ) {

View File

@ -65,6 +65,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false)
public boolean printComplexMerges = false;
@Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotation don't appear in the combined VCF", required=false)
public boolean filteredAreUncalled = false;
@Argument(fullName="setKey", shortName="setKey", doc="Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from. Set to null if you don't want the set field emitted.", required=false)
public String SET_KEY = "set";
@ -120,7 +123,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
// Need to provide reference bases to simpleMerge starting at current locus
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, context.getLocation());
VariantContext mergedVC = VariantContextUtils.simpleMerge(vcs, priority, variantMergeOption,
genotypeMergeOption, true, printComplexMerges, ref.getBase(), SET_KEY);
genotypeMergeOption, true, printComplexMerges, ref.getBase(), SET_KEY, filteredAreUncalled);
//out.printf(" merged => %s%nannotated => %s%n", mergedVC, annotatedMergedVC);