Under guidance from Matt added ability to use key-value tags with ROD binding command line arguments, so now one can say -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmap.vcf and get the tags in a walker. Look at ContrastiveRecalibrator for an example of how to use the new ReferenceOrderedDataSource.getTags(). Removed references to FDR in tranches since we are only using truth sensitivity. Finally fixed long standing bug where tranche filters weren't set appropriately.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5536 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2011-03-29 21:04:09 +00:00
parent 0f4ace0902
commit 5ddc0e464a
18 changed files with 199 additions and 183 deletions

View File

@ -83,6 +83,15 @@ public class Tags {
return keyValueTags.get(key);
}
/**
* Returns true if tags contains given key
* @param key The key for which to check existence.
* @return true if tags contains given key
*/
public boolean containsKey(final String key) {
return keyValueTags.containsKey(key);
}
/**
* Adds positional tag(s) to the tag object.
* @param tags The tag strings to add.

View File

@ -238,7 +238,7 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
Collection<RMDTriplet> rodBindings = new ArrayList<RMDTriplet>();
for (String fileName: argCollection.RODBindings) {
Tags tags = parser.getTags(fileName);
final Tags tags = parser.getTags(fileName);
fileName = expandFileName(fileName);
List<String> positionalTags = tags.getPositionalTags();
@ -259,17 +259,18 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
else
storageType = RMDStorageType.FILE;
rodBindings.add(new RMDTriplet(name,type,fileName,storageType));
rodBindings.add(new RMDTriplet(name,type,fileName,storageType,tags));
}
if (argCollection.DBSNPFile != null) {
if(argCollection.DBSNPFile.toLowerCase().contains("vcf"))
throw new UserException("--DBSNP (-D) argument currently does not support VCF. To use dbSNP in VCF format, please use -B:dbsnp,vcf <filename>.");
final Tags tags = parser.getTags(argCollection.DBSNPFile);
String fileName = expandFileName(argCollection.DBSNPFile);
RMDStorageType storageType = fileName.toLowerCase().endsWith("stdin") ? RMDStorageType.STREAM : RMDStorageType.FILE;
rodBindings.add(new RMDTriplet(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME,"dbsnp",fileName,storageType));
rodBindings.add(new RMDTriplet(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME,"dbsnp",fileName,storageType,tags));
}
return rodBindings;

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.datasources.rmd;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
@ -124,6 +125,15 @@ public class ReferenceOrderedDataSource {
return header;
}
public Tags getTags() {
return fileDescriptor.getTags();
}
public String getTagValue( final String key ) {
return fileDescriptor.getTags().getValue( key );
}
/**
* Retrieves the sequence dictionary created by this ROD.
* @return

View File

@ -33,6 +33,7 @@ import org.broad.tribble.index.Index;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.source.BasicFeatureSource;
import org.broad.tribble.util.LittleEndianOutputStream;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
@ -168,7 +169,7 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
if(typeName == null)
throw new ReviewedStingException("Unable to find type name for class " + targetClass.getName());
return createInstanceOfTrack(new RMDTriplet("anonymous",typeName,inputFile.getAbsolutePath(),RMDStorageType.FILE));
return createInstanceOfTrack(new RMDTriplet("anonymous",typeName,inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags()));
}
/**

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.gatk.refdata.utils;
import org.broadinstitute.sting.commandline.Tags;
/**
* a helper class to manage our triplets of data for the -B command line option (name, type, file)
* TODO: The presence of four datapoints here suggests that this class' name isn't sufficient to describe its function. Rename.
@ -35,12 +37,14 @@ public class RMDTriplet {
private final String type;
private final String file;
private final RMDStorageType storageType;
private final Tags tags;
public RMDTriplet(final String name, final String type, final String file, final RMDStorageType storageType) {
public RMDTriplet(final String name, final String type, final String file, final RMDStorageType storageType, final Tags tags) {
this.name = name;
this.type = type;
this.file = file;
this.storageType = storageType;
this.tags = tags;
}
/**
@ -75,4 +79,12 @@ public class RMDTriplet {
public RMDStorageType getStorageType() {
return storageType;
}
/**
* Gets the key=value tags associated with this track
* @return Tags associated with this track.
*/
public Tags getTags() {
return tags;
}
}

View File

@ -54,7 +54,6 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
/////////////////////////////
// Outputs
/////////////////////////////
@Output(fullName="cluster_file", shortName="clusterFile", doc="The output cluster file", required=true)
private PrintStream CLUSTER_FILE;

View File

@ -74,15 +74,18 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName="fdr_filter_level", shortName="fdr_filter_level", doc="The FDR level at which to start filtering.", required=false)
private double FDR_FILTER_LEVEL = 1.0;
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false)
private double TS_FILTER_LEVEL = 99.0;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
/////////////////////////////
// Private Member Variables
/////////////////////////////
final private List<Tranche> tranches = new ArrayList<Tranche>();
final private Set<String> inputNames = new HashSet<String>();
final private NestedHashMap lodMap = new NestedHashMap();
final private Set<String> ignoreInputFilterSet = new TreeSet<String>();
//---------------------------------------------------------------------------------------------------------------
//
@ -92,13 +95,12 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
public void initialize() {
for ( final Tranche t : Tranche.readTranches(TRANCHES_FILE) ) {
if ( t.fdr >= FDR_FILTER_LEVEL) {
if ( t.ts >= TS_FILTER_LEVEL ) {
tranches.add(t);
//statusMsg = "Keeping, above FDR threshold";
}
logger.info(String.format("Read tranche " + t));
}
Collections.reverse(tranches); // this algorithm wants the tranches ordered from worst to best
Collections.reverse(tranches); // this algorithm wants the tranches ordered from best (lowest truth sensitivity) to worst (highest truth sensitivity)
for( final ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
if( d.getName().startsWith("input") ) {
@ -113,6 +115,10 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
}
if( IGNORE_INPUT_FILTERS != null ) {
ignoreInputFilterSet.addAll( Arrays.asList(IGNORE_INPUT_FILTERS) );
}
// setup the header fields
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
@ -122,14 +128,14 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
if( tranches.size() >= 2 ) {
for( int iii = 0; iii < tranches.size() - 1; iii++ ) {
Tranche t = tranches.get(iii);
hInfo.add(new VCFFilterHeaderLine(t.name, String.format("FDR tranche level at VSQ Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod)));
final Tranche t = tranches.get(iii);
hInfo.add(new VCFFilterHeaderLine(t.name, String.format("Truth sensitivity tranche level at VSQ Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod)));
}
}
if( tranches.size() >= 1 ) {
hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("FDR tranche level at VQS Lod < " + tranches.get(0).minVQSLod)));
hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("Truth sensitivity tranche level at VQS Lod < " + tranches.get(0).minVQSLod)));
} else {
throw new UserException("No tranches were found in the file or were above the FDR Filter level " + FDR_FILTER_LEVEL);
throw new UserException("No tranches were found in the file or were above the truth sensitivity filter level " + TS_FILTER_LEVEL);
}
logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1));
@ -141,7 +147,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
logger.info("Reading in recalibration table...");
for ( final String line : new XReadLines( RECAL_FILE ) ) {
final String[] vals = line.split(",");
lodMap.put( Double.parseDouble(vals[2]), vals[0], Integer.parseInt(vals[1]));
lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(RECAL_FILE, e);
@ -164,38 +170,31 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
for( VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) {
if( vc != null ) {
String filterString = null;
final Double lod = (Double) lodMap.get( ref.getLocus().getContig(), ref.getLocus().getStart() );
if( !vc.isFiltered() ) {
try {
for( int i = tranches.size() - 1; i >= 0; i-- ) {
final Tranche tranche = tranches.get(i);
if( lod >= tranche.minVQSLod ) {
if (i == tranches.size() - 1) {
filterString = VCFConstants.PASSES_FILTERS_v4;
} else {
filterString = tranche.name;
}
break;
}
}
if( filterString == null ) {
filterString = tranches.get(0).name+"+";
}
if ( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
final Set<String> filters = new HashSet<String>();
filters.add(filterString);
vc = VariantContext.modifyFilters(vc, filters);
}
} catch ( Exception e ) {
throw new UserException.MalformedFile(vc.getSource(), "Invalid value for VQS key " + ContrastiveRecalibrator.VQS_LOD_KEY + " at variant " + vc, e);
}
}
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
if(lod != null) {
final Double lod = (Double) lodMap.get( ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop() );
if( vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) {
attrs.put(ContrastiveRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod));
for( int i = tranches.size() - 1; i >= 0; i-- ) {
final Tranche tranche = tranches.get(i);
if( lod >= tranche.minVQSLod ) {
if( i == tranches.size() - 1 ) {
filterString = VCFConstants.PASSES_FILTERS_v4;
} else {
filterString = tranche.name;
}
break;
}
}
if( filterString == null ) {
filterString = tranches.get(0).name+"+";
}
if( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
final Set<String> filters = new HashSet<String>();
filters.add(filterString);
vc = VariantContext.modifyFilters(vc, filters);
}
}
vcfWriter.add( VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), vc.getFilters(), attrs), ref.getBase() );
@ -219,7 +218,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
return 1; // This value isn't used for anything
}
public void onTraversalDone( Integer reduceSum ) {
public void onTraversalDone( final Integer reduceSum ) {
}
}

View File

@ -36,7 +36,6 @@ import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -72,8 +71,10 @@ public class ContrastiveRecalibrator extends RodWalker<ExpandingArrayList<Varian
//BUGBUG: use VariantRecalibrationArgumentCollection
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
private String[] USE_ANNOTATIONS = null;
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
private double[] FDR_TRANCHES = new double[]{0.1, 1.0, 10.0, 100.0};
@Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0};
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
/////////////////////////////
// Debug Arguments
@ -85,13 +86,13 @@ public class ContrastiveRecalibrator extends RodWalker<ExpandingArrayList<Varian
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
protected Boolean TRUST_ALL_POLYMORPHIC = false;
/////////////////////////////
// Private Member Variables
/////////////////////////////
private VariantDataManager dataManager;
private Set<String> inputNames = new HashSet<String>();
private VariantRecalibratorEngine engine = new VariantRecalibratorEngine(new VariantRecalibratorArgumentCollection()); //BUGBUG: doesn't do anything with the args at the moment
private final Set<String> ignoreInputFilterSet = new TreeSet<String>();
private final Set<String> inputNames = new HashSet<String>();
private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine(new VariantRecalibratorArgumentCollection()); //BUGBUG: doesn't do anything with the args at the moment
//---------------------------------------------------------------------------------------------------------------
//
@ -102,35 +103,27 @@ public class ContrastiveRecalibrator extends RodWalker<ExpandingArrayList<Varian
public void initialize() {
dataManager = new VariantDataManager( new ArrayList<String>(Arrays.asList(USE_ANNOTATIONS)) );
if( IGNORE_INPUT_FILTERS != null ) {
ignoreInputFilterSet.addAll( Arrays.asList(IGNORE_INPUT_FILTERS) );
}
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
if( d.getName().toLowerCase().startsWith("input") ) {
inputNames.add(d.getName());
logger.info("Found input variant track with name " + d.getName());
} else if ( d.getName().equals("dbsnp") ) { //BUGBUG: values are hard coded for now until command line tagging up for Rod bindings
logger.info("Found dbSNP track: \tKnown = true \tTraining = false \tTruth = false \tPrior = Q10.0");
dataManager.addTrainingSet(new TrainingSet("dbsnp", true, false, false, 10.0));
} else if ( d.getName().equals("hapmap") ) {
logger.info("Found HapMap track: \tKnown = false \tTraining = true \tTruth = true \tPrior = Q12.0");
dataManager.addTrainingSet(new TrainingSet("hapmap", false, true, true, 12.0));
} else if ( d.getName().equals("1kg") ) {
logger.info("Found 1KG track: \tKnown = false \tTraining = true \tTruth = false \tPrior = Q6.0");
dataManager.addTrainingSet(new TrainingSet("1kg", false, true, false, 6.0));
} else if ( d.getName().equals("omni") ) {
logger.info("Found Omni track: \tKnown = false \tTraining = true \tTruth = true \tPrior = Q15.0");
dataManager.addTrainingSet( new TrainingSet("omni", false, true, true, 15.0) );
logger.info( "Found input variant track with name " + d.getName() );
} else {
logger.info("WARNING! Not evaluating ROD binding: " + d.getName());
dataManager.addTrainingSet( new TrainingSet(d.getName(), d.getTags()) );
}
}
if( !dataManager.checkHasTrainingSet() ) {
throw new UserException.CommandLineException("No training set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, 1kg, or omni rod bindings.");
throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
}
if( !dataManager.checkHasTruthSet() ) {
throw new UserException.CommandLineException("No truth set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, 1kg, or omni rod bindings.");
throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" );
}
if( !dataManager.checkHasKnownSet() ) {
throw new UserException.CommandLineException("No known set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, 1kg, or omni rod bindings.");
throw new UserException.CommandLineException( "No known set found! Please provide sets of known polymorphic loci marked with the known=true ROD binding tag. For example, -B:dbsnp,VCF,known=true,training=false,truth=false,prior=8.0 dbsnpFile.vcf" );
}
if( inputNames.size() == 0 ) {
@ -152,7 +145,7 @@ public class ContrastiveRecalibrator extends RodWalker<ExpandingArrayList<Varian
}
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) {
if( vc != null ) { // BUGBUG: filtered?
if( vc != null && ( vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) ) {
final VariantDatum datum = new VariantDatum();
datum.annotations = dataManager.decodeAnnotations( ref.getGenomeLocParser(), vc, true ); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine
datum.pos = context.getLocation();
@ -205,7 +198,7 @@ public class ContrastiveRecalibrator extends RodWalker<ExpandingArrayList<Varian
//logger.info(String.format("Truth set size is %d, raw calls at these sites %d, maximum sensitivity of %.2f",
// nTruthSites, nCallsAtTruth, (100.0*nCallsAtTruth / Math.max(nTruthSites, nCallsAtTruth))));
TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric( nCallsAtTruth );
List<Tranche> tranches = TrancheManager.findTranches( dataManager.getData(), FDR_TRANCHES, metric, DEBUG_FILE ); //BUGBUG: recreated here to match the integration tests
List<Tranche> tranches = TrancheManager.findTranches( dataManager.getData(), TS_TRANCHES, metric, DEBUG_FILE ); //BUGBUG: recreated here to match the integration tests
TRANCHES_FILE.print(Tranche.tranchesString( tranches ));
logger.info( "Writing out recalibration table..." );

View File

@ -89,7 +89,7 @@ public class GaussianMixtureModel {
gaussian.initializeRandomSigma( rand );
gaussian.hyperParameter_a = degreesOfFreedom;
gaussian.hyperParameter_b = shrinkage;
gaussian.hyperParameter_lambda = dirichletParameter;
gaussian.hyperParameter_lambda = dirichletParameter;
}
}
@ -116,7 +116,7 @@ public class GaussianMixtureModel {
int numAssigned = 0;
for( final VariantDatum datum : data ) {
if( datum.assignment == gaussian ) {
if( datum.assignment.equals(gaussian) ) {
numAssigned++;
gaussian.incrementMu( datum );
}

View File

@ -55,10 +55,10 @@ public class MultivariateGaussian {
for( int iii = 0; iii < mu.length; iii++ ) {
for( int jjj = iii; jjj < mu.length; jjj++ ) {
randSigma[jjj][iii] = 0.55 + 1.25 * rand.nextDouble();
if(rand.nextBoolean()) {
if( rand.nextBoolean() ) {
randSigma[jjj][iii] *= -1.0;
}
if(iii != jjj) { randSigma[iii][jjj] = 0.0; } // Sigma is a symmetric, positive-definite matrix created by taking a lower diagonal matrix and multiplying it by its transpose
if( iii != jjj ) { randSigma[iii][jjj] = 0.0; } // Sigma is a symmetric, positive-definite matrix created by taking a lower diagonal matrix and multiplying it by its transpose
}
}
Matrix tmp = new Matrix( randSigma );

View File

@ -1,28 +1,32 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Tags;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 3/12/11
*/
public class TrainingSet implements Comparable<TrainingSet> {
public class TrainingSet {
public String name;
public boolean isKnown;
public boolean isTraining;
public boolean isTruth;
public double prior;
public boolean isKnown = false;
public boolean isTraining = false;
public boolean isTruth = false;
public double prior = 3.0;
public TrainingSet( final String name, final boolean isKnown, final boolean isTraining, final boolean isTruth, final double prior ) {
protected final static Logger logger = Logger.getLogger(TrainingSet.class);
public TrainingSet( final String name, final Tags tags ) {
this.name = name;
this.isKnown = isKnown;
this.isTraining = isTraining;
this.isTruth = isTruth;
this.prior = prior;
}
public int compareTo( final TrainingSet other ) {
return Double.compare(this.prior, other.prior);
if( tags != null ) {
isKnown = tags.containsKey("known") && tags.getValue("known").equals("true");
isTraining = tags.containsKey("training") && tags.getValue("training").equals("true");
isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true");
prior = ( tags.containsKey("known") ? Double.parseDouble(tags.getValue("prior")) : prior );
}
logger.info(String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", this.name, isKnown, isTraining, isTruth, prior) );
}
}

View File

@ -39,22 +39,21 @@ import java.util.*;
*/
public class Tranche implements Comparable<Tranche> {
private static final int CURRENT_VERSION = 3;
private static final int CURRENT_VERSION = 4;
public double fdr, minVQSLod, targetTiTv, knownTiTv, novelTiTv;
public double ts, minVQSLod, knownTiTv, novelTiTv;
public int numKnown,numNovel;
public String name;
int accessibleTruthSites = 0;
int callsAtTruthSites = 0;
public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites) {
this(fdr, targetTiTv, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, "anonymous");
public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites) {
this(ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, "anonymous");
}
public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, String name ) {
this.fdr = fdr;
this.targetTiTv = targetTiTv;
public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, String name ) {
this.ts = ts;
this.minVQSLod = minVQSLod;
this.novelTiTv = novelTiTv;
this.numNovel = numNovel;
@ -65,11 +64,8 @@ public class Tranche implements Comparable<Tranche> {
this.accessibleTruthSites = accessibleTruthSites;
this.callsAtTruthSites = callsAtTruthSites;
if ( fdr < 0.0 )
throw new UserException("Target FDR is unreasonable " + fdr);
if ( targetTiTv < 0.5 || targetTiTv > 10 )
throw new UserException("Target Ti/Tv ratio is unreasonable " + targetTiTv);
if ( ts < 0.0 || ts > 100.0)
throw new UserException("Target FDR is unreasonable " + ts);
if ( numKnown < 0 || numNovel < 0)
throw new ReviewedStingException("Invalid tranche - no. variants is < 0 : known " + numKnown + " novel " + numNovel);
@ -83,37 +79,35 @@ public class Tranche implements Comparable<Tranche> {
}
public int compareTo(Tranche other) {
return Double.compare(this.fdr, other.fdr);
return Double.compare(this.ts, other.ts);
}
public String toString() {
return String.format("Tranche fdr=%.2f minVQSLod=%.4f known=(%d @ %.2f) novel=(%d @ %.2f) truthSites(%d accessible, %d called), name=%s]",
fdr, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name);
return String.format("Tranche ts=%.2f minVQSLod=%.4f known=(%d @ %.2f) novel=(%d @ %.2f) truthSites(%d accessible, %d called), name=%s]",
ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name);
}
/**
* Returns an appropriately formatted string representing the raw tranches file on disk.
*
* @param rawTranches
* @param tranches
* @return
*/
public static String tranchesString(List<Tranche> rawTranches) {
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
PrintStream stream = new PrintStream(bytes);
public static String tranchesString( final List<Tranche> tranches ) {
final ByteArrayOutputStream bytes = new ByteArrayOutputStream();
final PrintStream stream = new PrintStream(bytes);
List<Tranche> tranches = new ArrayList<Tranche>();
tranches.addAll(rawTranches);
Collections.sort(tranches);
stream.println("# Variant quality score tranches file");
stream.println("# Version number " + CURRENT_VERSION);
stream.println("FDRtranche,targetTiTv,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,accessibleTruthSites,callsAtTruthSites,truthSensitivity");
stream.println("targetTruthSensitivity,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,accessibleTruthSites,callsAtTruthSites,truthSensitivity");
Tranche prev = null;
for ( Tranche t : tranches ) {
stream.printf("%.2f,%.2f,%d,%d,%.4f,%.4f,%.4f,FDRtranche%.2fto%.2f,%d,%d,%.4f%n",
t.fdr,t.targetTiTv,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.minVQSLod,
(prev == null ? 0.0 : prev.fdr), t.fdr, t.accessibleTruthSites, t.callsAtTruthSites, t.getTruthSensitivity());
stream.printf("%.2f,%d,%d,%.4f,%.4f,%.4f,TruthSensitivityTranche%.2fto%.2f,%d,%d,%.4f%n",
t.ts, t.numKnown, t.numNovel, t.knownTiTv, t.novelTiTv, t.minVQSLod,
(prev == null ? 0.0 : prev.ts), t.ts, t.accessibleTruthSites, t.callsAtTruthSites, t.getTruthSensitivity());
prev = t;
}
@ -160,19 +154,18 @@ public class Tranche implements Comparable<Tranche> {
final String[] vals = line.split(",");
if( header == null ) {
header = vals;
if ( header.length == 5 )
if ( header.length == 5 || header.length == 8 || header.length == 11 )
// old style tranches file, throw an error
throw new UserException.MalformedFile(f, "Unfortunately, your tranches file is from a previous version of this tool and cannot be used with the latest code. Please rerun VariantRecalibrator");
if ( header.length != 8 && header.length != 11 )
throw new UserException.MalformedFile(f, "Expected 8 elements in header line " + line);
if ( header.length != 10 )
throw new UserException.MalformedFile(f, "Expected 10 elements in header line " + line);
} else {
if ( header.length != vals.length )
throw new UserException.MalformedFile(f, "Line had too few/many fields. Header = " + header.length + " vals " + vals.length + " line " + line);
Map<String,String> bindings = new HashMap<String, String>();
for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]);
tranches.add(new Tranche(getDouble(bindings,"FDRtranche", true),
getDouble(bindings,"targetTiTv", false),
tranches.add(new Tranche(getDouble(bindings,"targetTruthSensitivity", true),
getDouble(bindings,"minVQSLod", true),
getInteger(bindings,"numKnown", false),
getDouble(bindings,"knownTiTv", false),
@ -184,7 +177,7 @@ public class Tranche implements Comparable<Tranche> {
}
}
Collections.sort(tranches); // sort this in the standard order
Collections.sort(tranches);
return tranches;
} catch( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(f, e);

View File

@ -91,7 +91,7 @@ public class TrancheManager {
}
public double getThreshold(double tranche) {
return tranche/100; // tranche of 1 => 99% sensitivity target
return 1.0 - tranche/100.0; // tranche of 1 => 99% sensitivity target
}
public double getTarget() { return 1.0; }
@ -123,14 +123,14 @@ public class TrancheManager {
public static List<Tranche> findTranches( ArrayList<VariantDatum> data, final double[] trancheThresholds, SelectionMetric metric, File debugFile ) {
logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.size()));
List<VariantDatum> tranchesData = sortVariantsbyLod(data);
metric.calculateRunningMetric(tranchesData);
Collections.sort(data);
metric.calculateRunningMetric(data);
if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, tranchesData, metric); }
if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, data, metric); }
List<Tranche> tranches = new ArrayList<Tranche>();
for ( double trancheThreshold : trancheThresholds ) {
Tranche t = findTranche(tranchesData, metric, trancheThreshold);
Tranche t = findTranche(data, metric, trancheThreshold);
if ( t == null ) {
if ( tranches.size() == 0 )
@ -159,11 +159,6 @@ public class TrancheManager {
}
}
private static List<VariantDatum> sortVariantsbyLod(final ArrayList<VariantDatum> data) {
Collections.sort(data);
return data;
}
public static Tranche findTranche( final List<VariantDatum> data, final SelectionMetric metric, final double trancheThreshold ) {
logger.info(String.format(" Tranche threshold %.2f => selection metric threshold %.3f", trancheThreshold, metric.getThreshold(trancheThreshold)));
@ -171,8 +166,8 @@ public class TrancheManager {
int n = data.size();
for ( int i = 0; i < n; i++ ) {
if ( metric.getRunningMetric(i) >= metricThreshold ) {
// we've found the largest group of variants with Ti/Tv >= our target titv
Tranche t = trancheOfVariants(data, i, trancheThreshold, metric.getTarget());
// we've found the largest group of variants with sensitivity >= our target truth sensitivity
Tranche t = trancheOfVariants(data, i, trancheThreshold);
logger.info(String.format(" Found tranche for %.3f: %.3f threshold starting with variant %d; running score is %.3f ",
trancheThreshold, metricThreshold, i, metric.getRunningMetric(i)));
logger.info(String.format(" Tranche is %s", t));
@ -183,7 +178,7 @@ public class TrancheManager {
return null;
}
public static Tranche trancheOfVariants( final List<VariantDatum> data, int minI, double fdr, double target ) {
public static Tranche trancheOfVariants( final List<VariantDatum> data, int minI, double ts ) {
int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0;
double minLod = data.get(minI).lod;
@ -207,7 +202,7 @@ public class TrancheManager {
int accessibleTruthSites = countCallsAtTruth(data, Double.NEGATIVE_INFINITY);
int nCallsAtTruth = countCallsAtTruth(data, minLod);
return new Tranche(fdr, target, minLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, nCallsAtTruth);
return new Tranche(ts, minLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, nCallsAtTruth);
}
public static double fdrToTiTv(double desiredFDR, double targetTiTv) {
@ -216,7 +211,7 @@ public class TrancheManager {
public static int countCallsAtTruth(final List<VariantDatum> data, double minLOD ) {
int n = 0;
for ( VariantDatum d : data) { n += d.atTruthSite && d.lod >= minLOD ? 1 : 0; }
for ( VariantDatum d : data) { n += (d.atTruthSite && d.lod >= minLOD ? 1 : 0); }
return n;
}

View File

@ -27,7 +27,7 @@ public class VariantDataManager {
private final ExpandingArrayList<TrainingSet> trainingSets;
private final static long RANDOM_SEED = 83409701;
private final static Random rand = new Random( RANDOM_SEED ); // this is going to cause problems if it is ever used in an integration test
private final static Random rand = new Random( RANDOM_SEED ); // this is going to cause problems if it is ever used in an integration test, planning to get rid of HRun anyway
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
@ -53,11 +53,7 @@ public class VariantDataManager {
final double theMean = mean(jjj); //BUGBUG: to clean up
final double theSTD = standardDeviation(theMean, jjj); //BUGBUG: to clean up
logger.info( annotationKeys.get(jjj) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) );
if( theSTD < 1E-8 ) { //BUGBUG: output recreated here to match the integration tests
foundZeroVarianceAnnotation = true;
} else if( theSTD < 1E-2 ) {
logger.warn("Warning! Tiny variance. It is strongly recommended that you -exclude " + annotationKeys.get(jjj));
}
foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-8);
meanVector[jjj] = theMean;
varianceVector[jjj] = theSTD;
for( final VariantDatum datum : data ) {
@ -65,7 +61,7 @@ public class VariantDataManager {
}
}
if( foundZeroVarianceAnnotation ) {
throw new UserException.BadInput("Found annotations with zero variance. They must be excluded before proceeding.");
throw new UserException.BadInput( "Found annotations with zero variance. They must be excluded before proceeding." );
}
}
@ -136,7 +132,7 @@ public class VariantDataManager {
final ExpandingArrayList<VariantDatum> dataToRemove = new ExpandingArrayList<VariantDatum>();
for( final VariantDatum datum : listData ) {
boolean remove = false;
for( double val : datum.annotations ) {
for( final double val : datum.annotations ) {
remove = remove || (Math.abs(val) > STD_THRESHOLD);
}
if( remove ) { dataToRemove.add( datum ); }
@ -163,8 +159,8 @@ public class VariantDataManager {
} else {
try {
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
} catch( Exception e ) {
throw new UserException.MalformedFile(vc.getSource(), "No double value detected for annotation = " + annotationKey + " in variant at " + VariantContextUtils.getLocation(genomeLocParser,vc) + ", reported annotation value = " + vc.getAttribute( annotationKey ), e );
} catch( final Exception e ) {
throw new UserException.MalformedFile( vc.getSource(), "No double value detected for annotation = " + annotationKey + " in variant at " + VariantContextUtils.getLocation(genomeLocParser,vc) + ", reported annotation value = " + vc.getAttribute( annotationKey ), e );
}
}
return value;
@ -188,8 +184,8 @@ public class VariantDataManager {
}
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
for( final VariantDatum datum : data ) { //BUGBUG: quick and dirty for now, only works for SNPs.
RECAL_FILE.println(String.format("%s,%d,%.4f", datum.pos.getContig(), datum.pos.getStart(), datum.lod));
for( final VariantDatum datum : data ) {
RECAL_FILE.println(String.format("%s,%d,%d,%.4f", datum.pos.getContig(), datum.pos.getStart(), datum.pos.getStop(), datum.lod));
}
}
}

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
@ -78,7 +79,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
@Test
public void testSingleBinding() {
String fileName = testDir + "TabularDataTest.dat";
RMDTriplet triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE);
RMDTriplet triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags());
ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(triplet,builder,seq.getSequenceDictionary(),genomeLocParser,false);
Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30)));
@ -101,10 +102,10 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
public void testMultipleBinding() {
File file = new File(testDir + "TabularDataTest.dat");
RMDTriplet testTriplet1 = new RMDTriplet("tableTest1","Table",file.getAbsolutePath(),RMDStorageType.FILE);
RMDTriplet testTriplet1 = new RMDTriplet("tableTest1","Table",file.getAbsolutePath(),RMDStorageType.FILE,new Tags());
ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(testTriplet1,builder,seq.getSequenceDictionary(),genomeLocParser,false);
RMDTriplet testTriplet2 = new RMDTriplet("tableTest2","Table",file.getAbsolutePath(),RMDStorageType.FILE);
RMDTriplet testTriplet2 = new RMDTriplet("tableTest2","Table",file.getAbsolutePath(),RMDStorageType.FILE,new Tags());
ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(testTriplet2,builder,seq.getSequenceDictionary(),genomeLocParser,false);
Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30)));

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.datasources.rmd;
import org.broadinstitute.sting.commandline.Tags;
import org.testng.Assert;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
@ -73,7 +74,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
File indexFileName = new File(testDir + "TabularDataTest.dat.idx");
if (indexFileName.exists()) indexFileName.delete();
triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE);
triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags());
builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null);
}

View File

@ -25,14 +25,14 @@ public class VariantRecalibrationWalkersV2IntegrationTest extends WalkerTest {
}
VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
"aa211561e6e9b1e323dec4eeaa318088", // tranches
"226adf939e90ad20599108e77ad25dae", // recal file
"06c2335e860e19f23bcacff8d4ab906d"); // cut VCF
"644e37fe6edaee13ddf9f3c4f0e6f500", // tranches
"e467bfd4ab74c7a7ff303cf810c514b3", // recal file
"05f66c959d1f8e78bcb4298092524921"); // cut VCF
VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf",
"f4f2057a8c010206f0f56deff0602452", // tranches
"dd36d252a6dc6e3207addddae731dd88", // recal file
"af4fe8bd0c2eca92d1990b1a49fae810"); // cut VCF
"e0a3845619d2138717df39ef1e9ee75f", // tranches
"c3fa524d0bdb40d6bd7aeda9e8dd882c", // recal file
"9f21a668c3fff13f382367851a11303c"); // cut VCF
@DataProvider(name = "VRTest")
public Object[][] createData1() {
@ -44,14 +44,14 @@ public class VariantRecalibrationWalkersV2IntegrationTest extends WalkerTest {
//System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
" -B:omni,VCF " + comparisonDataLocation + "Validated/Omni2.5_chip/1212samples.b36.vcf" +
" -B:dbsnp,DBSNP,known=true,training=false,truth=false,prior=10.0 " + GATKDataLocation + "dbsnp_129_b36.rod" +
" -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
" -B:omni,VCF,known=false,training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/1212samples.b36.vcf" +
" -T ContrastiveRecalibrator" +
" -B:input,VCF " + params.inVCF +
" -L 1:50,000,000-120,000,000" +
" -an QD -an MQ -an SB -an AF" +
" --trustAllPolymorphic" +
" --trustAllPolymorphic" + // for speed
" -recalFile %s" +
" -tranchesFile %s",
Arrays.asList(params.recalMD5, params.tranchesMD5));

View File

@ -112,40 +112,40 @@ class MethodsDevelopmentCallingPipeline extends QScript {
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask",
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"),
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.07, 1.0, !lowPass),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.07, 99.0, !lowPass),
"HiSeq19" -> new Target("NA12878.HiSeq19", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.HiSeq19.filtered.vcf"),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 3.0, !lowPass),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 97.0, !lowPass),
"GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.GA2.hg19.filtered.vcf"),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 1.0, !lowPass),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass),
"WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18_129, hapmap_hg18,
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask",
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"),
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"),
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, 3.0, !lowPass),
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, 97.0, !lowPass),
"WExTrio" -> new Target("CEUTrio.WEx", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"),
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 3.0, !lowPass),
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 97.0, !lowPass),
"FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"),
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 1.0, lowPass),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass),
"TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people
new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 1.0, !lowPass),
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 99.0, !lowPass),
"LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36,
new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from
new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, 1.0, lowPass), // chunked interval list to use with Queue's scatter/gather functionality
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, 99.0, lowPass), // chunked interval list to use with Queue's scatter/gather functionality
"LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"),
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 1.0, lowPass)
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass)
)
@ -189,6 +189,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
// 1.) Unified Genotyper Base
class GenotyperBase (t: Target) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS {
this.memoryLimit = 3
this.reference_sequence = t.reference
this.intervalsString ++= List(t.intervals)
this.scatterCount = 63 // the smallest interval list has 63 intervals, one for each Mb on chr20
@ -225,6 +226,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
// 2.) Hard Filtering for indels
class indelFilter (t: Target) extends VariantFiltration with UNIVERSAL_GATK_ARGS {
this.memoryLimit = 2
this.reference_sequence = t.reference
this.intervalsString ++= List(t.intervals)
this.scatterCount = 10
@ -243,24 +245,24 @@ class MethodsDevelopmentCallingPipeline extends QScript {
// 3.) Variant Quality Score Recalibration - Generate Recalibration table
class VQSR(t: Target, goldStandard: Boolean) extends ContrastiveRecalibrator with UNIVERSAL_GATK_ARGS {
this.memoryLimit = 6
this.memoryLimit = 4
this.reference_sequence = t.reference
this.intervalsString ++= List(t.intervals)
this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } )
this.rodBind :+= RodBind("hapmap", "VCF", t.hapmapFile)
this.rodBind :+= RodBind("hapmap", "VCF", t.hapmapFile, "known=false,training=true,truth=true,prior=12.0")
if( t.hapmapFile.contains("b37") )
this.rodBind :+= RodBind("1kg", "VCF", omni_b37)
this.rodBind :+= RodBind("omni", "VCF", omni_b37, "known=false,training=true,truth=true,prior=15.0")
else if( t.hapmapFile.contains("b36") )
this.rodBind :+= RodBind("1kg", "VCF", omni_b36)
this.rodBind :+= RodBind("omni", "VCF", omni_b36, "known=false,training=true,truth=true,prior=15.0")
if (t.dbsnpFile.endsWith(".rod"))
this.DBSNP = new File(t.dbsnpFile)
this.rodBind :+= RodBind("dbsnp", "DBSNP", t.dbsnpFile, "known=true,training=false,truth=false,prior=10.0")
else if (t.dbsnpFile.endsWith(".vcf"))
this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile)
this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile, "known=true,training=false,truth=false,prior=10.0")
this.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun")
this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile }
this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile }
this.allPoly = true
this.tranche ++= List("0.1", "0.5", "0.7", "1.0", "1.1", "1.2", "1.5", "1.6", "1.7", "1.8", "1.9", "2.0", "2.1", "2.2", "2.5","3.0", "5.0", "10.0")
this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0")
this.analysisName = t.name + "_VQSR"
this.jobName = queueLogDir + t.name + ".VQSR"
}
@ -279,9 +281,9 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.jobName = queueLogDir + t.name + ".applyVQSR"
}
// 6.) Variant Evaluation Base(OPTIONAL)
// 5.) Variant Evaluation Base(OPTIONAL)
class EvalBase(t: Target) extends VariantEval with UNIVERSAL_GATK_ARGS {
this.memoryLimit = 3
this.reference_sequence = t.reference
this.rodBind :+= RodBind("comphapmap", "VCF", t.hapmapFile)
this.intervalsString ++= List(t.intervals)
@ -292,7 +294,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.sample = samples
}
// 6a.) SNP Evaluation (OPTIONAL) based on the cut vcf
// 5a.) SNP Evaluation (OPTIONAL) based on the cut vcf
class snpEvaluation(t: Target) extends EvalBase(t) {
if (t.reference == b37 || t.reference == hg19) this.rodBind :+= RodBind("compomni", "VCF", omni_b37)
this.rodBind :+= RodBind("eval", "VCF", t.recalibratedVCF )
@ -301,7 +303,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.jobName = queueLogDir + t.name + ".snp.eval"
}
// 6b.) Indel Evaluation (OPTIONAL)
// 5b.) Indel Evaluation (OPTIONAL)
class indelEvaluation(t: Target) extends EvalBase(t) {
this.rodBind :+= RodBind("eval", "VCF", t.filteredIndelVCF)
this.evalModule :+= "IndelStatistics"