diff --git a/java/src/org/broadinstitute/sting/commandline/Tags.java b/java/src/org/broadinstitute/sting/commandline/Tags.java index ec4538281..5b1e48ce9 100644 --- a/java/src/org/broadinstitute/sting/commandline/Tags.java +++ b/java/src/org/broadinstitute/sting/commandline/Tags.java @@ -83,6 +83,15 @@ public class Tags { return keyValueTags.get(key); } + /** + * Returns true if tags contains given key + * @param key The key for which to check existence. + * @return true if tags contains given key + */ + public boolean containsKey(final String key) { + return keyValueTags.containsKey(key); + } + /** * Adds positional tag(s) to the tag object. * @param tags The tag strings to add. diff --git a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index 8d4fdd978..5a1f3378e 100644 --- a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -238,7 +238,7 @@ public abstract class CommandLineExecutable extends CommandLineProgram { Collection rodBindings = new ArrayList(); for (String fileName: argCollection.RODBindings) { - Tags tags = parser.getTags(fileName); + final Tags tags = parser.getTags(fileName); fileName = expandFileName(fileName); List positionalTags = tags.getPositionalTags(); @@ -259,17 +259,18 @@ public abstract class CommandLineExecutable extends CommandLineProgram { else storageType = RMDStorageType.FILE; - rodBindings.add(new RMDTriplet(name,type,fileName,storageType)); + rodBindings.add(new RMDTriplet(name,type,fileName,storageType,tags)); } if (argCollection.DBSNPFile != null) { if(argCollection.DBSNPFile.toLowerCase().contains("vcf")) throw new UserException("--DBSNP (-D) argument currently does not support VCF. To use dbSNP in VCF format, please use -B:dbsnp,vcf ."); + final Tags tags = parser.getTags(argCollection.DBSNPFile); String fileName = expandFileName(argCollection.DBSNPFile); RMDStorageType storageType = fileName.toLowerCase().endsWith("stdin") ? RMDStorageType.STREAM : RMDStorageType.FILE; - rodBindings.add(new RMDTriplet(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME,"dbsnp",fileName,storageType)); + rodBindings.add(new RMDTriplet(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME,"dbsnp",fileName,storageType,tags)); } return rodBindings; diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java index 4680c003a..60b68bda5 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.datasources.rmd; import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; @@ -124,6 +125,15 @@ public class ReferenceOrderedDataSource { return header; } + public Tags getTags() { + return fileDescriptor.getTags(); + } + + public String getTagValue( final String key ) { + return fileDescriptor.getTags().getValue( key ); + } + + /** * Retrieves the sequence dictionary created by this ROD. * @return diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java index 23063ee16..c2057ad5e 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java @@ -33,6 +33,7 @@ import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broad.tribble.source.BasicFeatureSource; import org.broad.tribble.util.LittleEndianOutputStream; +import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; @@ -168,7 +169,7 @@ public class RMDTrackBuilder extends PluginManager { if(typeName == null) throw new ReviewedStingException("Unable to find type name for class " + targetClass.getName()); - return createInstanceOfTrack(new RMDTriplet("anonymous",typeName,inputFile.getAbsolutePath(),RMDStorageType.FILE)); + return createInstanceOfTrack(new RMDTriplet("anonymous",typeName,inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags())); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java b/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java index 3f23d94c8..72c345d0f 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDTriplet.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.gatk.refdata.utils; +import org.broadinstitute.sting.commandline.Tags; + /** * a helper class to manage our triplets of data for the -B command line option (name, type, file) * TODO: The presence of four datapoints here suggests that this class' name isn't sufficient to describe its function. Rename. @@ -35,12 +37,14 @@ public class RMDTriplet { private final String type; private final String file; private final RMDStorageType storageType; + private final Tags tags; - public RMDTriplet(final String name, final String type, final String file, final RMDStorageType storageType) { + public RMDTriplet(final String name, final String type, final String file, final RMDStorageType storageType, final Tags tags) { this.name = name; this.type = type; this.file = file; this.storageType = storageType; + this.tags = tags; } /** @@ -75,4 +79,12 @@ public class RMDTriplet { public RMDStorageType getStorageType() { return storageType; } + + /** + * Gets the key=value tags associated with this track + * @return Tags associated with this track. + */ + public Tags getTags() { + return tags; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java index e74f0732b..b20fbbe59 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java @@ -54,7 +54,6 @@ public class GenerateVariantClustersWalker extends RodWalker { ///////////////////////////// // Command Line Arguments ///////////////////////////// - @Argument(fullName="fdr_filter_level", shortName="fdr_filter_level", doc="The FDR level at which to start filtering.", required=false) - private double FDR_FILTER_LEVEL = 1.0; - + @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) + private double TS_FILTER_LEVEL = 99.0; + @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false) + private String[] IGNORE_INPUT_FILTERS = null; + ///////////////////////////// // Private Member Variables ///////////////////////////// final private List tranches = new ArrayList(); final private Set inputNames = new HashSet(); final private NestedHashMap lodMap = new NestedHashMap(); + final private Set ignoreInputFilterSet = new TreeSet(); //--------------------------------------------------------------------------------------------------------------- // @@ -92,13 +95,12 @@ public class ApplyRecalibration extends RodWalker { public void initialize() { for ( final Tranche t : Tranche.readTranches(TRANCHES_FILE) ) { - if ( t.fdr >= FDR_FILTER_LEVEL) { + if ( t.ts >= TS_FILTER_LEVEL ) { tranches.add(t); - //statusMsg = "Keeping, above FDR threshold"; } logger.info(String.format("Read tranche " + t)); } - Collections.reverse(tranches); // this algorithm wants the tranches ordered from worst to best + Collections.reverse(tranches); // this algorithm wants the tranches ordered from best (lowest truth sensitivity) to worst (highest truth sensitivity) for( final ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) { if( d.getName().startsWith("input") ) { @@ -113,6 +115,10 @@ public class ApplyRecalibration extends RodWalker { throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." ); } + if( IGNORE_INPUT_FILTERS != null ) { + ignoreInputFilterSet.addAll( Arrays.asList(IGNORE_INPUT_FILTERS) ); + } + // setup the header fields final Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); @@ -122,14 +128,14 @@ public class ApplyRecalibration extends RodWalker { if( tranches.size() >= 2 ) { for( int iii = 0; iii < tranches.size() - 1; iii++ ) { - Tranche t = tranches.get(iii); - hInfo.add(new VCFFilterHeaderLine(t.name, String.format("FDR tranche level at VSQ Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod))); + final Tranche t = tranches.get(iii); + hInfo.add(new VCFFilterHeaderLine(t.name, String.format("Truth sensitivity tranche level at VSQ Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod))); } } if( tranches.size() >= 1 ) { - hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("FDR tranche level at VQS Lod < " + tranches.get(0).minVQSLod))); + hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("Truth sensitivity tranche level at VQS Lod < " + tranches.get(0).minVQSLod))); } else { - throw new UserException("No tranches were found in the file or were above the FDR Filter level " + FDR_FILTER_LEVEL); + throw new UserException("No tranches were found in the file or were above the truth sensitivity filter level " + TS_FILTER_LEVEL); } logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1)); @@ -141,7 +147,7 @@ public class ApplyRecalibration extends RodWalker { logger.info("Reading in recalibration table..."); for ( final String line : new XReadLines( RECAL_FILE ) ) { final String[] vals = line.split(","); - lodMap.put( Double.parseDouble(vals[2]), vals[0], Integer.parseInt(vals[1])); + lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys } } catch ( FileNotFoundException e ) { throw new UserException.CouldNotReadInputFile(RECAL_FILE, e); @@ -164,38 +170,31 @@ public class ApplyRecalibration extends RodWalker { for( VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) { if( vc != null ) { String filterString = null; - final Double lod = (Double) lodMap.get( ref.getLocus().getContig(), ref.getLocus().getStart() ); - if( !vc.isFiltered() ) { - try { - for( int i = tranches.size() - 1; i >= 0; i-- ) { - final Tranche tranche = tranches.get(i); - if( lod >= tranche.minVQSLod ) { - if (i == tranches.size() - 1) { - filterString = VCFConstants.PASSES_FILTERS_v4; - } else { - filterString = tranche.name; - } - break; - } - } - - if( filterString == null ) { - filterString = tranches.get(0).name+"+"; - } - - if ( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { - final Set filters = new HashSet(); - filters.add(filterString); - vc = VariantContext.modifyFilters(vc, filters); - } - } catch ( Exception e ) { - throw new UserException.MalformedFile(vc.getSource(), "Invalid value for VQS key " + ContrastiveRecalibrator.VQS_LOD_KEY + " at variant " + vc, e); - } - } - final Map attrs = new HashMap(vc.getAttributes()); - if(lod != null) { + final Double lod = (Double) lodMap.get( ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop() ); + if( vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) { attrs.put(ContrastiveRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod)); + for( int i = tranches.size() - 1; i >= 0; i-- ) { + final Tranche tranche = tranches.get(i); + if( lod >= tranche.minVQSLod ) { + if( i == tranches.size() - 1 ) { + filterString = VCFConstants.PASSES_FILTERS_v4; + } else { + filterString = tranche.name; + } + break; + } + } + + if( filterString == null ) { + filterString = tranches.get(0).name+"+"; + } + + if( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { + final Set filters = new HashSet(); + filters.add(filterString); + vc = VariantContext.modifyFilters(vc, filters); + } } vcfWriter.add( VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), vc.getFilters(), attrs), ref.getBase() ); @@ -219,7 +218,7 @@ public class ApplyRecalibration extends RodWalker { return 1; // This value isn't used for anything } - public void onTraversalDone( Integer reduceSum ) { + public void onTraversalDone( final Integer reduceSum ) { } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/ContrastiveRecalibrator.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/ContrastiveRecalibrator.java index cca347385..98f7ad473 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/ContrastiveRecalibrator.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/ContrastiveRecalibrator.java @@ -36,7 +36,6 @@ import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -72,8 +71,10 @@ public class ContrastiveRecalibrator extends RodWalker inputNames = new HashSet(); - private VariantRecalibratorEngine engine = new VariantRecalibratorEngine(new VariantRecalibratorArgumentCollection()); //BUGBUG: doesn't do anything with the args at the moment + private final Set ignoreInputFilterSet = new TreeSet(); + private final Set inputNames = new HashSet(); + private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine(new VariantRecalibratorArgumentCollection()); //BUGBUG: doesn't do anything with the args at the moment //--------------------------------------------------------------------------------------------------------------- // @@ -102,35 +103,27 @@ public class ContrastiveRecalibrator extends RodWalker(Arrays.asList(USE_ANNOTATIONS)) ); + if( IGNORE_INPUT_FILTERS != null ) { + ignoreInputFilterSet.addAll( Arrays.asList(IGNORE_INPUT_FILTERS) ); + } + for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) { if( d.getName().toLowerCase().startsWith("input") ) { inputNames.add(d.getName()); - logger.info("Found input variant track with name " + d.getName()); - } else if ( d.getName().equals("dbsnp") ) { //BUGBUG: values are hard coded for now until command line tagging up for Rod bindings - logger.info("Found dbSNP track: \tKnown = true \tTraining = false \tTruth = false \tPrior = Q10.0"); - dataManager.addTrainingSet(new TrainingSet("dbsnp", true, false, false, 10.0)); - } else if ( d.getName().equals("hapmap") ) { - logger.info("Found HapMap track: \tKnown = false \tTraining = true \tTruth = true \tPrior = Q12.0"); - dataManager.addTrainingSet(new TrainingSet("hapmap", false, true, true, 12.0)); - } else if ( d.getName().equals("1kg") ) { - logger.info("Found 1KG track: \tKnown = false \tTraining = true \tTruth = false \tPrior = Q6.0"); - dataManager.addTrainingSet(new TrainingSet("1kg", false, true, false, 6.0)); - } else if ( d.getName().equals("omni") ) { - logger.info("Found Omni track: \tKnown = false \tTraining = true \tTruth = true \tPrior = Q15.0"); - dataManager.addTrainingSet( new TrainingSet("omni", false, true, true, 15.0) ); + logger.info( "Found input variant track with name " + d.getName() ); } else { - logger.info("WARNING! Not evaluating ROD binding: " + d.getName()); + dataManager.addTrainingSet( new TrainingSet(d.getName(), d.getTags()) ); } } if( !dataManager.checkHasTrainingSet() ) { - throw new UserException.CommandLineException("No training set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, 1kg, or omni rod bindings."); + throw new UserException.CommandLineException( "No training set found! Please provide sets of known polymorphic loci marked with the training=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" ); } if( !dataManager.checkHasTruthSet() ) { - throw new UserException.CommandLineException("No truth set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, 1kg, or omni rod bindings."); + throw new UserException.CommandLineException( "No truth set found! Please provide sets of known polymorphic loci marked with the truth=true ROD binding tag. For example, -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 hapmapFile.vcf" ); } if( !dataManager.checkHasKnownSet() ) { - throw new UserException.CommandLineException("No known set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, 1kg, or omni rod bindings."); + throw new UserException.CommandLineException( "No known set found! Please provide sets of known polymorphic loci marked with the known=true ROD binding tag. For example, -B:dbsnp,VCF,known=true,training=false,truth=false,prior=8.0 dbsnpFile.vcf" ); } if( inputNames.size() == 0 ) { @@ -152,7 +145,7 @@ public class ContrastiveRecalibrator extends RodWalker tranches = TrancheManager.findTranches( dataManager.getData(), FDR_TRANCHES, metric, DEBUG_FILE ); //BUGBUG: recreated here to match the integration tests + List tranches = TrancheManager.findTranches( dataManager.getData(), TS_TRANCHES, metric, DEBUG_FILE ); //BUGBUG: recreated here to match the integration tests TRANCHES_FILE.print(Tranche.tranchesString( tranches )); logger.info( "Writing out recalibration table..." ); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index a8d00c186..ddc12269f 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -89,7 +89,7 @@ public class GaussianMixtureModel { gaussian.initializeRandomSigma( rand ); gaussian.hyperParameter_a = degreesOfFreedom; gaussian.hyperParameter_b = shrinkage; - gaussian.hyperParameter_lambda = dirichletParameter; + gaussian.hyperParameter_lambda = dirichletParameter; } } @@ -116,7 +116,7 @@ public class GaussianMixtureModel { int numAssigned = 0; for( final VariantDatum datum : data ) { - if( datum.assignment == gaussian ) { + if( datum.assignment.equals(gaussian) ) { numAssigned++; gaussian.incrementMu( datum ); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/MultivariateGaussian.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/MultivariateGaussian.java index dae7b17b1..a3c909bf6 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/MultivariateGaussian.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/MultivariateGaussian.java @@ -55,10 +55,10 @@ public class MultivariateGaussian { for( int iii = 0; iii < mu.length; iii++ ) { for( int jjj = iii; jjj < mu.length; jjj++ ) { randSigma[jjj][iii] = 0.55 + 1.25 * rand.nextDouble(); - if(rand.nextBoolean()) { + if( rand.nextBoolean() ) { randSigma[jjj][iii] *= -1.0; } - if(iii != jjj) { randSigma[iii][jjj] = 0.0; } // Sigma is a symmetric, positive-definite matrix created by taking a lower diagonal matrix and multiplying it by its transpose + if( iii != jjj ) { randSigma[iii][jjj] = 0.0; } // Sigma is a symmetric, positive-definite matrix created by taking a lower diagonal matrix and multiplying it by its transpose } } Matrix tmp = new Matrix( randSigma ); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/TrainingSet.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/TrainingSet.java index 49bc1e923..8e12cf544 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/TrainingSet.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/TrainingSet.java @@ -1,28 +1,32 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.Tags; + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: 3/12/11 */ -public class TrainingSet implements Comparable { +public class TrainingSet { public String name; - public boolean isKnown; - public boolean isTraining; - public boolean isTruth; - public double prior; + public boolean isKnown = false; + public boolean isTraining = false; + public boolean isTruth = false; + public double prior = 3.0; - public TrainingSet( final String name, final boolean isKnown, final boolean isTraining, final boolean isTruth, final double prior ) { + protected final static Logger logger = Logger.getLogger(TrainingSet.class); + + public TrainingSet( final String name, final Tags tags ) { this.name = name; - this.isKnown = isKnown; - this.isTraining = isTraining; - this.isTruth = isTruth; - this.prior = prior; - } - - public int compareTo( final TrainingSet other ) { - return Double.compare(this.prior, other.prior); + if( tags != null ) { + isKnown = tags.containsKey("known") && tags.getValue("known").equals("true"); + isTraining = tags.containsKey("training") && tags.getValue("training").equals("true"); + isTruth = tags.containsKey("truth") && tags.getValue("truth").equals("true"); + prior = ( tags.containsKey("known") ? Double.parseDouble(tags.getValue("prior")) : prior ); + } + logger.info(String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", this.name, isKnown, isTraining, isTruth, prior) ); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/Tranche.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/Tranche.java index f3ea7ad92..b685c9e68 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/Tranche.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/Tranche.java @@ -39,22 +39,21 @@ import java.util.*; */ public class Tranche implements Comparable { - private static final int CURRENT_VERSION = 3; + private static final int CURRENT_VERSION = 4; - public double fdr, minVQSLod, targetTiTv, knownTiTv, novelTiTv; + public double ts, minVQSLod, knownTiTv, novelTiTv; public int numKnown,numNovel; public String name; int accessibleTruthSites = 0; int callsAtTruthSites = 0; - public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites) { - this(fdr, targetTiTv, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, "anonymous"); + public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites) { + this(ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, "anonymous"); } - public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, String name ) { - this.fdr = fdr; - this.targetTiTv = targetTiTv; + public Tranche(double ts, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, String name ) { + this.ts = ts; this.minVQSLod = minVQSLod; this.novelTiTv = novelTiTv; this.numNovel = numNovel; @@ -65,11 +64,8 @@ public class Tranche implements Comparable { this.accessibleTruthSites = accessibleTruthSites; this.callsAtTruthSites = callsAtTruthSites; - if ( fdr < 0.0 ) - throw new UserException("Target FDR is unreasonable " + fdr); - - if ( targetTiTv < 0.5 || targetTiTv > 10 ) - throw new UserException("Target Ti/Tv ratio is unreasonable " + targetTiTv); + if ( ts < 0.0 || ts > 100.0) + throw new UserException("Target FDR is unreasonable " + ts); if ( numKnown < 0 || numNovel < 0) throw new ReviewedStingException("Invalid tranche - no. variants is < 0 : known " + numKnown + " novel " + numNovel); @@ -83,37 +79,35 @@ public class Tranche implements Comparable { } public int compareTo(Tranche other) { - return Double.compare(this.fdr, other.fdr); + return Double.compare(this.ts, other.ts); } public String toString() { - return String.format("Tranche fdr=%.2f minVQSLod=%.4f known=(%d @ %.2f) novel=(%d @ %.2f) truthSites(%d accessible, %d called), name=%s]", - fdr, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name); + return String.format("Tranche ts=%.2f minVQSLod=%.4f known=(%d @ %.2f) novel=(%d @ %.2f) truthSites(%d accessible, %d called), name=%s]", + ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name); } /** * Returns an appropriately formatted string representing the raw tranches file on disk. * - * @param rawTranches + * @param tranches * @return */ - public static String tranchesString(List rawTranches) { - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - PrintStream stream = new PrintStream(bytes); + public static String tranchesString( final List tranches ) { + final ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + final PrintStream stream = new PrintStream(bytes); - List tranches = new ArrayList(); - tranches.addAll(rawTranches); Collections.sort(tranches); stream.println("# Variant quality score tranches file"); stream.println("# Version number " + CURRENT_VERSION); - stream.println("FDRtranche,targetTiTv,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,accessibleTruthSites,callsAtTruthSites,truthSensitivity"); + stream.println("targetTruthSensitivity,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,accessibleTruthSites,callsAtTruthSites,truthSensitivity"); Tranche prev = null; for ( Tranche t : tranches ) { - stream.printf("%.2f,%.2f,%d,%d,%.4f,%.4f,%.4f,FDRtranche%.2fto%.2f,%d,%d,%.4f%n", - t.fdr,t.targetTiTv,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.minVQSLod, - (prev == null ? 0.0 : prev.fdr), t.fdr, t.accessibleTruthSites, t.callsAtTruthSites, t.getTruthSensitivity()); + stream.printf("%.2f,%d,%d,%.4f,%.4f,%.4f,TruthSensitivityTranche%.2fto%.2f,%d,%d,%.4f%n", + t.ts, t.numKnown, t.numNovel, t.knownTiTv, t.novelTiTv, t.minVQSLod, + (prev == null ? 0.0 : prev.ts), t.ts, t.accessibleTruthSites, t.callsAtTruthSites, t.getTruthSensitivity()); prev = t; } @@ -160,19 +154,18 @@ public class Tranche implements Comparable { final String[] vals = line.split(","); if( header == null ) { header = vals; - if ( header.length == 5 ) + if ( header.length == 5 || header.length == 8 || header.length == 11 ) // old style tranches file, throw an error throw new UserException.MalformedFile(f, "Unfortunately, your tranches file is from a previous version of this tool and cannot be used with the latest code. Please rerun VariantRecalibrator"); - if ( header.length != 8 && header.length != 11 ) - throw new UserException.MalformedFile(f, "Expected 8 elements in header line " + line); + if ( header.length != 10 ) + throw new UserException.MalformedFile(f, "Expected 10 elements in header line " + line); } else { if ( header.length != vals.length ) throw new UserException.MalformedFile(f, "Line had too few/many fields. Header = " + header.length + " vals " + vals.length + " line " + line); Map bindings = new HashMap(); for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]); - tranches.add(new Tranche(getDouble(bindings,"FDRtranche", true), - getDouble(bindings,"targetTiTv", false), + tranches.add(new Tranche(getDouble(bindings,"targetTruthSensitivity", true), getDouble(bindings,"minVQSLod", true), getInteger(bindings,"numKnown", false), getDouble(bindings,"knownTiTv", false), @@ -184,7 +177,7 @@ public class Tranche implements Comparable { } } - Collections.sort(tranches); // sort this in the standard order + Collections.sort(tranches); return tranches; } catch( FileNotFoundException e ) { throw new UserException.CouldNotReadInputFile(f, e); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/TrancheManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/TrancheManager.java index cd8c60cce..49b464eda 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/TrancheManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/TrancheManager.java @@ -91,7 +91,7 @@ public class TrancheManager { } public double getThreshold(double tranche) { - return tranche/100; // tranche of 1 => 99% sensitivity target + return 1.0 - tranche/100.0; // tranche of 1 => 99% sensitivity target } public double getTarget() { return 1.0; } @@ -123,14 +123,14 @@ public class TrancheManager { public static List findTranches( ArrayList data, final double[] trancheThresholds, SelectionMetric metric, File debugFile ) { logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.size())); - List tranchesData = sortVariantsbyLod(data); - metric.calculateRunningMetric(tranchesData); + Collections.sort(data); + metric.calculateRunningMetric(data); - if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, tranchesData, metric); } + if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, data, metric); } List tranches = new ArrayList(); for ( double trancheThreshold : trancheThresholds ) { - Tranche t = findTranche(tranchesData, metric, trancheThreshold); + Tranche t = findTranche(data, metric, trancheThreshold); if ( t == null ) { if ( tranches.size() == 0 ) @@ -159,11 +159,6 @@ public class TrancheManager { } } - private static List sortVariantsbyLod(final ArrayList data) { - Collections.sort(data); - return data; - } - public static Tranche findTranche( final List data, final SelectionMetric metric, final double trancheThreshold ) { logger.info(String.format(" Tranche threshold %.2f => selection metric threshold %.3f", trancheThreshold, metric.getThreshold(trancheThreshold))); @@ -171,8 +166,8 @@ public class TrancheManager { int n = data.size(); for ( int i = 0; i < n; i++ ) { if ( metric.getRunningMetric(i) >= metricThreshold ) { - // we've found the largest group of variants with Ti/Tv >= our target titv - Tranche t = trancheOfVariants(data, i, trancheThreshold, metric.getTarget()); + // we've found the largest group of variants with sensitivity >= our target truth sensitivity + Tranche t = trancheOfVariants(data, i, trancheThreshold); logger.info(String.format(" Found tranche for %.3f: %.3f threshold starting with variant %d; running score is %.3f ", trancheThreshold, metricThreshold, i, metric.getRunningMetric(i))); logger.info(String.format(" Tranche is %s", t)); @@ -183,7 +178,7 @@ public class TrancheManager { return null; } - public static Tranche trancheOfVariants( final List data, int minI, double fdr, double target ) { + public static Tranche trancheOfVariants( final List data, int minI, double ts ) { int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0; double minLod = data.get(minI).lod; @@ -207,7 +202,7 @@ public class TrancheManager { int accessibleTruthSites = countCallsAtTruth(data, Double.NEGATIVE_INFINITY); int nCallsAtTruth = countCallsAtTruth(data, minLod); - return new Tranche(fdr, target, minLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, nCallsAtTruth); + return new Tranche(ts, minLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, nCallsAtTruth); } public static double fdrToTiTv(double desiredFDR, double targetTiTv) { @@ -216,7 +211,7 @@ public class TrancheManager { public static int countCallsAtTruth(final List data, double minLOD ) { int n = 0; - for ( VariantDatum d : data) { n += d.atTruthSite && d.lod >= minLOD ? 1 : 0; } + for ( VariantDatum d : data) { n += (d.atTruthSite && d.lod >= minLOD ? 1 : 0); } return n; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VariantDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VariantDataManager.java index 4cfd4e325..507cda88a 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -27,7 +27,7 @@ public class VariantDataManager { private final ExpandingArrayList trainingSets; private final static long RANDOM_SEED = 83409701; - private final static Random rand = new Random( RANDOM_SEED ); // this is going to cause problems if it is ever used in an integration test + private final static Random rand = new Random( RANDOM_SEED ); // this is going to cause problems if it is ever used in an integration test, planning to get rid of HRun anyway protected final static Logger logger = Logger.getLogger(VariantDataManager.class); @@ -53,11 +53,7 @@ public class VariantDataManager { final double theMean = mean(jjj); //BUGBUG: to clean up final double theSTD = standardDeviation(theMean, jjj); //BUGBUG: to clean up logger.info( annotationKeys.get(jjj) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) ); - if( theSTD < 1E-8 ) { //BUGBUG: output recreated here to match the integration tests - foundZeroVarianceAnnotation = true; - } else if( theSTD < 1E-2 ) { - logger.warn("Warning! Tiny variance. It is strongly recommended that you -exclude " + annotationKeys.get(jjj)); - } + foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-8); meanVector[jjj] = theMean; varianceVector[jjj] = theSTD; for( final VariantDatum datum : data ) { @@ -65,7 +61,7 @@ public class VariantDataManager { } } if( foundZeroVarianceAnnotation ) { - throw new UserException.BadInput("Found annotations with zero variance. They must be excluded before proceeding."); + throw new UserException.BadInput( "Found annotations with zero variance. They must be excluded before proceeding." ); } } @@ -136,7 +132,7 @@ public class VariantDataManager { final ExpandingArrayList dataToRemove = new ExpandingArrayList(); for( final VariantDatum datum : listData ) { boolean remove = false; - for( double val : datum.annotations ) { + for( final double val : datum.annotations ) { remove = remove || (Math.abs(val) > STD_THRESHOLD); } if( remove ) { dataToRemove.add( datum ); } @@ -163,8 +159,8 @@ public class VariantDataManager { } else { try { value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) ); - } catch( Exception e ) { - throw new UserException.MalformedFile(vc.getSource(), "No double value detected for annotation = " + annotationKey + " in variant at " + VariantContextUtils.getLocation(genomeLocParser,vc) + ", reported annotation value = " + vc.getAttribute( annotationKey ), e ); + } catch( final Exception e ) { + throw new UserException.MalformedFile( vc.getSource(), "No double value detected for annotation = " + annotationKey + " in variant at " + VariantContextUtils.getLocation(genomeLocParser,vc) + ", reported annotation value = " + vc.getAttribute( annotationKey ), e ); } } return value; @@ -188,8 +184,8 @@ public class VariantDataManager { } public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { - for( final VariantDatum datum : data ) { //BUGBUG: quick and dirty for now, only works for SNPs. - RECAL_FILE.println(String.format("%s,%d,%.4f", datum.pos.getContig(), datum.pos.getStart(), datum.lod)); + for( final VariantDatum datum : data ) { + RECAL_FILE.println(String.format("%s,%d,%d,%.4f", datum.pos.getContig(), datum.pos.getStart(), datum.pos.getStop(), datum.lod)); } } } diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index ccf4ae17b..b32473b9d 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.datasources.providers; +import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; @@ -78,7 +79,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { @Test public void testSingleBinding() { String fileName = testDir + "TabularDataTest.dat"; - RMDTriplet triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE); + RMDTriplet triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags()); ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(triplet,builder,seq.getSequenceDictionary(),genomeLocParser,false); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30))); @@ -101,10 +102,10 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { public void testMultipleBinding() { File file = new File(testDir + "TabularDataTest.dat"); - RMDTriplet testTriplet1 = new RMDTriplet("tableTest1","Table",file.getAbsolutePath(),RMDStorageType.FILE); + RMDTriplet testTriplet1 = new RMDTriplet("tableTest1","Table",file.getAbsolutePath(),RMDStorageType.FILE,new Tags()); ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(testTriplet1,builder,seq.getSequenceDictionary(),genomeLocParser,false); - RMDTriplet testTriplet2 = new RMDTriplet("tableTest2","Table",file.getAbsolutePath(),RMDStorageType.FILE); + RMDTriplet testTriplet2 = new RMDTriplet("tableTest2","Table",file.getAbsolutePath(),RMDStorageType.FILE,new Tags()); ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(testTriplet2,builder,seq.getSequenceDictionary(),genomeLocParser,false); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30))); diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java index 13f0b3ac1..5b0d67e88 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.datasources.rmd; +import org.broadinstitute.sting.commandline.Tags; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; @@ -73,7 +74,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { File indexFileName = new File(testDir + "TabularDataTest.dat.idx"); if (indexFileName.exists()) indexFileName.delete(); - triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE); + triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags()); builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null); } diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersV2IntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersV2IntegrationTest.java index 816ba8056..edae5bf9d 100755 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersV2IntegrationTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersV2IntegrationTest.java @@ -25,14 +25,14 @@ public class VariantRecalibrationWalkersV2IntegrationTest extends WalkerTest { } VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", - "aa211561e6e9b1e323dec4eeaa318088", // tranches - "226adf939e90ad20599108e77ad25dae", // recal file - "06c2335e860e19f23bcacff8d4ab906d"); // cut VCF + "644e37fe6edaee13ddf9f3c4f0e6f500", // tranches + "e467bfd4ab74c7a7ff303cf810c514b3", // recal file + "05f66c959d1f8e78bcb4298092524921"); // cut VCF VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf", - "f4f2057a8c010206f0f56deff0602452", // tranches - "dd36d252a6dc6e3207addddae731dd88", // recal file - "af4fe8bd0c2eca92d1990b1a49fae810"); // cut VCF + "e0a3845619d2138717df39ef1e9ee75f", // tranches + "c3fa524d0bdb40d6bd7aeda9e8dd882c", // recal file + "9f21a668c3fff13f382367851a11303c"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -44,14 +44,14 @@ public class VariantRecalibrationWalkersV2IntegrationTest extends WalkerTest { //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" + - " -B:omni,VCF " + comparisonDataLocation + "Validated/Omni2.5_chip/1212samples.b36.vcf" + + " -B:dbsnp,DBSNP,known=true,training=false,truth=false,prior=10.0 " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -B:hapmap,VCF,known=false,training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" + + " -B:omni,VCF,known=false,training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/1212samples.b36.vcf" + " -T ContrastiveRecalibrator" + " -B:input,VCF " + params.inVCF + " -L 1:50,000,000-120,000,000" + " -an QD -an MQ -an SB -an AF" + - " --trustAllPolymorphic" + + " --trustAllPolymorphic" + // for speed " -recalFile %s" + " -tranchesFile %s", Arrays.asList(params.recalMD5, params.tranchesMD5)); diff --git a/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala b/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala index aa044aa28..785e0d42a 100755 --- a/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala +++ b/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala @@ -112,40 +112,40 @@ class MethodsDevelopmentCallingPipeline extends QScript { "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask", new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"), new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.07, 1.0, !lowPass), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.07, 99.0, !lowPass), "HiSeq19" -> new Target("NA12878.HiSeq19", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.HiSeq19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 3.0, !lowPass), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 97.0, !lowPass), "GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.GA2.hg19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 1.0, !lowPass), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass), "WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18_129, hapmap_hg18, "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask", new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"), new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, 3.0, !lowPass), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, 97.0, !lowPass), "WExTrio" -> new Target("CEUTrio.WEx", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 3.0, !lowPass), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 97.0, !lowPass), "FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"), new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 1.0, lowPass), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass), "TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 1.0, !lowPass), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 99.0, !lowPass), "LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36, new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, 1.0, lowPass), // chunked interval list to use with Queue's scatter/gather functionality + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, 99.0, lowPass), // chunked interval list to use with Queue's scatter/gather functionality "LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"), new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 1.0, lowPass) + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass) ) @@ -189,6 +189,7 @@ class MethodsDevelopmentCallingPipeline extends QScript { // 1.) Unified Genotyper Base class GenotyperBase (t: Target) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { + this.memoryLimit = 3 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) this.scatterCount = 63 // the smallest interval list has 63 intervals, one for each Mb on chr20 @@ -225,6 +226,7 @@ class MethodsDevelopmentCallingPipeline extends QScript { // 2.) Hard Filtering for indels class indelFilter (t: Target) extends VariantFiltration with UNIVERSAL_GATK_ARGS { + this.memoryLimit = 2 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) this.scatterCount = 10 @@ -243,24 +245,24 @@ class MethodsDevelopmentCallingPipeline extends QScript { // 3.) Variant Quality Score Recalibration - Generate Recalibration table class VQSR(t: Target, goldStandard: Boolean) extends ContrastiveRecalibrator with UNIVERSAL_GATK_ARGS { - this.memoryLimit = 6 + this.memoryLimit = 4 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) - this.rodBind :+= RodBind("hapmap", "VCF", t.hapmapFile) + this.rodBind :+= RodBind("hapmap", "VCF", t.hapmapFile, "known=false,training=true,truth=true,prior=12.0") if( t.hapmapFile.contains("b37") ) - this.rodBind :+= RodBind("1kg", "VCF", omni_b37) + this.rodBind :+= RodBind("omni", "VCF", omni_b37, "known=false,training=true,truth=true,prior=15.0") else if( t.hapmapFile.contains("b36") ) - this.rodBind :+= RodBind("1kg", "VCF", omni_b36) + this.rodBind :+= RodBind("omni", "VCF", omni_b36, "known=false,training=true,truth=true,prior=15.0") if (t.dbsnpFile.endsWith(".rod")) - this.DBSNP = new File(t.dbsnpFile) + this.rodBind :+= RodBind("dbsnp", "DBSNP", t.dbsnpFile, "known=true,training=false,truth=false,prior=10.0") else if (t.dbsnpFile.endsWith(".vcf")) - this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile) + this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile, "known=true,training=false,truth=false,prior=10.0") this.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile } this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile } this.allPoly = true - this.tranche ++= List("0.1", "0.5", "0.7", "1.0", "1.1", "1.2", "1.5", "1.6", "1.7", "1.8", "1.9", "2.0", "2.1", "2.2", "2.5","3.0", "5.0", "10.0") + this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0") this.analysisName = t.name + "_VQSR" this.jobName = queueLogDir + t.name + ".VQSR" } @@ -279,9 +281,9 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.jobName = queueLogDir + t.name + ".applyVQSR" } - - // 6.) Variant Evaluation Base(OPTIONAL) + // 5.) Variant Evaluation Base(OPTIONAL) class EvalBase(t: Target) extends VariantEval with UNIVERSAL_GATK_ARGS { + this.memoryLimit = 3 this.reference_sequence = t.reference this.rodBind :+= RodBind("comphapmap", "VCF", t.hapmapFile) this.intervalsString ++= List(t.intervals) @@ -292,7 +294,7 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.sample = samples } - // 6a.) SNP Evaluation (OPTIONAL) based on the cut vcf + // 5a.) SNP Evaluation (OPTIONAL) based on the cut vcf class snpEvaluation(t: Target) extends EvalBase(t) { if (t.reference == b37 || t.reference == hg19) this.rodBind :+= RodBind("compomni", "VCF", omni_b37) this.rodBind :+= RodBind("eval", "VCF", t.recalibratedVCF ) @@ -301,7 +303,7 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.jobName = queueLogDir + t.name + ".snp.eval" } - // 6b.) Indel Evaluation (OPTIONAL) + // 5b.) Indel Evaluation (OPTIONAL) class indelEvaluation(t: Target) extends EvalBase(t) { this.rodBind :+= RodBind("eval", "VCF", t.filteredIndelVCF) this.evalModule :+= "IndelStatistics"