diff --git a/build.xml b/build.xml index 85955d774..ef53f6aa4 100644 --- a/build.xml +++ b/build.xml @@ -49,7 +49,7 @@ - + @@ -489,7 +489,7 @@ docletpathref="doclet.classpath" classpathref="external.dependencies" classpath="${java.classes}" - additionalparam="-private -build-timestamp "${build.timestamp}" -absolute-version ${build.version} -quiet -J-Xdebug -J-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005"> + additionalparam="-private -build-timestamp "${build.timestamp}" -absolute-version ${build.version} -quiet -J-Xdebug -J-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005"> diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java index b9e380295..2ff8aa979 100755 --- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java +++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum; import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.XReadLines; import java.io.*; @@ -42,19 +43,71 @@ import java.util.Map; import java.util.regex.Pattern; /** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Dec 1, 2009 + * Call R scripts to plot residual error versus the various covariates. * - * Create collapsed versions of the recal csv file and call R scripts to plot residual error versus the various covariates. + *

+ * After counting covariates in either the initial BAM File or again in the recalibrated BAM File, an analysis tool is available which + * reads the .csv file and outputs several PDF (and .dat) files for each read group in the given BAM. These PDF files graphically + * show the various metrics and characteristics of the reported quality scores (often in relation to the empirical qualities). + * In order to show that any biases in the reported quality scores have been generally fixed through recalibration one should run + * CountCovariates again on a bam file produced by TableRecalibration. In this way users can compare the analysis plots generated + * by pre-recalibration and post-recalibration .csv files. Our usual chain of commands that we use to generate plots of residual + * error is: CountCovariates, TableRecalibrate, samtools index on the recalibrated bam file, CountCovariates again on the recalibrated + * bam file, and then AnalyzeCovariates on both the before and after recal_data.csv files to see the improvement in recalibration. + * + *

+ * The color coding along with the RMSE is included in the plots to give some indication of the number of observations that went into + * each of the quality score estimates. It is defined as follows for N, the number of observations: + * + *

    + *
  • light blue means N < 1,000
  • + *
  • cornflower blue means 1,000 <= N < 10,000
  • + *
  • dark blue means N >= 10,000
  • + *
  • The pink dots indicate points whose quality scores are special codes used by the aligner and which are mathematically + * meaningless and so aren't included in any of the numerical calculations.
  • + *
+ * + *

+ * NOTE: For those running this tool externally from the Broad, it is crucial to note that both the -Rscript and -resources options + * must be changed from the default. -Rscript needs to point to your installation of Rscript (this is the scripting version of R, + * not the interactive version) while -resources needs to point to the folder holding the R scripts that are used. For those using + * this tool as part of the Binary Distribution the -resources should point to the resources folder that is part of the tarball. + * For those using this tool by building from the git repository the -resources should point to the R/ subdirectory of the Sting checkout. + * + *

+ * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration + * + *

Input

+ *

+ * The recalibration table file in CSV format that was generated by the CountCovariates walker. + *

+ * + *

Examples

+ *
+ * java -Xmx4g -jar AnalyzeCovariates.jar \
+ *   -recalFile /path/to/recal.table.csv  \
+ *   -outputDir /path/to/output_dir/  \
+ *   -resources resources/  \
+ *   -ignoreQ 5
+ * 
+ * */ +@DocumentedGATKFeature( + groupName = "AnalyzeCovariates", + summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator") public class AnalyzeCovariates extends CommandLineProgram { ///////////////////////////// // Command Line Arguments ///////////////////////////// - + /** + * After the header, data records occur one per line until the end of the file. The first several items on a line are the + * values of the individual covariates and will change depending on which covariates were specified at runtime. The last + * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. + */ @Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false) private String RECAL_FILE = "output.recal_data.csv"; @Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false) @@ -67,11 +120,20 @@ public class AnalyzeCovariates extends CommandLineProgram { private int IGNORE_QSCORES_LESS_THAN = 5; @Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false) private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups + + /** + * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation + * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later. + */ @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 50") private int MAX_QUALITY_SCORE = 50; + + /** + * This argument is useful for comparing before/after plots and you want the axes to match each other. + */ @Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots") private int MAX_HISTOGRAM_VALUE = 0; - @Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, this value will be the max value of the histogram plots") + @Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting") private boolean DO_INDEL_QUALITY = false; diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java new file mode 100644 index 000000000..9350e4a66 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java @@ -0,0 +1,4 @@ +/** + * Package to plot residual accuracy versus error covariates for the base quality score recalibrator. + */ +package org.broadinstitute.sting.analyzecovariates; \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/commandline/Advanced.java b/public/java/src/org/broadinstitute/sting/commandline/Advanced.java new file mode 100644 index 000000000..7aeefe261 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/commandline/Advanced.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.commandline; + +import java.lang.annotation.*; + +/** + * Indicates that a walker argument should is considered an advanced option. + * + * @author Mark DePristo + * @version 0.1 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE,ElementType.FIELD}) +public @interface Advanced { +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java index e0e2ac378..8ec0d650a 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java @@ -151,6 +151,14 @@ public class ArgumentSource { return field.isAnnotationPresent(Hidden.class) || field.isAnnotationPresent(Deprecated.class); } + /** + * Is the given argument considered an advanced option when displaying on the command-line argument system. + * @return True if so. False otherwise. + */ + public boolean isAdvanced() { + return field.isAnnotationPresent(Advanced.class); + } + /** * Is this command-line argument dependent on some primitive argument types? * @return True if this command-line argument depends on other arguments; false otherwise. diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index d1d4ff914..ff992d77d 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -325,7 +325,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { - Class parameterType = getParameterizedTypeClass(type); + Class parameterType = JVMUtils.getParameterizedTypeClass(type); return RodBinding.makeUnbound((Class)parameterType); } @@ -338,6 +338,8 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); String value = getArgumentValue( defaultDefinition, matches ); + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + try { String name = defaultDefinition.fullName; String tribbleType = null; @@ -372,19 +374,19 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); if ( featureDescriptor != null ) { tribbleType = featureDescriptor.getName(); - logger.warn("Dynamically determined type of " + file + " to be " + tribbleType); + logger.info("Dynamically determined type of " + file + " to be " + tribbleType); } } + + if ( tribbleType == null ) + throw new UserException.CommandLineException( + String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + + "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", + manager.userFriendlyListOfAvailableFeatures(parameterType))); } } - if ( tribbleType == null ) // error handling - throw new UserException.CommandLineException( - String.format("Could not parse argument %s with value %s", - defaultDefinition.fullName, value)); - Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); - Class parameterType = getParameterizedTypeClass(type); RodBinding result = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags); parsingEngine.addTags(result,tags); parsingEngine.addRodBinding(result); @@ -395,20 +397,10 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { value, source.field.getName())); } catch (Exception e) { throw new UserException.CommandLineException( - String.format("Failed to parse value %s for argument %s.", - value, source.field.getName())); + String.format("Failed to parse value %s for argument %s. Message: %s", + value, source.field.getName(), e.getMessage())); } } - - private Class getParameterizedTypeClass(Type t) { - if ( t instanceof ParameterizedType ) { - ParameterizedType parameterizedType = (ParameterizedType)t; - if ( parameterizedType.getActualTypeArguments().length != 1 ) - throw new ReviewedStingException("BUG: more than 1 generic type found on class" + t); - return (Class)parameterizedType.getActualTypeArguments()[0]; - } else - throw new ReviewedStingException("BUG: could not find generic type on class " + t); - } } /** diff --git a/public/java/src/org/broadinstitute/sting/commandline/Output.java b/public/java/src/org/broadinstitute/sting/commandline/Output.java index 22565dbf5..f8aef0355 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/Output.java +++ b/public/java/src/org/broadinstitute/sting/commandline/Output.java @@ -55,7 +55,7 @@ public @interface Output { * --help argument is specified. * @return Doc string associated with this command-line argument. */ - String doc() default "An output file presented to the walker. Will overwrite contents if file exists."; + String doc() default "An output file created by the walker. Will overwrite contents if file exists"; /** * Is this argument required. If true, the command-line argument system will diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index 32132c7ca..32002e093 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -96,24 +96,23 @@ public abstract class CommandLineExecutable extends CommandLineProgram { loadArgumentsIntoObject(walker); argumentSources.add(walker); - Collection newStyle = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); + Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); // todo: remove me when the old style system is removed if ( getArgumentCollection().RODBindings.size() > 0 ) { logger.warn("################################################################################"); logger.warn("################################################################################"); - logger.warn("Deprecated -B rod binding syntax detected. This syntax will be retired in GATK 1.2."); + logger.warn("Deprecated -B rod binding syntax detected. This syntax has been eliminated in GATK 1.2."); logger.warn("Please use arguments defined by each specific walker instead."); for ( String oldStyleRodBinding : getArgumentCollection().RODBindings ) { logger.warn(" -B rod binding with value " + oldStyleRodBinding + " tags: " + parser.getTags(oldStyleRodBinding).getPositionalTags()); } logger.warn("################################################################################"); logger.warn("################################################################################"); + System.exit(1); } - Collection oldStyle = ListFileUtils.unpackRODBindingsOldStyle(getArgumentCollection().RODBindings, parser); - oldStyle.addAll(newStyle); - engine.setReferenceMetaDataFiles(oldStyle); + engine.setReferenceMetaDataFiles(rodBindings); for (ReadFilter filter: filters) { loadArgumentsIntoObject(filter); diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 7e96b609e..b8488dc9a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -31,13 +31,11 @@ import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; import org.broadinstitute.sting.gatk.walkers.Attribution; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.ApplicationDetails; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.GATKDocUtils; -import org.broadinstitute.sting.utils.help.GATKDoclet; +import org.broadinstitute.sting.utils.help.*; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.util.*; @@ -52,7 +50,7 @@ import java.util.*; @DocumentedGATKFeature( groupName = "GATK Engine", summary = "Features and arguments for the GATK engine itself, available to all walkers.", - extraDocs = { ReadFilter.class, UserException.class }) + extraDocs = { UserException.class }) public class CommandLineGATK extends CommandLineExecutable { @Argument(fullName = "analysis_type", shortName = "T", doc = "Type of analysis to run") private String analysisName = null; @@ -177,6 +175,10 @@ public class CommandLineGATK extends CommandLineExecutable { StringBuilder additionalHelp = new StringBuilder(); Formatter formatter = new Formatter(additionalHelp); + formatter.format("Available Reference Ordered Data types:%n"); + formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures()); + formatter.format("%n"); + formatter.format("For a full description of this walker, see its GATKdocs at:%n"); formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index b0c4e203b..5b9ebd99b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -689,8 +689,6 @@ public class GenomeAnalysisEngine { validateSuppliedReads(); readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference()); - sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); - for (ReadFilter filter : filters) filter.initialize(this); @@ -963,7 +961,7 @@ public class GenomeAnalysisEngine { /** * Get the list of intervals passed to the engine. - * @return List of intervals. + * @return List of intervals, or null if no intervals are in use */ public GenomeLocSortedSet getIntervals() { return this.intervals; diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java index ce638ff2b..2f4dd06e2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java @@ -39,8 +39,7 @@ import org.simpleframework.xml.*; public class DbsnpArgumentCollection { /** - * A dbSNP VCF file. Variants in this track will be treated as "known" variants - * in tools using this track. + * A dbSNP VCF file. */ @Input(fullName="dbsnp", shortName = "D", doc="dbSNP file", required=false) public RodBinding dbsnp; diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 62135f21b..fd39d46b0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -101,6 +101,8 @@ public class GATKArgumentCollection { @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) public File referenceFile = null; + @Deprecated + @Hidden @ElementList(required = false) @Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :, ", required = false) public ArrayList RODBindings = new ArrayList(); @@ -340,14 +342,6 @@ public class GATKArgumentCollection { return false; } } - if (other.RODBindings.size() != RODBindings.size()) { - return false; - } - for (int x = 0; x < RODBindings.size(); x++) { - if (!RODBindings.get(x).equals(other.RODBindings.get(x))) { - return false; - } - } if (!other.samFiles.equals(this.samFiles)) { return false; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java index cd77a9e7e..4ec451567 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java @@ -34,7 +34,7 @@ import net.sf.samtools.SAMRecord; * Filter out FailsVendorQualityCheck reads. */ -public class FailsVendorQualityCheckReadFilter extends ReadFilter { +public class FailsVendorQualityCheckFilter extends ReadFilter { public boolean filterOut( final SAMRecord read ) { return read.getReadFailsVendorQualityCheckFlag(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java similarity index 96% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java index 75369b306..ed9c37dca 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java @@ -35,7 +35,7 @@ import org.broadinstitute.sting.commandline.Argument; * @version 0.1 */ -public class MappingQualityReadFilter extends ReadFilter { +public class MappingQualityFilter extends ReadFilter { @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) public int MIN_MAPPING_QUALTY_SCORE = 10; diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java index 1afec36d1..ccdb40d31 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.QualityUtils; * @version 0.1 */ -public class MappingQualityUnavailableReadFilter extends ReadFilter { +public class MappingQualityUnavailableFilter extends ReadFilter { public boolean filterOut(SAMRecord rec) { return (rec.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java index e49d4117c..57db8419c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java @@ -33,7 +33,7 @@ import net.sf.samtools.SAMRecord; * @version 0.1 */ -public class MappingQualityZeroReadFilter extends ReadFilter { +public class MappingQualityZeroFilter extends ReadFilter { public boolean filterOut(SAMRecord rec) { return (rec.getMappingQuality() == 0); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java index 31c2144ce..50cd30f71 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java @@ -34,7 +34,7 @@ import net.sf.samtools.SAMRecord; * Filter out duplicate reads. */ -public class NotPrimaryAlignmentReadFilter extends ReadFilter { +public class NotPrimaryAlignmentFilter extends ReadFilter { public boolean filterOut( final SAMRecord read ) { return read.getNotPrimaryAlignmentFlag(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index b9aaf47de..286e22369 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -333,10 +333,6 @@ public class RefMetaDataTracker { return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); } @Deprecated - public List getValues(final Class type, final Collection names, final GenomeLoc onlyAtThisLoc) { - return addValues(names, type, new ArrayList(), onlyAtThisLoc, true, false); - } - @Deprecated public T getFirstValue(final Class type, final String name) { return safeGetFirst(getValues(type, name)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 902f9d308..bf490e28c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -1,10 +1,11 @@ package org.broadinstitute.sting.gatk.refdata; +import net.sf.samtools.util.SequenceUtil; import org.broad.tribble.Feature; +import org.broad.tribble.annotation.Strand; import org.broad.tribble.dbsnp.DbSNPFeature; import org.broad.tribble.gelitext.GeliTextFeature; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; @@ -92,6 +93,67 @@ public class VariantContextAdaptors { // -------------------------------------------------------------------------------------------------------------- private static class DBSnpAdaptor implements VCAdaptor { + private static boolean isSNP(DbSNPFeature feature) { + return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact"); + } + + private static boolean isMNP(DbSNPFeature feature) { + return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range"); + } + + private static boolean isInsertion(DbSNPFeature feature) { + return feature.getVariantType().contains("insertion"); + } + + private static boolean isDeletion(DbSNPFeature feature) { + return feature.getVariantType().contains("deletion"); + } + + private static boolean isIndel(DbSNPFeature feature) { + return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature); + } + + public static boolean isComplexIndel(DbSNPFeature feature) { + return feature.getVariantType().contains("in-del"); + } + + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + public static List getAlternateAlleleList(DbSNPFeature feature) { + List ret = new ArrayList(); + for (String allele : getAlleleList(feature)) + if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele); + return ret; + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. + * + * @return an alternate allele list + */ + public static List getAlleleList(DbSNPFeature feature) { + List alleleList = new ArrayList(); + // add ref first + if ( feature.getStrand() == Strand.POSITIVE ) + alleleList = Arrays.asList(feature.getObserved()); + else + for (String str : feature.getObserved()) + alleleList.add(SequenceUtil.reverseComplement(str)); + if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase()) + && !alleleList.get(0).equals(feature.getNCBIRefBase()) ) + Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0); + + return alleleList; + } + /** * Converts non-VCF formatted dbSNP records to VariantContext. * @return DbSNPFeature. @@ -102,18 +164,18 @@ public class VariantContextAdaptors { @Override public VariantContext convert(String name, Object input, ReferenceContext ref) { DbSNPFeature dbsnp = (DbSNPFeature)input; - if ( ! Allele.acceptableAlleleBases(DbSNPHelper.getReference(dbsnp)) ) + if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) ) return null; - Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true); + Allele refAllele = Allele.create(dbsnp.getNCBIRefBase(), true); - if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || DbSNPHelper.isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) { + if ( isSNP(dbsnp) || isIndel(dbsnp) || isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) { // add the reference allele List alleles = new ArrayList(); alleles.add(refAllele); // add all of the alt alleles boolean sawNullAllele = refAllele.isNull(); - for ( String alt : DbSNPHelper.getAlternateAlleleList(dbsnp) ) { + for ( String alt : getAlternateAlleleList(dbsnp) ) { if ( ! Allele.acceptableAlleleBases(alt) ) { //System.out.printf("Excluding dbsnp record %s%n", dbsnp); return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/DbSNPHelper.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/DbSNPHelper.java deleted file mode 100644 index a2132cee5..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/DbSNPHelper.java +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.features; - -import net.sf.samtools.util.SequenceUtil; -import org.broad.tribble.annotation.Strand; -import org.broad.tribble.dbsnp.DbSNPFeature; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -/** - * this class contains static helper methods for DbSNP - */ -public class DbSNPHelper { - - private DbSNPHelper() {} // don't make a DbSNPHelper - - public static String rsIDOfFirstRealVariant(List VCs, VariantContext.Type type) { - if ( VCs == null ) - return null; - - String rsID = null; - for ( VariantContext vc : VCs ) { - if ( vc.getType() == type ) { - rsID = vc.getID(); - break; - } - } - - return rsID; - } - - /** - * get the -1 * (log 10 of the error value) - * - * @return the log based error estimate - */ - public static double getNegLog10PError(DbSNPFeature feature) { - return 4; // -log10(0.0001) - } - - // - // What kind of variant are we? - // - // ---------------------------------------------------------------------- - public static boolean isSNP(DbSNPFeature feature) { - return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact"); - } - - public static boolean isMNP(DbSNPFeature feature) { - return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range"); - } - - public static String toMediumString(DbSNPFeature feature) { - String s = String.format("%s:%d:%s:%s", feature.getChr(), feature.getStart(), feature.getRsID(), Utils.join("",feature.getObserved())); - if (isSNP(feature)) s += ":SNP"; - if (isIndel(feature)) s += ":Indel"; - if (isHapmap(feature)) s += ":Hapmap"; - if (is2Hit2Allele(feature)) s += ":2Hit"; - return s; - } - - public static boolean isInsertion(DbSNPFeature feature) { - return feature.getVariantType().contains("insertion"); - } - - public static boolean isDeletion(DbSNPFeature feature) { - return feature.getVariantType().contains("deletion"); - } - - public static boolean isIndel(DbSNPFeature feature) { - return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || DbSNPHelper.isComplexIndel(feature); - } - - public static boolean isComplexIndel(DbSNPFeature feature) { - return feature.getVariantType().contains("in-del"); - } - - public static boolean isHapmap(DbSNPFeature feature) { - return feature.getValidationStatus().contains("by-hapmap"); - } - - public static boolean is2Hit2Allele(DbSNPFeature feature) { - return feature.getValidationStatus().contains("by-2hit-2allele"); - } - - public static boolean is1000genomes(DbSNPFeature feature) { - return feature.getValidationStatus().contains("by-1000genomes"); - } - - public static boolean isMQ1(DbSNPFeature feature) { - return feature.getWeight() == 1; - } - - /** - * gets the alternate alleles. This method should return all the alleles present at the location, - * NOT including the reference base. This is returned as a string list with no guarantee ordering - * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest - * frequency). - * - * @return an alternate allele list - */ - public static List getAlternateAlleleList(DbSNPFeature feature) { - List ret = new ArrayList(); - for (String allele : getAlleleList(feature)) - if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele); - return ret; - } - - public static boolean onFwdStrand(DbSNPFeature feature) { - return feature.getStrand() == Strand.POSITIVE; - } - - public static String getReference(DbSNPFeature feature) { - return feature.getNCBIRefBase(); - } - - public static String toSimpleString(DbSNPFeature feature) { - return String.format("%s:%s:%s", feature.getRsID(), feature.getObserved(), (feature.getStrand() == Strand.POSITIVE) ? "+" : "-"); - } - - /** - * gets the alleles. This method should return all the alleles present at the location, - * including the reference base. The first allele should always be the reference allele, followed - * by an unordered list of alternate alleles. - * - * @return an alternate allele list - */ - public static List getAlleleList(DbSNPFeature feature) { - List alleleList = new ArrayList(); - // add ref first - if ( onFwdStrand(feature) ) - alleleList = Arrays.asList(feature.getObserved()); - else - for (String str : feature.getObserved()) - alleleList.add(SequenceUtil.reverseComplement(str)); - if ( alleleList.size() > 0 && alleleList.contains(getReference(feature)) && !alleleList.get(0).equals(getReference(feature)) ) - Collections.swap(alleleList, alleleList.indexOf(getReference(feature)), 0); - - return alleleList; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java index 26a400071..c99aea254 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java @@ -36,7 +36,10 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.help.GATKDocUtils; +import org.broadinstitute.sting.utils.help.HelpUtils; +import javax.mail.Header; import java.io.File; import java.util.*; @@ -50,7 +53,7 @@ import java.util.*; * @author depristo */ public class FeatureManager { - public static class FeatureDescriptor { + public static class FeatureDescriptor implements Comparable { final String name; final FeatureCodec codec; @@ -62,6 +65,7 @@ public class FeatureManager { public String getName() { return name; } + public String getSimpleFeatureName() { return getFeatureClass().getSimpleName(); } public FeatureCodec getCodec() { return codec; } @@ -70,13 +74,18 @@ public class FeatureManager { @Override public String toString() { - return String.format("FeatureDescriptor name=%s codec=%s feature=%s", getName(), getCodecClass().getName(), getFeatureClass().getName()); + return String.format("FeatureDescriptor name=%s codec=%s feature=%s", + getName(), getCodecClass().getName(), getFeatureClass().getName()); + } + + @Override + public int compareTo(FeatureDescriptor o) { + return getName().compareTo(o.getName()); } } private final PluginManager pluginManager; - private final Collection featureDescriptors = new HashSet(); - + private final Collection featureDescriptors = new TreeSet(); /** * Construct a FeatureManager @@ -114,7 +123,7 @@ public class FeatureManager { */ @Requires("featureClass != null") public Collection getByFeature(Class featureClass) { - Set consistentDescriptors = new HashSet(); + Set consistentDescriptors = new TreeSet(); if (featureClass == null) throw new IllegalArgumentException("trackRecordType value is null, please pass in an actual class object"); @@ -189,10 +198,40 @@ public class FeatureManager { */ @Ensures("result != null") public String userFriendlyListOfAvailableFeatures() { - List names = new ArrayList(); - for ( final FeatureDescriptor descriptor : featureDescriptors ) - names.add(descriptor.getName()); - return Utils.join(",", names); + return userFriendlyListOfAvailableFeatures(Feature.class); + } + + /** + * Returns a list of the available tribble track names (vcf,dbsnp,etc) that we can load + * restricted to only Codecs producting Features consistent with the requiredFeatureType + * @return + */ + @Ensures("result != null") + public String userFriendlyListOfAvailableFeatures(Class requiredFeatureType) { + final String nameHeader="Name", featureHeader = "FeatureType", docHeader="Documentation"; + + int maxNameLen = nameHeader.length(), maxFeatureNameLen = featureHeader.length(); + for ( final FeatureDescriptor descriptor : featureDescriptors ) { + if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) { + maxNameLen = Math.max(maxNameLen, descriptor.getName().length()); + maxFeatureNameLen = Math.max(maxFeatureNameLen, descriptor.getSimpleFeatureName().length()); + } + } + + StringBuilder docs = new StringBuilder(); + String format = "%" + maxNameLen + "s %" + maxFeatureNameLen + "s %s%n"; + docs.append(String.format(format, nameHeader, featureHeader, docHeader)); + for ( final FeatureDescriptor descriptor : featureDescriptors ) { + if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) { + String oneDoc = String.format(format, + descriptor.getName(), + descriptor.getSimpleFeatureName(), + GATKDocUtils.helpLinksToGATKDocs(descriptor.getCodecClass())); + docs.append(oneDoc); + } + } + + return docs.toString(); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index 76b0276cd..bb65d9b09 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -30,7 +30,9 @@ import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFileFactory; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; @@ -42,7 +44,6 @@ import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation; import org.broadinstitute.sting.utils.clipreads.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.yaml.snakeyaml.events.SequenceStartEvent; import java.io.File; import java.io.PrintStream; @@ -51,44 +52,158 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * This ReadWalker provides simple, yet powerful read clipping capabilities. It allows the user to clip bases in reads - * with poor quality scores, that match particular sequences, or that were generated by particular machine cycles. + * This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences. + * + * + *

+ * It allows the user to clip bases in reads with poor quality scores, that match particular + * sequences, or that were generated by particular machine cycles. + * + *

+ *
Quality score based clipping
+ *
+ * Clip bases from the read in clipper from + *
argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
+ * to the end of the read. This is blatantly stolen from BWA. + * + * Walk through the read from the end (in machine cycle order) to the beginning, calculating the + * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this + * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the + * clipping index in the read (from the end). + *
+ *
Cycle based clipping
+ *
Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. + * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions). + * For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12. + *
+ *
Sequence matching
+ *
Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm, + * filtering only bases whose sequence exactly matches SEQ.
+ *
+ * + *

+ * + *

Input

+ *

+ * Any number of BAM files. + *

+ * + *

Output

+ *

+ * A new BAM file containing all of the reads from the input BAMs with the user-specified clipping + * operation applied to each read. + *

+ *

+ *

Summary output

+ *
+ *     Number of examined reads              13
+ *     Number of clipped reads               13
+ *     Percent of clipped reads              100.00
+ *     Number of examined bases              988
+ *     Number of clipped bases               126
+ *     Percent of clipped bases              12.75
+ *     Number of quality-score clipped bases 126
+ *     Number of range clipped bases         0
+ *     Number of sequence clipped bases      0
+ *     
+ *

+ * + *

+ *

Example clipping

+ * Suppose we are given this read: + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
+ *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
+ * + * If we are clipping reads with -QT 10 and -CR WRITE_NS, we get: + * + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
+ *          NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
+ * + * Whereas with -CR WRITE_Q0S: + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
+ *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
+ * + * Or -CR SOFTCLIP_BASES: + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3133    29      17S59M  *       *       *
+ *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
+ *

+ * + *

Examples

+ *
+ *     -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
+ *     -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
+ * 
+ + * @author Mark DePristo + * @since 2010 */ @Requires({DataSource.READS}) public class ClipReadsWalker extends ReadWalker { - @Output - PrintStream out; + /** + * If provided, ClipReads will write summary statistics about the clipping operations applied + * to the reads to this file. + */ + @Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false) + PrintStream out = null; /** - * an optional argument to dump the reads out to a BAM file + * The output SAM/BAM file will be written here */ - @Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false) - StingSAMFileWriter outputBam = null; + @Output(doc = "Write BAM output here", required = true) + StingSAMFileWriter outputBam; - @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "", required = false) + /** + * If a value > 0 is provided, then the quality score based read clipper will be applied to the reads using this + * quality score threshold. + */ + @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "If provided, the Q-score clipper will be applied", required = false) int qTrimmingThreshold = -1; - @Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false) + /** + * Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. + * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based + * values (positions). For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, + * and 12. + */ + @Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String indicating machine cycles to clip from the reads", required = false) String cyclesToClipArg = null; - @Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching these sequences", required = false) + /** + * Reads the sequences in the provided FASTA file, and clip any bases that exactly match any of the + * sequences in the file. + */ + @Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching the sequences in this FASTA file", required = false) String clipSequenceFile = null; + /** + * Clips bases from the reads matching the provided SEQ. Can be provided any number of times on the command line + */ @Argument(fullName = "clipSequence", shortName = "X", doc = "Remove sequences within reads matching this sequence", required = false) String[] clipSequencesArgs = null; - @Argument(fullName="read", doc="", required=false) - String onlyDoRead = null; - - //@Argument(fullName = "keepCompletelyClipped", shortName = "KCC", doc = "Unfortunately, sometimes a read is completely clipped away but with SOFTCLIP_BASES this results in an invalid CIGAR string. ", required = false) - //boolean keepCompletelyClippedReads = false; - -// @Argument(fullName = "onlyClipFirstSeqMatch", shortName = "ESC", doc="Only clip the first occurrence of a clipping sequence, rather than all subsequences within a read that match", required = false) -// boolean onlyClipFirstSeqMatch = false; - + /** + * The different values for this argument determines how ClipReads applies clips to the reads. This can range + * from writing Ns over the clipped bases to hard clipping away the bases from the BAM. + */ @Argument(fullName = "clipRepresentation", shortName = "CR", doc = "How should we actually clip the bases?", required = false) ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS; + @Hidden + @Advanced + @Argument(fullName="read", doc="", required=false) + String onlyDoRead = null; /** * List of sequence that should be clipped from the reads diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java index 4bfedb672..e2db1dc52 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java @@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.walkers; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.utils.GenomeLoc; @@ -17,7 +17,7 @@ import java.util.Set; * To change this template use File | Settings | File Templates. */ @Requires({DataSource.READS,DataSource.REFERENCE}) -@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentReadFilter.class}) +@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class}) public abstract class DuplicateWalker extends Walker { // Do we actually want to operate on the context? public boolean filter(GenomeLoc loc, AlignmentContext context, Set> readSets ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java index b0b2687f4..8152f74c2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java @@ -3,8 +3,8 @@ package org.broadinstitute.sting.gatk.walkers; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; -import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckReadFilter; -import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter; +import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -18,7 +18,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @By(DataSource.READS) @Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES}) @PartitionBy(PartitionType.INTERVAL) -@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentReadFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckReadFilter.class}) +@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) public abstract class LocusWalker extends Walker { // Do we actually want to operate on the context? public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java index 6243a6cc0..4d8be4800 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers; import org.broad.tribble.Feature; -import org.broad.tribble.dbsnp.DbSNPFeature; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; @@ -34,9 +33,6 @@ import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index 7e1dcd707..fdfac6bf7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -40,26 +40,65 @@ import java.util.TreeSet; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; + /** - * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear - * in the input file. It can dynamically merge the contents of multiple input BAM files, resulting - * in merged output sorted in coordinate order. Can also optionally filter reads based on the --read-filter - * command line argument. + * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. + * + *

+ * PrintReads can dynamically merge the contents of multiple input BAM files, resulting + * in merged output sorted in coordinate order. Can also optionally filter reads based on the + * --read_filter command line argument. + * + *

Input

+ *

+ * One or more bam files. + *

+ * + *

Output

+ *

+ * A single processed bam file. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T PrintReads \
+ *   -o output.bam \
+ *   -I input1.bam \
+ *   -I input2.bam \
+ *   --read_filter MappingQualityZero
+ * 
+ * */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @Requires({DataSource.READS, DataSource.REFERENCE}) public class PrintReadsWalker extends ReadWalker { - /** an optional argument to dump the reads out to a BAM file */ + @Output(doc="Write output to this BAM filename instead of STDOUT") SAMFileWriter out; + @Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false) String readGroup = null; + + /** + * For example, --platform ILLUMINA or --platform 454. + */ @Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false) - String platform = null; // E.g. ILLUMINA, 454 + String platform = null; + @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false) int nReadsToPrint = -1; + + /** + * Only reads from samples listed in the provided file(s) will be included in the output. + */ @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) public Set sampleFile = new TreeSet(); + + /** + * Only reads from the sample(s) will be included in the output. + */ @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) public Set sampleNames = new TreeSet(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index 6a2ffe189..cf68a9121 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -90,7 +90,7 @@ public class AlleleBalance extends InfoFieldAnnotation { } // todo -- actually care about indel length from the pileup (agnostic at the moment) int refCount = indelPileup.size(); - int altCount = vc.isInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions(); + int altCount = vc.isSimpleInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions(); if ( refCount + altCount == 0 ) { continue; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java index 4102d811c..463f7a645 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -79,7 +79,7 @@ public class HomopolymerRun extends InfoFieldAnnotation implements StandardAnnot GenomeLoc locus = ref.getLocus(); GenomeLoc window = ref.getWindow(); int refBasePos = (int) (locus.getStart() - window.getStart())+1; - if ( vc.isDeletion() ) { + if ( vc.isSimpleDeletion() ) { // check that deleted bases are the same byte dBase = bases[refBasePos]; for ( int i = 0; i < vc.getReference().length(); i ++ ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java index ff7f9a8f6..bfede40d2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java @@ -36,9 +36,9 @@ public class IndelType extends InfoFieldAnnotation implements ExperimentalAnnota if (!vc.isBiallelic()) type = "MULTIALLELIC_INDEL"; else { - if (vc.isInsertion()) + if (vc.isSimpleInsertion()) type = "INS."; - else if (vc.isDeletion()) + else if (vc.isSimpleDeletion()) type = "DEL."; else type = "OTHER."; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index fc5014885..350c683c2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -161,19 +161,19 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio public List getDescriptions() { return Arrays.asList( - new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID"), - new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name"), - new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID"), - new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID"), - new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank"), - new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If present, gene is non-coding"), - new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "One of the most high-impact effects across all transcripts at this site"), - new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the effect " + Arrays.toString(SnpEffConstants.EffectImpact.values())), - new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the effect"), - new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid"), - new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon"), - new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number"), - new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size") + new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"), + new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), + new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())), + new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant") ); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 8c8bd19d0..96a400c68 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -49,7 +49,34 @@ import java.util.*; /** - * Annotates variant calls with context information. Users can specify which of the available annotations to use. + * Annotates variant calls with context information. + * + *

+ * VariantAnnotator is a GATK tool for annotating variant calls based on their context. + * The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself. + * + *

Input

+ *

+ * A variant set to annotate and optionally one or more BAM files. + *

+ * + *

Output

+ *

+ * An annotated VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T VariantAnnotator \
+ *   -I input.bam \
+ *   -o output.vcf \
+ *   -A DepthOfCoverage
+ *   --variant input.vcf \
+ *   --dbsnp dbsnp.vcf
+ * 
+ * */ @Requires(value={}) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @@ -69,8 +96,6 @@ public class VariantAnnotator extends RodWalker implements Ann public RodBinding getSnpEffRodBinding() { return snpEffFile; } /** - * A dbSNP VCF file from which to annotate. - * * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. */ @ArgumentCollection @@ -101,15 +126,25 @@ public class VariantAnnotator extends RodWalker implements Ann @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false) - protected String sampleName = null; - + /** + * See the -list argument to view available annotations. + */ @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) protected List annotationsToUse = new ArrayList(); + /** + * See the -list argument to view available groups. + */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) protected List annotationGroupsToUse = new ArrayList(); + /** + * This option enables you to add annotations from one VCF to another. + * + * For example, if you want to annotate your 'variant' VCF with the AC field value from the rod bound to 'resource', + * you can specify '-E resource.AC' and records in the output VCF will be annotated with 'resource.AC=N' when a record exists in that rod at the given position. + * If multiple records in the rod overlap the given position, one is chosen arbitrarily. + */ @Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false) protected List expressionsToUse = new ArrayList(); @@ -127,8 +162,6 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false) protected boolean indelsOnly = false; - private HashMap nonVCFsampleName = new HashMap(); - private VariantAnnotatorEngine engine; private Collection indelBufferContext; @@ -164,12 +197,6 @@ public class VariantAnnotator extends RodWalker implements Ann List rodName = Arrays.asList(variantCollection.variants.getName()); Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); - // add the non-VCF sample from the command-line, if applicable - if ( sampleName != null ) { - nonVCFsampleName.put(sampleName.toUpperCase(), "variant"); - samples.add(sampleName.toUpperCase()); - } - // if there are no valid samples, warn the user if ( samples.size() == 0 ) { logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired."); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index a7837813a..01926a7f3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -29,15 +29,11 @@ import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -158,7 +154,7 @@ public class VariantAnnotatorEngine { private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { - String rsID = DbSNPHelper.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); + String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null); // annotate dbsnp id if available and not already there if ( rsID != null && (!vc.hasID() || vc.getID().equals(VCFConstants.EMPTY_ID_FIELD)) ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index 40e6748ed..60f0fcb0a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature; +import org.broadinstitute.sting.utils.codecs.beagle.BeagleFeature; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; @@ -48,6 +48,31 @@ import static java.lang.Math.log10; /** * Takes files produced by Beagle imputation engine and creates a vcf with modified annotations. + * + *

This walker is intended to be run after Beagle has successfully executed. The full calling sequence for using Beagle along with the GATK is:

+ * + *

1. Run ProduceBeagleInputWalker.

+ *

2. Run Beagle

+ *

3. Uncompress output files

+ *

4. Run BeagleOutputToVCFWalker.

+ * + * + * Note that this walker requires all input files produced by Beagle. + * + * + *

Example

+ *
+ *     java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
+ *      -R reffile.fasta -T BeagleOutputToVCF \
+ *      -V input_vcf.vcf \
+ *      -beagleR2:BEAGLE /myrun.beagle_output.r2 \
+ *      -beaglePhased:BEAGLE /myrun.beagle_output.phased \
+ *      -beagleProbs:BEAGLE /myrun.beagle_output.gprobs \
+ *      -o output_vcf.vcf
+ *      
+ +

Note that Beagle produces some of these files compressed as .gz, so gunzip must be run on them before walker is run in order to decompress them

+ */ public class BeagleOutputToVCFWalker extends RodWalker { @@ -57,22 +82,18 @@ public class BeagleOutputToVCFWalker extends RodWalker { @Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false) public RodBinding comp; - @Input(fullName="beagleR2", shortName = "beagleR2", doc="VCF file", required=true) + @Input(fullName="beagleR2", shortName = "beagleR2", doc="Beagle-produced .r2 file containing R^2 values for all markers", required=true) public RodBinding beagleR2; - @Input(fullName="beagleProbs", shortName = "beagleProbs", doc="VCF file", required=true) + @Input(fullName="beagleProbs", shortName = "beagleProbs", doc="Beagle-produced .probs file containing posterior genotype probabilities", required=true) public RodBinding beagleProbs; - @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="VCF file", required=true) + @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true) public RodBinding beaglePhased; - @Output(doc="File to which variants should be written",required=true) + @Output(doc="VCF File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - @Argument(fullName="output_file", shortName="output", doc="Please use --out instead" ,required=false) - @Deprecated - protected String oldOutputArg; - @Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false) public boolean DONT_FILTER_MONOMORPHIC_SITES = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index c1508cf83..07793fd7b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -48,19 +48,45 @@ import java.io.PrintStream; import java.util.*; /** - * Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input variant file + * Converts the input VCF into a format accepted by the Beagle imputation/analysis program. + *

+ * + *

Input

+ *

+ * A VCF with variants to convert to Beagle format + *

+ * + *

Outputs

+ *

+ * A single text file which can be fed to Beagle + *

+ *

+ * Optional: A file with a list of markers + *

+ * + *

Examples

+ *
+ *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
+ *      -R reffile.fasta -T ProduceBeagleInput \
+ *      -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
+ * 
+ * */ + public class ProduceBeagleInputWalker extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Input(fullName="validation", shortName = "validation", doc="Input VCF file", required=false) + @Hidden + @Input(fullName="validation", shortName = "validation", doc="Validation VCF file", required=false) public RodBinding validation; + @Output(doc="File to which BEAGLE input should be written",required=true) protected PrintStream beagleWriter = null; - @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false) + @Hidden + @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false) protected PrintStream markers = null; int markerCounter = 1; @@ -73,14 +99,19 @@ public class ProduceBeagleInputWalker extends RodWalker { @Argument(doc="VQSqual key", shortName = "vqskey", required=false) protected String VQSLOD_KEY = "VQSqual"; - @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) + @Hidden + @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) public double insertedNoCallRate = 0; - @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) + @Hidden + @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) public double validationPrior = -1.0; - @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false) + @Hidden + @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false) public double bootstrap = 0.0; - @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false) + @Hidden + @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false) VCFWriter bootstrapVCFOutput = null; + @Argument(fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false) public boolean CHECK_IS_MALE_ON_CHR_X = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 90e6fcd77..32875a098 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -22,6 +22,7 @@ package org.broadinstitute.sting.gatk.walkers.coverage; +import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -42,50 +43,195 @@ import java.io.PrintStream; /** * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome * - * @Author depristo - * @Date May 7, 2010 + *

+ * A very common question about a NGS set of reads is what areas of the genome are considered callable. The system + * considers the coverage at each locus and emits either a per base state or a summary interval BED file that + * partitions the genomic intervals into the following callable states: + *

+ *
REF_N
+ *
the reference base was an N, which is not considered callable the GATK
+ *
CALLABLE
+ *
the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
+ *
NO_COVERAGE
+ *
absolutely no reads were seen at this locus, regardless of the filtering parameters
+ *
LOW_COVERAGE
+ *
there were less than min. depth bases at the locus, after applying filters
+ *
EXCESSIVE_COVERAGE
+ *
more than -maxDepth read at the locus, indicating some sort of mapping problem
+ *
POOR_MAPPING_QUALITY
+ *
more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
+ *
+ *

+ * + *

Input

+ *

+ * A BAM file containing exactly one sample. + *

+ * + *

Output

+ *

+ *

    + *
  • -o: a OutputFormatted (recommended BED) file with the callable status covering each base
  • + *
  • -summary: a table of callable status x count of all examined bases
  • + *
+ *

+ * + *

Examples

+ *
+ *     -T CallableLociWalker \
+ *     -I my.bam \
+ *     -summary my.summary \
+ *     -o my.bed
+ * 
+ * + * would produce a BED file (my.bed) that looks like: + * + *
+ *     20 10000000 10000864 CALLABLE
+ *     20 10000865 10000985 POOR_MAPPING_QUALITY
+ *     20 10000986 10001138 CALLABLE
+ *     20 10001139 10001254 POOR_MAPPING_QUALITY
+ *     20 10001255 10012255 CALLABLE
+ *     20 10012256 10012259 POOR_MAPPING_QUALITY
+ *     20 10012260 10012263 CALLABLE
+ *     20 10012264 10012328 POOR_MAPPING_QUALITY
+ *     20 10012329 10012550 CALLABLE
+ *     20 10012551 10012551 LOW_COVERAGE
+ *     20 10012552 10012554 CALLABLE
+ *     20 10012555 10012557 LOW_COVERAGE
+ *     20 10012558 10012558 CALLABLE
+ *     et cetera...
+ * 
+ * as well as a summary table that looks like: + * + *
+ *                        state nBases
+ *                        REF_N 0
+ *                     CALLABLE 996046
+ *                  NO_COVERAGE 121
+ *                 LOW_COVERAGE 928
+ *           EXCESSIVE_COVERAGE 0
+ *         POOR_MAPPING_QUALITY 2906
+ * 
+ * + * @author Mark DePristo + * @since May 7, 2010 */ @By(DataSource.REFERENCE) public class CallableLociWalker extends LocusWalker { @Output PrintStream out; - @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read. The gap between this value and mmq are reads that are not sufficiently well mapped for calling but aren't indicative of mapping problems.", required = false) + /** + * Callable loci summary counts (see outputs) will be written to this file. + */ + @Output(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true) + File summaryFile; + + /** + * The gap between this value and mmq are reads that are not sufficiently well mapped for calling but + * aren't indicative of mapping problems. For example, if maxLowMAPQ = 1 and mmq = 20, then reads with + * MAPQ == 0 are poorly mapped, MAPQ >= 20 are considered as contributing to calling, where + * reads with MAPQ >= 1 and < 20 are not bad in and of themselves but aren't sufficiently good to contribute to + * calling. In effect this reads are invisible, driving the base to the NO_ or LOW_COVERAGE states + */ + @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read.", required = false) byte maxLowMAPQ = 1; - @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false) + /** + * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE + * state. + */ + @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false) byte minMappingQuality = 10; - @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false) + /** + * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state + */ + @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false) byte minBaseQuality = 20; + /** + * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this + * value and is less than maxDepth the site is considered CALLABLE. + */ + @Advanced @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false) int minDepth = 4; + /** + * If the QC+ depth exceeds this value the site is considered to have EXCESSIVE_DEPTH + */ @Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false) int maxDepth = -1; + /** + * We don't want to consider a site as POOR_MAPPING_QUALITY just because it has two reads, and one is MAPQ. We + * won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads + * covering the site. + */ + @Advanced @Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false) int minDepthLowMAPQ = 10; - @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "Maximum read depth before a locus is considered poorly mapped", required = false) + /** + * If the number of reads at this site is greater than minDepthForLowMAPQ and the fraction of reads with low mapping quality + * exceeds this fraction then the site has POOR_MAPPING_QUALITY. + */ + @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "If the fraction of reads at a base with low mapping quality exceeds this value, the site may be poorly mapped", required = false) double maxLowMAPQFraction = 0.1; - @Argument(fullName = "format", shortName = "format", doc = "Output format for the system: either BED or STATE_PER_BASE", required = false) + /** + * The output of this walker will be written in this format. The recommended option is BED. + */ + @Advanced + @Argument(fullName = "format", shortName = "format", doc = "Output format", required = false) OutputFormat outputFormat; - @Argument(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true) - File summaryFile; + public enum OutputFormat { + /** + * The output will be written as a BED file. There's a BED element for each + * continuous run of callable states (i.e., CALLABLE, REF_N, etc). This is the recommended + * format + */ + BED, - public enum OutputFormat { BED, STATE_PER_BASE } + /** + * Emit chr start stop state quads for each base. Produces a potentially disasterously + * large amount of output. + */ + STATE_PER_BASE + } + + public enum CalledState { + /** the reference base was an N, which is not considered callable the GATK */ + REF_N, + /** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */ + CALLABLE, + /** absolutely no reads were seen at this locus, regardless of the filtering parameters */ + NO_COVERAGE, + /** there were less than min. depth bases at the locus, after applying filters */ + LOW_COVERAGE, + /** more than -maxDepth read at the locus, indicating some sort of mapping problem */ + EXCESSIVE_COVERAGE, + /** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */ + POOR_MAPPING_QUALITY + } //////////////////////////////////////////////////////////////////////////////////// // STANDARD WALKER METHODS //////////////////////////////////////////////////////////////////////////////////// + @Override public boolean includeReadsWithDeletionAtLoci() { return true; } + @Override public void initialize() { + if ( getToolkit().getSamples().size() != 2 ) { + // unbelievably there are actually two samples even when there's just one in the header. God I hate this Samples system + throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getToolkit().getSamples()); + } + try { PrintStream summaryOut = new PrintStream(summaryFile); summaryOut.close(); @@ -94,15 +240,15 @@ public class CallableLociWalker extends LocusWalker + * DepthOfCoverage processes a set of bam files to determine coverage at different levels of partitioning and + * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by + * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, + * and/or percentage of bases covered to or beyond a threshold. + * Additionally, reads and bases can be filtered by mapping or base quality score. + * + *

Input

+ *

+ * One or more bam files (with proper headers) to be analyzed for coverage statistics + * (Optional) A REFSEQ Rod to aggregate coverage to the gene level + *

+ * + *

Output

+ *

+ * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: + * - no suffix: per locus coverage + * - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases + * - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases + * - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval + * - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples + * - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene + * - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples + * - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases + * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T VariantEval \
+ *   -o file_name_base \
+ *   -I input_bams.list
+ *   [-geneList refSeq.sorted.txt] \
+ *   [-pt readgroup] \
+ *   [-ct 4 -ct 6 -ct 10] \
+ *   [-L my_capture_genes.interval_list]
+ * 
* - * @Author chartl - * @Date Feb 22, 2010 */ // todo -- cache the map from sample names to means in the print functions, rather than regenerating each time // todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java index a4944e939..5c2a967b9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java @@ -38,12 +38,32 @@ import java.util.List; /** * Walks along reference and calculates the GC content for each interval. + * + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * GC content calculations per interval. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T GCContentByInterval \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   -L input.intervals
+ * 
+ * */ @Allows(value = {DataSource.REFERENCE}) @Requires(value = {DataSource.REFERENCE}) - @By(DataSource.REFERENCE) - public class GCContentByIntervalWalker extends LocusWalker { @Output protected PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java index 60f9724e8..fd912334f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java @@ -35,22 +35,53 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.Collections; import java.util.List; /** - * Generates an alternative reference sequence over the specified interval. Given variant ROD tracks, - * it replaces the reference bases at variation sites with the bases supplied by the ROD(s). Additionally, - * allows for a "snpmask" ROD to set overlapping bases to 'N'. + * Generates an alternative reference sequence over the specified interval. + * + *

+ * Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). + * Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'. + * + *

Input

+ *

+ * The reference, requested intervals, and any number of variant rod files. + *

+ * + *

Output

+ *

+ * A fasta file representing the requested intervals. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T FastaAlternateReferenceMaker \
+ *   -o output.fasta \
+ *   -L input.intervals \
+ *   --variant input.vcf \
+ *   [--snpmask mask.vcf]
+ * 
+ * */ @WalkerName("FastaAlternateReferenceMaker") @Reference(window=@Window(start=-1,stop=50)) @Requires(value={DataSource.REFERENCE}) public class FastaAlternateReferenceWalker extends FastaReferenceWalker { + /** + * Variants from these input files are used by this tool to construct an alternate reference. + */ @Input(fullName = "variant", shortName = "V", doc="variants to model", required=false) - public List> variants; + public List> variants = Collections.emptyList(); + /** + * Snps from this file are used as a mask when constructing the alternate reference. + */ @Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false) public RodBinding snpmask; @@ -66,17 +97,18 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker { String refBase = String.valueOf((char)ref.getBase()); // Check to see if we have a called snp - for ( VariantContext vc : tracker.getValues(VariantContext.class) ) { - if ( ! vc.getSource().equals(snpmask.getName())) { - if ( vc.isDeletion()) { - deletionBasesRemaining = vc.getReference().length(); - // delete the next n bases, not this one - return new Pair(context.getLocation(), refBase); - } else if ( vc.isInsertion()) { - return new Pair(context.getLocation(), refBase.concat(vc.getAlternateAllele(0).toString())); - } else if (vc.isSNP()) { - return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); - } + for ( VariantContext vc : tracker.getValues(variants) ) { + if ( vc.isFiltered() ) + continue; + + if ( vc.isSimpleDeletion()) { + deletionBasesRemaining = vc.getReference().length(); + // delete the next n bases, not this one + return new Pair(context.getLocation(), refBase); + } else if ( vc.isSimpleInsertion()) { + return new Pair(context.getLocation(), refBase.concat(vc.getAlternateAllele(0).toString())); + } else if (vc.isSNP()) { + return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java index 2dbfc76ff..5f3b37cc8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java @@ -38,14 +38,44 @@ import org.broadinstitute.sting.utils.collections.Pair; import java.io.PrintStream; /** - * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. Has optional - * features to control the output format. + * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. + * + *

+ * The output format can be partially controlled using the provided command-line arguments. + * + *

Input

+ *

+ * The reference and requested intervals. + *

+ * + *

Output

+ *

+ * A fasta file representing the requested intervals. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T FastaReference \
+ *   -o output.fasta \
+ *   -L input.intervals
+ * 
+ * */ @WalkerName("FastaReferenceMaker") public class FastaReferenceWalker extends RefWalker, GenomeLoc> { + @Output PrintStream out; - @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) public int fastaLineWidth=60; - @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity) - CAUTION: adjacent intervals will automatically be merged", required=false) public boolean fastaRawSeqs=false; + + @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) + public int fastaLineWidth=60; + + /** + * Please note that when using this argument adjacent intervals will automatically be merged. + */ + @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity)", required=false) + public boolean fastaRawSeqs=false; protected FastaSequence fasta; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index c555e88cd..bf3606b54 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -45,6 +45,34 @@ import java.util.*; /** * Filters variant calls using a number of user-selectable, parameterizable criteria. + * + *

+ * VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria. + * Records are hard-filtered by changing the value in the FILTER field to something other than PASS. + * + *

Input

+ *

+ * A variant set to filter. + *

+ * + *

Output

+ *

+ * A filtered VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T VariantFiltration \
+ *   -o output.vcf \
+ *   --variant input.vcf \
+ *   --filterExpression "AB < 0.2 || MQ0 > 50" \
+ *   --filterName "Nov09filters" \
+ *   --mask mask.vcf \
+ *   --maskName InDel
+ * 
+ * */ @Reference(window=@Window(start=-50,stop=50)) public class VariantFiltrationWalker extends RodWalker { @@ -52,33 +80,65 @@ public class VariantFiltrationWalker extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + /** + * Any variant which overlaps entries from the provided mask rod will be filtered. + */ @Input(fullName="mask", doc="Input ROD mask", required=false) public RodBinding mask; @Output(doc="File to which variants should be written", required=true) protected VCFWriter writer = null; - @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter (see wiki docs for more info)", required=false) + /** + * VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using + * --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2"). + */ + @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false) protected ArrayList FILTER_EXPS = new ArrayList(); - @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false) + + /** + * This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names. + */ + @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false) protected ArrayList FILTER_NAMES = new ArrayList(); + /** + * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. + * VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag). + * One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience + * methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1"). + */ @Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)", required=false) protected ArrayList GENOTYPE_FILTER_EXPS = new ArrayList(); + + /** + * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. + */ @Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false) protected ArrayList GENOTYPE_FILTER_NAMES = new ArrayList(); - @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3]", required=false) + /** + * Works together with the --clusterWindowSize argument. + */ + @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false) protected Integer clusterSize = 3; - @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0]", required=false) + + /** + * Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1. + */ + @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false) protected Integer clusterWindow = 0; - @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered; [default:0]", required=false) + @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false) protected Integer MASK_EXTEND = 0; - @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask']", required=false) + @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false) protected String MASK_NAME = "Mask"; - @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, should missing values be considered failing the expression (by default they are considered passing)?", required=false) + /** + * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing. + * Use this argument to have it evaluate as failing filters instead for these cases. + */ + @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false) protected Boolean FAIL_MISSING_VALUES = false; // JEXL expressions for the filters diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 83a8ce7d7..70f3c6a1a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -44,7 +44,9 @@ import java.util.Set; public abstract class AlleleFrequencyCalculationModel implements Cloneable { public enum Model { + /** The default model with the best performance in all cases */ EXACT, + /** For posterity we have kept around the older GRID_SEARCH model, but this gives inferior results and shouldn't be used. */ GRID_SEARCH } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 594c1dd28..60dfe4fe7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -53,7 +53,9 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { } public enum GENOTYPING_MODE { + /** the default; the Unified Genotyper will choose the most likely alternate allele */ DISCOVERY, + /** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */ GENOTYPE_GIVEN_ALLELES } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 1a76bfd07..e7f89bf08 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -36,31 +36,54 @@ import java.io.File; public class UnifiedArgumentCollection { - // control the various models to be used @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; + /** + * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. + */ @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false) public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; + /** + * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are: + * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2 + */ @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY; @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE; + /** + * Specifies how to determine the alternate allele to use for genotyping + */ @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + /** + * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with + * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this + * is the default). Note that the confidence (QUAL) values for multi-sample low-pass (e.g. 4x per sample) calling might + * be significantly smaller with the new EXACT model than with our older GRID_SEARCH model, as the latter tended to + * over-estimate the confidence; for low-pass calling we tend to use much smaller thresholds (e.g. 4). + */ @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false) public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + /** + * the minimum phred-scaled Qscore threshold to emit low confidence calls. Genotypes with confidence >= this but less + * than the calling threshold are emitted but marked as filtered. + */ @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; + /** + * This argument is not enabled by default because it increases the runtime by an appreciable amount. + */ @Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false) public boolean COMPUTE_SLOD = false; @@ -80,7 +103,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "abort_at_too_much_coverage", doc = "Don't call a site if the downsampled coverage is greater than this value", required = false) public int COVERAGE_AT_WHICH_TO_ABORT = -1; - // control the various parameters to be used @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) public int MIN_BASE_QUALTY_SCORE = 17; @@ -91,11 +113,17 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - // indel-related arguments + /** + * A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site. + * Decreasing this value will increase sensitivity but at the cost of larger calling time and a larger number of false positives. + */ @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; + /** + * This argument informs the prior probability of having an indel at a site. + */ @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) public double INDEL_HETEROZYGOSITY = 1.0/8000; @@ -126,22 +154,23 @@ public class UnifiedArgumentCollection { @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; + @Hidden @Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false) public boolean dovit = false; + @Hidden @Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false) public boolean GSA_PRODUCTION_ONLY = false; + @Hidden - @Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false) public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL; @Hidden - @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false) + @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false) public boolean IGNORE_SNP_ALLELES = false; - @Deprecated @Argument(fullName="output_all_callable_bases", shortName="all_bases", doc="Please use --output_mode EMIT_ALL_SITES instead" ,required=false) private Boolean ALL_BASES_DEPRECATED = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index d31bb6fb9..d5dbdedd6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.BadMateFilter; -import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter; +import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; @@ -45,13 +45,73 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; import java.util.*; - /** - * A variant caller which unifies the approaches of several disparate callers. Works for single-sample and - * multi-sample data. The user can choose from several different incorporated calculation models. + * A variant caller which unifies the approaches of several disparate callers -- Works for single-sample and multi-sample data. + * + *

+ * The GATK Unified Genotyper is a multiple-sample, technology-aware SNP and indel caller. It uses a Bayesian genotype + * likelihood model to estimate simultaneously the most likely genotypes and allele frequency in a population of N samples, + * emitting an accurate posterior probability of there being a segregating variant allele at each locus as well as for the + * genotype of each sample. The system can either emit just the variant sites or complete genotypes (which includes + * homozygous reference calls) satisfying some phred-scaled confidence value. The genotyper can make accurate calls on + * both single sample data and multi-sample data. + * + *

Input

+ *

+ * The read data from which to make variant calls. + *

+ * + *

Output

+ *

+ * A raw, unfiltered, highly specific callset in VCF format. + *

+ * + *

Example generic command for multi-sample SNP calling

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -T UnifiedGenotyper \
+ *   -I sample1.bam [-I sample2.bam ...] \
+ *   --dbsnp dbSNP.vcf \
+ *   -o snps.raw.vcf \
+ *   -stand_call_conf [50.0] \
+ *   -stand_emit_conf 10.0 \
+ *   -dcov [50] \
+ *   [-L targets.interval_list]
+ * 
+ * + *

+ * The above command will call all of the samples in your provided BAM files [-I arguments] together and produce a VCF file + * with sites and genotypes for all samples. The easiest way to get the dbSNP file is from the GATK resource bundle. Several + * arguments have parameters that should be chosen based on the average coverage per sample in your data. See the detailed + * argument descriptions below. + *

+ * + *

Example command for generating calls at all sites

+ *
+ * java -jar /path/to/GenomeAnalysisTK.jar \
+ *   -l INFO \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -T UnifiedGenotyper \
+ *   -I /DCC/ftp/pilot_data/data/NA12878/alignment/NA12878.SLX.maq.SRP000031.2009_08.bam \
+ *   -o my.vcf \
+ *   --output_mode EMIT_ALL_SITES
+ * 
+ * + *

Caveats

+ *
    + *
  • The system is under active and continuous development. All outputs, the underlying likelihood model, arguments, and + * file formats are likely to change.
  • + *
  • The system can be very aggressive in calling variants. In the 1000 genomes project for pilot 2 (deep coverage of ~35x) + * we expect the raw Qscore > 50 variants to contain at least ~10% FP calls. We use extensive post-calling filters to eliminate + * most of these FPs. Variant Quality Score Recalibration is a tool to perform this filtering.
  • + *
  • We only handle diploid genotypes
  • + *
+ * */ + @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) -@ReadFilters( {BadMateFilter.class, MappingQualityUnavailableReadFilter.class} ) +@ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} ) @Reference(window=@Window(start=-200,stop=200)) @By(DataSource.REFERENCE) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) @@ -61,10 +121,9 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; } @@ -72,7 +131,9 @@ public class UnifiedGenotyper extends LocusWalker> getCompRodBindings() { return Collections.emptyList(); } public List> getResourceRodBindings() { return Collections.emptyList(); } - // control the output + /** + * A raw, unfiltered, highly specific callset in VCF format. + */ @Output(doc="File to which variants should be written",required=true) protected VCFWriter writer = null; @@ -82,9 +143,15 @@ public class UnifiedGenotyper extends LocusWalker annotationsToUse = new ArrayList(); + /** + * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. + */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) protected String[] annotationClassesToUse = { "Standard" }; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index dc728ff6b..06455df6d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -51,8 +51,11 @@ public class UnifiedGenotyperEngine { public static final String LOW_QUAL_FILTER_NAME = "LowQual"; public enum OUTPUT_MODE { + /** the default */ EMIT_VARIANTS_ONLY, + /** include confident reference sites */ EMIT_ALL_CONFIDENT_SITES, + /** any callable site regardless of confidence */ EMIT_ALL_SITES } @@ -484,6 +487,9 @@ public class UnifiedGenotyperEngine { Map stratifiedContexts = null; + if ( !BaseUtils.isRegularBase( refContext.getBase() ) ) + return null; + if ( model == GenotypeLikelihoodsCalculationModel.Model.INDEL ) { if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { @@ -498,6 +504,7 @@ public class UnifiedGenotyperEngine { stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); } else { + // todo - tmp will get rid of extended events so this wont be needed if (!rawContext.hasExtendedEventPileup()) return null; @@ -515,9 +522,6 @@ public class UnifiedGenotyperEngine { } } else if ( model == GenotypeLikelihoodsCalculationModel.Model.SNP ) { - if ( !BaseUtils.isRegularBase( refContext.getBase() ) ) - return null; - // stratify the AlignmentContext and cut by sample stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index fa3991694..8680f3537 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -65,10 +65,53 @@ import java.util.*; /** * Performs local realignment of reads based on misalignments due to the presence of indels. - * Unlike most mappers, this walker uses the full alignment context to determine whether an - * appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly. + * + *

+ * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion + * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, + * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, + * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus + * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an + * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and + * specifically identify indels. + *

+ *

    There are 2 steps to the realignment process: + *
  1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
  2. + *
  3. Running the realigner over those intervals (IndelRealigner)
  4. + *
+ *

+ * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. + *

+ * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * (or with reads from similar technologies). + * + *

Input

+ *

+ * One or more aligned BAM files and optionally one or more lists of known indels. + *

+ * + *

Output

+ *

+ * A realigned version of your input BAM file(s). + *

+ * + *

Examples

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -I input.bam \
+ *   -R ref.fasta \
+ *   -T IndelRealigner \
+ *   -targetIntervals intervalListFromRTC.intervals \
+ *   -o realignedBam.bam \
+ *   [--known /path/to/indels.vcf] \
+ *   [-compress 0]    (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
+ * 
+ * + * @author ebanks */ -//Reference(window=@Window(start=-30,stop=30)) @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) public class IndelRealigner extends ReadWalker { @@ -77,88 +120,145 @@ public class IndelRealigner extends ReadWalker { public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner"; public enum ConsensusDeterminationModel { + /** + * Uses only indels from a provided ROD of known indels. + */ KNOWNS_ONLY, + /** + * Additionally uses indels already present in the original alignments of the reads. + */ USE_READS, + /** + * Additionally uses 'Smith-Waterman' to generate alternate consenses. + */ USE_SW } - @Input(fullName="known", shortName = "known", doc="Input VCF file with known indels", required=false) + /** + * Any number of VCF files representing known indels to be used for constructing alternate consenses. + * Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored. + */ + @Input(fullName="known", shortName = "known", doc="Input VCF file(s) with known indels", required=false) public List> known = Collections.emptyList(); + /** + * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s). + */ @Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) protected String intervalsFile = null; + /** + * This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number + * should be adjusted based on your particular data set. For low coverage and/or when looking for indels with low allele frequency, + * this number should be smaller. + */ @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) protected double LOD_THRESHOLD = 5.0; - @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) - protected double MISMATCH_THRESHOLD = 0.15; - + /** + * The realigned bam file. + */ @Output(required=false, doc="Output bam") protected StingSAMFileWriter writer = null; protected ConstrainedMateFixingManager manager = null; protected SAMFileWriter writerToUse = null; - @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "How should we determine the possible alternate consenses? -- in the order of least permissive to most permissive there is KNOWNS_ONLY (use only indels from known indels provided in RODs), USE_READS (additionally use indels already present in the original alignments of the reads), and USE_SW (additionally use 'Smith-Waterman' to generate alternate consenses). The default is USE_READS", required = false) + /** + * We recommend that users run with USE_READS when trying to realign high quality longer read data mapped with a gapped aligner; + * Smith-Waterman is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data). + */ + @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "Determines how to compute the possible alternate consenses", required = false) public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS; // ADVANCED OPTIONS FOLLOW - @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+ - "Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage. If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory).", required=false) + /** + * For expert users only! This is similar to the argument in the RealignerTargetCreator walker. The point here is that the realigner + * will only proceed with the realignment (even above the given threshold) if it minimizes entropy among the reads (and doesn't simply + * push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set. + */ + @Advanced + @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) + protected double MISMATCH_THRESHOLD = 0.15; + + /** + * For expert users only! To minimize memory consumption you can lower this number (but then the tool may skip realignment on regions with too much coverage; + * and if the number is too low, it may generate errors during realignment). Just make sure to give Java enough memory! 4Gb should be enough with the default value. + */ + @Advanced + @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter", required=false) protected int MAX_RECORDS_IN_MEMORY = 150000; + /** + * For expert users only! + */ + @Advanced @Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false) protected int MAX_ISIZE_FOR_MOVEMENT = 3000; + /** + * For expert users only! + */ + @Advanced @Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="maximum positional move in basepairs that a read can be adjusted during realignment", required=false) protected int MAX_POS_MOVE_ALLOWED = 200; + /** + * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. + */ + @Advanced @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) protected int MAX_CONSENSUSES = 30; + /** + * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. + */ + @Advanced @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) protected int MAX_READS_FOR_CONSENSUSES = 120; - @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment; "+ - "if this value is exceeded, realignment is not attempted and the reads are passed to the output file(s) as-is", required=false) + /** + * For expert users only! If this value is exceeded at a given interval, realignment is not attempted and the reads are passed to the output file(s) as-is. + * If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number. + */ + @Advanced + @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false) protected int MAX_READS = 20000; - @Argument(fullName="noPGTag", shortName="noPG", required=false, - doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. "+ - "This option is required in order to pass integration tests.") - protected boolean NO_PG_TAG = false; - - @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, - doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam.") + @Advanced + @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam") protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false; - @Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, - doc="This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, "+ - "it will throw an exception. Use this argument when your interval list is not sorted to instruct "+"" + - "the Realigner to first sort it in memory.") + /** + * For expert users only! This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception. + * Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory. + */ + @Advanced + @Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, doc="The target intervals are not sorted") protected boolean TARGET_NOT_SORTED = false; - //NWay output: testing, not ready for the prime time, hence hidden: - - @Hidden - @Argument(fullName="nWayOut", shortName="nWayOut", required=false, - doc="Generate one output file for each input (-I) bam file. Reads from all input files "+ - "will be realigned together, but then each read will be saved in the output file corresponding to "+ - "the input file the read came from. There are two ways to generate output bam file names: 1) if the "+ - "value of this argument is a general string (e.g. '.cleaned.bam'), then "+ - "extensions (\".bam\" or \".sam\") will be stripped from the input file names and the provided string value "+ - "will be pasted on instead; 2) if the value ends with a '.map' (e.g. input_output.map), then " + - "the two-column tab-separated file with the specified name must exist and list unique output file name (2nd column)" + - "for each input file name (1st column).") + /** + * Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that + * the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'), + * then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the + * value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output + * file name (2nd column) for each input file name (1st column). + */ + @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file") protected String N_WAY_OUT = null; + + + // DEBUGGING OPTIONS FOLLOW + @Hidden @Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses") protected boolean CHECKEARLY = false; - - // DEBUGGING OPTIONS FOLLOW + @Hidden + @Argument(fullName="noPGTag", shortName="noPG", required=false, + doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") + protected boolean NO_PG_TAG = false; @Hidden @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") @@ -786,7 +886,7 @@ public class IndelRealigner extends ReadWalker { for ( VariantContext knownIndel : knownIndelsToTry ) { if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() ) continue; - byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length()); + byte[] indelStr = knownIndel.isSimpleInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length()); int start = knownIndel.getStart() - leftmostIndex + 1; Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel); if ( c != null ) @@ -988,11 +1088,11 @@ public class IndelRealigner extends ReadWalker { if ( indexOnRef > 0 ) cigar.add(new CigarElement(indexOnRef, CigarOperator.M)); - if ( indel.isDeletion() ) { + if ( indel.isSimpleDeletion() ) { refIdx += indelStr.length; cigar.add(new CigarElement(indelStr.length, CigarOperator.D)); } - else if ( indel.isInsertion() ) { + else if ( indel.isSimpleInsertion() ) { for ( byte b : indelStr ) sb.append((char)b); cigar.add(new CigarElement(indelStr.length, CigarOperator.I)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index af8051334..17d5a8e9b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -35,16 +35,46 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.sam.AlignmentUtils; + /** - * Left aligns indels in reads. + * Left-aligns indels from reads in a bam file. + * + *

+ * LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it. The same indel can often be + * placed at multiple positions and still represent the same haplotype. While a standard convention is to place an + * indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. + * + *

Input

+ *

+ * A bam file to left-align. + *

+ * + *

Output

+ *

+ * A left-aligned bam. + *

+ * + *

Examples

+ *
+ * java -Xmx3g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T LeftAlignIndels \
+ *   -I input.bam \
+ *   -o output.vcf
+ * 
+ * */ public class LeftAlignIndels extends ReadWalker { @Output(required=false, doc="Output bam") protected StingSAMFileWriter writer = null; - @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+ - "If too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool may run out of memory.", required=false) + /** + * If set too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool + * may run out of memory. We recommend that you additionally tell Java to use a temp directory with plenty of available + * space (by setting java.io.tempdir on the command-line). + */ + @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the output writer", required=false) protected int MAX_RECORDS_IN_RAM = 500000; public void initialize() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index fbb62f17e..bede50a0b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -33,7 +33,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.BadCigarFilter; import org.broadinstitute.sting.gatk.filters.BadMateFilter; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter; +import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.filters.Platform454Filter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; @@ -52,38 +52,94 @@ import java.util.Collections; import java.util.List; /** - * Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads, MQ0 reads, and reads with consecutive indel operators in the CIGAR string. + * Emits intervals for the Local Indel Realigner to target for realignment. + * + *

+ * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion + * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, + * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, + * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus + * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an + * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and + * specifically identify indels. + *

+ *

    There are 2 steps to the realignment process: + *
  1. Determining (small) suspicious intervals which are likely in need of realignment (RealignerTargetCreator)
  2. + *
  3. Running the realigner over those intervals (see the IndelRealigner tool)
  4. + *
+ *

+ * An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. + *

+ * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. + * + *

Input

+ *

+ * One or more aligned BAM files and optionally one or more lists of known indels. + *

+ * + *

Output

+ *

+ * A list of target intervals to pass to the Indel Realigner. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -I input.bam \
+ *   -R ref.fasta \
+ *   -T RealignerTargetCreator \
+ *   -o forIndelRealigner.intervals \
+ *   [--known /path/to/indels.vcf]
+ * 
+ * + * @author ebanks */ -@ReadFilters({Platform454Filter.class, MappingQualityZeroReadFilter.class, BadCigarFilter.class}) +@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, BadCigarFilter.class}) @Reference(window=@Window(start=-1,stop=50)) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @By(DataSource.REFERENCE) @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) public class RealignerTargetCreator extends RodWalker { + /** + * The target intervals for realignment. + */ @Output protected PrintStream out; + /** + * Any number of VCF files representing known SNPs and/or indels. Could be e.g. dbSNP and/or official 1000 Genomes indel calls. + * SNPs in these files will be ignored unless the --mismatchFraction argument is used. + */ @Input(fullName="known", shortName = "known", doc="Input VCF file with known indels", required=false) public List> known = Collections.emptyList(); - // mismatch/entropy/SNP arguments + /** + * Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many basepairs apart. + */ @Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false) protected int windowSize = 10; - @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false) + /** + * To disable this behavior, set this value to <= 0 or > 1. This feature is really only necessary when using an ungapped aligner + * (e.g. MAQ in the case of single-end read data) and should be used in conjunction with '--model USE_SW' in the IndelRealigner. + */ + @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy", required=false) protected double mismatchThreshold = 0.0; @Argument(fullName="minReadsAtLocus", shortName="minReads", doc="minimum reads at a locus to enable using the entropy calculation", required=false) protected int minReadsAtLocus = 4; - // interval merging arguments + /** + * Because the realignment algorithm is N^2, allowing too large an interval might take too long to completely realign. + */ @Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="maximum interval size", required=false) protected int maxIntervalSize = 500; - @Deprecated - @Argument(fullName="realignReadsWithBadMates", doc="This argument is no longer used.", required=false) - protected boolean DEPRECATED_REALIGN_MATES = false; @Override public boolean generateExtendedEvents() { return true; } @@ -122,7 +178,7 @@ public class RealignerTargetCreator extends RodWalker { // @Output // PrintStream out; @@ -469,10 +469,20 @@ public class SomaticIndelDetectorWalker extends ReadWalker { // let's double check now that the read fits after the shift if ( read.getAlignmentEnd() > normal_context.getStop()) { // ooops, looks like the read does not fit into the window even after the latter was shifted!! - throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+ - "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ + // we used to die over such reads and require user to run with larger window size. Now we + // just print a warning and discard the read (this means that our counts can be slightly off in + // th epresence of such reads) + //throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+ + // "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ + // read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+ + // "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop()); + System.out.println("WARNING: Read "+read.getReadName()+ + " is out of coverage window bounds. Probably window is too small and the window_size value must be increased.\n"+ + " The read is ignored in this run (so all the counts/statistics reported will not include it).\n"+ + " Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+ "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop()); + return 1; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index ac4fba4b4..17a6e20f1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -23,12 +23,15 @@ */ package org.broadinstitute.sting.gatk.walkers.phasing; -import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter; +import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.BaseUtils; @@ -49,16 +52,46 @@ import java.util.*; import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - /** * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). + * + *

+ * Performs physical phasing of SNP calls, based on sequencing reads. + *

+ * + *

Input

+ *

+ * VCF file of SNP calls, BAM file of sequence reads. + *

+ * + *

Output

+ *

+ * Phased VCF file. + *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T ReadBackedPhasing
+ *      -R reference.fasta
+ *      -I reads.bam
+ *      --variant:vcf SNPs.vcf
+ *      -BTI variant
+ *      -BTIMR INTERSECTION
+ *      -o phased_SNPs.vcf
+ *      --phaseQualityThresh 20.0
+ * 
+ * + * @author Menachem Fromer + * @since July 2010 */ @Allows(value = {DataSource.READS, DataSource.REFERENCE}) @Requires(value = {DataSource.READS, DataSource.REFERENCE}) @By(DataSource.READS) -@ReadFilters({MappingQualityZeroReadFilter.class}) // Filter out all reads with zero mapping quality +@ReadFilters({MappingQualityZeroFilter.class}) public class ReadBackedPhasingWalker extends RodWalker { private static final boolean DEBUG = false; @@ -73,13 +106,13 @@ public class ReadBackedPhasingWalker extends RodWalker P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9 @Hidden @@ -87,10 +120,10 @@ public class ReadBackedPhasingWalker extends RodWalker { @Output PrintStream out; + @Input(fullName="check", shortName = "check", doc="Any number of RODs", required=false) + public List> features = Collections.emptyList(); + @Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false) int numOverlaps = 2; @@ -37,7 +42,7 @@ public class CountIntervals extends RefWalker { return null; } - List checkIntervals = tracker.getValues(Feature.class, "check"); + List checkIntervals = tracker.getValues(features); return (long) checkIntervals.size(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java index 0d68c8493..09113704a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java @@ -11,7 +11,31 @@ import java.io.PrintStream; /** * Walks over the input data set, calculating the total number of covered loci for diagnostic purposes. + * + *

* Simplest example of a locus walker. + * + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of loci traversed. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountLoci \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ * */ public class CountLociWalker extends LocusWalker implements TreeReducible { @Output(doc="Write count to this file instead of STDOUT") diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java index 26fa9a258..e770418c1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java @@ -39,6 +39,26 @@ import java.util.List; * query name order. Breaks counts down by total pairs and number * of paired reads. * + * + *

Input

+ *

+ * One or more bam files. + *

+ * + *

Output

+ *

+ * Number of pairs seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountPairs \
+ *   -o output.txt \
+ *   -I input.bam
+ * 
+ * * @author mhanna */ public class CountPairsWalker extends ReadPairWalker { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java similarity index 62% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java index d1545f159..7c7d6417a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java @@ -25,7 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.qc; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -33,25 +36,55 @@ import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.collections.Pair; +import java.util.Collections; +import java.util.List; + /** - * Prints out counts of the number of reference ordered data objects are - * each locus for debugging RefWalkers. + * Prints out counts of the number of reference ordered data objects encountered. + * + * + *

Input

+ *

+ * One or more rod files. + *

+ * + *

Output

+ *

+ * Number of rods seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountRODsByRef \
+ *   -o output.txt \
+ *   --rod input.vcf
+ * 
+ * */ -public class CountRodByRefWalker extends RefWalker, Long>> { - @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false) +public class CountRODsByRefWalker extends RefWalker, Long>> { + + /** + * One or more input rod files + */ + @Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false) + public List> rods = Collections.emptyList(); + + @Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false) public boolean verbose = false; - @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false) + @Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false) public boolean showSkipped = false; - CountRodWalker crw = new CountRodWalker(); + CountRODsWalker crw = new CountRODsWalker(); public void initialize() { crw.verbose = verbose; crw.showSkipped = showSkipped; } - public CountRodWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public CountRODsWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return crw.map(tracker, ref, context); } @@ -59,7 +92,7 @@ public class CountRodByRefWalker extends RefWalker, Long> reduce(CountRodWalker.Datum point, Pair, Long> sum) { + public Pair, Long> reduce(CountRODsWalker.Datum point, Pair, Long> sum) { return crw.reduce(point, sum); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java similarity index 87% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java index 8a03dea44..edbd5ff75 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java @@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.qc; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -41,23 +44,46 @@ import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.collections.Pair; import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** - * Prints out counts of the number of reference ordered data objects are - * each locus for debugging RodWalkers. + * Prints out counts of the number of reference ordered data objects encountered. + * + * + *

Input

+ *

+ * One or more rod files. + *

+ * + *

Output

+ *

+ * Number of rods seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountRODs \
+ *   -o output.txt \
+ *   --rod input.vcf
+ * 
+ * */ -public class CountRodWalker extends RodWalker, Long>> implements TreeReducible, Long>> { +public class CountRODsWalker extends RodWalker, Long>> implements TreeReducible, Long>> { @Output public PrintStream out; - @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false) + /** + * One or more input rod files + */ + @Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false) + public List> rods = Collections.emptyList(); + + @Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false) public boolean verbose = false; - @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false) + @Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false) public boolean showSkipped = false; @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java index 87c0409b9..9ce9c4eec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java @@ -9,8 +9,32 @@ import org.broadinstitute.sting.gatk.walkers.Requires; /** * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. + * + *

* Can also count the number of reads matching a given criterion using read filters (see the * --read-filter command line argument). Simplest example of a read-backed analysis. + * + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of reads seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountReads \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ * */ @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountReadsWalker extends ReadWalker { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java new file mode 100644 index 000000000..933e24784 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.*; + +/** + * Summary test + * + *

Body test

+ */ +public class DocumentationTest extends RodWalker { + // the docs for the arguments are in the collection + @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + /** + * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants. + * Other sets can be specified with the -knownName (--known_names) argument. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + + /** + * detailed documentation about the argument goes here. + */ + @Input(fullName="listofRodBinding", shortName = "disc", doc="Output variants that were not called in this Feature comparison track", required=false) + private List> listOfRodBinding = Collections.emptyList(); + + @Input(fullName="optionalRodBinding", shortName = "conc", doc="Output variants that were also called in this Feature comparison track", required=false) + private RodBinding concordanceTrack; + + @Input(fullName="optionalRodBindingWithoutDefault", shortName = "optionalRodBindingWithoutDefault", doc="Output variants that were also called in this Feature comparison track", required=false) + private RodBinding noDefaultOptionalRodBinding; + + @Input(fullName="optionalRodBindingWithoutDefaultNull", shortName = "shortTest", doc="Output variants that were also called in this Feature comparison track", required=false) + private RodBinding noDefaultOptionalRodBindingNull = null; + + @Input(fullName="featureArg", shortName = "featureArg", doc="A RodBinding of feature", required=false) + private RodBinding featureArg = null; + + @Output(doc="VCFWriter",required=true) + protected VCFWriter vcfWriter = null; + + @Advanced + @Argument(fullName="setString", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) + public Set sampleNames; + + @Argument(fullName="setStringInitialized", shortName="setStringInitialized", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) + public Set setStringInitialized = new HashSet(); + + @Argument(shortName="optionalArgWithMissinglessDefault", doc="One or more criteria to use when selecting the data. Evaluated *after* the specified samples are extracted and the INFO-field annotations are updated.", required=false) + public ArrayList SELECT_EXPRESSIONS = new ArrayList(); + + @Argument(shortName="AAAAA", fullName = "AAAAA", doc="Should be the first argument", required=false) + public boolean FIRST_ARG = false; + + @Advanced + @Argument(fullName="booleanArg", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) + private boolean EXCLUDE_NON_VARIANTS = false; + + @Advanced + @Argument(fullName="booleanArray", shortName="booleanArray", doc="x", required=false) + private boolean[] boolArray = null; + + @Argument(fullName="enumTest", shortName="enumTest", doc="Test enum", required=false) + private TestEnum TestEnumArg = TestEnum.ENUM2; + public enum TestEnum { + /** Docs for enum1 */ + ENUM1, + /** Docs for enum2 */ + ENUM2 + } + + @Hidden + @Argument(fullName="hiddenArg", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) + private boolean KEEP_AF_SPECTRUM = false; + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; } + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { return value + sum; } + public void onTraversalDone(Integer result) { } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java index bd25a73e0..ca30d875b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java @@ -32,7 +32,7 @@ import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature; +import org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java index fc6b3daee..9b0824ed0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.commandline.Gatherer; @@ -12,11 +37,8 @@ import java.util.List; import java.util.regex.Pattern; /** - * Created by IntelliJ IDEA. * User: carneiro * Date: 3/29/11 - * Time: 3:54 PM - * To change this template use File | Settings | File Templates. */ diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index b4739f366..98c8950e3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -29,8 +29,8 @@ import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter; +import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; +import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.BaseUtils; @@ -50,27 +50,54 @@ import java.util.List; import java.util.Map; /** - * First pass of the recalibration. Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide). + * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide). * - * This walker is designed to work as the first pass in a two-pass processing step. - * It does a by-locus traversal operating only at sites that are not in dbSNP. - * We assume that all reference mismatches we see are therefore errors and indicative of poor base quality. - * This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinucleotide) - * Since there is a large amount of data one can then calculate an empirical probability of error - * given the particular covariates seen at this site, where p(error) = num mismatches / num observations - * The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score) - * The first non-comment line of the output file gives the name of the covariates that were used for this calculation. + *

+ * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating + * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative + * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group, + * reported quality score, cycle, and dinucleotide). Since there is a large amount of data one can then calculate an empirical + * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations. + * The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score). + *

+ * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified. * - * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified - * Note: This walker is designed to be used in conjunction with TableRecalibrationWalker. + *

+ * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration + * + *

Input

+ *

+ * The input read data whose base quality scores need to be assessed. + *

+ * A database of known polymorphic sites to skip over. + *

+ * + *

Output

+ *

+ * A recalibration table file in CSV format that is used by the TableRecalibration walker. + *

+ * + *

Examples

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
+ *   -knownSites another/optional/setOfSitesToMask.vcf \
+ *   -I my_reads.bam \
+ *   -T CountCovariates \
+ *   -cov ReadGroupCovariate \
+ *   -cov QualityScoreCovariate \
+ *   -cov CycleCovariate \
+ *   -cov DinucCovariate \
+ *   -recalFile my_reads.recal_data.csv
+ * 
* - * @author rpoplin - * @since Nov 3, 2009 */ @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) @By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file -@ReadFilters( {MappingQualityZeroReadFilter.class, MappingQualityUnavailableReadFilter.class} ) // Filter out all reads with zero or unavailable mapping quality +@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality @Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta @PartitionBy(PartitionType.LOCUS) public class CountCovariatesWalker extends LocusWalker implements TreeReducible { @@ -96,14 +123,23 @@ public class CountCovariatesWalker extends LocusWalker> knownSites = Collections.emptyList(); - @Output - PrintStream out; + + /** + * After the header, data records occur one per line until the end of the file. The first several items on a line are the + * values of the individual covariates and will change depending on which covariates were specified at runtime. The last + * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. + */ @Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file") @Gather(CountCovariatesGatherer.class) public PrintStream RECAL_FILE; @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) private boolean LIST_ONLY = false; + + /** + * See the -list argument to view available covariates. + */ @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false) private String[] COVARIATES = null; @Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false) @@ -114,6 +150,10 @@ public class CountCovariatesWalker extends LocusWalker covClass : covariateClasses ) { - out.println( covClass.getSimpleName() ); + logger.info( covClass.getSimpleName() ); } - out.println(); + logger.info(""); System.exit( 0 ); // Early exit here because user requested it } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index e6d0b306c..ac25d4f13 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -66,15 +66,22 @@ public class RecalDataManager { private static boolean warnUserNullPlatform = false; public enum SOLID_RECAL_MODE { + /** Treat reference inserted bases as reference matching bases. Very unsafe! */ DO_NOTHING, + /** Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. */ SET_Q_ZERO, + /** In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. */ SET_Q_ZERO_BASE_N, + /** Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. */ REMOVE_REF_BIAS } public enum SOLID_NOCALL_STRATEGY { + /** When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. */ THROW_EXCEPTION, + /** Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. */ LEAVE_READ_UNRECALIBRATED, + /** Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. */ PURGE_READ } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index 0e7f7d111..f31e2fc5b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -51,12 +51,27 @@ public class RecalibrationArgumentCollection { public String FORCE_PLATFORM = null; @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) public int WINDOW_SIZE = 5; + + /** + * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. + */ @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false) public int HOMOPOLYMER_NBACK = 7; @Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false) public boolean EXCEPTION_IF_NO_TILE = false; + + /** + * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the + * reads which have had the reference inserted because of color space inconsistencies. + */ @Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO; + + /** + * CountCovariates and TableRecalibration accept a --solid_nocall_strategy flag which governs how the recalibrator handles + * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in + * their color space tag can not be recalibrated. + */ @Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false) public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index a044abecb..174e810c2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -52,19 +52,40 @@ import java.util.ResourceBundle; import java.util.regex.Pattern; /** - * Second pass of the recalibration. Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. + * Second pass of the base quality score recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. * - * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. + *

+ * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each + * base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, + * cycle, and dinuc). Using these values as a key in a large hashmap the walker calculates an empirical base quality score + * and overwrites the quality score currently in the read. This walker then outputs a new bam file with these updated (recalibrated) reads. * - * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) - * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. - * This walker then outputs a new bam file with these updated (recalibrated) reads. + *

+ * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration * - * Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. - * Note: This walker is designed to be used in conjunction with CovariateCounterWalker. + *

Input

+ *

+ * The input read data whose base quality scores need to be recalibrated. + *

+ * The recalibration table file in CSV format that was generated by the CountCovariates walker. + *

+ * + *

Output

+ *

+ * A bam file in which the quality scores in each read have been recalibrated. + *

+ * + *

Examples

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -I my_reads.bam \
+ *   -T TableRecalibration \
+ *   -o my_reads.recal.bam \
+ *   -recalFile my_reads.recal_data.csv
+ * 
* - * @author rpoplin - * @since Nov 3, 2009 */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @@ -79,24 +100,54 @@ public class TableRecalibrationWalker extends ReadWalker flag that instructs TableRecalibration to not modify + * quality scores less than but rather just write them out unmodified in the recalibrated BAM file. This is useful + * because Solexa writes Q2 and Q3 bases when the machine has really gone wrong. This would be fine in and of itself, + * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, + * your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases + * are unmodified during recalibration, so they don't get inappropriately evaluated. + */ + @Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false) private int PRESERVE_QSCORES_LESS_THAN = 5; - @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points, default=1") + + /** + * By default TableRecalibration applies a Yates' correction to account for overfitting when it calculates the empirical + * quality score, in particular, ( # mismatches + 1 ) / ( # observations + 1 ). TableRecalibration accepts a --smoothing / -sm + * argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example, + * --smoothing 15 for a large amount of smoothing. + */ + @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points") private int SMOOTHING = 1; - @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default=50") + + /** + * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation + * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later. + */ + @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores") private int MAX_QUALITY_SCORE = 50; + + /** + * By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun + * the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag. + */ @Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read") private boolean DO_NOT_WRITE_OQ = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java new file mode 100755 index 000000000..2b38afaf6 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java @@ -0,0 +1,463 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.validation; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.Map; +import java.util.Set; + +import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; + +/** + * Genotypes a dataset and validates the calls of another dataset using the Unified Genotyper. + * + *

+ * Genotype and Validate is a tool to evaluate the quality of a dataset for calling SNPs + * and Indels given a secondary (validation) data source. The data sources are BAM or VCF + * files. You can use them interchangeably (i.e. a BAM to validate calls in a VCF or a VCF + * to validate calls on a BAM). + *

+ * + *

+ * The simplest scenario is when you have a VCF of hand annotated SNPs and Indels, and you + * want to know how well a particular technology performs calling these snps. With a + * dataset (BAM file) generated by the technology in test, and the hand annotated VCF, you + * can run GenotypeAndValidate to asses the accuracy of the calls with the new technology's + * dataset. + *

+ * + *

+ * Another option is to validate the calls on a VCF file, using a deep coverage BAM file + * that you trust the calls on. The GenotypeAndValidate walker will make calls using the + * reads in the BAM file and take them as truth, then compare to the calls in the VCF file + * and produce a truth table. + *

+ * + * + *

Input

+ *

+ * A BAM file to make calls on and a VCF file to use as truth validation dataset. + * + * You also have the option to invert the roles of the files using the command line options listed below. + *

+ * + *

Output

+ *

+ * GenotypeAndValidate has two outputs. The truth table and the optional VCF file. The truth table is a + * 2x2 table correlating what was called in the dataset with the truth of the call (whether it's a true + * positive or a false positive). The table should look like this: + *

+ *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
ALTREFPredictive Value
called altTrue Positive (TP)False Positive (FP)Positive PV
called refFalse Negative (FN)True Negative (TN)Negative PV
+ *
+ * + *

+ * The positive predictive value (PPV) is the proportion of subjects with positive test results + * who are correctly diagnosed. + *

+ *

+ * The negative predictive value (NPV) is the proportion of subjects with a negative test result + * who are correctly diagnosed. + *

+ *

+ * The VCF file will contain only the variants that were called or not called, excluding the ones that + * were uncovered or didn't pass the filters. This file is useful if you are trying to compare + * the PPV and NPV of two different technologies on the exact same sites (so you can compare apples to + * apples). + *

+ * + *

+ * Here is an example of an annotated VCF file (info field clipped for clarity) + * + *

+ * #CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  NA12878
+ * 1   20568807    .   C   T   0    HapMapHet        AC=1;AF=0.50;AN=2;DP=0;GV=T  GT  0/1
+ * 1   22359922    .   T   C   282  WG-CG-HiSeq      AC=2;AF=0.50;GV=T;AN=4;DP=42 GT:AD:DP:GL:GQ  1/0 ./. 0/1:20,22:39:-72.79,-11.75,-67.94:99    ./.
+ * 13  102391461   .   G   A   341  Indel;SnpCluster AC=1;GV=F;AF=0.50;AN=2;DP=45 GT:AD:DP:GL:GQ  ./. ./. 0/1:32,13:45:-50.99,-13.56,-112.17:99   ./.
+ * 1   175516757   .   C   G   655  SnpCluster,WG    AC=1;AF=0.50;AN=2;GV=F;DP=74 GT:AD:DP:GL:GQ  ./. ./. 0/1:52,22:67:-89.02,-20.20,-191.27:99   ./.
+ * 
+ * + *

+ * + *

Additional Details

+ *
    + *
  • + * You should always use -BTI on your VCF track, so that the GATK only looks at the sites on the VCF file. + * This speeds up the process a lot. + *
  • + *
  • + * The total number of visited bases may be greater than the number of variants in the original + * VCF file because of extended indels, as they trigger one call per new insertion or deletion. + * (i.e. ACTG/- will count as 4 genotyper calls, but it's only one line in the VCF). + *
  • + *
+ * + *

Examples

+ *
    + *
  1. + * Genotypes BAM file from new technology using the VCF as a truth dataset: + *
  2. + * + *
    + *  java
    + *      -jar /GenomeAnalysisTK.jar
    + *      -T  GenotypeAndValidate
    + *      -R human_g1k_v37.fasta
    + *      -I myNewTechReads.bam
    + *      -alleles handAnnotatedVCF.vcf
    + *      -BTI alleles
    + * 
    + * + *
  3. + * Using a BAM file as the truth dataset: + *
  4. + * + *
    + *  java
    + *      -jar /GenomeAnalysisTK.jar
    + *      -T  GenotypeAndValidate
    + *      -R human_g1k_v37.fasta
    + *      -I myTruthDataset.bam
    + *      -alleles callsToValidate.vcf
    + *      -BTI alleles
    + *      -bt
    + *      -o gav.vcf
    + * 
    + * + * + * @author Mauricio Carneiro + * @since ${DATE} + */ + +@Requires(value={DataSource.READS, DataSource.REFERENCE}) +@Allows(value={DataSource.READS, DataSource.REFERENCE}) + +@By(DataSource.REFERENCE) +@Reference(window=@Window(start=-200,stop=200)) + + +public class GenotypeAndValidateWalker extends RodWalker implements TreeReducible { + + /** + * The optional output file that will have all the variants used in the Genotype and Validation essay. + */ + @Output(doc="Generate a VCF file with the variants considered by the walker, with a new annotation \"callStatus\" which will carry the value called in the validation VCF or BAM file", required=false) + protected VCFWriter vcfWriter = null; + + /** + * The callset to be used as truth (default) or validated (if BAM file is set to truth). + */ + @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype", required=true) + public RodBinding alleles; + + /** + * Makes the Unified Genotyper calls to the BAM file the truth dataset and validates the alleles ROD binding callset. + */ + @Argument(fullName ="set_bam_truth", shortName ="bt", doc="Use the calls on the reads (bam file) as the truth dataset and validate the calls on the VCF", required=false) + private boolean bamIsTruth = false; + + /** + * The minimum base quality score necessary for a base to be considered when calling a genotype. This argument is passed to the Unified Genotyper. + */ + @Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false) + private int mbq = -1; + + /** + * The maximum deletion fraction allowed in a site for calling a genotype. This argument is passed to the Unified Genotyper. + */ + @Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false) + private double deletions = -1; + + /** + * the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. This argument is passed to the Unified Genotyper. + */ + @Argument(fullName="standard_min_confidence_threshold_for_calling", shortName="stand_call_conf", doc="the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls", required=false) + private double callConf = -1; + + /** + * the minimum phred-scaled Qscore threshold to emit low confidence calls. This argument is passed to the Unified Genotyper. + */ + @Argument(fullName="standard_min_confidence_threshold_for_emitting", shortName="stand_emit_conf", doc="the minimum phred-scaled Qscore threshold to emit low confidence calls", required=false) + private double emitConf = -1; + + /** + * Only validate sites that have at least a given depth + */ + @Argument(fullName="condition_on_depth", shortName="depth", doc="Condition validation on a minimum depth of coverage by the reads", required=false) + private int minDepth = -1; + + /** + * If your VCF or BAM file has more than one sample and you only want to validate one, use this parameter to choose it. + */ + @Hidden + @Argument(fullName ="sample", shortName ="sn", doc="Name of the sample to validate (in case your VCF/BAM has more than one sample)", required=false) + private String sample = ""; + + private UnifiedGenotyperEngine snpEngine; + private UnifiedGenotyperEngine indelEngine; + + public static class CountedData { + private long nAltCalledAlt = 0L; + private long nAltCalledRef = 0L; + private long nRefCalledAlt = 0L; + private long nRefCalledRef = 0L; + private long nNotConfidentCalls = 0L; + private long nUncovered = 0L; + + /** + * Adds the values of other to this, returning this + * @param other the other object + */ + public void add(CountedData other) { + nAltCalledAlt += other.nAltCalledAlt; + nAltCalledRef += other.nAltCalledRef; + nRefCalledAlt += other.nRefCalledAlt; + nRefCalledRef += other.nRefCalledRef; + nUncovered += other.nUncovered; + nNotConfidentCalls += other.nNotConfidentCalls; + } + } + + + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + public void initialize() { + + // Initialize VCF header + if (vcfWriter != null) { + Map header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName()); + Set samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + Set headerLines = VCFUtils.smartMergeHeaders(header.values(), logger); + headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); + vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); + } + + // Filling in SNP calling arguments for UG + UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); + uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; + uac.alleles = alleles; + if (!bamIsTruth) uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq; + if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions; + if (emitConf >= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf; + if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf; + + uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; + snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac); + + // Adding the INDEL calling arguments for UG + uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL; + indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac); + + // make sure we have callConf set to the threshold set by the UAC so we can use it later. + callConf = uac.STANDARD_CONFIDENCE_FOR_CALLING; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + + final CountedData counter = new CountedData(); + + // For some reason RodWalkers get map calls with null trackers + if( tracker == null ) + return counter; + + VariantContext vcComp = tracker.getFirstValue(alleles); + if( vcComp == null ) + return counter; + + //todo - not sure I want this, may be misleading to filter extended indel events. + if (isInsideExtendedIndel(vcComp, ref)) + return counter; + + // Do not operate on variants that are not covered to the optional minimum depth + if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) { + counter.nUncovered = 1L; + return counter; + } + + VariantCallContext call; + if ( vcComp.isSNP() ) + call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context); + else if ( vcComp.isIndel() ) { + call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context); + } + else { + logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles()); + return counter; + } + + + boolean writeVariant = true; + + if (bamIsTruth) { + if (call.confidentlyCalled) { + // If truth is a confident REF call + if (call.isVariant()) { + if (vcComp.isVariant()) + counter.nAltCalledAlt = 1L; // todo -- may wanna check if the alts called are the same? + else + counter.nAltCalledRef = 1L; + } + // If truth is a confident ALT call + else { + if (vcComp.isVariant()) + counter.nRefCalledAlt = 1L; + else + counter.nRefCalledRef = 1L; + } + } + else { + counter.nNotConfidentCalls = 1L; + writeVariant = false; + } + } + else { + if (!vcComp.hasAttribute("GV")) + throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart()); + + + + if (call.isCalledAlt(callConf)) { + if (vcComp.getAttribute("GV").equals("T")) + counter.nAltCalledAlt = 1L; + else + counter.nRefCalledAlt = 1L; + } + else if (call.isCalledRef(callConf)) { + if (vcComp.getAttribute("GV").equals("T")) + counter.nAltCalledRef = 1L; + else + counter.nRefCalledRef = 1L; + } + else { + counter.nNotConfidentCalls = 1L; + writeVariant = false; + } + } + + if (vcfWriter != null && writeVariant) { + if (!vcComp.hasAttribute("callStatus")) { + MutableVariantContext mvc = new MutableVariantContext(vcComp); + mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" ); + vcfWriter.add(mvc); + } + else + vcfWriter.add(vcComp); + } + return counter; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + public CountedData reduceInit() { + return new CountedData(); + } + + public CountedData treeReduce( final CountedData sum1, final CountedData sum2) { + sum2.add(sum1); + return sum2; + } + + public CountedData reduce( final CountedData mapValue, final CountedData reduceSum ) { + reduceSum.add(mapValue); + return reduceSum; + } + + public void onTraversalDone( CountedData reduceSum ) { + double ppv = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nRefCalledAlt)); + double npv = 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nAltCalledRef)); + double sensitivity = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nAltCalledRef)); + double specificity = (reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt > 0) ? 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt)) : 100; + logger.info(String.format("Resulting Truth Table Output\n\n" + + "---------------------------------------------------\n" + + "\t\t|\tALT\t|\tREF\t\n" + + "---------------------------------------------------\n" + + "called alt\t|\t%d\t|\t%d\n" + + "called ref\t|\t%d\t|\t%d\n" + + "---------------------------------------------------\n" + + "positive predictive value: %f%%\n" + + "negative predictive value: %f%%\n" + + "---------------------------------------------------\n" + + "sensitivity: %f%%\n" + + "specificity: %f%%\n" + + "---------------------------------------------------\n" + + "not confident: %d\n" + + "not covered: %d\n" + + "---------------------------------------------------\n", reduceSum.nAltCalledAlt, reduceSum.nRefCalledAlt, reduceSum.nAltCalledRef, reduceSum.nRefCalledRef, ppv, npv, sensitivity, specificity, reduceSum.nNotConfidentCalls, reduceSum.nUncovered)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 7653f511f..f9bd019ea 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -14,9 +14,8 @@ import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; +import org.broadinstitute.sting.utils.codecs.table.TableFeature; import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.BaseUtils; @@ -31,21 +30,77 @@ import java.util.LinkedList; import java.util.List; /** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 6/13/11 - * Time: 2:12 PM - * To change this template use File | Settings | File Templates. + * Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation + * + *

    + * ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe + * sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within + * the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies + * reasons why the site may fail validation (nearby variation, for example). + *

    + * + *

    Input

    + *

    + * Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an + * interval list defining the size of the amplicons around the sites to be validated + *

    + * + *

    Output

    + *

    + * Output is a FASTA-formatted file with some modifications at probe sites. For instance: + *

    + * >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414
    + * CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC
    + * >20:792122 Valid 20_792122
    + * TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT
    + * >20:994145 Valid 20_994145
    + * TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC
    + * >20:1074230 SITE_IS_FILTERED=1, 20_1074230
    + * ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC
    + * >20:1084330 DELETION=1, 20_1084330
    + * CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
    + *
    + * are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be: + * + * Valid // amplicon is valid + * SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant") + * VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak + * MULTIPLE_PROBES=1, // multiple variants to be validated found inside the same amplicon + * DELETION=6,INSERTION=5, // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate + * DELETION=1, // deletion found inside the amplicon region, could shift mass-spec peak + * START_TOO_CLOSE, // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer + * END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer + * NO_VARIANTS_FOUND, // no variants found within the amplicon region + * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself) + *

    + * + *

    Examples

    + *
    
    + *    java
    + *      -jar GenomeAnalysisTK.jar
    + *      -T ValidationAmplicons
    + *      -R /humgen/1kg/reference/human_g1k_v37.fasta
    + *      -BTI ProbeIntervals
    + *      -ProbeIntervals:table interval_table.table
    + *      -ValidateAlleles:vcf sites_to_validate.vcf
    + *      -MaskAlleles:vcf mask_sites.vcf
    + *      --virtualPrimerSize 30
    + *      -o probes.fasta
    + * 
    + *
    + * @author chartl
    + * @since July 2011
      */
     @Requires(value={DataSource.REFERENCE})
     public class ValidationAmplicons extends RodWalker {
    -    @Input(fullName = "ProbeIntervals", doc="Chris document me", required=true)
    +    @Input(fullName = "ProbeIntervals", doc="A collection of intervals in table format with optional names that represent the "+
    +                                            "intervals surrounding the probe sites amplicons should be designed for", required=true)
         RodBinding probeIntervals;
     
    -    @Input(fullName = "ValidateAlleles", doc="Chris document me", required=true)
    +    @Input(fullName = "ValidateAlleles", doc="A VCF containing the sites and alleles you want to validate. Restricted to *BI-Allelic* sites", required=true)
         RodBinding validateAlleles;
     
    -    @Input(fullName = "MaskAlleles", doc="Chris document me", required=true)
    +    @Input(fullName = "MaskAlleles", doc="A VCF containing the sites you want to MASK from the designed amplicon (e.g. by Ns or lower-cased bases)", required=true)
         RodBinding maskAlleles;
     
     
    @@ -195,17 +250,17 @@ public class ValidationAmplicons extends RodWalker {
             } else /* (mask != null && validate == null ) */ {
                 if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) {
                     logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed.");
    -                logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
    +                logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isSimpleInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
                     sequenceInvalid = true;
    -                invReason.add(mask.isInsertion() ? "INSERTION" : "DELETION");
    +                invReason.add(mask.isSimpleInsertion() ? "INSERTION" : "DELETION");
                     // note: indelCounter could be > 0 (could have small deletion within larger one). This always selects
                     // the larger event.
    -                int indelCounterNew = mask.isInsertion() ? 2 : mask.getEnd()-mask.getStart();
    +                int indelCounterNew = mask.isSimpleInsertion() ? 2 : mask.getEnd()-mask.getStart();
                     if ( indelCounterNew > indelCounter ) {
                         indelCounter = indelCounterNew;
                     }
                     //sequence.append((char) ref.getBase());
    -                //sequence.append(mask.isInsertion() ? 'I' : 'D');
    +                //sequence.append(mask.isSimpleInsertion() ? 'I' : 'D');
                     sequence.append("N");
                     indelCounter--;
                     rawSequence.append(Character.toUpperCase((char) ref.getBase()));
    diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
    index 253c6e6d0..613a31ed3 100755
    --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
    +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
    @@ -36,25 +36,66 @@ import java.util.*;
     
     /**
      * General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more)
    + *
    + * 

    + * Given a variant callset, it is common to calculate various quality control metrics. These metrics include the number of + * raw or filtered SNP counts; ratio of transition mutations to transversions; concordance of a particular sample's calls + * to a genotyping chip; number of singletons per sample; etc. Furthermore, it is often useful to stratify these metrics + * by various criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the amino acid + * degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in + * evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation + * and stratification modules. + * + *

    Input

    + *

    + * One or more variant sets to evaluate plus any number of comparison sets. + *

    + * + *

    Output

    + *

    + * Evaluation tables. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T VariantEval \
    + *   -o output.eval.gatkreport \
    + *   --eval:set1 set1.vcf \
    + *   --eval:set2 set2.vcf \
    + *   [--comp comp.vcf]
    + * 
    + * */ @Reference(window=@Window(start=-50, stop=50)) public class VariantEvalWalker extends RodWalker implements TreeReducible { - // Output arguments + @Output protected PrintStream out; + /** + * The variant file(s) to evaluate. + */ @Input(fullName="eval", shortName = "eval", doc="Input evaluation file(s)", required=true) public List> evals; + /** + * The variant file(s) to compare against. + */ @Input(fullName="comp", shortName = "comp", doc="Input comparison file(s)", required=false) public List> compsProvided = Collections.emptyList(); private List> comps = new ArrayList>(); + /** + * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants. + * Other sets can be specified with the -knownName (--known_names) argument. + */ @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); // Help arguments - @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit") + @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false) protected Boolean LIST = false; // Partitioning the data arguments @@ -67,8 +108,12 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="sample", shortName="sn", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false) protected Set SAMPLE_EXPRESSIONS; + /** + * List of rod tracks to be used for specifying "known" variants other than dbSNP. + */ @Argument(shortName="knownName", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false) - protected String[] KNOWN_NAMES = {}; + protected HashSet KNOWN_NAMES = new HashSet(); + List> knowns = new ArrayList>(); // Stratification arguments @Argument(fullName="stratificationModule", shortName="ST", doc="One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)", required=false) @@ -80,7 +125,9 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false) protected Set typesToUse = null; - // Evaluator arguments + /** + * See the -list argument to view available modules. + */ @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false) protected String[] MODULES_TO_USE = {}; @@ -94,7 +141,10 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false) protected double MIN_PHASE_QUALITY = 10.0; - @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) + /** + * This argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined. + */ + @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations", required=false) protected String FAMILY_STRUCTURE; @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) @@ -108,9 +158,6 @@ public class VariantEvalWalker extends RodWalker implements Tr // Variables private Set jexlExpressions = new TreeSet(); - private Set compNames = new TreeSet(); - private Set knownNames = new TreeSet(); - private Set evalNames = new TreeSet(); private Set sampleNamesForEvaluation = new TreeSet(); private Set sampleNamesForStratification = new TreeSet(); @@ -149,23 +196,24 @@ public class VariantEvalWalker extends RodWalker implements Tr comps.addAll(compsProvided); if ( dbsnp.dbsnp.isBound() ) { comps.add(dbsnp.dbsnp); - knownNames.add(dbsnp.dbsnp.getName()); + knowns.add(dbsnp.dbsnp); } // Add a dummy comp track if none exists if ( comps.size() == 0 ) comps.add(new RodBinding(VariantContext.class, "none", "UNBOUND", "", new Tags())); - // Cache the rod names - for ( RodBinding compRod : comps ) - compNames.add(compRod.getName()); + // Set up set of additional knowns + for ( RodBinding compRod : comps ) { + if ( KNOWN_NAMES.contains(compRod.getName()) ) + knowns.add(compRod); + } + // Collect the eval rod names + Set evalNames = new TreeSet(); for ( RodBinding evalRod : evals ) evalNames.add(evalRod.getName()); - // Set up set of additional known names - knownNames.addAll(Arrays.asList(KNOWN_NAMES)); - // Now that we have all the rods categorized, determine the sample list from the eval rods. Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evalNames); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); @@ -263,7 +311,8 @@ public class VariantEvalWalker extends RodWalker implements Tr // for each comp track for ( final RodBinding compRod : comps ) { // no sample stratification for comps - final Set compSet = compVCs.get(compRod) == null ? new HashSet(0) : compVCs.get(compRod).values().iterator().next(); + final HashMap> compSetHash = compVCs.get(compRod); + final Set compSet = (compSetHash == null || compSetHash.size() == 0) ? new HashSet(0) : compVCs.get(compRod).values().iterator().next(); // find the comp final VariantContext comp = findMatchingComp(eval, compSet); @@ -462,15 +511,15 @@ public class VariantEvalWalker extends RodWalker implements Tr public static String getAllSampleName() { return ALL_SAMPLE_NAME; } - public Set getKnownNames() { return knownNames; } + public List> getKnowns() { return knowns; } - public Set getEvalNames() { return evalNames; } + public List> getEvals() { return evals; } public Set getSampleNamesForEvaluation() { return sampleNamesForEvaluation; } public Set getSampleNamesForStratification() { return sampleNamesForStratification; } - public Set getCompNames() { return compNames; } + public List> getComps() { return comps; } public Set getJexlExpressions() { return jexlExpressions; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 87b8bac1d..59ef3d992 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -39,8 +39,10 @@ public class CountVariants extends VariantEvaluator implements StandardEval { public long nInsertions = 0; @DataPoint(description = "Number of deletions") public long nDeletions = 0; - @DataPoint(description = "Number of complex loci") + @DataPoint(description = "Number of complex indels") public long nComplex = 0; + @DataPoint(description = "Number of mixed loci (loci that can't be classified as a SNP, Indel or MNP)") + public long nMixed = 0; @DataPoint(description = "Number of no calls loci") @@ -97,27 +99,35 @@ public class CountVariants extends VariantEvaluator implements StandardEval { // This is really not correct. What we really want here is a polymorphic vs. monomorphic count (i.e. on the Genotypes). // So in order to maintain consistency with the previous implementation (and the intention of the original author), I've // added in a proxy check for monomorphic status here. - if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() == vc1.getNSamples()) ) { + // Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call. + if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() + vc1.getNoCallCount() == vc1.getNSamples()) ) { nRefLoci++; } else { - nVariantLoci++; - switch (vc1.getType()) { + switch (vc1.getType()) { case NO_VARIATION: break; case SNP: + nVariantLoci++; nSNPs++; if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; break; case MNP: + nVariantLoci++; nMNPs++; if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; break; case INDEL: - if (vc1.isInsertion()) nInsertions++; - else nDeletions++; + nVariantLoci++; + if (vc1.isSimpleInsertion()) + nInsertions++; + else if (vc1.isSimpleDeletion()) + nDeletions++; + else + nComplex++; break; case MIXED: - nComplex++; + nVariantLoci++; + nMixed++; break; default: throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType()); @@ -180,8 +190,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval { heterozygosity = perLocusRate(nHets); heterozygosityPerBp = perLocusRInverseRate(nHets); hetHomRatio = ratio(nHets, nHomVar); - indelRate = perLocusRate(nDeletions + nInsertions); - indelRatePerBp = perLocusRInverseRate(nDeletions + nInsertions); + indelRate = perLocusRate(nDeletions + nInsertions + nComplex); + indelRatePerBp = perLocusRInverseRate(nDeletions + nInsertions + nComplex); deletionInsertionRatio = ratio(nDeletions, nInsertions); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java index 77def0f30..35fffd815 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java @@ -96,9 +96,9 @@ public class IndelLengthHistogram extends VariantEvaluator { } if ( vc1.isIndel() ) { - if ( vc1.isInsertion() ) { + if ( vc1.isSimpleInsertion() ) { indelHistogram.update(vc1.getAlternateAllele(0).length()); - } else if ( vc1.isDeletion() ) { + } else if ( vc1.isSimpleDeletion() ) { indelHistogram.update(-vc1.getReference().length()); } else { throw new ReviewedStingException("Indel type that is not insertion or deletion."); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java deleted file mode 100755 index 6e1b76acd..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java +++ /dev/null @@ -1,221 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.ArrayList; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author delangel - * @since Apr 11, 2010 - */ - -@Analysis(name = "Indel Metrics by allele count", description = "Shows various stats binned by allele count") -public class IndelMetricsByAC extends VariantEvaluator { - // a mapping from quality score histogram bin to Ti/Tv ratio - @DataPoint(description = "Indel Metrics by allele count") - IndelMetricsByAc metrics = null; - - int numSamples = 0; - - public void initialize(VariantEvalWalker walker) { - numSamples = walker.getNumSamples(); - } - - //@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count") - //AlleleCountStats alleleCountStats = null; - private static final int INDEL_SIZE_LIMIT = 100; - private static final int NUM_SCALAR_COLUMNS = 6; - static int len2Index(int ind) { - return ind+INDEL_SIZE_LIMIT; - } - - static int index2len(int ind) { - return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS; - } - - protected final static String[] METRIC_COLUMNS; - static { - METRIC_COLUMNS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1]; - METRIC_COLUMNS[0] = "AC"; - METRIC_COLUMNS[1] = "nIns"; - METRIC_COLUMNS[2] = "nDels"; - METRIC_COLUMNS[3] = "n"; - METRIC_COLUMNS[4] = "nComplex"; - METRIC_COLUMNS[5] = "nLong"; - - for (int k=NUM_SCALAR_COLUMNS; k < NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT+1; k++) - METRIC_COLUMNS[k] = "indel_size_len"+Integer.valueOf(index2len(k)); - } - - class IndelMetricsAtAC { - public int ac = -1, nIns =0, nDel = 0, nComplex = 0, nLong; - public int sizeCount[] = new int[2*INDEL_SIZE_LIMIT+1]; - - public IndelMetricsAtAC(int ac) { this.ac = ac; } - - public void update(VariantContext eval) { - int eventLength = 0; - if ( eval.isInsertion() ) { - eventLength = eval.getAlternateAllele(0).length(); - nIns++; - } else if ( eval.isDeletion() ) { - eventLength = -eval.getReference().length(); - nDel++; - } - else { - nComplex++; - } - if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) - sizeCount[len2Index(eventLength)]++; - else - nLong++; - - - - } - - // corresponding to METRIC_COLUMNS - public String getColumn(int i) { - if (i >= NUM_SCALAR_COLUMNS && i <=NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT) - return String.valueOf(sizeCount[i-NUM_SCALAR_COLUMNS]); - - switch (i) { - case 0: return String.valueOf(ac); - case 1: return String.valueOf(nIns); - case 2: return String.valueOf(nDel); - case 3: return String.valueOf(nIns + nDel); - case 4: return String.valueOf(nComplex); - case 5: return String.valueOf(nLong); - - default: - throw new ReviewedStingException("Unexpected column " + i); - } - } - } - - class IndelMetricsByAc implements TableType { - ArrayList metrics = new ArrayList(); - Object[] rows = null; - - public IndelMetricsByAc( int nchromosomes ) { - rows = new Object[nchromosomes+1]; - metrics = new ArrayList(nchromosomes+1); - for ( int i = 0; i < nchromosomes + 1; i++ ) { - metrics.add(new IndelMetricsAtAC(i)); - rows[i] = "ac" + i; - } - } - - public Object[] getRowKeys() { - return rows; - } - - public Object[] getColumnKeys() { - return METRIC_COLUMNS; - } - - public String getName() { - return "IndelMetricsByAc"; - } - - // - public String getCell(int ac, int y) { - return metrics.get(ac).getColumn(y); - } - - public String toString() { - return ""; - } - - public void incrValue( VariantContext eval ) { - int ac = -1; - - if ( eval.hasGenotypes() ) - ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); - else if ( eval.hasAttribute("AC") ) { - ac = Integer.valueOf(eval.getAttributeAsString("AC")); - } - - if ( ac != -1 ) - metrics.get(ac).update(eval); - } - } - - //public IndelMetricsByAC(VariantEvalWalker parent) { - //super(parent); - // don't do anything - //} - - public String getName() { - return "IndelMetricsByAC"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName(); - } - - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - final String interesting = null; - - if (eval != null ) { - if ( metrics == null ) { - int nSamples = numSamples; - //int nSamples = 2; - if ( nSamples != -1 ) - metrics = new IndelMetricsByAc(2 * nSamples); - } - - if ( eval.isIndel() && eval.isBiallelic() && - metrics != null ) { - metrics.incrValue(eval); - } - } - - return interesting; // This module doesn't capture any interesting sites, so return null - } - - //public void finalizeEvaluation() { - // - //} -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java index d99196ecf..fc347339d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java @@ -44,7 +44,7 @@ public class IndelStatistics extends VariantEvaluator { @DataPoint(description = "Indel Statistics") IndelStats indelStats = null; - @DataPoint(description = "Indel Classification") + // @DataPoint(description = "Indel Classification") IndelClasses indelClasses = null; int numSamples = 0; @@ -57,13 +57,13 @@ public class IndelStatistics extends VariantEvaluator { private static final int IND_HET = 0; private static final int IND_INS = 1; private static final int IND_DEL = 2; - private static final int IND_AT_CG_RATIO = 3; + private static final int IND_COMPLEX = 3; private static final int IND_HET_INS = 4; private static final int IND_HOM_INS = 5; private static final int IND_HET_DEL = 6; private static final int IND_HOM_DEL = 7; private static final int IND_HOM_REF = 8; - private static final int IND_COMPLEX = 9; + private static final int IND_MIXED = 9; private static final int IND_LONG = 10; private static final int IND_AT_EXP = 11; private static final int IND_CG_EXP = 12; @@ -79,15 +79,14 @@ public class IndelStatistics extends VariantEvaluator { } static class IndelStats implements TableType { - protected final static String ALL_SAMPLES_KEY = "allSamples"; - protected final static String[] COLUMN_KEYS; + protected final static String[] COLUMN_KEYS; static { COLUMN_KEYS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1]; COLUMN_KEYS[0] = "heterozygosity"; COLUMN_KEYS[1] = "insertions"; COLUMN_KEYS[2] = "deletions"; - COLUMN_KEYS[3] = "AT_CG_expansion_ratio"; + COLUMN_KEYS[3] = "complex"; COLUMN_KEYS[4] = "het_insertions"; COLUMN_KEYS[5] = "homozygous_insertions"; COLUMN_KEYS[6] = "het_deletions"; @@ -104,13 +103,10 @@ public class IndelStatistics extends VariantEvaluator { } // map of sample to statistics - protected final HashMap indelSummary = new HashMap(); + protected final int[] indelSummary; public IndelStats(final VariantContext vc) { - indelSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]); - for( final String sample : vc.getGenotypes().keySet() ) { - indelSummary.put(sample, new int[COLUMN_KEYS.length]); - } + indelSummary = new int[COLUMN_KEYS.length]; } /** @@ -118,19 +114,10 @@ public class IndelStatistics extends VariantEvaluator { * @return one row per sample */ public Object[] getRowKeys() { - return indelSummary.keySet().toArray(new String[indelSummary.size()]); + return new String[]{"all"}; } public Object getCell(int x, int y) { - final Object[] rowKeys = getRowKeys(); - if (y == IND_AT_CG_RATIO) { - - int at = indelSummary.get(rowKeys[x])[IND_AT_EXP]; - int cg = indelSummary.get(rowKeys[x])[IND_CG_EXP]; - return String.format("%4.2f",((double)at) / (Math.max(cg, 1))); - } - else - return String.format("%d",indelSummary.get(rowKeys[x])[y]); - + return String.format("%d",indelSummary[y]); } /** @@ -160,96 +147,49 @@ public class IndelStatistics extends VariantEvaluator { int eventLength = 0; boolean isInsertion = false, isDeletion = false; - if ( vc.isInsertion() ) { + if ( vc.isSimpleInsertion() ) { eventLength = vc.getAlternateAllele(0).length(); - indelSummary.get(ALL_SAMPLES_KEY)[IND_INS]++; + indelSummary[IND_INS]++; isInsertion = true; - } else if ( vc.isDeletion() ) { - indelSummary.get(ALL_SAMPLES_KEY)[IND_DEL]++; + } else if ( vc.isSimpleDeletion() ) { + indelSummary[IND_DEL]++; eventLength = -vc.getReference().length(); isDeletion = true; } - else { - indelSummary.get(ALL_SAMPLES_KEY)[IND_COMPLEX]++; + else if (vc.isComplexIndel()) { + indelSummary[IND_COMPLEX]++; } + else if (vc.isMixed()) + indelSummary[IND_MIXED]++; + if (IndelUtils.isATExpansion(vc,ref)) - indelSummary.get(ALL_SAMPLES_KEY)[IND_AT_EXP]++; + indelSummary[IND_AT_EXP]++; if (IndelUtils.isCGExpansion(vc,ref)) - indelSummary.get(ALL_SAMPLES_KEY)[IND_CG_EXP]++; + indelSummary[IND_CG_EXP]++; // make sure event doesn't overstep array boundaries - if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) { - indelSummary.get(ALL_SAMPLES_KEY)[len2Index(eventLength)]++; - if (eventLength % 3 != 0) - indelSummary.get(ALL_SAMPLES_KEY)[IND_FRAMESHIFT]++; - } - else - indelSummary.get(ALL_SAMPLES_KEY)[IND_LONG]++; - - - for( final String sample : vc.getGenotypes().keySet() ) { - if ( indelSummary.containsKey(sample) ) { - Genotype g = vc.getGenotype(sample); - boolean isVariant = (g.isCalled() && !g.isHomRef()); - if (isVariant) { - // update ins/del count - if (isInsertion) { - indelSummary.get(sample)[IND_INS]++; - } - else if (isDeletion) - indelSummary.get(sample)[IND_DEL]++; - else - indelSummary.get(sample)[IND_COMPLEX]++; - - // update histogram - if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) { - indelSummary.get(sample)[len2Index(eventLength)]++; - if (eventLength % 3 != 0) - indelSummary.get(sample)[IND_FRAMESHIFT]++; - } - else - indelSummary.get(sample)[IND_LONG]++; - - if (g.isHet()) - if (isInsertion) - indelSummary.get(sample)[IND_HET_INS]++; - else if (isDeletion) - indelSummary.get(sample)[IND_HET_DEL]++; - else - if (isInsertion) - indelSummary.get(sample)[IND_HOM_INS]++; - else if (isDeletion) - indelSummary.get(sample)[IND_HOM_DEL]++; - - if (IndelUtils.isATExpansion(vc,ref)) - indelSummary.get(sample)[IND_AT_EXP]++; - if (IndelUtils.isCGExpansion(vc,ref)) - indelSummary.get(sample)[IND_CG_EXP]++; - - - } - else - indelSummary.get(sample)[IND_HOM_REF]++; + if (vc.isSimpleDeletion() || vc.isSimpleInsertion()) { + if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) { + indelSummary[len2Index(eventLength)]++; + if (eventLength % 3 != 0) + indelSummary[IND_FRAMESHIFT]++; } + else + indelSummary[IND_LONG]++; } - } } static class IndelClasses implements TableType { - protected final static String ALL_SAMPLES_KEY = "allSamples"; protected final static String[] columnNames = IndelUtils.getIndelClassificationNames(); // map of sample to statistics - protected final HashMap indelClassSummary = new HashMap(); + protected final int[] indelClassSummary; public IndelClasses(final VariantContext vc) { - indelClassSummary.put(ALL_SAMPLES_KEY, new int[columnNames.length]); - for( final String sample : vc.getGenotypes().keySet() ) { - indelClassSummary.put(sample, new int[columnNames.length]); - } + indelClassSummary = new int[columnNames.length]; } /** @@ -257,11 +197,10 @@ public class IndelStatistics extends VariantEvaluator { * @return one row per sample */ public Object[] getRowKeys() { - return indelClassSummary.keySet().toArray(new String[indelClassSummary.size()]); + return new String[]{"all"}; } public Object getCell(int x, int y) { - final Object[] rowKeys = getRowKeys(); - return String.format("%d",indelClassSummary.get(rowKeys[x])[y]); + return String.format("%d",indelClassSummary[y]); } /** @@ -285,18 +224,7 @@ public class IndelStatistics extends VariantEvaluator { } private void incrementSampleStat(VariantContext vc, int index) { - indelClassSummary.get(ALL_SAMPLES_KEY)[index]++; - for( final String sample : vc.getGenotypes().keySet() ) { - if ( indelClassSummary.containsKey(sample) ) { - Genotype g = vc.getGenotype(sample); - boolean isVariant = (g.isCalled() && !g.isHomRef()); - if (isVariant) - // update count - indelClassSummary.get(sample)[index]++; - - } - } - + indelClassSummary[index]++; } /* * increment the specified value @@ -344,16 +272,13 @@ public class IndelStatistics extends VariantEvaluator { if (eval != null ) { if ( indelStats == null ) { - int nSamples = numSamples; - - if ( nSamples != -1 ) - indelStats = new IndelStats(eval); + indelStats = new IndelStats(eval); } if ( indelClasses == null ) { indelClasses = new IndelClasses(eval); } - if ( eval.isIndel() && eval.isBiallelic() ) { + if ( eval.isIndel() || eval.isMixed() ) { if (indelStats != null ) indelStats.incrValue(eval, ref); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 411493d4f..5cdea4e00 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -1,23 +1,25 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; +import java.util.List; public class AlleleCount extends VariantStratifier { // needs to know the variant context private ArrayList states = new ArrayList(); @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { + public void initialize() { + List> evals = getVariantEvalWalker().getEvals(); + // we can only work with a single eval VCF, and it must have genotypes - if ( evalNames.size() != 1 ) + if ( evals.size() != 1 ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification only works with a single eval vcf"); // There are 2 x n sample chromosomes for diploids diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java index 2ffc7716c..96d9f30ec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java @@ -2,19 +2,17 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; public class AlleleFrequency extends VariantStratifier { // needs to know the variant context private ArrayList states; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { + public void initialize() { states = new ArrayList(); for( double a = 0.000; a <= 1.005; a += 0.005 ) { states.add(String.format("%.3f", a)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java index c6975808f..9f4123589 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java @@ -1,24 +1,20 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; public class CompRod extends VariantStratifier implements RequiredStratification { - // Needs to know the comp rods - private Set compNames; private ArrayList states; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - this.compNames = compNames; - + public void initialize() { states = new ArrayList(); - states.addAll(compNames); + for ( RodBinding rod : getVariantEvalWalker().getComps() ) + states.add(rod.getName()); } public ArrayList getAllStates() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java index c14355035..e12a1ba97 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java @@ -2,20 +2,18 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; public class Contig extends VariantStratifier { // needs to know the variant context private ArrayList states; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { + public void initialize() { states = new ArrayList(); - states.addAll(contigNames); + states.addAll(getVariantEvalWalker().getContigNames()); states.add("all"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java index e1f2ae983..ff49c8ba9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java @@ -2,11 +2,9 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; /** * CpG is a stratification module for VariantEval that divides the input data by within/not within a CpG site @@ -24,7 +22,7 @@ public class CpG extends VariantStratifier { private ArrayList states; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { + public void initialize() { states = new ArrayList(); states.add("all"); states.add("CpG"); @@ -40,7 +38,7 @@ public class CpG extends VariantStratifier { if (ref != null && ref.getBases() != null) { String fwRefBases = new String(ref.getBases()); - String leftFlank = fwRefBases.substring((fwRefBases.length()/2) - 1, (fwRefBases.length()/2) + 1); + //String leftFlank = fwRefBases.substring((fwRefBases.length()/2) - 1, (fwRefBases.length()/2) + 1); String rightFlank = fwRefBases.substring((fwRefBases.length()/2), (fwRefBases.length()/2) + 2); //if (leftFlank.equalsIgnoreCase("CG") || leftFlank.equalsIgnoreCase("GC") || rightFlank.equalsIgnoreCase("CG") || rightFlank.equalsIgnoreCase("GC")) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java index 155a66186..cc878e975 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java @@ -2,13 +2,11 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; -import java.util.Set; public class Degeneracy extends VariantStratifier { private ArrayList states; @@ -16,7 +14,7 @@ public class Degeneracy extends VariantStratifier { private HashMap> degeneracies; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { + public void initialize() { states = new ArrayList(); states.add("1-fold"); states.add("2-fold"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java index 40f952fd2..0bfecee25 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java @@ -1,24 +1,20 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; public class EvalRod extends VariantStratifier implements RequiredStratification { - // needs to know the eval rods - private Set evalNames; private ArrayList states; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - this.evalNames = evalNames; - + public void initialize() { states = new ArrayList(); - states.addAll(evalNames); + for ( RodBinding rod : getVariantEvalWalker().getEvals() ) + states.add(rod.getName()); } public ArrayList getAllStates() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java index 3b7a419f2..3e3cbc224 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java @@ -2,18 +2,16 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; public class Filter extends VariantStratifier { // needs to know the variant context private ArrayList states; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { + public void initialize() { states = new ArrayList(); states.add("called"); states.add("filtered"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index c6c094f8e..0de871fe6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -2,18 +2,16 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; public class FunctionalClass extends VariantStratifier { // needs to know the variant context private ArrayList states; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { + public void initialize() { states = new ArrayList(); states.add("all"); states.add("silent"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java index 76efedbf4..59b991c4d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java @@ -15,8 +15,8 @@ public class JexlExpression extends VariantStratifier implements StandardStratif private ArrayList states; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - this.jexlExpressions = jexlExpressions; + public void initialize() { + jexlExpressions = getVariantEvalWalker().getJexlExpressions(); states = new ArrayList(); states.add("none"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java index d2e4392a5..a3810a4c0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java @@ -1,21 +1,21 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; public class Novelty extends VariantStratifier implements StandardStratification { // needs the variant contexts and known names - private Set knownNames; + private List> knowns; final private ArrayList states = new ArrayList(Arrays.asList("all", "known", "novel")); @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - this.knownNames = knownNames; + public void initialize() { + knowns = getVariantEvalWalker().getKnowns(); } public ArrayList getAllStates() { @@ -24,13 +24,11 @@ public class Novelty extends VariantStratifier implements StandardStratification public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if (tracker != null && eval != null) { - for (final String knownName : knownNames) { - final Collection knownComps = tracker.getValues(VariantContext.class, knownName, ref.getLocus()); - for ( final VariantContext c : knownComps ) { - // loop over sites, looking for something that matches the type eval - if ( eval.getType() == c.getType() ) { - return new ArrayList(Arrays.asList("all", "known")); - } + final Collection knownComps = tracker.getValues(knowns, ref.getLocus()); + for ( final VariantContext c : knownComps ) { + // loop over sites, looking for something that matches the type eval + if ( eval.getType() == c.getType() ) { + return new ArrayList(Arrays.asList("all", "known")); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java index a2a3eb3fb..b714fa291 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java @@ -2,20 +2,18 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; public class Sample extends VariantStratifier { // needs the sample names private ArrayList samples; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { + public void initialize() { samples = new ArrayList(); - samples.addAll(sampleNames); + samples.addAll(getVariantEvalWalker().getSampleNamesForStratification()); } public ArrayList getAllStates() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java index 2c4b8bc46..df6523207 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java @@ -3,11 +3,9 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; public abstract class VariantStratifier implements Comparable { private VariantEvalWalker variantEvalWalker; @@ -27,7 +25,7 @@ public abstract class VariantStratifier implements Comparable { this.variantEvalWalker = variantEvalWalker; } - public abstract void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames); + public abstract void initialize(); public ArrayList getAllStates() { return new ArrayList(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 33fb008ca..f31dd9f9f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -103,7 +103,7 @@ public class VariantEvalUtils { try { VariantStratifier vs = c.newInstance(); vs.setVariantEvalWalker(variantEvalWalker); - vs.initialize(variantEvalWalker.getJexlExpressions(), variantEvalWalker.getCompNames(), variantEvalWalker.getKnownNames(), variantEvalWalker.getEvalNames(), variantEvalWalker.getSampleNamesForStratification(), variantEvalWalker.getContigNames()); + vs.initialize(); strats.add(vs); } catch (InstantiationException e) { @@ -347,9 +347,9 @@ public class VariantEvalUtils { } } } - - bindings.put(track, mapping); } + + bindings.put(track, mapping); } return bindings; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index abe27e483..16f1abf1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -45,10 +45,43 @@ import java.io.FileNotFoundException; import java.util.*; /** - * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration + * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration + * + *

    + * Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value + * and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level + * have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered + * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a + * slightly lower quality level. + * + *

    + * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration + * + *

    Input

    + *

    + * The input raw variants to be recalibrated. + *

    + * The recalibration table file in CSV format that was generated by the VariantRecalibrator walker. + *

    + * The tranches file that was generated by the VariantRecalibrator walker. + * + *

    Output

    + *

    + * A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level. + * + *

    Examples

    + *
    + * java -Xmx3g -jar GenomeAnalysisTK.jar \
    + *   -T ApplyRecalibration \
    + *   -R reference/human_g1k_v37.fasta \
    + *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
    + *   --ts_filter_level 99.0 \
    + *   -tranchesFile path/to/output.tranches \
    + *   -recalFile path/to/output.recal \
    + *   -o path/to/output.recalibrated.filtered.vcf
    + * 
    * - * @author rpoplin - * @since Mar 14, 2011 */ public class ApplyRecalibration extends RodWalker { @@ -57,11 +90,11 @@ public class ApplyRecalibration extends RodWalker { // Inputs ///////////////////////////// /** - * The raw input variants to be recalibrated. + * These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling. */ @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) public List> input; - @Input(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true) + @Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true) private File RECAL_FILE; @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true) private File TRANCHES_FILE; @@ -69,7 +102,7 @@ public class ApplyRecalibration extends RodWalker { ///////////////////////////// // Outputs ///////////////////////////// - @Output( doc="The output filtered, recalibrated VCF file", required=true) + @Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value", required=true) private VCFWriter vcfWriter = null; ///////////////////////////// @@ -77,7 +110,7 @@ public class ApplyRecalibration extends RodWalker { ///////////////////////////// @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) private double TS_FILTER_LEVEL = 99.0; - @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false) + @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false) private String[] IGNORE_INPUT_FILTERS = null; @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false) public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index 17461de2f..3fa9c3883 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -207,6 +207,7 @@ public class GaussianMixtureModel { for( final boolean isNull : datum.isNull ) { if( isNull ) { return evaluateDatumMarginalized( datum ); } } + // Fill an array with the log10 probability coming from each Gaussian and then use MathUtils to sum them up correctly final double[] pVarInGaussianLog10 = new double[gaussians.size()]; int gaussianIndex = 0; for( final MultivariateGaussian gaussian : gaussians ) { @@ -215,6 +216,7 @@ public class GaussianMixtureModel { return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k)) } + // Used only to decide which covariate dimension is most divergent in order to report in the culprit info field annotation public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) { if(datum.isNull[iii]) { return null; } @@ -229,7 +231,7 @@ public class GaussianMixtureModel { } public double evaluateDatumMarginalized( final VariantDatum datum ) { - int numSamples = 0; + int numRandomDraws = 0; double sumPVarInGaussian = 0.0; final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization final double[] pVarInGaussianLog10 = new double[gaussians.size()]; @@ -248,10 +250,10 @@ public class GaussianMixtureModel { // add this sample's probability to the pile in order to take an average in the end sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k)) - numSamples++; + numRandomDraws++; } } } - return Math.log10( sumPVarInGaussian / ((double) numSamples) ); + return Math.log10( sumPVarInGaussian / ((double) numRandomDraws) ); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index cb4d94332..429becfc7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -233,13 +233,15 @@ public class VariantDataManager { } public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap rodToPriorMap, - final List> training, final List> truth, final List> known, final List> badSites) { + final List> training, final List> truth, final List> known, final List> badSites, final List> resource) { datum.isKnown = false; datum.atTruthSite = false; datum.atTrainingSite = false; datum.atAntiTrainingSite = false; datum.prior = 2.0; + //BUGBUG: need to clean this up + for( final RodBinding rod : training ) { for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { @@ -264,6 +266,13 @@ public class VariantDataManager { } } } + for( final RodBinding rod : resource ) { + for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { + if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { + datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) ); + } + } + } for( final RodBinding rod : badSites ) { for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) { if( trainVC != null ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index da9da936b..df4faebd1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -45,10 +45,54 @@ import java.io.PrintStream; import java.util.*; /** - * Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score + * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. + * + *

    + * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker. + * + *

    + * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. + * One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. + * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship + * between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic + * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided + * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive + * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the + * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is + * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. + * + *

    + * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration + * + *

    Input

    + *

    + * The input raw variants to be recalibrated. + *

    + * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. + * + *

    Output

    + *

    + * A recalibration table file in CSV format that is used by the ApplyRecalibration walker. + *

    + * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. + * + *

    Examples

    + *
    + * java -Xmx4g -jar GenomeAnalysisTK.jar \
    + *   -T VariantRecalibrator \
    + *   -R reference/human_g1k_v37.fasta \
    + *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
    + *   -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
    + *   -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
    + *   -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
    + *   -known:prior=8.0 dbsnp_132.b37.vcf \
    + *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
    + *   -recalFile path/to/output.recal \
    + *   -tranchesFile path/to/output.tranches \
    + *   -rscriptFile path/to/output.plots.R
    + * 
    * - * User: rpoplin - * Date: 3/12/11 */ public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { @@ -62,42 +106,44 @@ public class VariantRecalibrator extends RodWalker> input; + /** - * A list of training variants used to train the Gaussian mixture model. - * * Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model. */ @Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true) public List> training; + /** - * A list of true variants to be used when deciding the truth sensitivity cut of the final callset. - * * When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used. * Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example. */ @Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true) public List> truth; + /** - * A list of known variants to be used for metric comparison purposes. - * * The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes. * The output metrics are stratified by known status in order to aid in comparisons with other call sets. */ @Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false) public List> known = Collections.emptyList(); + /** - * A list of known bad variants used to supplement training the negative model. - * * In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list * with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example). */ @Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false) public List> badSites = Collections.emptyList(); + /** + * Any set of sites for which you would like to apply a prior probability but for which you don't want to use as training, truth, or known sites. + */ + @Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false) + public List> resource = Collections.emptyList(); + ///////////////////////////// // Outputs ///////////////////////////// @@ -109,13 +155,29 @@ public class VariantRecalibrator extends RodWalker rod : allInputBindings ) { try { rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) ); @@ -207,9 +270,9 @@ public class VariantRecalibrator extends RodWalker + * CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data + * and any number of sources can be input. This tool currently supports two different combination types for each of + * variants (the first 8 fields of the VCF) and genotypes (the rest). * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. * Union: assumes each rod represents the same set of samples (although this is not enforced); using the - * priority list (if provided), emits a single record instance at every position represented in the rods. + * priority list (if provided), it emits a single record instance at every position represented in the rods. + * + * CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD + * bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect, + * CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs + * can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just + * the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single + * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out + * in the detailed example on the wiki. + * + *

    Input

    + *

    + * One or more variant sets to combine. + *

    + * + *

    Output

    + *

    + * A combined VCF. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T CombineVariants \
    + *   --variant input1.vcf \
    + *   --variant input2.vcf \
    + *   -o output.vcf \
    + *   -genotypeMergeOptions UNIQUIFY
    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T CombineVariants \
    + *   --variant:foo input1.vcf \
    + *   --variant:bar input2.vcf \
    + *   -o output.vcf \
    + *   -genotypeMergeOptions PRIORITIZE
    + *   -priority foo,bar
    + * 
    + * */ @Reference(window=@Window(start=-50,stop=50)) public class CombineVariants extends RodWalker { @@ -69,32 +113,43 @@ public class CombineVariants extends RodWalker { @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - // the types of combinations we currently allow - @Argument(shortName="genotypeMergeOptions", doc="How should we merge genotype records for samples shared across the ROD files?", required=false) + @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false) public VariantContextUtils.GenotypeMergeType genotypeMergeOption = VariantContextUtils.GenotypeMergeType.PRIORITIZE; - @Argument(shortName="filteredRecordsMergeType", doc="How should we deal with records seen at the same site in the VCF, but with different FILTER fields? KEEP_IF_ANY_UNFILTERED PASSes the record if any record is unfiltered, KEEP_IF_ALL_UNFILTERED requires all records to be unfiltered", required=false) + @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; - @Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=false) + /** + * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. + */ + @Argument(fullName="rod_priority_list", shortName="priority", doc="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted", required=false) public String PRIORITY_STRING = null; @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false) public boolean printComplexMerges = false; - @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotation don't appear in the combined VCF", required=false) + @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF", required=false) public boolean filteredAreUncalled = false; - @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype INFO field", required=false) + /** + * Used to generate a sites-only file. + */ + @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false) public boolean minimalVCF = false; - @Argument(fullName="setKey", shortName="setKey", doc="Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from. Set to null if you don't want the set field emitted.", required=false) + /** + * Set to 'null' if you don't want the set field emitted. + */ + @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) public String SET_KEY = "set"; - @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls so that one can simply perform a merge sort to combine the VCFs into one, drastically reducing the runtime.", required=false) + /** + * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime.. + */ + @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false) public boolean ASSUME_IDENTICAL_SAMPLES = false; - @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if variant is present in at least N input files.", required=false) + @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false) public int minimumN = 1; @Hidden diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java index c47a015c6..c9f330db5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java @@ -46,6 +46,31 @@ import java.util.*; /** * Left-aligns indels from a variants file. + * + *

    + * LeftAlignVariants is a tool that takes a VCF file and left-aligns any indels inside it. The same indel can often be + * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to + * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. + * + *

    Input

    + *

    + * A variant set to left-align. + *

    + * + *

    Output

    + *

    + * A left-aligned VCF. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T LeftAlignVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf
    + * 
    + * */ @Reference(window=@Window(start=-200,stop=200)) public class LeftAlignVariants extends RodWalker { @@ -108,7 +133,7 @@ public class LeftAlignVariants extends RodWalker { // get the indel length int indelLength; - if ( vc.isDeletion() ) + if ( vc.isSimpleDeletion() ) indelLength = vc.getReference().length(); else indelLength = vc.getAlternateAllele(0).length(); @@ -125,7 +150,7 @@ public class LeftAlignVariants extends RodWalker { // create a CIGAR string to represent the event ArrayList elements = new ArrayList(); elements.add(new CigarElement(originalIndex, CigarOperator.M)); - elements.add(new CigarElement(indelLength, vc.isDeletion() ? CigarOperator.D : CigarOperator.I)); + elements.add(new CigarElement(indelLength, vc.isSimpleDeletion() ? CigarOperator.D : CigarOperator.I)); elements.add(new CigarElement(refSeq.length - originalIndex, CigarOperator.M)); Cigar originalCigar = new Cigar(elements); @@ -140,8 +165,8 @@ public class LeftAlignVariants extends RodWalker { int indelIndex = originalIndex-difference; byte[] newBases = new byte[indelLength]; - System.arraycopy((vc.isDeletion() ? refSeq : originalIndel), indelIndex, newBases, 0, indelLength); - Allele newAllele = Allele.create(newBases, vc.isDeletion()); + System.arraycopy((vc.isSimpleDeletion() ? refSeq : originalIndel), indelIndex, newBases, 0, indelLength); + Allele newAllele = Allele.create(newBases, vc.isSimpleDeletion()); newVC = updateAllele(newVC, newAllele, refSeq[indelIndex-1]); writer.add(newVC); @@ -153,14 +178,14 @@ public class LeftAlignVariants extends RodWalker { } private static byte[] makeHaplotype(VariantContext vc, byte[] ref, int indexOfRef, int indelLength) { - byte[] hap = new byte[ref.length + (indelLength * (vc.isDeletion() ? -1 : 1))]; + byte[] hap = new byte[ref.length + (indelLength * (vc.isSimpleDeletion() ? -1 : 1))]; // add the bases before the indel System.arraycopy(ref, 0, hap, 0, indexOfRef); int currentPos = indexOfRef; // take care of the indel - if ( vc.isDeletion() ) { + if ( vc.isSimpleDeletion() ) { indexOfRef += indelLength; } else { System.arraycopy(vc.getAlternateAllele(0).getBases(), 0, hap, currentPos, indelLength); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 16733bc44..bfe7198cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -50,54 +50,184 @@ import java.io.PrintStream; import java.util.*; /** - * Takes a VCF file, selects variants based on sample(s) in which it was found and/or on various annotation criteria, - * recompute the value of certain annotations based on the new sample set, and output a new VCF with the results. + * Selects variants from a VCF source. + * + *

    + * Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses + * (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain + * requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose. + * Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a + * pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of + * coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are + * documented in the Using JEXL expressions section (http://www.broadinstitute.org/gsa/wiki/index.php/Using_JEXL_expressions). + * One can optionally include concordance or discordance tracks for use in selecting overlapping variants. + * + *

    Input

    + *

    + * A variant set to select from. + *

    + * + *

    Output

    + *

    + * A selected VCF. + *

    + * + *

    Examples

    + *
    + * Select two samples out of a VCF with many samples:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -sn SAMPLE_A_PARC \
    + *   -sn SAMPLE_B_ACTG
    + *
    + * Select two samples and any sample that matches a regular expression:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -sn SAMPLE_1_PARC \
    + *   -sn SAMPLE_1_ACTG \
    + *   -sn 'SAMPLE.+PARC'
    + *
    + * Select any sample that matches a regular expression and sites where the QD annotation is more than 10:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -sn 'SAMPLE.+PARC'
    + *   -select "QD > 10.0"
    + *
    + * Select a sample and exclude non-variant loci and filtered loci:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -sn SAMPLE_1_ACTG \
    + *   -env \
    + *   -ef
    + *
    + * Select a sample and restrict the output vcf to a set of intervals:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -L /path/to/my.interval_list \
    + *   -sn SAMPLE_1_ACTG
    + *
    + * Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called by this dataset):
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant hapmap.vcf \
    + *   --discordance myCalls.vcf
    + *   -o output.vcf \
    + *   -sn mySample
    + *
    + * Select all calls made by both myCalls and hisCalls (useful to take a look at what is consistent between the two callers):
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant myCalls.vcf \
    + *   --concordance hisCalls.vcf
    + *   -o output.vcf \
    + *   -sn mySample
    + *
    + * Generating a VCF of all the variants that are mendelian violations:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -SM family.yaml \
    + *   -family NA12891+NA12892=NA12878 \
    + *   -mvq 50
    + *
    + * Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -number 1000
    + *
    + * Creating a set with 50% of the total number of variants in the variant VCF:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -fraction 0.5
    + *
    + * 
    + * */ public class SelectVariants extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** - * A site is considered discordant if there exists some sample in eval that has a non-reference genotype + * A site is considered discordant if there exists some sample in the variant track that has a non-reference genotype * and either the site isn't present in this track, the sample isn't present in this track, * or the sample is called reference in this track. */ - @Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this Feature comparison track", required=false) + @Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false) private RodBinding discordanceTrack; /** * A site is considered concordant if (1) we are not looking for specific samples and there is a variant called - * in both variants and concordance tracks or (2) every sample present in eval is present in the concordance - * track and they have the sample genotype call. + * in both the variant and concordance tracks or (2) every sample present in the variant track is present in the + * concordance track and they have the sample genotype call. */ - @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this Feature comparison track", required=false) + @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false) private RodBinding concordanceTrack; @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) - public Set sampleNames; + @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false) + public Set sampleNames = new HashSet(0); - @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times.", required=false) - public Set sampleExpressions; + @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false) + public Set sampleExpressions ; - @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) + @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false) public Set sampleFiles; - @Argument(shortName="select", doc="One or more criteria to use when selecting the data. Evaluated *after* the specified samples are extracted and the INFO-field annotations are updated.", required=false) + /** + * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. + */ + @Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample. Can be specified multiple times", required=false) + public Set XLsampleNames = new HashSet(0); + + /** + * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. + */ + @Argument(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false) + public Set XLsampleFiles = new HashSet(0); + + /** + * Note that these expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated. + */ + @Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false) public ArrayList SELECT_EXPRESSIONS = new ArrayList(); - @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) + @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) private boolean EXCLUDE_NON_VARIANTS = false; - @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis.", required=false) + @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) private boolean EXCLUDE_FILTERED = false; - @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't include filtered loci.", required=false) + @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; @Hidden - @Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) + @Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) private boolean KEEP_AF_SPECTRUM = false; @Hidden @@ -108,30 +238,43 @@ public class SelectVariants extends RodWalker { @Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) private File FAMILY_STRUCTURE_FILE = null; - @Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) + /** + * String formatted as dad+mom=child where these parameters determine which sample names are examined. + */ + @Argument(fullName="family_structure", shortName="family", doc="Deprecated; use the -SM argument instead", required=false) private String FAMILY_STRUCTURE = ""; - @Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only. Sample metadata information will be taken from YAML file (passed with -SM)", required=false) + /** + * Sample metadata information will be taken from a YAML file (see the -SM argument). + */ + @Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false) private Boolean MENDELIAN_VIOLATIONS = false; @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; - @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track. Variants are kept in memory to guarantee that n variants will be output, so use it only for a reasonable number of variants. Use select_random_fraction for larger numbers of variants", required=false) + /** + * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so use it only for a reasonable + * number of variants. Use --select_random_fraction for larger numbers of variants. + */ + @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) private int numRandom = 0; - @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track. Routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions", required=false) + /** + * This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions. + */ + @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false) private double fractionRandom = 0; - @Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs.", required=false) + @Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs from the input file", required=false) private boolean SELECT_SNPS = false; - @Argument(fullName="selectIndels", shortName="indels", doc="Select only Indels.", required=false) + @Argument(fullName="selectIndels", shortName="indels", doc="Select only indels from the input file", required=false) private boolean SELECT_INDELS = false; @Hidden - @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) - private String outMVFile = null; + @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) + private String outMVFile = null; /* Private class used to store the intermediate variants in the integer random selection process */ private class RandomVariantStructure { @@ -173,8 +316,7 @@ public class SelectVariants extends RodWalker { private ArrayList afBoosts = null; double bkDelta = 0.0; - - private PrintStream outMVFileStream = null; + private PrintStream outMVFileStream = null; /** @@ -190,19 +332,27 @@ public class SelectVariants extends RodWalker { Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); + // first, add any requested samples samples.addAll(samplesFromFile); samples.addAll(samplesFromExpressions); - if (sampleNames != null) - samples.addAll(sampleNames); + samples.addAll(sampleNames); - if(samples.isEmpty()) { + // if none were requested, we want all of them + if ( samples.isEmpty() ) { samples.addAll(vcfSamples); NO_SAMPLES_SPECIFIED = true; } - for (String sample : samples) { + // now, exclude any requested samples + Collection XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles); + samples.removeAll(XLsamplesFromFile); + samples.removeAll(XLsampleNames); + + if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED ) + throw new UserException("All samples requested to be included were also requested to be excluded."); + + for ( String sample : samples ) logger.info("Including sample '" + sample + "'"); - } // Initialize VCF header Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java index 5c7fb268c..c0f695966 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; import org.broad.tribble.dbsnp.DbSNPFeature; import org.broadinstitute.sting.commandline.*; @@ -34,7 +33,6 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -48,7 +46,32 @@ import java.util.Set; /** - * Validates a variants file. + * Strictly validates a variants file. + * + *

    + * ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it. + * Checks include the correctness of the reference base(s), accuracy of AC & AN values, tests against rsIDs + * when a dbSNP file is provided, and that all alternate alleles are present in at least one sample. + * + *

    Input

    + *

    + * A variant set to filter. + *

    + * + *

    Output

    + *

    + * A filtered VCF. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T ValidateVariants \
    + *   --variant input.vcf \
    + *   --dbsnp dbsnp.vcf
    + * 
    + * */ @Reference(window=@Window(start=0,stop=100)) public class ValidateVariants extends RodWalker { @@ -67,10 +90,13 @@ public class ValidateVariants extends RodWalker { @Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false) protected ValidationType type = ValidationType.ALL; - @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "should we skip validation on filtered records?", required = false) + /** + * By default, even filtered records are validated. + */ + @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false) protected Boolean DO_NOT_VALIDATE_FILTERED = false; - @Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "should we just emit warnings on errors instead of terminating the run?", required = false) + @Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false) protected Boolean WARN_ON_ERROR = false; private long numErrors = 0; @@ -111,11 +137,11 @@ public class ValidateVariants extends RodWalker { Allele reportedRefAllele = vc.getReference(); Allele observedRefAllele; // insertions - if ( vc.isInsertion() ) { + if ( vc.isSimpleInsertion() ) { observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING); } // deletions - else if ( vc.isDeletion() || vc.isMixed() || vc.isMNP() ) { + else if ( vc.isSimpleDeletion() || vc.isMixed() || vc.isMNP() ) { // we can't validate arbitrarily long deletions if ( reportedRefAllele.length() > 100 ) { logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart())); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index 6ed0bbd16..b98646270 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -25,10 +25,8 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -43,21 +41,57 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.*; /** - * Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes) + * Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes) + * + *

    + * The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes). + * The tool produces a VCF that is annotated with information pertaining to plate quality control and by + * default is soft-filtered by high no-call rate or low Hardy-Weinberg probability. + * If you have .ped files, please first convert them to VCF format + * (see http://www.broadinstitute.org/gsa/wiki/index.php/Converting_ped_to_vcf). + * + *

    Input

    + *

    + * A validation VCF to annotate. + *

    + * + *

    Output

    + *

    + * An annotated VCF. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T VariantValidationAssessor \
    + *   --variant input.vcf \
    + *   -o output.vcf
    + * 
    + * */ @Reference(window=@Window(start=0,stop=40)) public class VariantValidationAssessor extends RodWalker { - @Input(fullName="variants", shortName = "V", doc="Input VCF file", required=true) - public RodBinding variants; + + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfwriter = null; - @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false) + @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid", required=false) protected double maxHardy = 20.0; - @Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid [default:0.05]", required=false) + + /** + * To disable, set to a value greater than 1. + */ + @Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid", required=false) protected double maxNoCall = 0.05; - @Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false) + + /** + * To disable, set to a value greater than 1. + */ + @Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid", required=false) protected double maxHomNonref = 1.1; //@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+ @@ -93,7 +127,7 @@ public class VariantValidationAssessor extends RodWalker if ( tracker == null ) return null; - VariantContext vc = tracker.getFirstValue(variants, ref.getLocus()); + VariantContext vc = tracker.getFirstValue(variantCollection.variants, ref.getLocus()); // ignore places where we don't have a variant if ( vc == null ) return null; @@ -101,7 +135,7 @@ public class VariantValidationAssessor extends RodWalker if ( sampleNames == null ) sampleNames = new TreeSet(vc.getSampleNames()); - return addVariantInformationToCall(ref, vc); + return addVariantInformationToCall(vc); } public Integer reduce(VariantContext call, Integer numVariants) { @@ -113,7 +147,7 @@ public class VariantValidationAssessor extends RodWalker } public void onTraversalDone(Integer finalReduce) { - final List inputNames = Arrays.asList(variants.getName()); + final List inputNames = Arrays.asList(variantCollection.variants.getName()); // setup the header fields Set hInfo = new HashSet(); @@ -159,7 +193,7 @@ public class VariantValidationAssessor extends RodWalker } - private VariantContext addVariantInformationToCall(ReferenceContext ref, VariantContext vContext) { + private VariantContext addVariantInformationToCall(VariantContext vContext) { // check possible filters double hwPvalue = hardyWeinbergCalculation(vContext); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index af3593ce4..19db58e0c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -40,95 +40,109 @@ import java.io.PrintStream; import java.util.*; /** - * Emits specific fields as dictated by the user from one or more VCF files. + * Emits specific fields from a VCF file to a tab-deliminated table + * + *

    + * This walker accepts a single VCF file and writes out user-selected fields from the + * VCF as a header-containing, tab-deliminated file. The user specifies one or more + * fields to print with the -F NAME, each of which appears as a single column in + * the output file, with a header named NAME, and the value of this field in the VCF + * one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding + * in the INFO field (AC=10). Note that this tool does not support capturing any + * GENOTYPE field values. If a VCF record is missing a value, then the tool by + * default throws an error, but the special value NA can be emitted instead with + * appropriate tool arguments. + * + *

    + * + *

    Input

    + *

    + *

      + *
    • A VCF file
    • + *
    • A list of -F fields to write
    • + *
    + *

    + * + *

    Output

    + *

    + * A table deliminated file containing the values of the requested fields in the VCF file + *

    + * + *

    Examples

    + *
    + *     -T $WalkerName \
    + *     -V file.vcf \
    + *     -F CHROM -F POS -F ID -F QUAL -F AC \
    + *     -o results.table
    + *
    + *     would produce a file that looks like:
    + *
    + *     CHROM    POS ID      QUAL    AC
    + *     1        10  .       50      1
    + *     1        20  rs10    99      10
    + *     et cetera...
    + * 
    + * + * @author Mark DePristo + * @since 2010 */ public class VariantsToTable extends RodWalker { - @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @Output(doc="File to which results should be written",required=true) protected PrintStream out; - @Argument(fullName="fields", shortName="F", doc="Fields to emit from the VCF, allows any VCF field, any info field, and some meta fields like nHets", required=true) - public ArrayList fieldsToTake = new ArrayList(); + /** + * -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10). + * Note that this tool does not support capturing any GENOTYPE field values. Note this argument + * accepts any number of inputs. So -F CHROM -F POS is allowed. + */ + @Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true) + public List fieldsToTake = new ArrayList(); - @Argument(fullName="showFiltered", shortName="raw", doc="Include filtered records") + /** + * By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered). + * Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value. + */ + @Advanced + @Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false) public boolean showFiltered = false; - @Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false) + /** + * If provided, then this tool will exit with success after this number of records have been emitted to the file. + */ + @Advanced + @Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false) public int MAX_RECORDS = -1; int nRecords = 0; + /** + * By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then + * VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this + * can make your resulting file unreadable and malformated according to tools like R, as the representation of + * multi-allelic INFO field values can be lists of values. + */ + @Advanced @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) public boolean keepMultiAllelic = false; + /** + * By default, this tool throws a UserException when it encounters a field without a value in some record. This + * is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being + * found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such + * fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument + * will cause VariantsToTable to write out NA values for missing fields instead of throwing an error. + */ + @Advanced @Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false) public boolean ALLOW_MISSING_DATA = false; public void initialize() { + // print out the header out.println(Utils.join("\t", fieldsToTake)); } - public static abstract class Getter { public abstract String get(VariantContext vc); } - public static Map getters = new HashMap(); - - static { - // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT - getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } }); - getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } }); - getters.put("REF", new Getter() { - public String get(VariantContext vc) { - String x = ""; - if ( vc.hasReferenceBaseForIndel() ) { - Byte refByte = vc.getReferenceBaseForIndel(); - x=x+new String(new byte[]{refByte}); - } - return x+vc.getReference().getDisplayString(); - } - }); - getters.put("ALT", new Getter() { - public String get(VariantContext vc) { - StringBuilder x = new StringBuilder(); - int n = vc.getAlternateAlleles().size(); - if ( n == 0 ) return "."; - if ( vc.hasReferenceBaseForIndel() ) { - Byte refByte = vc.getReferenceBaseForIndel(); - x.append(new String(new byte[]{refByte})); - } - - for ( int i = 0; i < n; i++ ) { - if ( i != 0 ) x.append(","); - x.append(vc.getAlternateAllele(i).getDisplayString()); - } - return x.toString(); - } - }); - getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } }); - getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) { - if ( vc.isSNP() && vc.isBiallelic() ) - return VariantContextUtils.isTransition(vc) ? "1" : "0"; - else - return "-1"; - }}); - getters.put("FILTER", new Getter() { public String get(VariantContext vc) { - return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); } - }); - - getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } }); - getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); - getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); - getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); - getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); - getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); - getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); - getters.put("GQ", new Getter() { public String get(VariantContext vc) { - if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); - return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError()); - }}); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) // RodWalkers can make funky map calls return 0; @@ -155,6 +169,15 @@ public class VariantsToTable extends RodWalker { return s.endsWith("*"); } + /** + * Utility function that returns the list of values for each field in fields from vc. + * + * @param vc the VariantContext whose field values we can to capture + * @param fields a non-null list of fields to capture from VC + * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise + * provides a value of NA + * @return + */ public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { List vals = new ArrayList(); @@ -183,28 +206,12 @@ public class VariantsToTable extends RodWalker { } if (field.equals("AF") || field.equals("AC")) { - String afo = val; - - double af=0; - if (afo.contains(",")) { - String[] afs = afo.split(","); - afs[0] = afs[0].substring(1,afs[0].length()); - afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); - - double[] afd = new double[afs.length]; - - for (int k=0; k < afd.length; k++) - afd[k] = Double.valueOf(afs[k]); - - af = MathUtils.arrayMax(afd); - //af = Double.valueOf(afs[0]); - - } - else - if (!afo.equals("NA")) - af = Double.valueOf(afo); - - val = Double.toString(af); + if (val.contains(",")) { + // strip [,] and spaces + val = val.replace("[",""); + val = val.replace("]",""); + val = val.replace(" ",""); + } } vals.add(val); @@ -213,13 +220,75 @@ public class VariantsToTable extends RodWalker { return vals; } - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - + // + // default reduce -- doesn't do anything at all + // + public Integer reduceInit() { return 0; } + public Integer reduce(Integer counter, Integer sum) { return counter + sum; } public void onTraversalDone(Integer sum) {} + + // ---------------------------------------------------------------------------------------------------- + // + // static system for getting values from VC by name. + // + // ---------------------------------------------------------------------------------------------------- + + public static abstract class Getter { public abstract String get(VariantContext vc); } + public static Map getters = new HashMap(); + + static { + // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT + getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } }); + getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } }); + getters.put("REF", new Getter() { + public String get(VariantContext vc) { + String x = ""; + if ( vc.hasReferenceBaseForIndel() ) { + Byte refByte = vc.getReferenceBaseForIndel(); + x=x+new String(new byte[]{refByte}); + } + return x+vc.getReference().getDisplayString(); + } + }); + getters.put("ALT", new Getter() { + public String get(VariantContext vc) { + StringBuilder x = new StringBuilder(); + int n = vc.getAlternateAlleles().size(); + if ( n == 0 ) return "."; + if ( vc.hasReferenceBaseForIndel() ) { + Byte refByte = vc.getReferenceBaseForIndel(); + x.append(new String(new byte[]{refByte})); + } + + for ( int i = 0; i < n; i++ ) { + if ( i != 0 ) x.append(","); + x.append(vc.getAlternateAllele(i).getDisplayString()); + } + return x.toString(); + } + }); + getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } }); + getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) { + if ( vc.isSNP() && vc.isBiallelic() ) + return VariantContextUtils.isTransition(vc) ? "1" : "0"; + else + return "-1"; + }}); + getters.put("FILTER", new Getter() { public String get(VariantContext vc) { + return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); } + }); + + getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } }); + getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); + getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); + getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); + getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); + getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); + getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); + getters.put("GQ", new Getter() { public String get(VariantContext vc) { + if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); + return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError()); + }}); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 1684dccfb..9b33f8537 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -33,7 +33,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; -import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.*; @@ -53,6 +52,30 @@ import java.util.*; /** * Converts variants from other file formats to VCF format. + * + *

    + * Note that there must be a Tribble feature/codec for the file format as well as an adaptor. + * + *

    Input

    + *

    + * A variant file to filter. + *

    + * + *

    Output

    + *

    + * A VCF file. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T VariantsToVCF \
    + *   -o output.vcf \
    + *   --variant:RawHapMap input.hapmap \
    + *   --dbsnp dbsnp.vcf
    + * 
    + * */ @Reference(window=@Window(start=-40,stop=40)) public class VariantsToVCF extends RodWalker { @@ -61,15 +84,24 @@ public class VariantsToVCF extends RodWalker { protected VCFWriter baseWriter = null; private SortingVCFWriter vcfwriter; // needed because hapmap/dbsnp indel records move + /** + * Variants from this input file are used by this tool as input. + */ @Input(fullName="variant", shortName = "V", doc="Input variant file", required=true) public RodBinding variants; @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false) + /** + * This argument is used for data (like GELI) with genotypes but no sample names encoded within. + */ + @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod", required=false) protected String sampleName = null; + /** + * This argument is useful for fixing input VCFs with bad reference bases (the output will be a fixed version of the VCF). + */ @Argument(fullName="fixRef", shortName="fixRef", doc="Fix common reference base in case there's an indel without padding", required=false) protected boolean fixReferenceBase = false; @@ -87,7 +119,7 @@ public class VariantsToVCF extends RodWalker { if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) ) return 0; - String rsID = dbsnp == null ? null : DbSNPHelper.rsIDOfFirstRealVariant(tracker.getValues(dbsnp.dbsnp, context.getLocation()), VariantContext.Type.SNP); + String rsID = dbsnp == null ? null : VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbsnp.dbsnp, context.getLocation()), VariantContext.Type.SNP); Collection contexts = getVariantContexts(tracker, ref); @@ -135,8 +167,8 @@ public class VariantsToVCF extends RodWalker { continue; Map alleleMap = new HashMap(2); - alleleMap.put(RawHapMapFeature.DELETION, Allele.create(Allele.NULL_ALLELE_STRING, dbsnpVC.isInsertion())); - alleleMap.put(RawHapMapFeature.INSERTION, Allele.create(((RawHapMapFeature)record).getAlleles()[1], !dbsnpVC.isInsertion())); + alleleMap.put(RawHapMapFeature.DELETION, Allele.create(Allele.NULL_ALLELE_STRING, dbsnpVC.isSimpleInsertion())); + alleleMap.put(RawHapMapFeature.INSERTION, Allele.create(((RawHapMapFeature)record).getAlleles()[1], !dbsnpVC.isSimpleInsertion())); hapmap.setActualAlleles(alleleMap); // also, use the correct positioning for insertions diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java new file mode 100644 index 000000000..3716d3110 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import org.ggf.drmaa.DrmaaException; +import org.ggf.drmaa.JobInfo; + +import java.util.Map; + +/** + * JNA mapping from Java to C DRMAA binding. + */ +public class JnaJobInfo implements JobInfo { + + private final String jobId; + private final Map rusage; + private final boolean hasExited; + private final int exitStatus; + private final boolean hasSignaled; + private final String terminatingSignal; + private final boolean hasCoreDump; + private final boolean wasAborted; + + public JnaJobInfo(String jobId, Map rusage, boolean hasExited, int exitStatus, boolean hasSignaled, String terminatingSignal, boolean hasCoreDump, boolean wasAborted) { + this.jobId = jobId; + this.rusage = rusage; + this.hasExited = hasExited; + this.exitStatus = exitStatus; + this.hasSignaled = hasSignaled; + this.terminatingSignal = terminatingSignal; + this.hasCoreDump = hasCoreDump; + this.wasAborted = wasAborted; + } + + @Override + public String getJobId() throws DrmaaException { + return this.jobId; + } + + @Override + public Map getResourceUsage() throws DrmaaException { + return rusage; + } + + @Override + public boolean hasExited() throws DrmaaException { + return hasExited; + } + + @Override + public int getExitStatus() throws DrmaaException { + if (!hasExited) + throw new IllegalStateException("job has not exited"); + return exitStatus; + } + + @Override + public boolean hasSignaled() throws DrmaaException { + return hasSignaled; + } + + @Override + public String getTerminatingSignal() throws DrmaaException { + if (!hasSignaled) + throw new IllegalStateException("job has not signaled"); + return terminatingSignal; + } + + @Override + public boolean hasCoreDump() throws DrmaaException { + return hasCoreDump; + } + + @Override + public boolean wasAborted() throws DrmaaException { + return wasAborted; + } +} diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java new file mode 100644 index 000000000..58cd19926 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import com.sun.jna.Pointer; +import org.ggf.drmaa.*; + +import java.util.*; + +/** + * JNA mapping from Java to C DRMAA binding. + */ +public class JnaJobTemplate implements JobTemplate { + private final JnaSession session; + private final Pointer jt; + + public JnaJobTemplate(JnaSession session, Pointer jt) { + this.session = session; + this.jt = jt; + } + + public Pointer getPointer() { + return jt; + } + + @Override + public void setRemoteCommand(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_REMOTE_COMMAND, s); + } + + @Override + public String getRemoteCommand() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_REMOTE_COMMAND); + } + + @SuppressWarnings("unchecked") + @Override + public void setArgs(List list) throws DrmaaException { + JnaSession.setVectorAttribute(jt, LibDrmaa.DRMAA_V_ARGV, list); + } + + @Override + public List getArgs() throws DrmaaException { + return JnaSession.getVectorAttribute(jt, LibDrmaa.DRMAA_V_ARGV); + } + + @Override + public void setJobSubmissionState(int state) throws DrmaaException { + String stateString; + if (state == JobTemplate.HOLD_STATE) + stateString = LibDrmaa.DRMAA_SUBMISSION_STATE_HOLD; + else if (state == JobTemplate.ACTIVE_STATE) + stateString = LibDrmaa.DRMAA_SUBMISSION_STATE_ACTIVE; + else + throw new InvalidAttributeValueException("jobSubmissionState attribute is invalid"); + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JS_STATE, stateString); + } + + @Override + public int getJobSubmissionState() throws DrmaaException { + int state; + String stateString = JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JS_STATE); + if (LibDrmaa.DRMAA_SUBMISSION_STATE_HOLD.equals(stateString)) + state = JobTemplate.HOLD_STATE; + else if (LibDrmaa.DRMAA_SUBMISSION_STATE_ACTIVE.equals(stateString)) + state = JobTemplate.ACTIVE_STATE; + else + throw new InvalidAttributeValueException("jobSubmissionState attribute is invalid"); + return state; + } + + @SuppressWarnings("unchecked") + @Override + public void setJobEnvironment(Map env) throws DrmaaException { + JnaSession.setVectorAttribute(jt, LibDrmaa.DRMAA_V_ENV, JnaSession.mapToCollection(env)); + } + + @SuppressWarnings("unchecked") + @Override + public Map getJobEnvironment() throws DrmaaException { + return JnaSession.collectionToMap(JnaSession.getVectorAttribute(jt, LibDrmaa.DRMAA_V_ENV)); + } + + @Override + public void setWorkingDirectory(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_WD, s); + } + + @Override + public String getWorkingDirectory() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_WD); + } + + @Override + public void setJobCategory(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JOB_CATEGORY, s); + } + + @Override + public String getJobCategory() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JOB_CATEGORY); + } + + @Override + public void setNativeSpecification(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_NATIVE_SPECIFICATION, s); + } + + @Override + public String getNativeSpecification() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_NATIVE_SPECIFICATION); + } + + @SuppressWarnings("unchecked") + @Override + public void setEmail(Set set) throws DrmaaException { + JnaSession.setVectorAttribute(jt, LibDrmaa.DRMAA_V_EMAIL, set); + } + + @SuppressWarnings("unchecked") + @Override + public Set getEmail() throws DrmaaException { + return new LinkedHashSet(JnaSession.getVectorAttribute(jt, LibDrmaa.DRMAA_V_EMAIL)); + } + + @Override + public void setBlockEmail(boolean b) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_BLOCK_EMAIL, b ? "1" : "0"); + } + + @Override + public boolean getBlockEmail() throws DrmaaException { + return "1".equals(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_BLOCK_EMAIL)); + } + + @Override + public void setStartTime(PartialTimestamp partialTimestamp) throws DrmaaException { + JnaSession.setPartialTime(jt, LibDrmaa.DRMAA_START_TIME, partialTimestamp); + } + + @Override + public PartialTimestamp getStartTime() throws DrmaaException { + return JnaSession.getPartialTime(jt, LibDrmaa.DRMAA_START_TIME); + } + + @Override + public void setJobName(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JOB_NAME, s); + } + + @Override + public String getJobName() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JOB_NAME); + } + + @Override + public void setInputPath(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_INPUT_PATH, s); + } + + @Override + public String getInputPath() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_INPUT_PATH); + } + + @Override + public void setOutputPath(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_OUTPUT_PATH, s); + } + + @Override + public String getOutputPath() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_OUTPUT_PATH); + } + + @Override + public void setErrorPath(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_ERROR_PATH, s); + } + + @Override + public String getErrorPath() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_ERROR_PATH); + } + + @Override + public void setJoinFiles(boolean b) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JOIN_FILES, b ? "y" : "n"); + } + + @Override + public boolean getJoinFiles() throws DrmaaException { + return "y".equals(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JOIN_FILES)); + } + + @Override + public void setTransferFiles(FileTransferMode fileTransferMode) throws DrmaaException { + StringBuilder buf = new StringBuilder(); + + if (fileTransferMode.getInputStream()) + buf.append('i'); + + if (fileTransferMode.getOutputStream()) + buf.append('o'); + + if (fileTransferMode.getErrorStream()) + buf.append('e'); + + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_TRANSFER_FILES, buf.toString()); + } + + @Override + public FileTransferMode getTransferFiles() throws DrmaaException { + String mode = JnaSession.getAttribute(jt, LibDrmaa.DRMAA_TRANSFER_FILES); + + if (mode == null) + return null; + + FileTransferMode fileTransferMode = new FileTransferMode(); + fileTransferMode.setInputStream(mode.indexOf('i') >= 0); + fileTransferMode.setOutputStream(mode.indexOf('o') >= 0); + fileTransferMode.setErrorStream(mode.indexOf('e') >= 0); + return fileTransferMode; + } + + @Override + public void setDeadlineTime(PartialTimestamp partialTimestamp) throws DrmaaException { + JnaSession.setPartialTime(jt, LibDrmaa.DRMAA_DEADLINE_TIME, partialTimestamp); + } + + @Override + public PartialTimestamp getDeadlineTime() throws DrmaaException { + return JnaSession.getPartialTime(jt, LibDrmaa.DRMAA_DEADLINE_TIME); + } + + @Override + public void setHardWallclockTimeLimit(long l) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_WCT_HLIMIT, JnaSession.formatLimit(l)); + } + + @Override + public long getHardWallclockTimeLimit() throws DrmaaException { + return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_WCT_HLIMIT)); + } + + @Override + public void setSoftWallclockTimeLimit(long l) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_WCT_SLIMIT, JnaSession.formatLimit(l)); + } + + @Override + public long getSoftWallclockTimeLimit() throws DrmaaException { + return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_WCT_SLIMIT)); + } + + @Override + public void setHardRunDurationLimit(long l) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_DURATION_HLIMIT, JnaSession.formatLimit(l)); + } + + @Override + public long getHardRunDurationLimit() throws DrmaaException { + return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_DURATION_HLIMIT)); + } + + @Override + public void setSoftRunDurationLimit(long l) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_DURATION_SLIMIT, JnaSession.formatLimit(l)); + } + + @Override + public long getSoftRunDurationLimit() throws DrmaaException { + return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_DURATION_SLIMIT)); + } + + @Override + public Set getAttributeNames() throws DrmaaException { + return JnaSession.getAttrNames(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof JnaJobTemplate)) + return false; + JnaJobTemplate other = (JnaJobTemplate) obj; + return this.jt.equals(other.jt) && this.session.equals(other.session); + } + + @Override + public int hashCode() { + return jt.hashCode(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java new file mode 100644 index 000000000..480113e1e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java @@ -0,0 +1,450 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import com.sun.jna.Memory; +import com.sun.jna.NativeLong; +import com.sun.jna.Pointer; +import com.sun.jna.StringArray; +import com.sun.jna.ptr.IntByReference; +import com.sun.jna.ptr.PointerByReference; +import org.ggf.drmaa.*; + +import java.text.ParseException; +import java.util.*; + +/** + * JNA mapping from Java to C DRMAA binding. + * See: Java and C Binding Documents on http://drmaa.org + */ +public class JnaSession implements Session { + private static final PartialTimestampFormat PARTIAL_TIMESTAMP_FORMAT = new PartialTimestampFormat(); + private static final ThreadLocal threadError = new ThreadLocal() { + @Override + protected Memory initialValue() { + return new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER); + } + }; + + @Override + public void init(String contact) throws DrmaaException { + checkError(LibDrmaa.drmaa_init(contact, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + @Override + public void exit() throws DrmaaException { + checkError(LibDrmaa.drmaa_exit(getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + @Override + public JobTemplate createJobTemplate() throws DrmaaException { + PointerByReference jtRef = new PointerByReference(); + checkError(LibDrmaa.drmaa_allocate_job_template(jtRef, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + return new JnaJobTemplate(this, jtRef.getValue()); + } + + @Override + public void deleteJobTemplate(JobTemplate jobTemplate) throws DrmaaException { + JnaJobTemplate jnaJobTemplate = (JnaJobTemplate) jobTemplate; + checkError(LibDrmaa.drmaa_delete_job_template(jnaJobTemplate.getPointer(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + @Override + public String runJob(JobTemplate jobTemplate) throws DrmaaException { + Memory jobId = new Memory(LibDrmaa.DRMAA_JOBNAME_BUFFER); + JnaJobTemplate jnaJobTemplate = (JnaJobTemplate) jobTemplate; + checkError(LibDrmaa.drmaa_run_job(jobId, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, jnaJobTemplate.getPointer(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + return jobId.getString(0); + } + + @Override + public List runBulkJobs(JobTemplate jobTemplate, int start, int end, int incr) throws DrmaaException { + PointerByReference jobIds = new PointerByReference(); + JnaJobTemplate jnaJobTemplate = (JnaJobTemplate) jobTemplate; + checkError(LibDrmaa.drmaa_run_bulk_jobs(jobIds, jnaJobTemplate.getPointer(), start, end, incr, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + try { + return getJobIds(jobIds); + } finally { + releaseJobIds(jobIds); + } + } + + @Override + public void control(String jobId, int action) throws DrmaaException { + checkError(LibDrmaa.drmaa_control(jobId, action, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + @SuppressWarnings("unchecked") + @Override + public void synchronize(List list, long timeout, boolean dispose) throws DrmaaException { + StringArray jobIds = new StringArray((String[]) list.toArray(new String[list.size()])); + checkError(LibDrmaa.drmaa_synchronize(jobIds, new NativeLong(timeout), dispose ? 1 : 0, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + @Override + public JobInfo wait(String jobId, long timeout) throws DrmaaException { + Memory jobIdOut = new Memory(LibDrmaa.DRMAA_JOBNAME_BUFFER); + IntByReference stat = new IntByReference(); + PointerByReference rusage = new PointerByReference(); + IntByReference exited = new IntByReference(); + IntByReference exitStatus = new IntByReference(); + IntByReference signaled = new IntByReference(); + Memory signal = new Memory(LibDrmaa.DRMAA_SIGNAL_BUFFER); + IntByReference coreDumped = new IntByReference(); + IntByReference aborted = new IntByReference(); + + int errnum; + + errnum = LibDrmaa.drmaa_wait(jobId, jobIdOut, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, stat, new NativeLong(timeout), rusage, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + Map rusageMap; + if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_RUSAGE) { + rusageMap = null; + } else { + try { + rusageMap = collectionToMap(getAttrValues(rusage)); + } finally { + releaseAttrValues(rusage); + } + } + + checkError(LibDrmaa.drmaa_wifexited(exited, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + + if (exited.getValue() != 0) { + checkError(LibDrmaa.drmaa_wexitstatus(exitStatus, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + checkError(LibDrmaa.drmaa_wifsignaled(signaled, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + + if (signaled.getValue() != 0) { + checkError(LibDrmaa.drmaa_wtermsig(signal, LibDrmaa.DRMAA_SIGNAL_BUFFER_LEN, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + checkError(LibDrmaa.drmaa_wcoredump(coreDumped, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + checkError(LibDrmaa.drmaa_wifaborted(aborted, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + + return new JnaJobInfo(jobIdOut.getString(0), rusageMap, exited.getValue() != 0, exitStatus.getValue(), + signaled.getValue() != 0, signal.getString(0), coreDumped.getValue() != 0, aborted.getValue() != 0); + } + + @Override + public int getJobProgramStatus(String jobId) throws DrmaaException { + IntByReference remotePs = new IntByReference(); + checkError(LibDrmaa.drmaa_job_ps(jobId, remotePs, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + return remotePs.getValue(); + } + + @Override + public String getContact() { + Memory contact = new Memory(LibDrmaa.DRMAA_CONTACT_BUFFER); + try { + checkError(LibDrmaa.drmaa_get_contact(contact, LibDrmaa.DRMAA_CONTACT_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } catch (DrmaaException e) { + // DRMAA spec says this method should throw DrmaaException. + // Why doesn't interface implement this? + throw new RuntimeException(e); + } + return contact.getString(0); + } + + @Override + public Version getVersion() { + IntByReference major = new IntByReference(); + IntByReference minor = new IntByReference(); + try { + checkError(LibDrmaa.drmaa_version(major, minor, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } catch (DrmaaException e) { + // DRMAA spec says this method should throw DrmaaException. + // Why doesn't interface implement this? + throw new RuntimeException(e); + } + return new Version(major.getValue(), minor.getValue()); + } + + @Override + public String getDrmSystem() { + Memory drmSystem = new Memory(LibDrmaa.DRMAA_DRM_SYSTEM_BUFFER); + try { + checkError(LibDrmaa.drmaa_get_DRM_system(drmSystem, LibDrmaa.DRMAA_DRM_SYSTEM_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } catch (DrmaaException e) { + // DRMAA spec says this method should throw DrmaaException. + // Why doesn't interface implement this? + throw new RuntimeException(e); + } + return drmSystem.getString(0); + } + + @Override + public String getDrmaaImplementation() { + Memory drmaaImplementation = new Memory(LibDrmaa.DRMAA_DRMAA_IMPLEMENTATION_BUFFER); + try { + checkError(LibDrmaa.drmaa_get_DRMAA_implementation(drmaaImplementation, LibDrmaa.DRMAA_DRMAA_IMPLEMENTATION_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } catch (DrmaaException e) { + // DRMAA spec says this method should throw DrmaaException. + // Why doesn't interface implement this? + throw new RuntimeException(e); + } + return drmaaImplementation.getString(0); + } + + public static void setAttribute(Pointer jt, String name, String value) throws DrmaaException { + checkError(LibDrmaa.drmaa_set_attribute(jt, name, value, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + public static String getAttribute(Pointer jt, String name) throws DrmaaException { + Memory attrBuffer = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); + checkError(LibDrmaa.drmaa_get_attribute(jt, name, attrBuffer, LibDrmaa.DRMAA_ATTR_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + return attrBuffer.getString(0); + } + + public static void setVectorAttribute(Pointer jt, String name, Collection values) throws DrmaaException { + StringArray valuesArray = new StringArray(values.toArray(new String[values.size()])); + checkError(LibDrmaa.drmaa_set_vector_attribute(jt, name, valuesArray, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + public static List getVectorAttribute(Pointer jt, String name) throws DrmaaException { + PointerByReference values = new PointerByReference(); + checkError(LibDrmaa.drmaa_get_vector_attribute(jt, name, values, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + try { + return getAttrValues(values); + } finally { + releaseAttrValues(values); + } + } + + public static void setPartialTime(Pointer jt, String name, PartialTimestamp partialTimestamp) throws DrmaaException { + setAttribute(jt, name, PARTIAL_TIMESTAMP_FORMAT.format(partialTimestamp)); + } + + public static PartialTimestamp getPartialTime(Pointer jt, String name) throws DrmaaException { + String time = getAttribute(jt, name); + if (time == null) + return null; + try { + return PARTIAL_TIMESTAMP_FORMAT.parse(time); + } catch (ParseException e) { + throw new InternalException(name + " property is unparsable"); + } + } + + public static Set getAttrNames() throws DrmaaException { + PointerByReference values = new PointerByReference(); + checkError(LibDrmaa.drmaa_get_attribute_names(values, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + try { + return new LinkedHashSet(getAttrNames(values)); + } finally { + releaseAttrNames(values); + } + } + + public static Collection mapToCollection(Map map) { + Collection collection = new LinkedHashSet(); + for (Map.Entry entry: map.entrySet()) + collection.add(entry.getKey() + "=" + entry.getValue()); + return collection; + } + + public static Map collectionToMap(Collection list) { + Map map = new LinkedHashMap(); + for (String entry: list) { + if (entry == null) + continue; + int equals = entry.indexOf('='); + if (equals < 0) + continue; + map.put(entry.substring(0, equals), entry.substring(equals + 1)); + } + return map; + } + + public static String formatLimit(long secs) { + long seconds = (secs % 60); + long minutes = (secs / 60) % 60; + long hours = (secs / 3600); + return String.format("%d:%02d:%02d", hours, minutes, seconds); + } + + public static long parseLimit(String limit) { + long seconds = 0; + if (limit != null) { + for (String token: limit.split(":")) { + seconds *= 60; + seconds += Long.parseLong(token); + } + } + return seconds; + } + + private static List getAttrNames(PointerByReference names) throws DrmaaException { + List namesList = new ArrayList(); + IntByReference size = new IntByReference(); + int errnum; + + errnum = LibDrmaa.drmaa_get_num_attr_names(names.getValue(), size); + checkError(errnum, "unable to get attribute names"); + int num = size.getValue(); + + Memory value = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); + for (int i = 1; i <= num; i++) { + errnum = LibDrmaa.drmaa_get_next_attr_name(names.getValue(), value, LibDrmaa.DRMAA_ATTR_BUFFER_LEN); + checkError(errnum, "unable to get attribute name " + i); + if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS) + break; + namesList.add(value.getString(0)); + } + + return namesList; + } + + private static List getAttrValues(PointerByReference values) throws DrmaaException { + List valuesList = new ArrayList(); + IntByReference size = new IntByReference(); + int errnum; + + errnum = LibDrmaa.drmaa_get_num_attr_values(values.getValue(), size); + checkError(errnum, "unable to get attribute values"); + int num = size.getValue(); + + Memory value = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); + for (int i = 1; i <= num; i++) { + errnum = LibDrmaa.drmaa_get_next_attr_value(values.getValue(), value, LibDrmaa.DRMAA_ATTR_BUFFER_LEN); + checkError(errnum, "unable to get attribute value " + i); + if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS) + break; + valuesList.add(value.getString(0)); + } + + return valuesList; + } + + private static List getJobIds(PointerByReference jobIds) throws DrmaaException { + List jobIdsList = new ArrayList(); + IntByReference size = new IntByReference(); + int errnum; + + errnum = LibDrmaa.drmaa_get_num_job_ids(jobIds.getValue(), size); + checkError(errnum, "unable to get jobIds"); + int num = size.getValue(); + + Memory value = new Memory(LibDrmaa.DRMAA_JOBNAME_BUFFER); + for (int i = 1; i <= num; i++) { + errnum = LibDrmaa.drmaa_get_next_job_id(jobIds.getValue(), value, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN); + checkError(errnum, "unable to get jobId " + i); + if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS) + break; + jobIdsList.add(value.getString(0)); + } + + return jobIdsList; + } + + private static void releaseAttrNames(PointerByReference names) throws DrmaaException { + LibDrmaa.drmaa_release_attr_names(names.getValue()); + } + + private static void releaseAttrValues(PointerByReference values) throws DrmaaException { + LibDrmaa.drmaa_release_attr_values(values.getValue()); + } + + private static void releaseJobIds(PointerByReference jobIds) throws DrmaaException { + LibDrmaa.drmaa_release_job_ids(jobIds.getValue()); + } + + private static Memory getError() { + return threadError.get(); + } + + private static void checkError(int errnum) throws DrmaaException { + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + checkError(errnum, getError().getString(0)); + } + + private static void checkError(int errnum, String error) throws DrmaaException { + switch (errnum) { + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS: + break; + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INTERNAL_ERROR: + throw new InternalException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE: + throw new DrmCommunicationException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_AUTH_FAILURE: + throw new AuthorizationException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_ARGUMENT: + throw new IllegalArgumentException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_ACTIVE_SESSION: + throw new NoActiveSessionException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MEMORY: + throw new OutOfMemoryError(error); + + /* -------------- init and exit specific --------------- */ + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_CONTACT_STRING: + throw new InvalidContactStringException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DEFAULT_CONTACT_STRING_ERROR: + throw new DefaultContactStringException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_DEFAULT_CONTACT_STRING_SELECTED: + throw new NoDefaultContactStringException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DRMS_INIT_FAILED: + throw new DrmsInitException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_ALREADY_ACTIVE_SESSION: + throw new AlreadyActiveSessionException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DRMS_EXIT_ERROR: + throw new DrmsExitException(error); + + /* ---------------- job attributes specific -------------- */ + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_ATTRIBUTE_FORMAT: + throw new InvalidAttributeFormatException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE: + throw new InvalidAttributeValueException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES: + throw new ConflictingAttributeValuesException(error); + + /* --------------------- job submission specific -------------- */ + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_TRY_LATER: + throw new TryLaterException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DENIED_BY_DRM: + throw new DeniedByDrmException(error); + + /* ------------------------------- job control specific ---------------- */ + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_JOB: + throw new InvalidJobException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_RESUME_INCONSISTENT_STATE: + throw new ResumeInconsistentStateException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUSPEND_INCONSISTENT_STATE: + throw new SuspendInconsistentStateException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_HOLD_INCONSISTENT_STATE: + throw new HoldInconsistentStateException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_RELEASE_INCONSISTENT_STATE: + throw new ReleaseInconsistentStateException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_EXIT_TIMEOUT: + throw new ExitTimeoutException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_RUSAGE: + break; + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS: + break; + default: + throw new IllegalArgumentException(String.format("Unknown error code %d: %s", errnum, error)); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java new file mode 100644 index 000000000..a1460b7f4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import org.ggf.drmaa.Session; +import org.ggf.drmaa.SessionFactory; + +/** + * JNA mapping from Java to C DRMAA binding. + */ +@SuppressWarnings("unused") +public class JnaSessionFactory extends SessionFactory { + @Override + public Session getSession() { + return new JnaSession(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java new file mode 100644 index 000000000..1244d3023 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java @@ -0,0 +1,754 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/*___INFO__MARK_BEGIN__*/ +/************************************************************************* + * + * The Contents of this file are made available subject to the terms of + * the Sun Industry Standards Source License Version 1.2 + * + * Sun Microsystems Inc., March, 2001 + * + * + * Sun Industry Standards Source License Version 1.2 + * ================================================= + * The contents of this file are subject to the Sun Industry Standards + * Source License Version 1.2 (the "License"); You may not use this file + * except in compliance with the License. You may obtain a copy of the + * License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html + * + * Software provided under this License is provided on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, + * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. + * See the License for the specific provisions governing your rights and + * obligations concerning the Software. + * + * The Initial Developer of the Original Code is: Sun Microsystems, Inc. + * + * Copyright: 2001 by Sun Microsystems, Inc. + * + * All Rights Reserved. + * + ************************************************************************/ +/*___INFO__MARK_END__*/ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import com.sun.jna.*; +import com.sun.jna.ptr.IntByReference; +import com.sun.jna.ptr.PointerByReference; + +@SuppressWarnings("unused") +public class LibDrmaa { + static { + Native.register("drmaa"); + } + +/* see www.drmaa.org for more details on the DRMAA specification */ +/****** DRMAA/-DRMAA_Interface ************************************************* +* NAME +* DRMAA_Interface -- DRMAA interface +* +* FUNCTION +* The enlisted functions specify the C/C++ binding of the DRMAA interface +* specification. +* +* SEE ALSO +* DRMAA/drmaa_get_next_attr_name() +* DRMAA/drmaa_get_next_attr_value() +* DRMAA/drmaa_get_next_job_id() +* DRMAA/drmaa_release_attr_names() +* DRMAA/drmaa_release_attr_values() +* DRMAA/drmaa_release_job_ids() +* DRMAA/drmaa_init() +* DRMAA/drmaa_exit() +* DRMAA/drmaa_allocate_job_template() +* DRMAA/drmaa_delete_job_template() +* DRMAA/drmaa_set_attribute() +* DRMAA/drmaa_get_attribute() +* DRMAA/drmaa_set_vector_attribute() +* DRMAA/drmaa_get_vector_attribute() +* DRMAA/drmaa_get_attribute_names() +* DRMAA/drmaa_get_vector_attribute_names() +* DRMAA/drmaa_run_job() +* DRMAA/drmaa_run_bulk_jobs() +* DRMAA/drmaa_control() +* DRMAA/drmaa_synchronize() +* DRMAA/drmaa_wait() +* DRMAA/drmaa_wifexited() +* DRMAA/drmaa_wexitstatus() +* DRMAA/drmaa_wifsignaled() +* DRMAA/drmaa_wtermsig() +* DRMAA/drmaa_wcoredump() +* DRMAA/drmaa_wifaborted() +* DRMAA/drmaa_job_ps() +* DRMAA/drmaa_strerror() +* DRMAA/drmaa_get_contact() +* DRMAA/drmaa_version() +* DRMAA/drmaa_get_DRM_system() +*******************************************************************************/ + +/* ------------------- Constants ------------------- */ +/* + * some not yet agreed buffer length constants + * these are recommended minimum values + */ + +/* drmaa_get_attribute() */ +public static final long DRMAA_ATTR_BUFFER = 1024; +public static final NativeLong DRMAA_ATTR_BUFFER_LEN = new NativeLong(DRMAA_ATTR_BUFFER - 1); + +/* drmaa_get_contact() */ +public static final long DRMAA_CONTACT_BUFFER = 1024; +public static final NativeLong DRMAA_CONTACT_BUFFER_LEN = new NativeLong(DRMAA_CONTACT_BUFFER - 1); + +/* drmaa_get_DRM_system() */ +public static final long DRMAA_DRM_SYSTEM_BUFFER = 1024; +public static final NativeLong DRMAA_DRM_SYSTEM_BUFFER_LEN = new NativeLong(DRMAA_DRM_SYSTEM_BUFFER - 1); + +/* drmaa_get_DRM_system() */ +public static final long DRMAA_DRMAA_IMPLEMENTATION_BUFFER = 1024; +public static final NativeLong DRMAA_DRMAA_IMPLEMENTATION_BUFFER_LEN = new NativeLong(DRMAA_DRMAA_IMPLEMENTATION_BUFFER - 1); + +/* + * Agreed buffer length constants + * these are recommended minimum values + */ +public static final long DRMAA_ERROR_STRING_BUFFER = 1024; +public static final long DRMAA_JOBNAME_BUFFER = 1024; +public static final long DRMAA_SIGNAL_BUFFER = 32; + +public static final NativeLong DRMAA_ERROR_STRING_BUFFER_LEN = new NativeLong(DRMAA_ERROR_STRING_BUFFER - 1); +public static final NativeLong DRMAA_JOBNAME_BUFFER_LEN = new NativeLong(DRMAA_JOBNAME_BUFFER - 1); +public static final NativeLong DRMAA_SIGNAL_BUFFER_LEN = new NativeLong(DRMAA_SIGNAL_BUFFER - 1); + +/* + * Agreed constants + */ +public static final NativeLong DRMAA_TIMEOUT_WAIT_FOREVER = new NativeLong(-1); +public static final NativeLong DRMAA_TIMEOUT_NO_WAIT = new NativeLong(0); + +public static final String DRMAA_JOB_IDS_SESSION_ANY = "DRMAA_JOB_IDS_SESSION_ANY"; +public static final String DRMAA_JOB_IDS_SESSION_ALL = "DRMAA_JOB_IDS_SESSION_ALL"; + +public static final String DRMAA_SUBMISSION_STATE_ACTIVE = "drmaa_active"; +public static final String DRMAA_SUBMISSION_STATE_HOLD = "drmaa_hold"; + +/* + * Agreed placeholder names + */ +public static final String DRMAA_PLACEHOLDER_INCR = "$drmaa_incr_ph$"; +public static final String DRMAA_PLACEHOLDER_HD = "$drmaa_hd_ph$"; +public static final String DRMAA_PLACEHOLDER_WD = "$drmaa_wd_ph$"; + +/* + * Agreed names of job template attributes + */ +public static final String DRMAA_REMOTE_COMMAND = "drmaa_remote_command"; +public static final String DRMAA_JS_STATE = "drmaa_js_state"; +public static final String DRMAA_WD = "drmaa_wd"; +public static final String DRMAA_JOB_CATEGORY = "drmaa_job_category"; +public static final String DRMAA_NATIVE_SPECIFICATION = "drmaa_native_specification"; +public static final String DRMAA_BLOCK_EMAIL = "drmaa_block_email"; +public static final String DRMAA_START_TIME = "drmaa_start_time"; +public static final String DRMAA_JOB_NAME = "drmaa_job_name"; +public static final String DRMAA_INPUT_PATH = "drmaa_input_path"; +public static final String DRMAA_OUTPUT_PATH = "drmaa_output_path"; +public static final String DRMAA_ERROR_PATH = "drmaa_error_path"; +public static final String DRMAA_JOIN_FILES = "drmaa_join_files"; +public static final String DRMAA_TRANSFER_FILES = "drmaa_transfer_files"; +public static final String DRMAA_DEADLINE_TIME = "drmaa_deadline_time"; +public static final String DRMAA_WCT_HLIMIT = "drmaa_wct_hlimit"; +public static final String DRMAA_WCT_SLIMIT = "drmaa_wct_slimit"; +public static final String DRMAA_DURATION_HLIMIT = "drmaa_duration_hlimit"; +public static final String DRMAA_DURATION_SLIMIT = "drmaa_duration_slimit"; + +/* names of job template vector attributes */ +public static final String DRMAA_V_ARGV = "drmaa_v_argv"; +public static final String DRMAA_V_ENV = "drmaa_v_env"; +public static final String DRMAA_V_EMAIL = "drmaa_v_email"; + +/* + * DRMAA errno values + * + * do not touch these values are agreed !!! + */ +public static interface DRMAA_ERRNO { + /* -------------- these are relevant to all sections ---------------- */ + public static final int DRMAA_ERRNO_SUCCESS = 0; /* Routine returned normally with success. */ + public static final int DRMAA_ERRNO_INTERNAL_ERROR = 1; /* Unexpected or internal DRMAA error like memory allocation, system call failure, etc. */ + public static final int DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE = 2; /* Could not contact DRM system for this request. */ + public static final int DRMAA_ERRNO_AUTH_FAILURE = 3; /* The specified request is not processed successfully due to authorization failure. */ + public static final int DRMAA_ERRNO_INVALID_ARGUMENT = 4; /* The input value for an argument is invalid. */ + public static final int DRMAA_ERRNO_NO_ACTIVE_SESSION = 5; /* Exit routine failed because there is no active session */ + public static final int DRMAA_ERRNO_NO_MEMORY = 6; /* failed allocating memory */ + + /* -------------- init and exit specific --------------- */ + public static final int DRMAA_ERRNO_INVALID_CONTACT_STRING = 7; /* Initialization failed due to invalid contact string. */ + public static final int DRMAA_ERRNO_DEFAULT_CONTACT_STRING_ERROR = 8; /* DRMAA could not use the default contact string to connect to DRM system. */ + public static final int DRMAA_ERRNO_NO_DEFAULT_CONTACT_STRING_SELECTED = 9; /* No default contact string was provided or selected. DRMAA requires that the default contact string is selected when there is more than one default contact string due to multiple DRMAA implementation contained in the binary module. */ + public static final int DRMAA_ERRNO_DRMS_INIT_FAILED = 10; /* Initialization failed due to failure to init DRM system. */ + public static final int DRMAA_ERRNO_ALREADY_ACTIVE_SESSION = 11; /* Initialization failed due to existing DRMAA session. */ + public static final int DRMAA_ERRNO_DRMS_EXIT_ERROR = 12; /* DRM system disengagement failed. */ + + /* ---------------- job attributes specific -------------- */ + public static final int DRMAA_ERRNO_INVALID_ATTRIBUTE_FORMAT = 13; /* The format for the job attribute value is invalid. */ + public static final int DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE = 14; /* The value for the job attribute is invalid. */ + public static final int DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES = 15; /* The value of this attribute is conflicting with a previously set attributes. */ + + /* --------------------- job submission specific -------------- */ + public static final int DRMAA_ERRNO_TRY_LATER = 16; /* Could not pass job now to DRM system. A retry may succeed however (saturation). */ + public static final int DRMAA_ERRNO_DENIED_BY_DRM = 17; /* The DRM system rejected the job. The job will never be accepted due to DRM configuration or job template settings. */ + + /* ------------------------------- job control specific ---------------- */ + public static final int DRMAA_ERRNO_INVALID_JOB = 18; /* The job specified by the 'jobid' does not exist. */ + public static final int DRMAA_ERRNO_RESUME_INCONSISTENT_STATE = 19; /* The job has not been suspended. The RESUME request will not be processed. */ + public static final int DRMAA_ERRNO_SUSPEND_INCONSISTENT_STATE = 20; /* The job has not been running, and it cannot be suspended. */ + public static final int DRMAA_ERRNO_HOLD_INCONSISTENT_STATE = 21; /* The job cannot be moved to a HOLD state. */ + public static final int DRMAA_ERRNO_RELEASE_INCONSISTENT_STATE = 22; /* The job is not in a HOLD state. */ + public static final int DRMAA_ERRNO_EXIT_TIMEOUT = 23; /* We have encountered a time-out condition for drmaa_synchronize or drmaa_wait. */ + public static final int DRMAA_ERRNO_NO_RUSAGE = 24; /* This error code is returned by drmaa_wait() when a job has finished but no rusage and stat data could be provided. */ + public static final int DRMAA_ERRNO_NO_MORE_ELEMENTS = 25; /* There are no more elements in the opaque string vector. */ + + public static final int DRMAA_NO_ERRNO = 26; +} + +/* + * Agreed DRMAA job states as returned by drmaa_job_ps() + */ +public static interface DRMAA_PS { + public static final int DRMAA_PS_UNDETERMINED = 0x00; /* process status cannot be determined */ + public static final int DRMAA_PS_QUEUED_ACTIVE = 0x10; /* job is queued and active */ + public static final int DRMAA_PS_SYSTEM_ON_HOLD = 0x11; /* job is queued and in system hold */ + public static final int DRMAA_PS_USER_ON_HOLD = 0x12; /* job is queued and in user hold */ + public static final int DRMAA_PS_USER_SYSTEM_ON_HOLD = 0x13; /* job is queued and in user and system hold */ + public static final int DRMAA_PS_RUNNING = 0x20; /* job is running */ + public static final int DRMAA_PS_SYSTEM_SUSPENDED = 0x21; /* job is system suspended */ + public static final int DRMAA_PS_USER_SUSPENDED = 0x22; /* job is user suspended */ + public static final int DRMAA_PS_USER_SYSTEM_SUSPENDED = 0x23; /* job is user and system suspended */ + public static final int DRMAA_PS_DONE = 0x30; /* job finished normally */ + public static final int DRMAA_PS_FAILED = 0x40; /* job finished, but failed */ +} + +/* + * Agreed DRMAA actions for drmaa_control() + */ +public static interface DRMAA_CONTROL { + public static final int DRMAA_CONTROL_SUSPEND = 0; + public static final int DRMAA_CONTROL_RESUME = 1; + public static final int DRMAA_CONTROL_HOLD = 2; + public static final int DRMAA_CONTROL_RELEASE = 3; + public static final int DRMAA_CONTROL_TERMINATE = 4; +} + +/* ------------------- Data types ------------------- */ +/* + * Agreed opaque DRMAA job template + * struct drmaa_job_template_s is in japiP.h + */ +//typedef struct drmaa_job_template_s drmaa_job_template_t; + +/* ---------- C/C++ language binding specific interfaces -------- */ + +//typedef struct drmaa_attr_names_s drmaa_attr_names_t; +//typedef struct drmaa_attr_values_s drmaa_attr_values_t; +//typedef struct drmaa_job_ids_s drmaa_job_ids_t; + +/* + * get next string attribute from iterator + * + * returns DRMAA_ERRNO_SUCCESS or DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE + * if no such exists + */ + +public static native int drmaa_get_next_attr_name(/* drmaa_attr_names_t* */ Pointer values, Pointer value, + NativeLong value_len); +public static native int drmaa_get_next_attr_value(/* drmaa_attr_names_t* */ Pointer values, Pointer value, + NativeLong value_len); +public static native int drmaa_get_next_job_id(/* drmaa_job_ids_t* */ Pointer values, Pointer value, + NativeLong value_len); + +/* + * get element count of opaque string vector + * + * Gives the number of elements in the opaque string vector. Useful for + * copying the contents into an array. + */ +public static native int drmaa_get_num_attr_names(/* drmaa_attr_names_t* */ Pointer values, IntByReference size); +public static native int drmaa_get_num_attr_values(/* drmaa_attr_values_t* */ Pointer values, IntByReference size); +public static native int drmaa_get_num_job_ids(/* drmaa_job_ids_t* */ Pointer values, IntByReference size); + +/* + * release opaque string vector + * + * Opaque string vectors can be used without any constraint + * until the release function has been called. + */ +public static native void drmaa_release_attr_names(/* drmaa_attr_names_t* */ Pointer values); +public static native void drmaa_release_attr_values(/* drmaa_attr_values_t* */ Pointer values); +public static native void drmaa_release_job_ids(/* drmaa_job_ids_t* */ Pointer values); + +/* ------------------- init/exit routines ------------------- */ +/* + * Initialize DRMAA API library and create a new DRMAA Session. 'Contact' + * is an implementation dependent string which MAY be used to specify + * which DRM system to use. This routine MUST be called before any + * other DRMAA calls, except for drmaa_version(). + * If 'contact' is NULL, the default DRM system SHALL be used provided there is + * only one DRMAA implementation in the provided binary module. When these is + * more than one DRMAA implementation in the binary module, drmaa_init() SHALL + * return the DRMAA_ERRNO_NO_DEFAULT_CONTACT_STRING_SELECTED error. drmaa_init() + * SHOULD be called by only one of the threads. The main thread is RECOMMENDED. + * A call by another thread SHALL return DRMAA_ERRNO_ALREADY_ACTIVE_SESSION. + * When 'contact' is a a semi-colon separated list of name=value strings, the + * strings will be parsed and interpreted. The current list of accepted names + * is: + * session -- the id of the session to which to reconnect +#if 0 + * sge_root -- the SGE_ROOT to use + * sge_cell -- the SGE_CELL to use +#endif + * + * drmaa_init() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_INVALID_CONTACT_STRING, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_ALREADY_ACTIVE_SESSION, + * DRMAA_ERRNO_NO_DEFAULT_CONTACT_STRING_SELECTED, or + * DRMAA_ERRNO_DEFAULT_CONTACT_STRING_ERROR. + */ +public static native int drmaa_init(String contact, Pointer error_diagnosis, NativeLong error_diag_len); + + +/* + * Disengage from DRMAA library and allow the DRMAA library to perform + * any necessary internal clean up. + * This routine SHALL end the current DRMAA Session, but SHALL NOT effect any + * jobs (e.g., queued and running jobs SHALL remain queued and running). + * drmaa_exit() SHOULD be called by only one of the threads. Other thread calls + * to drmaa_exit() MAY fail since there is no active session. + * + * drmaa_exit() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_DRMS_EXIT_ERROR or + * DRMAA_ERRNO_NO_ACTIVE_SESSION. + */ +public static native int drmaa_exit(Pointer error_diagnosis, NativeLong error_diag_len); + +/* ------------------- job template routines ------------------- */ + +/* + * Allocate a new job template. + * + * drmaa_allocate_job_template() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE, + * DRMAA_ERRNO_INTERNAL_ERROR or + * DRMAA_ERRNO_NO_MEMORY. + */ +public static native int drmaa_allocate_job_template(/* drmaa_job_template_t** */ PointerByReference jt, Pointer error_diagnosis, NativeLong error_diag_len); + +/* + * Deallocate a job template. This routine has no effect on jobs. + * + * drmaa_delete_job_template() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE or + * DRMAA_ERRNO_INTERNAL_ERROR. + */ +public static native int drmaa_delete_job_template(/* drmaa_job_template_t* */ Pointer jt, Pointer error_diagnosis, + NativeLong error_diag_len); + + +/* + * Adds ('name', 'value') pair to list of attributes in job template 'jt'. + * Only non-vector attributes SHALL be passed. + * + * drmaa_set_attribute() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_INVALID_ATTRIBUTE_FORMAT, + * DRMAA_ERRNO_INVALID_ARGUMENT, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE or + * DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES. + */ +public static native int drmaa_set_attribute(/* drmaa_job_template_t* */ Pointer jt, String name, + String value, Pointer error_diagnosis, + NativeLong error_diag_len); + + +/* + * If 'name' is an existing non-vector attribute name in the job + * template 'jt', then the value of 'name' SHALL be returned; otherwise, + * NULL is returned. + * + * drmaa_get_attribute() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE. + */ +public static native int drmaa_get_attribute(/* drmaa_job_template_t* */ Pointer jt, String name, Pointer value, + NativeLong value_len, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* Adds ('name', 'values') pair to list of vector attributes in job template + * 'jt'. Only vector attributes SHALL be passed. + * A 'value' string vector containing n elements must be n+1 elements long, with + * the nth value, i.e. value[n], being set to NULL as a delimitor. + * + * drmaa_set_vector_attribute() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_INVALID_ATTRIBUTE_FORMAT, + * DRMAA_ERRNO_INVALID_ARGUMENT, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES. + */ +public static native int drmaa_set_vector_attribute(/* drmaa_job_template_t* */ Pointer jt, String name, + Pointer value, Pointer error_diagnosis, + NativeLong error_diag_len); + + +/* + * If 'name' is an existing vector attribute name in the job template 'jt', + * then the values of 'name' are returned; otherwise, NULL is returned. + * + * drmaa_get_vector_attribute() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE. + */ +public static native int drmaa_get_vector_attribute(/* drmaa_job_template_t* */ Pointer jt, String name, + /* drmaa_attr_values_t ** */ PointerByReference values, + Pointer error_diagnosis, NativeLong error_diag_len); + + +/* + * SHALL return the set of supported attribute names whose associated + * value type is String. This set SHALL include supported DRMAA reserved + * attribute names and native attribute names. + * + * drmaa_get_attribute_names() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_NO_MEMORY. + */ +public static native int drmaa_get_attribute_names(/* drmaa_attr_names_t ** */ PointerByReference values, + Pointer error_diagnosis, NativeLong error_diag_len); + +/* + * SHALL return the set of supported attribute names whose associated + * value type is String Vector. This set SHALL include supported DRMAA reserved + * attribute names and native attribute names. + * + * drmaa_get_vector_attribute_names() SHALL return DRMAA_ERRNO_SUCCESS on + * success, otherwise: + * DRMAA_ERRNO_NO_MEMORY. + */ +public static native int drmaa_get_vector_attribute_names(/* drmaa_attr_names_t ** */ PointerByReference values, + Pointer error_diagnosis, + NativeLong error_diag_len); + +/* ------------------- job submission routines ------------------- */ + +/* + * Submit a job with attributes defined in the job template 'jt'. + * The job identifier 'job_id' is a printable, NULL terminated string, + * identical to that returned by the underlying DRM system. + * + * drmaa_run_job() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_TRY_LATER, + * DRMAA_ERRNO_DENIED_BY_DRM, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE or + * DRMAA_ERRNO_AUTH_FAILURE. + */ +public static native int drmaa_run_job(Pointer job_id, NativeLong job_id_len, + /* drmaa_job_template_t * */ Pointer jt, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* + * Submit a set of parametric jobs, dependent on the implied loop index, each + * with attributes defined in the job template 'jt'. + * The job identifiers 'job_ids' SHALL all be printable, + * NULL terminated strings, identical to those returned by the underlying + * DRM system. Nonnegative loop bounds SHALL NOT use file names + * that start with minus sign like command line options. + * DRMAA defines a special index placeholder, drmaa_incr_ph, (which has the + * value "$incr_pl$") that is used to construct parametric job templates. + * For example: + * //C++ string syntax used + * drmaa_set_attribute(pjt, "stderr", drmaa_incr_ph + ".err" ); + * + * drmaa_run_bulk_jobs() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_TRY_LATER, + * DRMAA_ERRNO_DENIED_BY_DRM, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE or + * DRMAA_ERRNO_AUTH_FAILURE. + */ +public static native int drmaa_run_bulk_jobs(/* drmaa_job_ids_t ** */ PointerByReference jobids, + /* drmaa_job_template_t * */ Pointer jt, int start, int end, + int incr, Pointer error_diagnosis, NativeLong error_diag_len); + +/* ------------------- job control routines ------------------- */ + +/* + * Start, stop, restart, or kill the job identified by 'job_id'. + * If 'job_id' is DRMAA_JOB_IDS_SESSION_ALL then this routine + * acts on all jobs *submitted* during this DRMAA session. + * The legal values for 'action' and their meanings SHALL be: + * DRMAA_CONTROL_SUSPEND: stop the job, + * DRMAA_CONTROL_RESUME: (re)start the job, + * DRMAA_CONTROL_HOLD: put the job on-hold, + * DRMAA_CONTROL_RELEASE: release the hold on the job, and + * DRMAA_CONTROL_TERMINATE: kill the job. + * + * This routine SHALL return once the action has been acknowledged by + * the DRM system, but does not necessarily wait until the action + * has been completed. + * + * drmaa_control() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE, + * DRMAA_ERRNO_AUTH_FAILURE, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_RESUME_INCONSISTENT_STATE, + * DRMAA_ERRNO_SUSPEND_INCONSISTENT_STATE, + * DRMAA_ERRNO_HOLD_INCONSISTENT_STATE, + * DRMAA_ERRNO_RELEASE_INCONSISTENT_STATE or + * DRMAA_ERRNO_INVALID_JOB. + */ +public static native int drmaa_control(String jobid, int action, Pointer error_diagnosis, + NativeLong error_diag_len); + + +/* + * Wait until all jobs specified by 'job_ids' have finished + * execution. If 'job_ids' is DRMAA_JOB_IDS_SESSION_ALL then this routine + * waits for all jobs *submitted* during this DRMAA session. The timeout value + * is used to specify the number of seconds to wait for the job to fail finish + * before returning if a result is not immediately available. The value + * DRMAA_TIMEOUT_WAIT_FOREVER can be used to specify that routine should wait + * indefinitely for a result. The value DRMAA_TIMEOUT_NO_WAIT can be used to + * specify that the routine should return immediately if no result is available. + * If the call exits before timeout, all the jobs have + * been waited on or there was an interrupt. + * If the invocation exits on timeout, the return code is + * DRMAA_ERRNO_EXIT_TIMEOUT. The caller SHOULD check system time before and + * after this call in order to check how much time has passed. + * + * The dispose parameter specifies how to treat reaping information: + * True=1 "fake reap", i.e. dispose of the rusage data + * False=0 do not reap + * + * A 'job_ids' string vector containing n elements must be n+1 elements long, + * with the nth value, i.e. job_ids[n], being set to NULL as a delimitor. + * + * drmaa_synchronize() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE, + * DRMAA_ERRNO_AUTH_FAILURE, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_EXIT_TIMEOUT or + * DRMAA_ERRNO_INVALID_JOB. + */ +public static native int drmaa_synchronize(Pointer job_ids, NativeLong timeout, int dispose, + Pointer error_diagnosis, NativeLong error_diag_len); + + +/* + * This routine SHALL wait for a job with job_id to fail or finish execution. If + * the special string, DRMAA_JOB_IDS_SESSION_ANY is provided as the job_id, + * this routine SHALL wait for any job from the session. This routine is modeled + * on the wait3 POSIX routine. The timeout value is used to specify the number + * of seconds to wait for the job to fail finish before returning if a result is + * not immediately available. The value DRMAA_TIMEOUT_WAIT_FOREVER can be + * used to specify that routine should wait indefinitely for a result. The value + * DRMAA_TIMEOUT_NO_WAIT may be specified that the routine should return + * immediately if no result is available. + * If the call exits before timeout ,the job has been waited on + * successfully or there was an interrupt. + * If the invocation exits on timeout, the return code is + * DRMAA_ERRNO_EXIT_TIMEOUT. The caller SHOULD check system time before and + * after this call in order to check how much time has passed. + * The routine reaps jobs on a successful call, so any subsequent calls + * to drmaa_wait SHOULD fail returning an error DRMAA_ERRNO_INVALID_JOB meaning + * that the job has been already reaped. This error is the same as if the job + * was unknown. Failing due to an elapsed timeout has an effect that it is + * possible to issue drmaa_wait multiple times for the same job_id. When + * successful, the rusage information SHALL be provided as an array of strings, + * where each string complies with the format =. The string portion + * contains the amount of resources consumed by the job and is opaque. + * The 'stat' drmaa_wait parameter is used in the drmaa_w* functions for + * providing more detailed information about job termination if available. An + * analogous set of macros is defined in POSIX for analyzing the wait3(2) OUT + * parameter 'stat'. + * + * drmaa_wait() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE, + * DRMAA_ERRNO_AUTH_FAILURE, + * DRMAA_ERRNO_NO_RUSAGE, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_EXIT_TIMEOUT or + * DRMAA_ERRNO_INVALID_JOB. + */ +public static native int drmaa_wait(String job_id, Pointer job_id_out, NativeLong job_id_out_len, + IntByReference stat, NativeLong timeout, /* drmaa_attr_values_t ** */ PointerByReference rusage, + Pointer error_diagnosis, NativeLong error_diag_len); + +/* + * Evaluates into 'exited' a non-zero value if stat was returned for a + * job that terminated normally. A zero value can also indicate that + * altough the job has terminated normally an exit status is not available + * or that it is not known whether the job terminated normally. In both + * cases drmaa_wexitstatus() SHALL NOT provide exit status information. + * A non-zero 'exited' value indicates more detailed diagnosis can be provided + * by means of drmaa_wifsignaled(), drmaa_wtermsig() and drmaa_wcoredump(). + */ +public static native int drmaa_wifexited(IntByReference exited, int stat, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* + * If the OUT parameter 'exited' of drmaa_wifexited() is non-zero, + * this function evaluates into 'exit_code' the exit code that the + * job passed to _exit() (see exit(2)) or exit(3C), or the value that + * the child process returned from main. + */ +public static native int drmaa_wexitstatus(IntByReference exit_status, int stat, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* + * Evaluates into 'signaled' a non-zero value if status was returned + * for a job that terminated due to the receipt of a signal. A zero value + * can also indicate that altough the job has terminated due to the receipt + * of a signal the signal is not available or that it is not known whether + * the job terminated due to the receipt of a signal. In both cases + * drmaa_wtermsig() SHALL NOT provide signal information. + */ +public static native int drmaa_wifsignaled(IntByReference signaled, int stat, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* + * If the OUT parameter 'signaled' of drmaa_wifsignaled(stat) is + * non-zero, this function evaluates into signal a string representation of the + * signal that caused the termination of the job. For signals declared by POSIX, + * the symbolic names SHALL be returned (e.g., SIGABRT, SIGALRM). + * For signals not declared by POSIX, any other string MAY be returned. + */ +public static native int drmaa_wtermsig(Pointer signal, NativeLong signal_len, int stat, + Pointer error_diagnosis, NativeLong error_diag_len); + +/* + * If the OUT parameter 'signaled' of drmaa_wifsignaled(stat) is + * non-zero, this function evaluates into 'core_dumped' a non-zero value + * if a core image of the terminated job was created. + */ +public static native int drmaa_wcoredump(IntByReference core_dumped, int stat, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* + * Evaluates into 'aborted' a non-zero value if 'stat' + * was returned for a job that ended before entering the running state. + */ +public static native int drmaa_wifaborted(IntByReference aborted, int stat, Pointer error_diagnosis, + NativeLong error_diag_len); + + + +/* + * Get the program status of the job identified by 'job_id'. + * The possible values returned in 'remote_ps' and their meanings SHALL be: + * + * DRMAA_PS_UNDETERMINED = 0x00: process status cannot be determined + * DRMAA_PS_QUEUED_ACTIVE = 0x10: job is queued and active + * DRMAA_PS_SYSTEM_ON_HOLD = 0x11: job is queued and in system hold + * DRMAA_PS_USER_ON_HOLD = 0x12: job is queued and in user hold + * DRMAA_PS_USER_SYSTEM_ON_HOLD = 0x13: job is queued and in user and system + * hold + * DRMAA_PS_RUNNING = 0x20: job is running + * DRMAA_PS_SYSTEM_SUSPENDED = 0x21: job is system suspended + * DRMAA_PS_USER_SUSPENDED = 0x22: job is user suspended + * DRMAA_PS_USER_SYSTEM_SUSPENDED = 0x23: job is user and system suspended + * DRMAA_PS_DONE = 0x30: job finished normally + * DRMAA_PS_FAILED = 0x40: job finished, but failed + * + * DRMAA SHOULD always get the status of job_id from DRM system, unless the + * previous status has been DRMAA_PS_FAILED or DRMAA_PS_DONE and the status has + * been successfully cached. Terminated jobs get DRMAA_PS_FAILED status. + * + * drmaa_synchronize() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE, + * DRMAA_ERRNO_AUTH_FAILURE, + * DRMAA_ERRNO_NO_MEMORY or + * DRMAA_ERRNO_INVALID_JOB. + */ +public static native int drmaa_job_ps(String job_id, IntByReference remote_ps, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* ------------------- auxiliary routines ------------------- */ + +/* + * SHALL return the error message text associated with the errno number. The + * routine SHALL return null string if called with invalid ERRNO number. + */ +public static native String drmaa_strerror(int drmaa_errno); + +/* + * If called before drmaa_init(), it SHALL return a comma delimited default + * DRMAA implementation contacts string, one per each DRM system provided + * implementation. If called after drmaa_init(), it SHALL return the selected + * contact string. The output string is Implementation dependent. + * drmaa_get_contact() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_INTERNAL_ERROR. + */ +public static native int drmaa_get_contact(Pointer contact, NativeLong contact_len, + Pointer error_diagnosis, NativeLong error_diag_len); + +/* + * OUT major - major version number (non-negative integer) + * OUT minor - minor version number (non-negative integer) + * SHALL return the major and minor version numbers of the DRMAA library; + * for DRMAA 1.0, 'major' is 1 and 'minor' is 0. + */ +public static native int drmaa_version(IntByReference major, IntByReference minor, + Pointer error_diagnosis, NativeLong error_diag_len); + + +/* + * If called before drmaa_init(), it SHALL return a comma delimited DRM systems + * string, one per each DRM system provided implementation. If called after + * drmaa_init(), it SHALL return the selected DRM system. The output string is + * implementation dependent. + * + * drmaa_get_DRM_system() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_INTERNAL_ERROR. + */ +public static native int drmaa_get_DRM_system(Pointer drm_system, NativeLong drm_system_len, + Pointer error_diagnosis, NativeLong error_diag_len); + + +/* + * If called before drmaa_init(), it SHALL return a comma delimited DRMAA + * implementations string, one per each DRM system provided implementation. If + * called after drmaa_init(), it SHALL return the selected DRMAA implementation. + * The output (string) is implementation dependent. drmaa_get_DRM_implementation + * routine SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_INTERNAL_ERROR. + */ +public static native int drmaa_get_DRMAA_implementation(Pointer drmaa_impl, NativeLong drmaa_impl_len, + Pointer error_diagnosis, NativeLong error_diag_len); +} diff --git a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java index 2446383ff..d7b34a253 100644 --- a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java +++ b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java @@ -91,6 +91,54 @@ public class LibBat { Native.register("bat"); } + // Via support@platform.com: + // For equivalent api of bsub -a "xxx aaa qqq", option -a is not in struct submit, we + // have to use setOption_ to set it. setOption_ can be used in user program by including + // cmd.h or opensource.h of LSF opensource. You can refer to cmd.sub.c in opensource. + // + // Here is a demonstration on the api for bsub -a + // ========================================================================= + // /*define external setOption_ function*/ + // extern int setOption_(int argc, char **argv, char *template, + // struct submit *req, int mask, int mask2, char **errMsg); + // + // int setEsub(char *esub, struct submit *req) { + // int x; + // char *template, *arg[3]; + // /*set esub with the following strings and set array length*/ + // arg[0] = "blah"; + // arg[1] = "-a"; + // arg[2] = test; + // /* -a "test", You can add additional esubs in here. Just make sure they're space delimited. ie. "test mpich lammpi" */ + // x=3; + // /*set template*/ + // template = "a:" + // /*run setOption_()*/ + // if (setOption_(x, arg, template, req, ~0, ~0, ~0, NULL) == -1) { + // return(-1); + // } + // else { + // return(0); + // } + // } + // ========================================================================= + + /** + * Used for setting esub and other options not in struct submit. + * Via support@platform.com + * + * @param argc number of args + * @param argv arguments including a first argument that will not be used + * @param template a colon delimited list of arguments in getopt format + * @param jobSubReq the lsf submit + * @param mask unknown + * @param mask2 unknown + * @param mask3 unknown + * @param errMsg unknown + * @return -1 if the option setting failed + */ + public static native int setOption_(int argc, Pointer argv, String template, submit jobSubReq, int mask, int mask2, int mask3, Pointer errMsg); + /** Max job name length as defined by 'man bsub'. */ public static final int MAX_JOB_NAME_LEN = 4094; @@ -9690,8 +9738,10 @@ public class LibBat { * for a service class. */ - public enum objectives { - GOAL_DEADLINE, GOAL_VELOCITY, GOAL_THROUGHPUT + public static interface objectives { + public static int GOAL_DEADLINE = 0; + public static int GOAL_VELOCITY = 1; + public static int GOAL_THROUGHPUT = 2; } @@ -15109,52 +15159,46 @@ public static class ByValue extends jobArrayElementLog implements Structure.ByVa * \addtogroup _consumertype _consumertype * consumer types */ - public static enum consumerType { + public static interface consumerType { /** * < Queues */ - LIMIT_QUEUES(1), + public static final int LIMIT_QUEUES = 1; /** * < Per-queue */ - LIMIT_PER_QUEUE(2), + public static final int LIMIT_PER_QUEUE = 2; /** * < Users */ - LIMIT_USERS(3), + public static final int LIMIT_USERS = 3; /** * < Per-users */ - LIMIT_PER_USER(4), + public static final int LIMIT_PER_USER = 4; /** * < Hosts */ - LIMIT_HOSTS(5), + public static final int LIMIT_HOSTS = 5; /** * < Per-host */ - LIMIT_PER_HOST(6), + public static final int LIMIT_PER_HOST = 6; /** * < Projects */ - LIMIT_PROJECTS(7), + public static final int LIMIT_PROJECTS = 7; /** * < Per-project */ - LIMIT_PER_PROJECT(8); - - private int value; - - private consumerType(int value) { - this.value = value; - } + public static final int LIMIT_PER_PROJECT = 8; } @@ -19011,20 +19055,27 @@ public static class ByValue extends jobArrayElementLog implements Structure.ByVa /* [] mis-matched in RMS[] */ public static final int RMS_BRACKETS_MISMATCH_ERR = (-22); - public static enum rmsAllocType_t { - RMS_ALLOC_TYPE_UNKNOWN, RMS_ALLOC_TYPE_SLOAD, RMS_ALLOC_TYPE_SNODE, RMS_ALLOC_TYPE_MCONT + public static interface rmsAllocType_t { + public static final int RMS_ALLOC_TYPE_UNKNOWN = 0; + public static final int RMS_ALLOC_TYPE_SLOAD = 1; + public static final int RMS_ALLOC_TYPE_SNODE = 2; + public static final int RMS_ALLOC_TYPE_MCONT = 3; } - public static enum rmsTopology_t { - RMS_TOPOLOGY_UNKNOWN, RMS_TOPOLOGY_PTILE, RMS_TOPOLOGY_NODES + public static interface rmsTopology_t { + public static final int RMS_TOPOLOGY_UNKNOWN = 0; + public static final int RMS_TOPOLOGY_PTILE = 1; + public static final int RMS_TOPOLOGY_NODES = 2; } - public static enum rmsFlags_t { - RMS_FLAGS_UNKNOWN, RMS_FLAGS_RAILS, RMS_FLAGS_RAILMASK + public static interface rmsFlags_t { + public static final int RMS_FLAGS_UNKNOWN = 0; + public static final int RMS_FLAGS_RAILS = 1; + public static final int RMS_FLAGS_RAILMASK = 2; } diff --git a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java index c7b3de6cf..30b83abc2 100644 --- a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java +++ b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java @@ -495,14 +495,19 @@ public class LibLsf { - public enum valueType { - LS_BOOLEAN, LS_NUMERIC, LS_STRING, LS_EXTERNAL + public static interface valueType { + public static final int LS_BOOLEAN = 0; + public static final int LS_NUMERIC = 1; + public static final int LS_STRING = 2; + public static final int LS_EXTERNAL = 3; } - public enum orderType { - INCR, DECR, NA + public static interface orderType { + public static final int INCR = 0; + public static final int DECR = 1; + public static final int NA = 2; } @@ -1567,8 +1572,13 @@ public class LibLsf { public static final int NIO_TASK_ALL = 0x03; public static final int NIO_TASK_CONNECTED = 0x04; - public static enum nioType { - NIO_STATUS, NIO_STDOUT, NIO_EOF, NIO_IOERR, NIO_REQUEUE, NIO_STDERR + public static interface nioType { + public static final int NIO_STATUS = 0; + public static final int NIO_STDOUT = 1; + public static final int NIO_EOF = 2; + public static final int NIO_IOERR = 3; + public static final int NIO_REQUEUE = 4; + public static final int NIO_STDERR = 5; } diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java index c09c4037e..cb5bad4ae 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java @@ -144,6 +144,9 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } else if ("input_file".equals(argumentDefinition.fullName) && argumentDefinition.ioType == ArgumentIOType.INPUT) { return Arrays.asList(new InputTaggedFileDefinitionField(argumentDefinition), new InputIndexesArgumentField(argumentDefinition, BAMIndex.BAMIndexSuffix, ".bam")); + } else if ((RodBinding.class.equals(argumentDefinition.argumentType) || RodBinding.class.equals(argumentDefinition.componentType)) && argumentDefinition.ioType == ArgumentIOType.INPUT) { + return Arrays.asList(new InputTaggedFileDefinitionField(argumentDefinition), new InputIndexesArgumentField(argumentDefinition, Tribble.STANDARD_INDEX_EXTENSION)); + } else if (argumentDefinition.ioType == ArgumentIOType.INPUT) { return Collections.singletonList(new InputArgumentField(argumentDefinition)); @@ -196,7 +199,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } // if (intervalFields.contains(argumentDefinition.fullName) && argumentDefinition.ioType == ArgumentIOType.INPUT) - // Change intervals exclusize of intervalsString. + // Change intervals exclusive of intervalsString. private static class IntervalFileArgumentField extends InputArgumentField { public IntervalFileArgumentField(ArgumentDefinition argumentDefinition) { super(argumentDefinition); @@ -332,9 +335,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } } - /** - * The other extreme of a NamedRodBindingField, allows the user to specify the track name, track type, and the file. - */ + // Allows the user to specify the track name, track type, and the file. public static class RodBindArgumentField extends ArgumentDefinitionField { public RodBindArgumentField(ArgumentDefinition argumentDefinition) { super(argumentDefinition); @@ -347,25 +348,28 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } } - /** - * Named input_files. - */ + // Tagged input_files or other rods. public static class InputTaggedFileDefinitionField extends ArgumentDefinitionField { public InputTaggedFileDefinitionField(ArgumentDefinition argumentDefinition) { super(argumentDefinition); } @Override protected Class getInnerType() { return null; } // TaggedFile does not need to be imported. - @Override protected String getFieldType() { return "List[File]"; } - @Override protected String getDefaultValue() { return "Nil"; } + @Override protected String getFieldType() { return argumentDefinition.isMultiValued ? "List[File]" : "File"; } + @Override protected String getDefaultValue() { return argumentDefinition.isMultiValued ? "Nil" : "_"; } @Override protected String getCommandLineTemplate() { - return " + repeat(\"\", %3$s, format=TaggedFile.formatCommandLine(\"%1$s\"))"; + if (argumentDefinition.isMultiValued) { + return " + repeat(\"\", %3$s, format=TaggedFile.formatCommandLine(\"%1$s\"))"; + } else if (!argumentDefinition.required) { + return " + optional(\"\", %3$s, format=TaggedFile.formatCommandLine(\"%1$s\"))"; + } else { + return " + TaggedFile.formatCommandLine(\"%1$s\")(\"\", %3$s, \"\")"; + } } } - /** - * Adds optional inputs for the indexes of any bams or sams added to this function. - */ + // Adds optional inputs for the indexes of any rods added to this function. private static class InputIndexesArgumentField extends ArgumentField { + private final boolean originalIsMultiValued; private final String indexFieldName; private final String originalFieldName; private final String indexSuffix; @@ -374,14 +378,19 @@ public abstract class ArgumentDefinitionField extends ArgumentField { this(originalArgumentDefinition, indexSuffix, null); } public InputIndexesArgumentField(ArgumentDefinition originalArgumentDefinition, String indexSuffix, String originalSuffix) { - this.indexFieldName = originalArgumentDefinition.fullName + "Indexes"; + this.originalIsMultiValued = originalArgumentDefinition.isMultiValued; + this.indexFieldName = originalArgumentDefinition.fullName + "Index" + (originalIsMultiValued ? "es" : ""); this.originalFieldName = originalArgumentDefinition.fullName; this.indexSuffix = indexSuffix; this.originalSuffix = originalSuffix; } @Override protected Class getAnnotationIOClass() { return Input.class; } @Override public String getCommandLineAddition() { return ""; } - @Override protected String getDoc() { return "Dependencies on any indexes of " + this.originalFieldName; } + @Override protected String getDoc() { + return originalIsMultiValued + ? "Dependencies on any indexes of " + this.originalFieldName + : "Dependencies on the index of " + this.originalFieldName; + } @Override protected String getFullName() { return this.indexFieldName; } @Override protected boolean isRequired() { return false; } @Override protected String getFieldType() { return "List[File]"; } @@ -389,24 +398,41 @@ public abstract class ArgumentDefinitionField extends ArgumentField { @Override protected Class getInnerType() { return File.class; } @Override protected String getRawFieldName() { return this.indexFieldName; } @Override protected String getFreezeFields() { - if (originalSuffix == null) { - return String.format( - ("%1$s ++= %2$s" + - ".filter(orig => orig != null)" + - ".map(orig => new File(orig.getPath + \"%3$s\"))%n"), - indexFieldName, originalFieldName, indexSuffix); + if (originalIsMultiValued) { + if (originalSuffix == null) { + return String.format( + ("%1$s ++= %2$s" + + ".filter(orig => orig != null)" + + ".map(orig => new File(orig.getPath + \"%3$s\"))%n"), + indexFieldName, originalFieldName, indexSuffix); + } else { + return String.format( + ("%1$s ++= %2$s" + + ".filter(orig => orig != null && orig.getName.endsWith(\"%4$s\"))" + + ".flatMap(orig => Array(" + + " new File(orig.getPath + \"%3$s\")," + + " new File(orig.getPath.stripSuffix(\"%4$s\") + \"%3$s\") ))%n"), + indexFieldName, originalFieldName, indexSuffix, originalSuffix); + } } else { - return String.format( - ("%1$s ++= %2$s" + - ".filter(orig => orig != null && orig.getName.endsWith(\"%4$s\"))" + - ".flatMap(orig => Array(" + - " new File(orig.getPath + \"%3$s\")," + - " new File(orig.getPath.stripSuffix(\"%4$s\") + \"%3$s\") ))%n"), - indexFieldName, originalFieldName, indexSuffix, originalSuffix); + if (originalSuffix == null) { + return String.format( + ("if (%2$s != null)%n " + + "%1$s :+= new File(%2$s.getPath + \"%3$s\")%n"), + indexFieldName, originalFieldName, indexSuffix); + } else { + return String.format( + ("if (%2$s != null && %2$s.getName.endsWith(\"%4$s\"))%n " + + "%1$s ++= Array(" + + " new File(%2$s.getPath + \"%3$s\")," + + " new File(%2$s.getPath.stripSuffix(\"%4$s\") + \"%3$s\") )%n"), + indexFieldName, originalFieldName, indexSuffix, originalSuffix); + } } } } + // Tracks an automatically generated index private static abstract class OutputIndexArgumentField extends ArgumentField { protected final String indexFieldName; protected final String originalFieldName; @@ -456,6 +482,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } } + // Allows setting the format for floats and doubles private static class FormatterArgumentField extends ArgumentField { private final ArgumentField argumentField; public FormatterArgumentField(ArgumentField argumentField) { diff --git a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java b/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java index af69ebca6..74f147127 100755 --- a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java @@ -121,9 +121,9 @@ public class IndelUtils { boolean done = false; ArrayList inds = new ArrayList(); - if ( vc.isInsertion() ) { + if ( vc.isSimpleInsertion() ) { indelAlleleString = vc.getAlternateAllele(0).getDisplayString(); - } else if ( vc.isDeletion() ) { + } else if ( vc.isSimpleDeletion() ) { indelAlleleString = vc.getReference().getDisplayString(); } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java b/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java index e65b8f921..fa154fca3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java @@ -224,4 +224,14 @@ public class JVMUtils { throw new StingException("Unknown type: " + type + " (" + type.getClass().getName() + ")"); } } + + public static Class getParameterizedTypeClass(Type t) { + if ( t instanceof ParameterizedType ) { + ParameterizedType parameterizedType = (ParameterizedType)t; + if ( parameterizedType.getActualTypeArguments().length != 1 ) + throw new ReviewedStingException("BUG: more than 1 generic type found on class" + t); + return (Class)parameterizedType.getActualTypeArguments()[0]; + } else + throw new ReviewedStingException("BUG: could not find generic type on class " + t); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java index 14c04b5c4..0dbe55726 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java @@ -4,9 +4,28 @@ package org.broadinstitute.sting.utils.clipreads; * How should we represent a clipped bases in a read? */ public enum ClippingRepresentation { - WRITE_NS, // change the bases to Ns - WRITE_Q0S, // change the quality scores to Q0 - WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns - SOFTCLIP_BASES, // change cigar string to S, but keep bases - HARDCLIP_BASES // remove the bases from the read + /** Clipped bases are changed to Ns */ + WRITE_NS, + + /** Clipped bases are changed to have Q0 quality score */ + WRITE_Q0S, + + /** Clipped bases are change to have both an N base and a Q0 quality score */ + WRITE_NS_Q0S, + + /** + * Change the read's cigar string to soft clip (S, see sam-spec) away the bases. + * Note that this can only be applied to cases where the clipped bases occur + * at the start or end of a read. + */ + SOFTCLIP_BASES, + + /** + * Change the read's cigar string to hard clip (H, see sam-spec) away the bases. + * Hard clipping, unlike soft clipping, actually removes bases from the read, + * reducing the resulting file's size but introducing an irrevesible (i.e., + * lossy) operation. Note that this can only be applied to cases where the clipped + * bases occur at the start or end of a read. + */ + HARDCLIP_BASES } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java index 5e536d4c1..413848543 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.refdata.features.beagle; +package org.broadinstitute.sting.utils.codecs.beagle; /* * Copyright (c) 2010 The Broad Institute * @@ -40,6 +40,29 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.regex.Pattern; +/** + * TODO GUILLERMO DEL ANGEL + * + *

    + * Codec Description + *

    + * + *

    + * See also: @see VCF specification
    + *

    + + *

    + * + *

    File format example

    + *
    + *     line 1
    + *     line 2
    + *     line 3
    + * 
    + * + * @author Mark DePristo + * @since 2010 + */ public class BeagleCodec implements ReferenceDependentFeatureCodec { private String[] header; public enum BeagleReaderType {PROBLIKELIHOOD, GENOTYPES, R2}; diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java index e6832754d..0aa9ecba2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java @@ -22,7 +22,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.features.beagle; +package org.broadinstitute.sting.utils.codecs.beagle; import org.broad.tribble.Feature; import org.broadinstitute.sting.utils.variantcontext.Allele; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java deleted file mode 100755 index fef6c4ea0..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.completegenomics; - -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * a codec for the VAR file types produced by the Complete Genomics Institute - */ -public class CGVarCodec implements FeatureCodec { - - private static final String REF_TYPE = "ref"; - private static final String SNP_TYPE = "snp"; - private static final String DELETION_TYPE = "del"; - private static final String INSERTION_TYPE = "ins"; - private static final String SUBSTITUTION_TYPE = "sub"; - - // the minimum number of features in the CG file line - private static final int minimumFeatureCount = 8; - - /** - * decode the location only - * @param line the input line to decode - * @return a HapMapFeature - */ - public Feature decodeLoc(String line) { - return decode(line); - } - - /** - * decode the CG record - * @param line the input line to decode - * @return a VariantContext - */ - public Feature decode(String line) { - String[] array = line.split("\\s+"); - - // make sure the split was successful - that we got an appropriate number of fields - if ( array.length < minimumFeatureCount ) - return null; - - String type = array[6]; - - long start = Long.valueOf(array[4]); - long end; - Allele ref, alt = null; - - //System.out.println(line); - - if ( type.equals(SNP_TYPE) ) { - ref = Allele.create(array[7], true); - alt = Allele.create(array[8], false); - end = start; - } else if ( type.equals(INSERTION_TYPE) ) { - ref = Allele.create(Allele.NULL_ALLELE_STRING, true); - alt = Allele.create(array[7], false); - end = start; - } else if ( type.equals(DELETION_TYPE) ) { - ref = Allele.create(array[7], true); - alt = Allele.create(Allele.NULL_ALLELE_STRING, false); - end = start + ref.length(); - //} else if ( type.equals(REF_TYPE) ) { - // ref = Allele.create("N", true); // ref bases aren't accurate - // start++; - // end = start; - //} else if ( type.equals(SUBSTITUTION_TYPE) ) { - // ref = Allele.create(array[7], true); - // alt = Allele.create(array[8], false); - // end = start + Math.max(ref.length(), alt.length()); - } else { - return null; // we don't handle other types - } - - HashSet alleles = new HashSet(); - alleles.add(ref); - if ( alt != null ) - alleles.add(alt); - - HashMap attrs = new HashMap(); - String id = array[array.length - 1]; - if ( id.indexOf("dbsnp") != -1 ) { - attrs.put(VariantContext.ID_KEY, parseID(id)); - } - - // create a new feature given the array - return new VariantContext("CGI", array[3], start, end, alleles, VariantContext.NO_NEG_LOG_10PERROR, null, attrs); - } - - public Class getFeatureType() { - return VariantContext.class; - } - - // There's no spec and no character to distinguish header lines... - private final static int NUM_HEADER_LINES = 12; - public Object readHeader(LineReader reader) { - return null; - - //String headerLine = null; - //try { - // for (int i = 0; i < NUM_HEADER_LINES; i++) - // headerLine = reader.readLine(); - //} catch (IOException e) { - // throw new IllegalArgumentException("Unable to read a line from the line reader"); - //} - //return headerLine; - } - - private static final Pattern DBSNP_PATTERN = Pattern.compile("^dbsnp\\.\\d+:(.*)"); - private String parseID(String raw) { - StringBuilder sb = null; - - String[] ids = raw.split(";"); - for ( String id : ids ) { - Matcher matcher = DBSNP_PATTERN.matcher(id); - if ( matcher.matches() ) { - String rsID = matcher.group(1); - if ( sb == null ) { - sb = new StringBuilder(rsID); - } else { - sb.append(";"); - sb.append(rsID); - } - } - } - - return sb == null ? null : sb.toString(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java index 90878dee7..a80e05d59 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java @@ -33,12 +33,43 @@ import java.io.IOException; import java.util.Arrays; /** - * a codec for the file types produced by the HapMap consortium, available on their website: - * http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/ + * A codec for the file types produced by the HapMap consortium * - * The format includes eleven standard fields, plus genotypes for each of the samples included - * in the file - * + *

    + * The format includes eleven standard fields, plus genotypes for each of the samples included + * in the file: + * + *

    + *     Col1: refSNP rs# identifier at the time of release (NB might merge with another rs# in the future)
    + *     Col2: SNP alleles according to dbSNP
    + *     Col3: chromosome that SNP maps to
    + *     Col4: chromosome position of SNP, in basepairs on reference sequence
    + *     Col5: strand of reference sequence that SNP maps to
    + *     Col6: version of reference sequence assembly
    + *     Col7: HapMap genotype center that produced the genotypes
    + *     Col8: LSID for HapMap protocol used for genotyping
    + *     Col9: LSID for HapMap assay used for genotyping
    + *     Col10: LSID for panel of individuals genotyped
    + *     Col11: QC-code, currently 'QC+' for all entries (for future use)
    + *     Col12 and on: observed genotypes of samples, one per column, sample identifiers in column headers (Coriell catalog numbers, example: NA10847). Duplicate samples have .dup suffix.
    + * 
    + *

    + * + *

    + * See also: @See HapMap genotypes download + *

    + * + *

    File format example

    + * From genotypes_chr1_ASW_r27_nr.b36_fwd.txt.gz: + *
    + *     rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode NA19625 NA19700 NA19701 NA19702 NA19703 NA19704 NA19705 NA19708 NA19712 NA19711 NA19818 NA19819 NA19828 NA19835 NA19834 NA19836 NA19902 NA19901 NA19900 NA19904 NA19919 NA19908 NA19909 NA19914 NA19915 NA19916 NA19917 NA19918 NA19921 NA20129 NA19713 NA19982 NA19983 NA19714 NA19985 NA20128 NA20126 NA20127 NA20277 NA20276 NA20279 NA20282 NA20281 NA20284 NA20287 NA20288 NA20290 NA20289 NA20291 NA20292 NA20295 NA20294 NA20297 NA20300 NA20301 NA20302 NA20317 NA20319 NA20322 NA20333 NA20332 NA20335 NA20334 NA20337 NA20336 NA20340 NA20341 NA20343 NA20342 NA20344 NA20345 NA20346 NA20347 NA20348 NA20349 NA20350 NA20357 NA20356 NA20358 NA20359 NA20360 NA20363 NA20364
    + *     rs9629043 C/T chr1 554636 + ncbi_b36 broad urn:LSID:affymetrix.hapmap.org:Protocol:GenomeWideSNP_6.0:3 urn:LSID:broad.hapmap.org:Assay:SNP_A-8575115:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ CC CC CC CC CC CC CC CC CC CC CC CC NN CC CC CC CT CT CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CT CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC
    + *     rs28446478 G/T chr1 576058 + ncbi_b36 sanger urn:LSID:illumina.hapmap.org:Protocol:Human_1M_BeadChip:3 urn:LSID:sanger.hapmap.org:Assay:H1Mrs28446478:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ GT TT GT TT TT TT TT GT GT TT TT TT TT GT GT GT GT TT GT TT GT GT TT GT GT TT TT TT GT GT TT TT TT GT TT GT TT GT GT GT GT GT TT GT TT TT GT GT TT TT TT TT TT TT GT GT GT GT TT TT TT TT GT TT GT TT TT GT TT TT TT GT TT TT TT GT GT TT GT TT GT TT TT
    + *     rs12565286 C/G chr1 711153 + ncbi_b36 broad urn:LSID:affymetrix.hapmap.org:Protocol:GenomeWideSNP_6.0:3 urn:LSID:broad.hapmap.org:Assay:SNP_A-8709646:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG CG CG GG GG GG GG GG GG GG GG GG CG CG GG GG GG GG GG GG GG GG GG GG CG NN GG GG GG GG GG GG NN GG NN NN
    + * 
    + * + * @author Mark DePristo + * @since 2010 */ public class RawHapMapCodec implements FeatureCodec { // the minimum number of features in the HapMap file line @@ -82,7 +113,7 @@ public class RawHapMapCodec implements FeatureCodec { headerLine); } - public Class getFeatureType() { + public Class getFeatureType() { return RawHapMapFeature.class; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java similarity index 80% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index 461aab9a5..d94d9ff84 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -1,98 +1,125 @@ -package org.broadinstitute.sting.gatk.refdata.features.refseq; - -import org.broad.tribble.Feature; -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.ArrayList; - -/** - * the ref seq codec - */ -public class RefSeqCodec implements ReferenceDependentFeatureCodec { - - /** - * The parser to use when resolving genome-wide locations. - */ - private GenomeLocParser genomeLocParser; - - /** - * Set the parser to use when resolving genetic data. - * @param genomeLocParser The supplied parser. - */ - @Override - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - @Override - public Feature decodeLoc(String line) { - if (line.startsWith("#")) return null; - String fields[] = line.split("\t"); - if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length); - String contig_name = fields[2]; - try { - return new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - } catch ( UserException.MalformedGenomeLoc e ) { - Utils.warnUser("RefSeq file is potentially incorrect, as some transcripts or exons have a negative length ("+fields[2]+")"); - return null; - } - } - - /** Fills this object from a text line in RefSeq (UCSC) text dump file */ - @Override - public RefSeqFeature decode(String line) { - if (line.startsWith("#")) return null; - String fields[] = line.split("\t"); - - // we reference postion 15 in the split array below, make sure we have at least that many columns - if (fields.length < 16) throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length); - String contig_name = fields[2]; - RefSeqFeature feature = new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - - feature.setTranscript_id(fields[1]); - if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1); - else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1); - else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line); - - - feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]))); - feature.setGene_name(fields[12]); - String[] exon_starts = fields[9].split(","); - String[] exon_stops = fields[10].split(","); - String[] eframes = fields[15].split(","); - - if ( exon_starts.length != exon_stops.length ) - throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line); - if ( exon_starts.length != eframes.length ) - throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line); - - ArrayList exons = new ArrayList(exon_starts.length); - ArrayList exon_frames = new ArrayList(eframes.length); - - for ( int i = 0 ; i < exon_starts.length ; i++ ) { - exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); - exon_frames.add(Integer.decode(eframes[i])); - } - - feature.setExons(exons); - feature.setExon_frames(exon_frames); - return feature; - } - - @Override - public Object readHeader(LineReader reader) { - return null; - } - - @Override - public Class getFeatureType() { - return RefSeqCodec.class; - } -} +package org.broadinstitute.sting.utils.codecs.refseq; + +import org.broad.tribble.Feature; +import org.broad.tribble.TribbleException; +import org.broad.tribble.readers.LineReader; +import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.ArrayList; + +/** + * TODO FOR CHRIS HARTL + * + *

    + * Codec Description + *

    + * + *

    + * See also: link to file specification + *

    + * + *

    File format example

    + *

    + * A BAM file containing exactly one sample. + *

    + * + * @author Mark DePristo + * @since 2010 + */ +public class RefSeqCodec implements ReferenceDependentFeatureCodec { + + /** + * The parser to use when resolving genome-wide locations. + */ + private GenomeLocParser genomeLocParser; + private boolean zero_coding_length_user_warned = false; + /** + * Set the parser to use when resolving genetic data. + * @param genomeLocParser The supplied parser. + */ + @Override + public void setGenomeLocParser(GenomeLocParser genomeLocParser) { + this.genomeLocParser = genomeLocParser; + } + + @Override + public Feature decodeLoc(String line) { + if (line.startsWith("#")) return null; + String fields[] = line.split("\t"); + if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length); + String contig_name = fields[2]; + try { + return new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + } catch ( UserException.MalformedGenomeLoc e ) { + Utils.warnUser("RefSeq file is potentially incorrect, as some transcripts or exons have a negative length ("+fields[2]+")"); + return null; + } + } + + /** Fills this object from a text line in RefSeq (UCSC) text dump file */ + @Override + public RefSeqFeature decode(String line) { + if (line.startsWith("#")) return null; + String fields[] = line.split("\t"); + + // we reference postion 15 in the split array below, make sure we have at least that many columns + if (fields.length < 16) throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length); + String contig_name = fields[2]; + RefSeqFeature feature = new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + + feature.setTranscript_id(fields[1]); + if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1); + else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1); + else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line); + + int coding_start = Integer.parseInt(fields[6])+1; + int coding_stop = Integer.parseInt(fields[7]); + + if ( coding_start > coding_stop ) { + if ( ! zero_coding_length_user_warned ) { + Utils.warnUser("RefSeq file contains transcripts with zero coding length. "+ + "Such transcripts will be ignored (this warning is printed only once)"); + zero_coding_length_user_warned = true; + } + return null; + } + + feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, coding_start, coding_stop)); + feature.setGene_name(fields[12]); + String[] exon_starts = fields[9].split(","); + String[] exon_stops = fields[10].split(","); + String[] eframes = fields[15].split(","); + + if ( exon_starts.length != exon_stops.length ) + throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line); + if ( exon_starts.length != eframes.length ) + throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line); + + ArrayList exons = new ArrayList(exon_starts.length); + ArrayList exon_frames = new ArrayList(eframes.length); + + for ( int i = 0 ; i < exon_starts.length ; i++ ) { + exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); + exon_frames.add(Integer.decode(eframes[i])); + } + + feature.setExons(exons); + feature.setExon_frames(exon_frames); + return feature; + } + + @Override + public Object readHeader(LineReader reader) { + return null; + } + + @Override + public Class getFeatureType() { + return RefSeqFeature.class; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java similarity index 99% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java index a38d45428..c04ca8592 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.refdata.features.refseq; +package org.broadinstitute.sting.utils.codecs.refseq; import org.broad.tribble.Feature; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/Transcript.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/Transcript.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java index d8bf12810..3e8a4fb34 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/Transcript.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java @@ -1,53 +1,53 @@ -package org.broadinstitute.sting.gatk.refdata.features.refseq; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.HasGenomeLocation; - -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Sep 22, 2009 - * Time: 5:22:30 PM - * To change this template use File | Settings | File Templates. - */ -public interface Transcript extends HasGenomeLocation { - - /** Returns id of the transcript (RefSeq NM_* id) */ - public String getTranscriptId(); - /** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */ - public int getStrand(); - /** Returns transcript's full genomic interval (includes all exons with UTRs) */ - public GenomeLoc getLocation(); - /** Returns genomic interval of the coding sequence (does not include - * UTRs, but still includes introns, since it's a single interval on the DNA) - */ - public GenomeLoc getCodingLocation(); - /** Name of the gene this transcript corresponds to (typically NOT gene id such as Entrez etc, - * but the implementation can decide otherwise) - */ - public String getGeneName(); - /** Number of exons in this transcript */ - public int getNumExons(); - /** Genomic location of the n-th exon; expected to throw an exception (runtime) if n is out of bounds */ - public GenomeLoc getExonLocation(int n); - - /** Returns the list of all exons in this transcript, as genomic intervals */ - public List getExons(); - - /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ - public boolean overlapsP (GenomeLoc that); - - /** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript. - * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons, - * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript. - * @see #overlapsExonP - */ - public boolean overlapsCodingP (GenomeLoc that); - - /** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */ - public boolean overlapsExonP (GenomeLoc that); - - -} +package org.broadinstitute.sting.utils.codecs.refseq; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.HasGenomeLocation; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Sep 22, 2009 + * Time: 5:22:30 PM + * To change this template use File | Settings | File Templates. + */ +public interface Transcript extends HasGenomeLocation { + + /** Returns id of the transcript (RefSeq NM_* id) */ + public String getTranscriptId(); + /** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */ + public int getStrand(); + /** Returns transcript's full genomic interval (includes all exons with UTRs) */ + public GenomeLoc getLocation(); + /** Returns genomic interval of the coding sequence (does not include + * UTRs, but still includes introns, since it's a single interval on the DNA) + */ + public GenomeLoc getCodingLocation(); + /** Name of the gene this transcript corresponds to (typically NOT gene id such as Entrez etc, + * but the implementation can decide otherwise) + */ + public String getGeneName(); + /** Number of exons in this transcript */ + public int getNumExons(); + /** Genomic location of the n-th exon; expected to throw an exception (runtime) if n is out of bounds */ + public GenomeLoc getExonLocation(int n); + + /** Returns the list of all exons in this transcript, as genomic intervals */ + public List getExons(); + + /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ + public boolean overlapsP (GenomeLoc that); + + /** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript. + * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons, + * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript. + * @see #overlapsExonP + */ + public boolean overlapsCodingP (GenomeLoc that); + + /** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */ + public boolean overlapsExonP (GenomeLoc that); + + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java similarity index 86% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java index 43e2c3ff5..f4633b2ce 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.features.sampileup; +package org.broadinstitute.sting.utils.codecs.sampileup; import org.broad.tribble.Feature; import org.broad.tribble.FeatureCodec; @@ -35,13 +35,46 @@ import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; -import static org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature.VariantType; +import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType; /** - * A Tribble encoder / decoder for SAM pileup data. + * Decoder for SAM pileup data. For GATK validation purposes only * - * @author mhanna - * @version 0.1 + *

    + * Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute. + * It desribes the base-pair information at each chromosomal position. This format + * facilitates SNP/indel calling and brief alignment viewing by eyes. + *

    + *

    + * Each line consists of chromosome, 1-based coordinate, reference base, the + * number of reads covering the site, read bases and base qualities. At the + * read base column, a dot stands for a match to the reference base on the + * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch + * on the forward strand and `acgtn' for a mismatch on the reverse strand. + * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between + * this reference position and the next reference position. The length of the + * insertion is given by the integer in the pattern, followed by the inserted sequence. + *

    + * + *

    + *
    See also: @see SAMTools project
    + *
    See also: @see Pileup format
    + *

    + * + *

    File format example

    + *
    + *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
    + *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
    + *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
    + *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
    + *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
    + *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
    + *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
    + *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
    + * 
    + * + * @author Matt Hanna + * @since 2009 */ public class SAMPileupCodec implements FeatureCodec { // the number of tokens we expect to parse from a pileup line diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java similarity index 99% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java index 378f26934..eb33243e3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.features.sampileup; +package org.broadinstitute.sting.utils.codecs.sampileup; import net.sf.samtools.util.StringUtil; import org.broad.tribble.Feature; diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java similarity index 89% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java index 039b8adde..d4bdb5aa9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.features.samread; +package org.broadinstitute.sting.utils.codecs.samread; import net.sf.samtools.Cigar; import net.sf.samtools.TextCigarCodec; @@ -36,8 +36,21 @@ import org.broad.tribble.util.ParsingUtils; /** * Decodes a simple SAM text string. * - * @author mhanna - * @version 0.1 + *

    + * Reads in the SAM text version of a BAM file as a ROD. For testing only + *

    + * + *

    + * See also: @see SAMTools for format specification + *

    + * + *

    File format example

    + *
    + *     SL-XBC:1:10:628:923#0	16	Escherichia_coli_K12	1	37	76M	=	1	0	AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA	B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB
    + * 
    + * + * @author Matt Hanna + * @since 2009 */ public class SAMReadCodec implements FeatureCodec { /* SL-XBC:1:10:628:923#0 16 Escherichia_coli_K12 1 37 76M = 1 0 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB */ diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java index 7f12b2b2f..fc1bf89af 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.features.samread; +package org.broadinstitute.sting.utils.codecs.samread; import org.broad.tribble.Feature; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java index eada8521f..7f3d9e17d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/snpEff/SnpEffCodec.java @@ -41,10 +41,11 @@ import java.io.IOException; /** * Codec for decoding the output format of the SnpEff variant effect predictor tool - * (http://snpeff.sourceforge.net/). * + *

    * This format has 23 tab-delimited fields: * + *

      * Chromosome
      * Position
      * Reference
    @@ -68,10 +69,16 @@ import java.io.IOException;
      * Codons Around
      * Amino Acids Around
      * Custom Interval ID
    + * 
    + * Note that we treat all except the Chromosome, Position, and Effect fields as optional. + *

    * - * We treat all except the Chromosome, Position, and Effect fields as optional. + *

    + * See also: @see SNPEff project page + *

    * * @author David Roazen + * @since 2011 */ public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec { @@ -222,7 +229,7 @@ public class SnpEffCodec implements FeatureCodec, SelfScopingFeatureCodec { return null; } - public Class getFeatureType() { + public Class getFeatureType() { return SnpEffFeature.class; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java deleted file mode 100755 index e169dbdfc..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java +++ /dev/null @@ -1,209 +0,0 @@ -package org.broadinstitute.sting.utils.codecs.soapsnp; - -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.NameAwareCodec; -import org.broad.tribble.TribbleException; -import org.broad.tribble.exception.CodecLineParsingException; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.*; - -/** - * @author depristo - *

    - * a codec for parsing soapsnp files (see http://soap.genomics.org.cn/soapsnp.html#usage2) - *

    - * - * A simple text file format with the following whitespace separated fields: - * -1) Chromosome ID -2) Coordinate on chromosome, start from 1 -3) Reference genotype -4) Consensus genotype -5) Quality score of consensus genotype -6) Best base -7) Average quality score of best base -8) Count of uniquely mapped best base -9) Count of all mapped best base -10) Second best bases -11) Average quality score of second best base -12) Count of uniquely mapped second best base -13) Count of all mapped second best base -14) Sequencing depth of the site -15) Rank sum test p_value -16) Average copy number of nearby region -17) Whether the site is a dbSNP. - */ -public class SoapSNPCodec implements FeatureCodec, NameAwareCodec { - private String[] parts; - - // we store a name to give to each of the variant contexts we emit - private String name = "Unknown"; - - public Feature decodeLoc(String line) { - return decode(line); - } - - /** - * Decode a line as a Feature. - * - * @param line - * - * @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is - * a comment) - */ - public Feature decode(String line) { - try { - // parse into lines - parts = line.trim().split("\\s+"); - - // check that we got the correct number of tokens in the split - if (parts.length != 18) - throw new CodecLineParsingException("Invalid SoapSNP row found -- incorrect element count. Expected 18, got " + parts.length + " line = " + line); - - String contig = parts[0]; - long start = Long.valueOf(parts[1]); - AlleleAndGenotype allelesAndGenotype = parseAlleles(parts[2], parts[3], line); - - double negLog10PError = Integer.valueOf(parts[4]) / 10.0; - - Map attributes = new HashMap(); - attributes.put("BestBaseQ", parts[6]); - attributes.put("SecondBestBaseQ", parts[10]); - attributes.put("RankSumP", parts[15]); - // add info to keys - - //System.out.printf("Alleles = " + allelesAndGenotype.alleles); - //System.out.printf("genotype = " + allelesAndGenotype.genotype); - - VariantContext vc = new VariantContext(name, contig, start, start, allelesAndGenotype.alleles, allelesAndGenotype.genotype, negLog10PError, VariantContext.PASSES_FILTERS, attributes); - - //System.out.printf("line = %s%n", line); - //System.out.printf("vc = %s%n", vc); - - return vc; - } catch (CodecLineParsingException e) { - throw new TribbleException("Unable to parse line " + line,e); - } catch (NumberFormatException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - throw new TribbleException("Unable to parse line " + line,e); - } - } - - private static class AlleleAndGenotype { - Collection alleles; - Collection genotype; - - public AlleleAndGenotype(Collection alleles, Genotype genotype) { - this.alleles = alleles; - this.genotype = new HashSet(); - this.genotype.add(genotype); - } - } - - private AlleleAndGenotype parseAlleles(String ref, String consensusGenotype, String line) { - /* A Adenine - C Cytosine - G Guanine - T (or U) Thymine (or Uracil) - R A or G - Y C or T - S G or C - W A or T - K G or T - M A or C - B C or G or T - D A or G or T - H A or C or T - V A or C or G - N any base - . or - gap - */ - if ( ref.equals(consensusGenotype) ) - throw new TribbleException.InternalCodecException("Ref base and consensus genotype are the same " + ref); - - Allele refAllele = Allele.create(ref, true); - List genotypeAlleles = null; - - char base = consensusGenotype.charAt(0); - - switch ( base ) { - case 'A': case 'C': case 'G': case 'T': - Allele a = Allele.create(consensusGenotype); - genotypeAlleles = Arrays.asList(a, a); - break; - case 'R': case 'Y': case 'S': case 'W': case 'K': case 'M': - genotypeAlleles = determineAlt(refAllele, ref.charAt(0), base); - break; - default: - throw new TribbleException("Unexpected consensus genotype " + consensusGenotype + " at line = " + line); - } - - - Collection alleles = new HashSet(genotypeAlleles); - alleles.add(refAllele); - Genotype genotype = new Genotype("unknown", genotypeAlleles); // todo -- probably should include genotype quality - - return new AlleleAndGenotype( alleles, genotype ); - } - - private static final Map IUPAC_SNPS = new HashMap(); - static { - IUPAC_SNPS.put('R', "AG"); - IUPAC_SNPS.put('Y', "CT"); - IUPAC_SNPS.put('S', "GC"); - IUPAC_SNPS.put('W', "AT"); - IUPAC_SNPS.put('K', "GT"); - IUPAC_SNPS.put('M', "AC"); - } - - private List determineAlt(Allele ref, char refbase, char alt) { - String alts = IUPAC_SNPS.get(alt); - if ( alts == null ) - throw new IllegalStateException("BUG: unexpected consensus genotype " + alt); - - Allele a1 = alts.charAt(0) == refbase ? ref : Allele.create((byte)alts.charAt(0)); - Allele a2 = alts.charAt(1) == refbase ? ref : Allele.create((byte)alts.charAt(1)); - - //if ( a1 != ref && a2 != ref ) - // throw new IllegalStateException("BUG: unexpected consensus genotype " + alt + " does not contain the reference base " + ref); - - return Arrays.asList(a1, a2); - } - - /** - * @return VariantContext - */ - public Class getFeatureType() { - return VariantContext.class; - } - - public Object readHeader(LineReader reader) { - - return null; // we don't have a meaningful header - } - - /** - * get the name of this codec - * @return our set name - */ - public String getName() { - return name; - } - - /** - * set the name of this codec - * @param name new name - */ - public void setName(String name) { - this.name = name; - } - - public static void main(String[] args) { - System.out.printf("Testing " + args[0]); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java similarity index 73% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java index 745ccdd9f..fdcc8ed10 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.refdata.features.table; +package org.broadinstitute.sting.utils.codecs.table; import org.broad.tribble.Feature; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; @@ -6,14 +6,19 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import java.util.Arrays; /** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 3/28/11 - * Time: 2:47 PM - * To change this template use File | Settings | File Templates. - */ -/** - * The standard table codec with a slightly different parsing convention (expects loci as contig start stop, not contig:start-stop) + * The standard table codec that expects loci as contig start stop, not contig:start-stop + * + *

    + * The standard table codec with a slightly different parsing convention + * (expects loci as contig start stop, not contig:start-stop) + *

    + * + *

    + * See also: TableCodec + *

    + * + * @author Chris Hartl + * @since 2010 */ public class BedTableCodec extends TableCodec implements ReferenceDependentFeatureCodec { diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java similarity index 64% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java index ab1ac59d8..1919ccbf0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.refdata.features.table; +package org.broadinstitute.sting.utils.codecs.table; import org.broad.tribble.Feature; import org.broad.tribble.readers.LineReader; @@ -11,13 +11,40 @@ import java.util.ArrayList; import java.util.Arrays; /** - * implementation of a simple table (tab or comma delimited format) input files + * Reads tab deliminated tabular text files + * + *

    + *

      + *
    • Header: must begin with line HEADER or track (for IGV), followed by any number of column names, + * separated by whitespace.
    • + *
    • Comment lines starting with # are ignored
    • + *
    • Each non-header and non-comment line is split into parts by whitespace, + * and these parts are assigned as a map to their corresponding column name in the header. + * Note that the first element (corresponding to the HEADER column) must be a valid genome loc + * such as 1, 1:1 or 1:1-10, which is the position of the Table element on the genome. TableCodec + * requires that there be one value for each column in the header, and no more, on all lines.
    • + *
    + *

    + * + *

    + * + *

    File format example

    + *
    + *     HEADER a b c
    + *     1:1  1   2   3
    + *     1:2  4   5   6
    + *     1:3  7   8   9
    + * 
    + * + * @author Mark DePristo + * @since 2009 */ public class TableCodec implements ReferenceDependentFeatureCodec { - protected String delimiterRegex = "\\s+"; - protected String headerDelimiter = "HEADER"; - protected String igvHeaderDelimiter = "track"; - protected String commentDelimiter = "#"; + final static protected String delimiterRegex = "\\s+"; + final static protected String headerDelimiter = "HEADER"; + final static protected String igvHeaderDelimiter = "track"; + final static protected String commentDelimiter = "#"; + protected ArrayList header = new ArrayList(); /** @@ -51,7 +78,7 @@ public class TableCodec implements ReferenceDependentFeatureCodec { } @Override - public Class getFeatureType() { + public Class getFeatureType() { return TableFeature.class; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java similarity index 96% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java index ca73ee960..a85849f0b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.refdata.features.table; +package org.broadinstitute.sting.utils.codecs.table; import org.broad.tribble.Feature; import org.broadinstitute.sting.utils.GenomeLoc; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index cb505c717..a3100030e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -14,10 +14,9 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; +import java.io.*; import java.util.*; +import java.util.zip.GZIPInputStream; public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser, SelfScopingFeatureCodec { @@ -155,9 +154,45 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, * @return a feature, (not guaranteed complete) that has the correct start and stop */ public Feature decodeLoc(String line) { - return reallyDecode(line); + String[] locParts = new String[6]; + ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); + + // get our alleles (because the end position depends on them) + String ref = getCachedString(locParts[3].toUpperCase()); + String alts = getCachedString(locParts[4].toUpperCase()); + List alleles = parseAlleles(ref, alts, lineNo); + + // find out our location + int start = Integer.valueOf(locParts[1]); + int stop = start; + + // ref alleles don't need to be single bases for monomorphic sites + if ( alleles.size() == 1 ) { + stop = start + alleles.get(0).length() - 1; + } else if ( !isSingleNucleotideEvent(alleles) ) { + stop = clipAlleles(start, ref, alleles, null, lineNo); + } + + return new VCFLocFeature(locParts[0], start, stop); } + private final static class VCFLocFeature implements Feature { + + final String chr; + final int start, stop; + + private VCFLocFeature(String chr, int start, int stop) { + this.chr = chr; + this.start = start; + this.stop = stop; + } + + public String getChr() { return chr; } + public int getStart() { return start; } + public int getEnd() { return stop; } + } + + /** * decode the line into a feature (VariantContext) * @param line the line @@ -208,7 +243,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, // parse out the required fields String contig = getCachedString(parts[0]); - long pos = Long.valueOf(parts[1]); + int pos = Integer.valueOf(parts[1]); String id = null; if ( parts[2].length() == 0 ) generateException("The VCF specification requires a valid ID field"); @@ -228,7 +263,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, Map attributes = parseInfo(info, id); // find out our current location, and clip the alleles down to their minimum length - long loc = pos; + int loc = pos; // ref alleles don't need to be single bases for monomorphic sites if ( alleles.size() == 1 ) { loc = pos + alleles.get(0).length() - 1; @@ -263,7 +298,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, * * @return the type of record */ - public Class getFeatureType() { + public Class getFeatureType() { return VariantContext.class; } @@ -507,9 +542,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, * @param ref the reference string * @param unclippedAlleles the list of unclipped alleles * @param clippedAlleles output list of clipped alleles - * @return a list of alleles, clipped to the reference + * @return the new reference end position of this event */ - protected static long clipAlleles(long position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { + protected static int clipAlleles(int position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { // Note that the computation of forward clipping here is meant only to see whether there is a common // base to all alleles, and to correctly compute reverse clipping, @@ -535,11 +570,13 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, if (clipping) reverseClipped++; } - for (Allele a : unclippedAlleles) { - if (a.isSymbolic()) { - clippedAlleles.add(a); - } else { - clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(),0,a.getBases().length-reverseClipped),a.isReference())); + if ( clippedAlleles != null ) { + for ( Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) { + clippedAlleles.add(a); + } else { + clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(),0,a.getBases().length-reverseClipped),a.isReference())); + } } } @@ -623,9 +660,21 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) { try { - char[] buff = new char[MAGIC_HEADER_LINE.length()]; - new FileReader(potentialInput).read(buff, 0, MAGIC_HEADER_LINE.length()); + return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || + isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE); + } catch ( FileNotFoundException e ) { + return false; + } catch ( IOException e ) { + return false; + } + } + + private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) { + try { + byte[] buff = new byte[MAGIC_HEADER_LINE.length()]; + stream.read(buff, 0, MAGIC_HEADER_LINE.length()); String firstLine = new String(buff); + stream.close(); return firstLine.startsWith(MAGIC_HEADER_LINE); } catch ( IOException e ) { return false; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index ea16595bb..e5b1a2de5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -14,8 +14,20 @@ import java.util.*; /** - * a feature codec for the VCF 3 specification. Our aim is to read in the records and convert to VariantContext as - * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + * A feature codec for the VCF3 specification, to read older VCF files. VCF3 has been + * depreciated in favor of VCF4 (See VCF codec for the latest information) + * + *

    + * Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example) + *

    + * + *

    + * See also: @see VCF specification
    + * See also: @see VCF spec. publication + *

    + * + * @author Mark DePristo + * @since 2010 */ public class VCF3Codec extends AbstractVCFCodec { public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3"; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 55a0eb3f9..fa030ef5f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -12,12 +12,46 @@ import java.io.FileReader; import java.io.IOException; import java.util.*; - /** - * a feature codec for the VCF 4 specification. Our aim is to read in the records and convert to VariantContext as - * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + * A feature codec for the VCF 4 specification + * + *

    + * VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a + * header line, and then data lines each containing information about a position in the genome. + *

    + *

    One of the main uses of next-generation sequencing is to discover variation amongst large populations + * of related samples. Recently the format for storing next-generation read alignments has been + * standardised by the SAM/BAM file format specification. This has significantly improved the + * interoperability of next-generation tools for alignment, visualisation, and variant calling. + * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent + * types of sequence variation, including SNPs, indels and larger structural variants, together + * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for + * fast data retrieval of variants from a range of positions on the reference genome. + * The format was developed for the 1000 Genomes Project, and has also been adopted by other projects + * such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements + * various utilities for processing VCF files, including validation, merging and comparing, + * and also provides a general Perl and Python API. + * The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.

    + * + *

    + * See also: @see VCF specification
    + * See also: @see VCF spec. publication + *

    + * + *

    File format example

    + *
    + *     ##fileformat=VCFv4.0
    + *     #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12878
    + *     chr1    109     .       A       T       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:610,327:308:-316.30,-95.47,-803.03:99
    + *     chr1    147     .       C       A       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:294,49:118:-57.87,-34.96,-338.46:99
    + * 
    + * + * @author Mark DePristo + * @since 2010 */ public class VCFCodec extends AbstractVCFCodec { + // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; /** diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index f43891e77..c0a04c81f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -180,4 +180,19 @@ public class VCFUtils { return new HashSet(map.values()); } + + public static String rsIDOfFirstRealVariant(List VCs, VariantContext.Type type) { + if ( VCs == null ) + return null; + + String rsID = null; + for ( VariantContext vc : VCs ) { + if ( vc.getType() == type ) { + rsID = vc.getID(); + break; + } + } + + return rsID; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java index 710503ca8..5bbe3f91e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java @@ -36,9 +36,12 @@ import java.lang.annotation.*; @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.TYPE) public @interface DocumentedGATKFeature { + /** Should we actually document this feature, even through it's annotated? */ public boolean enable() default true; + /** The overall group name (walkers, readfilters) this feature is associated with */ public String groupName(); + /** A human readable summary of the purpose of this group of features */ public String summary() default ""; - public Class handler() default GenericDocumentationHandler.class; + /** Are there links to other docs that we should include? CommandLineGATK.class for walkers, for example? */ public Class[] extraDocs() default {}; } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java index ce03c8093..87926d2e3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java @@ -92,9 +92,7 @@ public abstract class DocumentedGATKFeatureHandler { * * toProcess.setHandlerContent(summary, rootMap); * - * @param rootDoc * @param toProcess - * @param all */ - public abstract void processOne(RootDoc rootDoc, GATKDocWorkUnit toProcess, Set all); + public abstract void processOne(GATKDocWorkUnit toProcess); } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java new file mode 100644 index 000000000..6c8b0a475 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +/** + * Documentation unit. Effectively a class version of the DocumentedGATKFeature. + * Immutable data structure. + * + * @author depristo + */ +class DocumentedGATKFeatureObject { + /** Which class are we documenting. Specific to each class being documented */ + private final Class classToDoc; + /** Are we enabled? */ + private final boolean enable; + private final String groupName, summary; + private final Class[] extraDocs; + + public DocumentedGATKFeatureObject(Class classToDoc, final boolean enable, final String groupName, final String summary, final Class[] extraDocs) { + this.classToDoc = classToDoc; + this.enable = enable; + this.groupName = groupName; + this.summary = summary; + this.extraDocs = extraDocs; + } + + public DocumentedGATKFeatureObject(Class classToDoc, final String groupName, final String summary) { + this(classToDoc, true, groupName, summary, new Class[]{}); + } + + public Class getClassToDoc() { return classToDoc; } + public boolean enable() { return enable; } + public String groupName() { return groupName; } + public String summary() { return summary; } + public Class[] extraDocs() { return extraDocs; } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java index 8efeecd7b..cd645943b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java @@ -24,25 +24,40 @@ package org.broadinstitute.sting.utils.help; -/** - * @author depristo - * @since 8/8/11 - */ public class GATKDocUtils { - private final static String URL_ROOT_FOR_RELEASE_GATKDOCS = "http://www.broadinstitute.org/gsa/gatkdocs/release/"; - private final static String URL_ROOT_FOR_STABLE_GATKDOCS = "http://iwww.broadinstitute.org/gsa/gatkdocs/stable/"; - private final static String URL_ROOT_FOR_UNSTABLE_GATKDOCS = "http://iwww.broadinstitute.org/gsa/gatkdocs/unstable/"; + /** The URL root for RELEASED GATKDOC units */ + public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = "http://www.broadinstitute.org/gsa/gatkdocs/release/"; + /** The URL root for STABLE GATKDOC units */ + public final static String URL_ROOT_FOR_STABLE_GATKDOCS = "http://iwww.broadinstitute.org/gsa/gatkdocs/stable/"; + /** The URL root for UNSTABLE GATKDOC units */ + public final static String URL_ROOT_FOR_UNSTABLE_GATKDOCS = "http://iwww.broadinstitute.org/gsa/gatkdocs/unstable/"; + /** + * Return the filename of the GATKDoc HTML that would be generated for Class. This + * does not guarantee that the docs exist, or that docs would actually be generated + * for class (might not be annotated for documentation, for example). But if + * this class is documented, GATKDocs will write the docs to a file named as returned + * by this function. + * + * @param c + * @return + */ public static String htmlFilenameForClass(Class c) { return c.getName().replace(".", "_") + ".html"; } + /** + * Returns a full URL http://etc/ linking to the documentation for class (assuming it + * exists). Currently points to the RELEASE doc path only. + * @param c + * @return + */ public static String helpLinksToGATKDocs(Class c) { String classPath = htmlFilenameForClass(c); StringBuilder b = new StringBuilder(); - b.append("release version: ").append(URL_ROOT_FOR_RELEASE_GATKDOCS).append(classPath).append("\n"); - b.append("stable version: ").append(URL_ROOT_FOR_STABLE_GATKDOCS).append(classPath).append("\n"); - b.append("unstable version: ").append(URL_ROOT_FOR_UNSTABLE_GATKDOCS).append(classPath).append("\n"); + b.append(URL_ROOT_FOR_RELEASE_GATKDOCS).append(classPath); + //b.append("stable version: ").append(URL_ROOT_FOR_STABLE_GATKDOCS).append(classPath).append("\n"); + //b.append("unstable version: ").append(URL_ROOT_FOR_UNSTABLE_GATKDOCS).append(classPath).append("\n"); return b.toString(); } -} +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java index 1f6db2757..41c855329 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java @@ -51,7 +51,7 @@ class GATKDocWorkUnit implements Comparable { /** The javadoc documentation for clazz */ final ClassDoc classDoc; /** The annotation that lead to this Class being in GATKDoc */ - final DocumentedGATKFeature annotation; + final DocumentedGATKFeatureObject annotation; /** When was this walker built, and what's the absolute version number */ final String buildTimestamp, absoluteVersion; @@ -60,7 +60,7 @@ class GATKDocWorkUnit implements Comparable { Map forTemplate; public GATKDocWorkUnit(String name, String filename, String group, - DocumentedGATKFeature annotation, DocumentedGATKFeatureHandler handler, + DocumentedGATKFeatureObject annotation, DocumentedGATKFeatureHandler handler, ClassDoc classDoc, Class clazz, String buildTimestamp, String absoluteVersion) { this.annotation = annotation; diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java index d071be105..7f26f22f5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java @@ -33,23 +33,83 @@ import freemarker.template.TemplateException; import org.apache.commons.io.FileUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; +import org.broad.tribble.FeatureCodec; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.walkers.qc.DocumentationTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.*; import java.util.*; /** + * Javadoc Doclet that combines javadoc, GATK ParsingEngine annotations, and FreeMarker + * templates to produce html formatted GATKDocs for walkers + * and other classes. * + * This document has the following workflow: + * + * 1 -- walk the javadoc heirarchy, looking for class that have the + * DocumentedGATKFeature annotation or are in the type heirarchy in the + * static list of things to document, and are to be documented + * 2 -- construct for each a GATKDocWorkUnit, resulting in the complete + * set of things to document + * 3 -- for each unit, actually generate an html page documenting it + * as well as links to related features via their units. Writing + * of a specific class HTML is accomplished by a generate DocumentationHandler + * 4 -- write out an index of all units, organized by group + * + * The documented classes are restricted to only those with @DocumentedGATKFeature + * annotation or are in the STATIC_DOCS class. */ public class GATKDoclet { - final protected static File SETTINGS_DIR = new File("settings/helpTemplates"); - final protected static File DESTINATION_DIR = new File("gatkdocs"); final protected static Logger logger = Logger.getLogger(GATKDoclet.class); + + /** Where we find the help FreeMarker templates */ + final protected static File SETTINGS_DIR = new File("settings/helpTemplates"); + + /** Where we write the GATKDoc html directory */ + final protected static File DESTINATION_DIR = new File("gatkdocs"); + + // ---------------------------------------------------------------------- + // + // Global variables that are set on the command line by javadoc + // + // ---------------------------------------------------------------------- protected static String buildTimestamp = null, absoluteVersion = null; protected static boolean showHiddenFeatures = false; + protected static boolean testOnly = false; + + /** + * Any class that's in this list will be included in the documentation + * when the -test argument is provided. Useful for debugging. + */ + private static final List> testOnlyKeepers = Arrays.asList( + DocumentationTest.class, CommandLineGATK.class, UserException.class); + + /** The javadoc root doc */ RootDoc rootDoc; + /** The set of all things we are going to document */ + Set myWorkUnits; + + /** + * A static list of DocumentedGATKFeatureObjects. Any class that is as or extends + * one of the DocumentedGATKFeatureObjects.clazz of this collection will also + * be documented, even if it doesn't have the @DocumentedGATKFeature annotation. Useful + * when you want to document things that implement an interface (annotations on java + * interfaces aren't inherited) or whose base class isn't under your control (tribble + * codecs). + */ + final static Collection STATIC_DOCS = new ArrayList(); + static { + STATIC_DOCS.add(new DocumentedGATKFeatureObject(FeatureCodec.class, + "Reference ordered data (ROD) codecs", + "Tribble codecs for reading reference ordered data such as VCF or BED files")); + } + + /** * Extracts the contents of certain types of javadoc and adds them to an XML file. * @param rootDoc The documentation root. @@ -57,7 +117,8 @@ public class GATKDoclet { * @throws java.io.IOException if output can't be written. */ public static boolean start(RootDoc rootDoc) throws IOException { - logger.setLevel(Level.DEBUG); + logger.setLevel(Level.INFO); + // load arguments for(String[] options: rootDoc.options()) { if(options[0].equals("-build-timestamp")) @@ -66,10 +127,13 @@ public class GATKDoclet { absoluteVersion = options[1]; if (options[0].equals("-include-hidden")) showHiddenFeatures = true; + if (options[0].equals("-test")) + testOnly = true; } - GATKDoclet doclet = new GATKDoclet(); - doclet.processDocs(rootDoc); + // process the docs + new GATKDoclet().processDocs(rootDoc); + return true; } @@ -79,43 +143,29 @@ public class GATKDoclet { * @return Number of potential parameters; 0 if not supported. */ public static int optionLength(String option) { - if(option.equals("-build-timestamp") || option.equals("-absolute-version") || option.equals("-include-hidden")) { + if(option.equals("-build-timestamp") || + option.equals("-absolute-version") || + option.equals("-include-hidden")) { return 2; - } - return 0; + } else if ( option.equals("-test") ) + return 1; + else + return 0; } + /** + * Are we supposed to include @Hidden annotations in our documented output? + * @return + */ public boolean showHiddenFeatures() { return showHiddenFeatures; } - public Set workUnits() { - TreeSet m = new TreeSet(); - - for ( ClassDoc doc : rootDoc.classes() ) { - //logger.debug("Considering " + doc); - Class clazz = getClassForClassDoc(doc); - - if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance")) - logger.debug("foo"); - - DocumentedGATKFeature feature = getFeatureForClassDoc(doc); - DocumentedGATKFeatureHandler handler = createHandler(doc, feature); - if ( handler != null && handler.includeInDocs(doc) ) { - logger.info("Generating documentation for class " + doc); - String filename = handler.getDestinationFilename(doc, clazz); - GATKDocWorkUnit unit = new GATKDocWorkUnit(doc.name(), - filename, feature.groupName(), - feature, handler, doc, clazz, - buildTimestamp, absoluteVersion); - m.add(unit); - } - } - - return m; - } - - protected void processDocs(RootDoc rootDoc) { + /** + * + * @param rootDoc + */ + private void processDocs(RootDoc rootDoc) { // setup the global access to the root this.rootDoc = rootDoc; @@ -133,9 +183,9 @@ public class GATKDoclet { // Specify how templates will see the data-model. This is an advanced topic... cfg.setObjectWrapper(new DefaultObjectWrapper()); - Set myWorkUnits = workUnits(); + myWorkUnits = computeWorkUnits(); for ( GATKDocWorkUnit workUnit : myWorkUnits ) { - processDocWorkUnit(cfg, workUnit, myWorkUnits); + processDocWorkUnit(cfg, workUnit); } processIndex(cfg, new ArrayList(myWorkUnits)); @@ -146,36 +196,92 @@ public class GATKDoclet { } } - private DocumentedGATKFeatureHandler createHandler(ClassDoc doc, DocumentedGATKFeature feature) { - try { - if ( feature != null ) { - if ( feature.enable() ) { - DocumentedGATKFeatureHandler handler = feature.handler().newInstance(); - handler.setDoclet(this); - return handler; - } else { - logger.info("Skipping disabled Documentation for " + doc); - } + /** + * Returns the set of all GATKDocWorkUnits that we are going to generate docs for. + * @return + */ + private Set computeWorkUnits() { + TreeSet m = new TreeSet(); + + for ( ClassDoc doc : rootDoc.classes() ) { + //logger.debug("Considering " + doc); + Class clazz = getClassForClassDoc(doc); + + // don't add anything that's not DocumentationTest if we are in test mode + if ( clazz != null && testOnly && ! testOnlyKeepers.contains(clazz) ) + continue; + + //if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance")) + // logger.debug("foo"); + + DocumentedGATKFeatureObject feature = getFeatureForClassDoc(doc); + DocumentedGATKFeatureHandler handler = createHandler(doc, feature); + if ( handler != null && handler.includeInDocs(doc) ) { + logger.info("Generating documentation for class " + doc); + String filename = handler.getDestinationFilename(doc, clazz); + GATKDocWorkUnit unit = new GATKDocWorkUnit(doc.name(), + filename, feature.groupName(), + feature, handler, doc, clazz, + buildTimestamp, absoluteVersion); + m.add(unit); + } + } + + return m; + } + + /** + * Create a handler capable of documenting the class doc according to feature. Returns + * null if no appropriate handler is found or doc shouldn't be documented at all. + * @param doc + * @param feature + * @return + */ + private DocumentedGATKFeatureHandler createHandler(ClassDoc doc, DocumentedGATKFeatureObject feature) { + if ( feature != null ) { + if ( feature.enable() ) { + DocumentedGATKFeatureHandler handler = new GenericDocumentationHandler(); + handler.setDoclet(this); + return handler; + } else { + logger.info("Skipping disabled Documentation for " + doc); } - } catch ( IllegalAccessException e) { - throw new RuntimeException(e); // the constructor is now private -- this is an error - } catch ( InstantiationException e) { - throw new RuntimeException(e); // the constructor is now private -- this is an error } return null; } - private DocumentedGATKFeature getFeatureForClassDoc(ClassDoc doc) { - // todo -- what do I need the ? extends Object to pass the compiler? + /** + * Returns the instantiated DocumentedGATKFeatureObject that describes the GATKDoc + * structure we will apply to Doc. + * + * @param doc + * @return null if this proves inappropriate or doc shouldn't be documented + */ + private DocumentedGATKFeatureObject getFeatureForClassDoc(ClassDoc doc) { Class docClass = getClassForClassDoc(doc); - if ( docClass != null && docClass.isAnnotationPresent(DocumentedGATKFeature.class) ) { - return docClass.getAnnotation(DocumentedGATKFeature.class); - } else { + + if ( docClass == null ) return null; // not annotated so it shouldn't be documented + + if ( docClass.isAnnotationPresent(DocumentedGATKFeature.class) ) { + DocumentedGATKFeature f = docClass.getAnnotation(DocumentedGATKFeature.class); + return new DocumentedGATKFeatureObject(docClass, f.enable(), f.groupName(), f.summary(), f.extraDocs()); + } else { + for ( DocumentedGATKFeatureObject staticDocs : STATIC_DOCS ) { + if ( staticDocs.getClassToDoc().isAssignableFrom(docClass) ) { + return new DocumentedGATKFeatureObject(docClass, staticDocs.enable(), staticDocs.groupName(), staticDocs.summary(), staticDocs.extraDocs()); + } + } + return null; } } + /** + * Return the Java class described by the ClassDoc doc + * @param doc + * @return + */ private Class getClassForClassDoc(ClassDoc doc) { try { // todo -- what do I need the ? extends Object to pass the compiler? @@ -191,10 +297,12 @@ public class GATKDoclet { } } - public static ClassDoc getClassDocForClass(RootDoc rootDoc, Class clazz) { - return rootDoc.classNamed(clazz.getName()); - } - + /** + * Create the html index listing all of the GATKDocs features + * @param cfg + * @param indexData + * @throws IOException + */ private void processIndex(Configuration cfg, List indexData) throws IOException { /* Get or create a template */ Template temp = cfg.getTemplate("generic.index.template.html"); @@ -209,6 +317,12 @@ public class GATKDoclet { } } + /** + * Helpful function to create the html index. Given all of the already run GATKDocWorkUnits, + * create the high-level grouping data listing individual features by group. + * @param indexData + * @return + */ private Map groupIndexData(List indexData) { // // root -> data -> { summary -> y, filename -> z }, etc @@ -217,16 +331,15 @@ public class GATKDoclet { Collections.sort(indexData); - Set docFeatures = new HashSet(); + List> groups = new ArrayList>(); + Set seenDocumentationFeatures = new HashSet(); List> data = new ArrayList>(); for ( GATKDocWorkUnit workUnit : indexData ) { data.add(workUnit.indexDataMap()); - docFeatures.add(workUnit.annotation); - } - - List> groups = new ArrayList>(); - for ( DocumentedGATKFeature feature : docFeatures ) { - groups.add(toMap(feature)); + if ( ! seenDocumentationFeatures.contains(workUnit.annotation.groupName()) ) { + groups.add(toMap(workUnit.annotation)); + seenDocumentationFeatures.add(workUnit.annotation.groupName()); + } } root.put("data", data); @@ -237,25 +350,51 @@ public class GATKDoclet { return root; } - private static final Map toMap(DocumentedGATKFeature annotation) { + /** + * Trivial helper routine that returns the map of name and summary given the annotation + * @param annotation + * @return + */ + private static final Map toMap(DocumentedGATKFeatureObject annotation) { Map root = new HashMap(); root.put("name", annotation.groupName()); root.put("summary", annotation.summary()); return root; } - public final static GATKDocWorkUnit findWorkUnitForClass(Class c, Set all) { - for ( final GATKDocWorkUnit unit : all ) + /** + * Helper function that finding the GATKDocWorkUnit associated with class from among all of the work units + * @param c the class we are looking for + * @return the GATKDocWorkUnit whose .clazz.equals(c), or null if none could be found + */ + public final GATKDocWorkUnit findWorkUnitForClass(Class c) { + for ( final GATKDocWorkUnit unit : this.myWorkUnits ) if ( unit.clazz.equals(c) ) return unit; return null; } - private void processDocWorkUnit(Configuration cfg, GATKDocWorkUnit unit, Set all) + /** + * Return the ClassDoc associated with clazz + * @param clazz + * @return + */ + public ClassDoc getClassDocForClass(Class clazz) { + return rootDoc.classNamed(clazz.getName()); + } + + /** + * High-level function that processes a single DocWorkUnit unit using its handler + * + * @param cfg + * @param unit + * @throws IOException + */ + private void processDocWorkUnit(Configuration cfg, GATKDocWorkUnit unit) throws IOException { //System.out.printf("Processing documentation for class %s%n", unit.classDoc); - unit.handler.processOne(rootDoc, unit, all); + unit.handler.processOne(unit); // Get or create a template Template temp = cfg.getTemplate(unit.handler.getTemplateName(unit.classDoc)); diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java index 6ddf8a157..4f1e95499 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java @@ -24,20 +24,27 @@ package org.broadinstitute.sting.utils.help; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import com.sun.javadoc.ClassDoc; import com.sun.javadoc.FieldDoc; import com.sun.javadoc.RootDoc; import com.sun.javadoc.Tag; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broad.tribble.bed.FullBEDFeature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; import java.io.*; -import java.lang.reflect.Field; +import java.lang.reflect.*; import java.util.*; /** @@ -45,20 +52,24 @@ import java.util.*; */ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { private static Logger logger = Logger.getLogger(GenericDocumentationHandler.class); - GATKDocWorkUnit toProcess; - ClassDoc classdoc; - Set all; - RootDoc rootDoc; + + /** + * The max. length of the longest of --fullName -shortName argument name + * before we prefer the shorter option. + */ + private static final int MAX_DISPLAY_NAME = 30; + + /** The Class we are documenting */ + private GATKDocWorkUnit toProcess; @Override public boolean includeInDocs(ClassDoc doc) { - return true; -// try { -// Class type = HelpUtils.getClassForDoc(doc); -// return JVMUtils.isConcrete(type); -// } catch ( ClassNotFoundException e ) { -// return false; -// } + try { + Class type = HelpUtils.getClassForDoc(doc); + return JVMUtils.isConcrete(type); + } catch ( ClassNotFoundException e ) { + return false; + } } @@ -68,11 +79,8 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { } @Override - public void processOne(RootDoc rootDoc, GATKDocWorkUnit toProcessArg, Set allArg) { - this.rootDoc = rootDoc; + public void processOne(GATKDocWorkUnit toProcessArg) { this.toProcess = toProcessArg; - this.all = allArg; - this.classdoc = toProcess.classDoc; //System.out.printf("%s class %s%n", toProcess.group, toProcess.classDoc); Map root = new HashMap(); @@ -84,95 +92,125 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { toProcess.setHandlerContent((String)root.get("summary"), root); } + /** + * Add high-level summary information about toProcess to root, such as its + * name, summary, description, version, etc. + * + * @param root + */ protected void addHighLevelBindings(Map root) { - root.put("name", classdoc.name()); + root.put("name", toProcess.classDoc.name()); // Extract overrides from the doc tags. StringBuilder summaryBuilder = new StringBuilder(); - for(Tag tag: classdoc.firstSentenceTags()) + for(Tag tag: toProcess.classDoc.firstSentenceTags()) summaryBuilder.append(tag.text()); root.put("summary", summaryBuilder.toString()); - root.put("description", classdoc.commentText().substring(summaryBuilder.toString().length())); + root.put("description", toProcess.classDoc.commentText().substring(summaryBuilder.toString().length())); root.put("timestamp", toProcess.buildTimestamp); root.put("version", toProcess.absoluteVersion); - for(Tag tag: classdoc.tags()) { + for(Tag tag: toProcess.classDoc.tags()) { root.put(tag.name(), tag.text()); } } + /** + * Add bindings describing related GATK capabilites to toProcess + * @param root + */ + protected void addRelatedBindings(Map root) { + List> extraDocsData = new ArrayList>(); + + // add in all of the explicitly related items + for ( final Class extraDocClass : toProcess.annotation.extraDocs() ) { + final GATKDocWorkUnit otherUnit = getDoclet().findWorkUnitForClass(extraDocClass); + if ( otherUnit == null ) + throw new ReviewedStingException("Requested extraDocs for class without any documentation: " + extraDocClass); + extraDocsData.add( + new HashMap(){{ + put("filename", otherUnit.filename); + put("name", otherUnit.name);}}); + + } + root.put("extradocs", extraDocsData); + } + + /** + * Add information about all of the arguments available to toProcess to root + * + * @param root + */ protected void addArgumentBindings(Map root) { ParsingEngine parsingEngine = createStandardGATKParsingEngine(); - // attempt to instantiate the class - Object instance = makeInstanceIfPossible(toProcess.clazz); - - Map> args = new HashMap>(); + Map>> args = createArgumentMap(); root.put("arguments", args); - args.put("all", new ArrayList()); - args.put("required", new ArrayList()); - args.put("optional", new ArrayList()); - args.put("hidden", new ArrayList()); - args.put("depreciated", new ArrayList()); try { - for ( ArgumentSource argumentSource : parsingEngine.extractArgumentSources(HelpUtils.getClassForDoc(classdoc)) ) { + // loop over all of the arguments according to the parsing engine + for ( final ArgumentSource argumentSource : parsingEngine.extractArgumentSources(HelpUtils.getClassForDoc(toProcess.classDoc)) ) { + // todo -- why can you have multiple ones? ArgumentDefinition argDef = argumentSource.createArgumentDefinitions().get(0); - FieldDoc fieldDoc = getFieldDoc(classdoc, argumentSource.field.getName()); - Map argBindings = docForArgument(fieldDoc, argumentSource, argDef); // todo -- why can you have multiple ones? + FieldDoc fieldDoc = getFieldDoc(toProcess.classDoc, argumentSource.field.getName()); + Map argBindings = docForArgument(fieldDoc, argumentSource, argDef); if ( ! argumentSource.isHidden() || getDoclet().showHiddenFeatures() ) { - logger.debug(String.format("Processing %s", argumentSource)); - String kind = "optional"; - if ( argumentSource.isRequired() ) kind = "required"; - else if ( argumentSource.isHidden() ) kind = "hidden"; - else if ( argumentSource.isDeprecated() ) kind = "depreciated"; + final String kind = docKindOfArg(argumentSource); - // get the value of the field - if ( instance != null ) { - Object value = getFieldValue(toProcess.clazz, instance, fieldDoc.name()); - - if ( value == null && argumentSource.createsTypeDefault() ) { - // handle the case where there's an implicit default - try { - value = argumentSource.typeDefaultDocString(); - } catch (ReviewedStingException e) { - ; // failed to create type default, don't worry about it - } - } - - if ( value != null ) - argBindings.put("defaultValue", prettyPrintValueString(value)); - } + final Object value = argumentValue(toProcess.clazz, argumentSource); + if ( value != null ) + argBindings.put("defaultValue", prettyPrintValueString(value)); args.get(kind).add(argBindings); args.get("all").add(argBindings); - } else { - logger.debug(String.format("Skipping hidden feature %s", argumentSource)); } } + + // sort the arguments + for (Map.Entry>> entry : args.entrySet()) { + entry.setValue(sortArguments(entry.getValue())); + } } catch ( ClassNotFoundException e ) { throw new RuntimeException(e); } } - private Object getFieldValue(Class c, Object instance, String fieldName) { - Field field = JVMUtils.findField(c, fieldName); - if ( field != null ) { - Object value = JVMUtils.getFieldValue(field, instance); - //System.out.printf("Fetched value of field %s in class %s: %s%n", fieldName, c, value); - return value; - } else { - return findFieldValueInArgumentCollections(c, instance, fieldName); - } + /** + * Return the argument kind (required, advanced, hidden, etc) of this argumentSource + * @param argumentSource + * @return + */ + @Requires("argumentSource != null") + @Ensures("result != null") + private String docKindOfArg(ArgumentSource argumentSource) { + if ( argumentSource.isRequired() ) return "required"; + else if ( argumentSource.isAdvanced() ) return "advanced"; + else if ( argumentSource.isHidden() ) return "hidden"; + else if ( argumentSource.isDeprecated() ) return "depreciated"; + else return "optional"; } - private Object findFieldValueInArgumentCollections(Class c, Object instance, String fieldName) { - for ( Field field : JVMUtils.getAllFields(c) ) { - if ( field.isAnnotationPresent(ArgumentCollection.class) ) { - //System.out.printf("Searching for %s in argument collection field %s%n", fieldName, field); - Object fieldValue = JVMUtils.getFieldValue(field, instance); - Object value = getFieldValue(fieldValue.getClass(), fieldValue, fieldName); - if ( value != null ) - return value; + /** + * Attempts to determine the value of argumentSource in an instantiated version of c + * @param c + * @param argumentSource + * @return value of argumentSource, or null if this isn't possible + */ + @Requires({"c != null", "argumentSource != null"}) + private Object argumentValue(Class c, ArgumentSource argumentSource) { + // get the value of the field + // attempt to instantiate the class + final Object instance = makeInstanceIfPossible(toProcess.clazz); + if ( instance != null ) { + final Object value = getFieldValue(instance, argumentSource.field.getName()); + if ( value != null ) + return value; + + if ( argumentSource.createsTypeDefault() ) { + try { // handle the case where there's an implicit default + return argumentSource.typeDefaultDocString(); + } catch (ReviewedStingException e) { + ; // failed to create type default, don't worry about it + } } } @@ -180,6 +218,85 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { } /** + * Create the argument map for holding class arguments + * @return + */ + private Map>> createArgumentMap() { + Map>> args = new HashMap>>(); + args.put("all", new ArrayList>()); + args.put("required", new ArrayList>()); + args.put("optional", new ArrayList>()); + args.put("advanced", new ArrayList>()); + args.put("hidden", new ArrayList>()); + args.put("depreciated", new ArrayList>()); + return args; + } + + + /** + * Sorts the individual argument list in unsorted according to CompareArgumentsByName + * @param unsorted + * @return + */ + private List> sortArguments(List> unsorted) { + Collections.sort(unsorted, new CompareArgumentsByName()); + return unsorted; + } + + /** + * Sort arguments by case-insensitive comparison ignoring the -- and - prefixes + */ + private class CompareArgumentsByName implements Comparator> { + public int compare(Map x, Map y) { + return elt(x).compareTo(elt(y)); + } + + private String elt(Map m) { + String v = m.get("name").toString().toLowerCase(); + if ( v.startsWith("--") ) + return v.substring(2); + else if ( v.startsWith("-") ) + return v.substring(1); + else + throw new RuntimeException("Expect to see arguments beginning with at least one -, but found " + v); + } + } + + /** + * Utility function that finds the value of fieldName in any fields of ArgumentCollection fields in + * instance of class c. + * + * @param instance the object to query for the field value + * @param fieldName the name of the field we are looking for in instance + * @return The value assigned to field in the ArgumentCollection, otherwise null + */ + private Object getFieldValue(Object instance, String fieldName) { + // + // subtle note. If you have a field named X that is an ArgumentCollection that + // contains a field X as well, you need only consider fields in the argumentCollection, not + // matching the argument itself. + // + // @ArgumentCollection + // protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + // + for ( Field field : JVMUtils.getAllFields(instance.getClass()) ) { + if ( field.isAnnotationPresent(ArgumentCollection.class) ) { + //System.out.printf("Searching for %s in argument collection field %s%n", fieldName, field); + Object fieldValue = JVMUtils.getFieldValue(field, instance); + Object value = getFieldValue(fieldValue, fieldName); + if ( value != null ) + return value; + } else if ( field.getName().equals(fieldName) ) { + return JVMUtils.getFieldValue(field, instance); + } + } + + return null; + } + + /** + * Pretty prints value + * * Assumes value != null * @param value * @return @@ -214,6 +331,11 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { return value.toString(); } + /** + * Attempt to instantiate class c, if possible. Returns null if this proves impossible. + * @param c + * @return + */ private Object makeInstanceIfPossible(Class c) { Object instance = null; try { @@ -233,61 +355,16 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { // this last one is super dangerous, but some of these methods catch ClassNotFoundExceptions // and rethrow then as RuntimeExceptions catch (RuntimeException e) {} -// finally { -// if ( instance == null ) -// logger.warn(String.format("Unable to create instance of class %s => %s", c, instance)); -// } return instance; } - protected void addRelatedBindings(Map root) { - List> extraDocsData = new ArrayList>(); - // add in all of the explicitly related items - for ( final Class extraDocClass : toProcess.annotation.extraDocs() ) { - final GATKDocWorkUnit otherUnit = GATKDoclet.findWorkUnitForClass(extraDocClass, all); - if ( otherUnit == null ) - throw new ReviewedStingException("Requested extraDocs for class without any documentation: " + extraDocClass); - extraDocsData.add( - new HashMap(){{ - put("filename", otherUnit.filename); - put("name", otherUnit.name);}}); - - } - - List> hierarchyDocs = new ArrayList>(); - for (final GATKDocWorkUnit other : all ) { - final String relation = classRelationship(toProcess.clazz, other.clazz); - if ( relation != null ) - hierarchyDocs.add( - new HashMap(){{ - put("filename", other.filename); - put("relation", relation); - put("name", other.name);}}); - - } - - root.put("relatedDocs", hierarchyDocs); - root.put("extradocs", extraDocsData); - } - - private static final String classRelationship(Class me, Class other) { - if ( other.equals(me) ) - // no circular references - return null; - else if ( other.isAssignableFrom(me) ) - // toProcess is a superclass of other.clazz - return "superclass"; - else if ( me.isAssignableFrom(other) ) - // toProcess inherits from other.clazz - return "subclass"; - else - return null; - - } - - protected ParsingEngine createStandardGATKParsingEngine() { + /** + * Create an instance of the GATK parsing engine, for argument processing with GATKDoclet + * @return + */ + private ParsingEngine createStandardGATKParsingEngine() { CommandLineProgram clp = new CommandLineGATK(); try { CommandLineProgram.start(clp, new String[]{}, true); @@ -297,10 +374,25 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { } } + /** + * Gets the javadocs associated with field name in classDoc. Throws a + * runtime exception if this proves impossible. + * + * @param classDoc + * @param name + * @return + */ private FieldDoc getFieldDoc(ClassDoc classDoc, String name) { return getFieldDoc(classDoc, name, true); } + /** + * Recursive helper routine to getFieldDoc() + * @param classDoc + * @param name + * @param primary + * @return + */ private FieldDoc getFieldDoc(ClassDoc classDoc, String name, boolean primary) { //System.out.printf("Looking for %s in %s%n", name, classDoc.name()); for ( FieldDoc fieldDoc : classDoc.fields(false) ) { @@ -309,6 +401,8 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { return fieldDoc; Field field = HelpUtils.getFieldForFieldDoc(fieldDoc); + if ( field == null ) + throw new RuntimeException("Could not find the field corresponding to " + fieldDoc + ", presumably because the field is inaccessible"); if ( field.isAnnotationPresent(ArgumentCollection.class) ) { ClassDoc typeDoc = getRootDoc().classNamed(fieldDoc.type().qualifiedTypeName()); if ( typeDoc == null ) @@ -333,41 +427,141 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { return null; } + /** + * Returns a Pair of (main, synonym) names for argument with fullName s1 and + * shortName s2. The main is selected to be the longest of the two, provided + * it doesn't exceed MAX_DISPLAY_NAME, in which case the shorter is taken. + * @param s1 + * @param s2 + * @return + */ + Pair displayNames(String s1, String s2) { + if ( s1 == null ) return new Pair(s2, null); + if ( s2 == null ) return new Pair(s1, null); + + String l = s1.length() > s2.length() ? s1 : s2; + String s = s1.length() > s2.length() ? s2 : s1; + + if ( l.length() > MAX_DISPLAY_NAME ) + return new Pair(s, l); + else + return new Pair(l, s); + } + + /** + * Returns a human readable string that describes the Type type of a GATK argument. + * + * This will include parameterized types, so that Set{T} shows up as Set(T) and not + * just Set in the docs. + * + * @param type + * @return + */ + protected String argumentTypeString(Type type) { + if (type instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType)type; + List subs = new ArrayList(); + for (Type actualType: parameterizedType.getActualTypeArguments()) + subs.add(argumentTypeString(actualType)); + return argumentTypeString(((ParameterizedType)type).getRawType()) + "[" + Utils.join(",", subs) + "]"; + } else if (type instanceof GenericArrayType) { + return argumentTypeString(((GenericArrayType)type).getGenericComponentType()) + "[]"; + } else if (type instanceof WildcardType) { + throw new RuntimeException("We don't support wildcards in arguments: " + type); + } else if (type instanceof Class) { + return ((Class) type).getSimpleName(); + } else { + throw new StingException("Unknown type: " + type); + } + } + + /** + * Helper routine that returns the Feature.class required by a RodBinding, + * either T for RodBinding{T} or List{RodBinding{T}}. Returns null if + * the Type doesn't fit either model. + * @param type + * @return + */ + protected Class getFeatureTypeIfPossible(Type type) { + if ( type instanceof ParameterizedType) { + ParameterizedType paramType = (ParameterizedType)type; + if ( RodBinding.class.isAssignableFrom((Class)paramType.getRawType()) ) { + return (Class)JVMUtils.getParameterizedTypeClass(type); + } else { + for ( Type paramtype : paramType.getActualTypeArguments() ) { + Class x = getFeatureTypeIfPossible(paramtype); + if ( x != null ) + return x; + } + } + } + + return null; + } + + /** + * High-level entry point for creating a FreeMarker map describing the GATK argument + * source with definition def, with associated javadoc fieldDoc. + * @param fieldDoc + * @param source + * @param def + * @return a non-null Map binding argument keys with their values + */ protected Map docForArgument(FieldDoc fieldDoc, ArgumentSource source, ArgumentDefinition def) { Map root = new HashMap(); - root.put("name", def.shortName != null ? "-" + def.shortName : "--" + def.fullName ); + Pair names = displayNames("-" + def.shortName, "--" + def.fullName); - if ( def.shortName != null && def.fullName != null ) - root.put("synonyms", "--" + def.fullName); + root.put("name", names.getFirst() ); + + if ( names.getSecond() != null ) + root.put("synonyms", names.getSecond()); root.put("required", def.required ? "yes" : "no"); - root.put("type", def.argumentType.getSimpleName()); + + // type of the field + root.put("type", argumentTypeString(source.field.getGenericType())); + + Class featureClass = getFeatureTypeIfPossible(source.field.getGenericType()); + if ( featureClass != null ) { + // deal with the allowable types + FeatureManager manager = new FeatureManager(); + List rodTypes = new ArrayList(); + for (FeatureManager.FeatureDescriptor descriptor : manager.getByFeature(featureClass) ) { + rodTypes.add(String.format("%s", + GATKDocUtils.htmlFilenameForClass(descriptor.getCodecClass()), + descriptor.getName())); + } + + root.put("rodTypes", Utils.join(", ", rodTypes)); + } // summary and fulltext root.put("summary", def.doc != null ? def.doc : ""); root.put("fulltext", fieldDoc.commentText()); + // What are our enum options? + if ( def.validOptions != null ) + root.put("options", docForEnumArgument(source.field.getType())); + + // general attributes List attributes = new ArrayList(); - // this one below is just too much. - //attributes.add(def.ioType.annotationClass.getSimpleName()); if ( def.required ) attributes.add("required"); - // flag is just boolean, not interesting - //if ( def.isFlag ) attributes.add("flag"); - if ( def.isHidden ) attributes.add("hidden"); if ( source.isDeprecated() ) attributes.add("depreciated"); if ( attributes.size() > 0 ) root.put("attributes", Utils.join(", ", attributes)); - if ( def.validOptions != null ) { - root.put("options", docForEnumArgument(source.field.getType())); - } - return root; } + /** + * Helper routine that provides a FreeMarker map for an enumClass, grabbing the + * values of the enum and their associated javadoc documentation. + * @param enumClass + * @return + */ @Requires("enumClass.isEnum()") private List> docForEnumArgument(Class enumClass) { - ClassDoc doc = GATKDoclet.getClassDocForClass(rootDoc, enumClass); + ClassDoc doc = this.getDoclet().getClassDocForClass(enumClass); if ( doc == null ) // || ! doc.isEnum() ) throw new RuntimeException("Tried to get docs for enum " + enumClass + " but got instead: " + doc); @@ -381,5 +575,4 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { return bindings; } - } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java index 4527c6afe..645ab34c1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java @@ -32,13 +32,6 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils; import java.lang.reflect.Field; public class HelpUtils { - protected static boolean implementsInterface(ProgramElementDoc classDoc, Class... interfaceClasses) { - for (Class interfaceClass : interfaceClasses) - if (assignableToClass(classDoc, interfaceClass, false)) - return true; - return false; - } - protected static boolean assignableToClass(ProgramElementDoc classDoc, Class lhsClass, boolean requireConcrete) { try { Class type = getClassForDoc(classDoc); @@ -74,4 +67,5 @@ public class HelpUtils { String.format("%s.%s", containingPackage.name(), doc.name()) : String.format("%s", doc.name()); } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index 3718345a4..70417889b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -135,6 +135,11 @@ public class NWaySAMFileWriter implements SAMFileWriter { public void addAlignment(SAMRecord samRecord) { final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); + String rg = samRecord.getStringAttribute("RG"); + if ( rg != null ) { + String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg); + samRecord.setAttribute("RG",rg_orig); + } writerMap.get(id).addAlignment(samRecord); } diff --git a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java index 61d53679a..9d4b23a8b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java @@ -156,12 +156,13 @@ public class ListFileUtils { FeatureManager.FeatureDescriptor descriptor = builderForValidation.getByTriplet(triplet); if ( descriptor == null ) throw new UserException.UnknownTribbleType(rodBinding.getTribbleType(), - String.format("Field %s had provided type %s but there's no such Tribble type. Available types are %s", - rodBinding.getName(), rodBinding.getTribbleType(), builderForValidation.userFriendlyListOfAvailableFeatures())); + String.format("Field %s had provided type %s but there's no such Tribble type. The compatible types are: %n%s", + rodBinding.getName(), rodBinding.getTribbleType(), builderForValidation.userFriendlyListOfAvailableFeatures(rodBinding.getType()))); if ( ! rodBinding.getType().isAssignableFrom(descriptor.getFeatureClass()) ) throw new UserException.BadArgumentValue(rodBinding.getName(), - String.format("Field %s expected type %s, but the type of the input file provided on the command line was %s. Please make sure that you have provided the correct file type and/or that you are not binding your rod to a name matching one of the available types.", - rodBinding.getName(), rodBinding.getType(), descriptor.getName())); + String.format("Field %s expects Features of type %s, but the input file produces Features of type %s. The compatible types are: %n%s", + rodBinding.getName(), rodBinding.getType().getSimpleName(), descriptor.getSimpleFeatureName(), + builderForValidation.userFriendlyListOfAvailableFeatures(rodBinding.getType()))); rodBindings.add(triplet); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index ca3399c78..888dc1e98 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -583,24 +583,24 @@ public class VariantContext implements Feature { // to enable tribble intergrati /** * @return true if the alleles indicate a simple insertion (i.e., the reference allele is Null) */ - public boolean isInsertion() { - // can't just call !isDeletion() because of complex indels - return getType() == Type.INDEL && getReference().isNull(); + public boolean isSimpleInsertion() { + // can't just call !isSimpleDeletion() because of complex indels + return getType() == Type.INDEL && getReference().isNull() && isBiallelic(); } /** * @return true if the alleles indicate a simple deletion (i.e., a single alt allele that is Null) */ - public boolean isDeletion() { - // can't just call !isInsertion() because of complex indels - return getType() == Type.INDEL && getAlternateAllele(0).isNull(); + public boolean isSimpleDeletion() { + // can't just call !isSimpleInsertion() because of complex indels + return getType() == Type.INDEL && getAlternateAllele(0).isNull() && isBiallelic(); } /** * @return true if the alleles indicate neither a simple deletion nor a simple insertion */ public boolean isComplexIndel() { - return isIndel() && !isDeletion() && !isInsertion(); + return isIndel() && !isSimpleDeletion() && !isSimpleInsertion(); } public boolean isSymbolic() { @@ -822,8 +822,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati // --------------------------------------------------------------------------------------------------------- private void loadGenotypes() { - if ( !hasAttribute(UNPARSED_GENOTYPE_MAP_KEY) ) + if ( !hasAttribute(UNPARSED_GENOTYPE_MAP_KEY) ) { + if ( genotypes == null ) + genotypes = NO_GENOTYPES; return; + } Object parserObj = getAttribute(UNPARSED_GENOTYPE_PARSER_KEY); if ( parserObj == null || !(parserObj instanceof VCFParser) ) @@ -1080,8 +1083,8 @@ public class VariantContext implements Feature { // to enable tribble intergrati } public void validateReferenceBases(Allele reference, Byte paddedRefBase) { - // don't validate if we're an insertion - if ( !reference.isNull() && !reference.basesMatch(getReference()) ) { + // don't validate if we're an insertion or complex event + if ( !reference.isNull() && getReference().length() == 1 && !reference.basesMatch(getReference()) ) { throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString())); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index fa039b42e..834ad0917 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -345,11 +345,33 @@ public class VariantContextUtils { } public enum GenotypeMergeType { - UNIQUIFY, PRIORITIZE, UNSORTED, REQUIRE_UNIQUE + /** + * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. + */ + UNIQUIFY, + /** + * Take genotypes in priority order (see the priority argument). + */ + PRIORITIZE, + /** + * Take the genotypes in any order. + */ + UNSORTED, + /** + * Require that all samples/genotypes be unique between all inputs. + */ + REQUIRE_UNIQUE } public enum FilteredRecordMergeType { - KEEP_IF_ANY_UNFILTERED, KEEP_IF_ALL_UNFILTERED + /** + * Union - leaves the record if any record is unfiltered. + */ + KEEP_IF_ANY_UNFILTERED, + /** + * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. + */ + KEEP_IF_ALL_UNFILTERED } /** diff --git a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java index 79e9172dd..013a37a88 100755 --- a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.commandline; import org.broad.tribble.Feature; -import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.testng.Assert; @@ -35,7 +34,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; -import javax.script.Bindings; import java.util.List; import java.util.EnumSet; /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java index f3e868474..3a242cb13 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java @@ -88,14 +88,6 @@ public class GATKArgumentCollectionUnitTest extends BaseTest { collect.intervals.add("intervals".toLowerCase()); collect.excludeIntervals = new ArrayList(); collect.numberOfThreads = 1; - - // make some rod bindings up - ArrayList fakeBindings = new ArrayList(); - fakeBindings.add("Bind1"); - fakeBindings.add("Bind2"); - fakeBindings.add("Bind3"); - - collect.RODBindings = fakeBindings; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index f782580e2..85ae1e1f7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -10,7 +10,7 @@ import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; +import org.broadinstitute.sting.utils.codecs.table.TableFeature; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java index bd4f93d24..d45f6e667 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java @@ -4,7 +4,7 @@ import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; +import org.broadinstitute.sting.utils.codecs.table.TableFeature; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java index fbd30bc8a..1e39fd26f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; +import org.broadinstitute.sting.utils.codecs.table.TableFeature; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java index 5d662ffed..bae8e99ed 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java @@ -29,8 +29,8 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broad.tribble.Feature; import org.broad.tribble.FeatureCodec; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.refdata.features.table.BedTableCodec; -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; +import org.broadinstitute.sting.utils.codecs.table.BedTableCodec; +import org.broadinstitute.sting.utils.codecs.table.TableFeature; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; @@ -54,7 +54,8 @@ import java.util.*; public class FeatureManagerUnitTest extends BaseTest { private static final File RANDOM_FILE = new File(validationDataLocation + "exampleGATKReport.eval"); private static final File VCF3_FILE = new File(validationDataLocation + "vcfexample3.vcf"); - private static final File VCF4_FILE = new File(validationDataLocation + "vcf4.1.example.vcf"); + private static final File VCF4_FILE = new File(testDir + "HiSeq.10000.vcf"); + private static final File VCF4_FILE_GZ = new File(testDir + "HiSeq.10000.vcf.gz"); private FeatureManager manager; private GenomeLocParser genomeLocParser; @@ -98,7 +99,8 @@ public class FeatureManagerUnitTest extends BaseTest { } public String toString() { - return String.format("FMTest name=%s codec=%s feature=%s file=%s", name, codec, feature, associatedFile); + return String.format("FMTest name=%s codec=%s feature=%s file=%s", + name, codec.getSimpleName(), feature.getSimpleName(), associatedFile); } } @@ -106,6 +108,7 @@ public class FeatureManagerUnitTest extends BaseTest { public Object[][] createTests() { new FMTest(VariantContext.class, VCF3Codec.class, "VCF3", VCF3_FILE); new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE); + new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE_GZ); new FMTest(TableFeature.class, BedTableCodec.class, "bedtable", null); return FMTest.getTests(FMTest.class); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java index a129f8adf..1565c419b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java @@ -37,8 +37,8 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest { "-R " + hg18Reference + " -T ClipReads " + "-I " + validationDataLocation + "clippingReadsTest.bam " + - "-o %s " + - "-ob %s " + args, + "-os %s " + + "-o %s " + args, 2, // just one output file Arrays.asList("tmp", "bam"), Arrays.asList(md51, md52)); @@ -72,9 +72,9 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest { " -I " + validationDataLocation + "originalQuals.chr1.1-1K.bam" + " -L chr1:1-1,000" + " -OQ -QT 4 -CR WRITE_Q0S" + - " -o %s -ob %s", + " -o %s -os %s", 2, - Arrays.asList("55c01ccc2e84481b22d3632cdb06c8ba", "22db22749f811d30216215e047461621")); + Arrays.asList("22db22749f811d30216215e047461621", "55c01ccc2e84481b22d3632cdb06c8ba")); executeTest("clipOriginalQuals", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index f54bfa40c..832079807 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -133,7 +133,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { validationDataLocation + "1000G.exomes.vcf --snpEffFile " + validationDataLocation + "snpEff_1.9.6_1000G.exomes.vcf_hg37.61.out -L 1:26,000,000-26,500,000", 1, - Arrays.asList("c08648a078368c80530bff004b3157f1") + Arrays.asList("03eae1dab19a9358250890594bf53607") ); executeTest("Testing SnpEff annotations", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 1de9a72d8..3503a2353 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -29,7 +29,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("bced1842c78fbabb089dd12b7087050d") + Arrays.asList("1fefd6cf9c2554d5f886c3998defd4d0") ); executeTest("testFundamentalsCountVariantsSNPsandIndels", spec); } @@ -50,7 +50,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("06510bd37ffaa39e817ca0dcaf8f8ac2") + Arrays.asList("d470e00a368b5a0468012818994c6a89") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); } @@ -72,7 +72,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("19c5b1b6396921c5b1059a2849ae4fcc") + Arrays.asList("12856e52c2682328f91594089328596c") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); } @@ -93,7 +93,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("a71f8d81cf166cd97ac628092650964a") + Arrays.asList("91610b9240f64e0eb03cfd2602cf57af") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithCpG", spec); } @@ -114,7 +114,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("4dabe0658232f6174188515db6dfe112") + Arrays.asList("e40b77e7ed6581328e373a24b93cd170") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec); } @@ -135,7 +135,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("3340587f10ceff83e5567ddfd1a9a60e") + Arrays.asList("15beaf3823c131cabc5fb0445239f978") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithDegeneracy", spec); } @@ -156,7 +156,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("c730c7ee31c8138cef6efd8dd04fbbfc") + Arrays.asList("7ddd4ee74938d229ce5cb7b9b9104abe") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithSample", spec); } @@ -179,7 +179,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("2559ca8f454b03e81561f6947f79df18") + Arrays.asList("a90f33906a732ef5eb346e559c96ccc1") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithJexlExpression", spec); } @@ -204,7 +204,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("23aa5f97641d2fd033095f21c51d2f37") + Arrays.asList("2567f90d3d7354850c5a59730ecc6e4f") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithMultipleJexlExpressions", spec); } @@ -223,7 +223,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("a69dd3f06903b3f374c6d6f010c653e0") + Arrays.asList("fa091aa8967893389c51102fd9f0bebb") ); executeTest("testFundamentalsCountVariantsNoCompRod", spec); } @@ -236,7 +236,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("125fe0a04b5d933cc14016598b2791cd")); + 1, Arrays.asList("2df4f8911ffc3c8d042298723ed465f8")); executeTestParallel("testSelect1", spec); } @@ -253,7 +253,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("d1932be3748fcf6da77dc51aec323710")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("ed54aa127b173d8ad8b6482f2a929a42")); executeTestParallel("testCompVsEvalAC",spec); } @@ -283,7 +283,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("cbea5f9f8c046d4c014d261db352c43b")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("18c44636e36d6657110bf984f8eac181")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -295,7 +295,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("d07a246963ae609643620c839b20cd1e")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("1b8ae4fd10de0888bd843f833859d990")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -373,7 +373,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("44464fe7c89a56cf128a932ef640f7da") + Arrays.asList("da65fc8f0d0eeaf0a0b06a07f444bb8e") ); executeTest("testAlleleCountStrat", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 4abf0a102..3267173a7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -38,21 +38,21 @@ public class CombineVariantsIntegrationTest extends WalkerTest { return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args; } - public void test1InOut(String file, String md5, boolean vcf3) { - test1InOut(file, md5, "", vcf3); + public void test1InOut(String file, String md5) { + test1InOut(file, md5, ""); } - public void test1InOut(String file, String md5, String args, boolean vcf3) { + public void test1InOut(String file, String md5, String args) { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -priority v1 -V:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file + args), + baseTestString(" -priority v1 -V:v1 " + validationDataLocation + file + args), 1, Arrays.asList(md5)); executeTest("testInOut1--" + file, spec); } - public void combine2(String file1, String file2, String args, String md5, boolean vcf3) { + public void combine2(String file1, String file2, String args, String md5) { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -priority v1,v2 -V:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file1 + " -V:v2,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file2 + args), + baseTestString(" -priority v1,v2 -V:v1 " + validationDataLocation + file1 + " -V:v2 "+ validationDataLocation + file2 + args), 1, Arrays.asList(md5)); executeTest("combine2 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec); @@ -78,23 +78,23 @@ public class CombineVariantsIntegrationTest extends WalkerTest { executeTest("combine PLs 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec); } - @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "c608b9fc1e36dba6cebb4f259883f9f0", true); } - @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "20caad94411d6ab48153b214de916df8", " -setKey foo", true); } - @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "004f3065cb1bc2ce2f9afd695caf0b48", " -setKey null", true); } - @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "c9c901ff9ef2a982624b203a8086dff0", false); } // official project VCF files in tabix format + @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "c608b9fc1e36dba6cebb4f259883f9f0"); } + @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "20caad94411d6ab48153b214de916df8", " -setKey foo"); } + @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "004f3065cb1bc2ce2f9afd695caf0b48", " -setKey null"); } + @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "c9c901ff9ef2a982624b203a8086dff0"); } // official project VCF files in tabix format - @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "7593be578d4274d672fc22fced38012b", false); } - @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "1cd467863c4e948fadd970681552d57e", false); } + @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "7593be578d4274d672fc22fced38012b"); } + @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "1cd467863c4e948fadd970681552d57e"); } @Test public void combineWithPLs() { combinePLs("combine.3.vcf", "combine.4.vcf", "0f873fed02aa99db5b140bcd6282c10a"); } - @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f", false); } // official project VCF files in tabix format - @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "20163d60f18a46496f6da744ab5cc0f9", false); } // official project VCF files in tabix format - @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f1cf095c2fe9641b7ca1f8ee2c46fd4a", false); } + @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f"); } // official project VCF files in tabix format + @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "20163d60f18a46496f6da744ab5cc0f9"); } // official project VCF files in tabix format + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f1cf095c2fe9641b7ca1f8ee2c46fd4a"); } - @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e144b6283765494bfe8189ac59965083", false); } + @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e144b6283765494bfe8189ac59965083"); } - @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "89f55abea8f59e39d1effb908440548c", true); } + @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "89f55abea8f59e39d1effb908440548c"); } @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "4836086891f6cbdd40eebef3076d215a"); } @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "6a34b5d743efda8b2f3b639f3a2f5de8"); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index bec0d5dd4..cec377f5f 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -7,7 +7,7 @@ import java.util.Arrays; public class SelectVariantsIntegrationTest extends WalkerTest { public static String baseTestString(String args) { - return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s" + args; + return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s -NO_HEADER" + args; } @Test @@ -16,7 +16,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile + " -NO_HEADER"), + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile), 1, Arrays.asList("d18516c1963802e92cb9e425c0b75fd6") ); @@ -24,12 +24,26 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testComplexSelection--" + testfile, spec); } + @Test + public void testSampleExclusion() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s -NO_HEADER -xl_sn A -xl_sf " + samplesFile + " --variant:VCF3 " + testfile, + 1, + Arrays.asList("730f021fd6ecf1d195dabbee2e233bfd") + ); + + executeTest("testSampleExclusion--" + testfile, spec); + } + @Test public void testRepeatedLineSelection() { String testfile = validationDataLocation + "test.dup.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile + " -NO_HEADER"), + baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile), 1, Arrays.asList("b74038779fe6485dbb8734ae48178356") ); diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java new file mode 100644 index 000000000..3dfd0550d --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import org.apache.commons.io.FileUtils; +import org.broadinstitute.sting.BaseTest; +import org.ggf.drmaa.*; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class JnaSessionIntegrationTest extends BaseTest { + private static final SessionFactory factory = new JnaSessionFactory(); + + @Test + public void testDrmaa() throws Exception { + Session session = factory.getSession(); + Version version = session.getVersion(); + System.out.println(String.format("DRMAA version: %d.%d", version.getMajor(), version.getMinor())); + System.out.println(String.format("DRMAA contact(s): %s", session.getContact())); + System.out.println(String.format("DRM system(s): %s", session.getDrmSystem())); + System.out.println(String.format("DRMAA implementation(s): %s", session.getDrmaaImplementation())); + } + + @Test + public void testSubmitEcho() throws Exception { + File outFile = createNetworkTempFile("JnaSessionIntegrationTest-", ".out"); + Session session = factory.getSession(); + session.init(null); + try { + JobTemplate template = session.createJobTemplate(); + template.setRemoteCommand("sh"); + template.setOutputPath(":" + outFile.getAbsolutePath()); + template.setJoinFiles(true); + template.setArgs(Arrays.asList("-c", "echo \"Hello world.\"")); + + String jobId = session.runJob(template); + System.out.println(String.format("Job id %s", jobId)); + session.deleteJobTemplate(template); + + System.out.println("Waiting for job to run: " + jobId); + int remotePs = Session.QUEUED_ACTIVE; + + List runningStatuses = Arrays.asList(Session.QUEUED_ACTIVE, Session.RUNNING); + + while (runningStatuses.contains(remotePs)) { + Thread.sleep(30 * 1000L); + remotePs = session.getJobProgramStatus(jobId); + } + + Assert.assertEquals(remotePs, Session.DONE, "Job status is not DONE."); + + JobInfo jobInfo = session.wait(jobId, Session.TIMEOUT_NO_WAIT); + + Assert.assertTrue(jobInfo.hasExited(), String.format("Job did not exit cleanly: %s", jobId)); + Assert.assertEquals(jobInfo.getExitStatus(), 0, String.format("Exit status for jobId %s is non-zero", jobId)); + if (jobInfo.hasSignaled()) + Assert.fail(String.format("JobId %s exited with signal %s and core dump flag %s", jobId, jobInfo.getTerminatingSignal(), jobInfo.hasCoreDump())); + Assert.assertFalse(jobInfo.wasAborted(), String.format("Job was aborted: %s", jobId)); + } finally { + session.exit(); + } + + Assert.assertTrue(FileUtils.waitFor(outFile, 120), "File not found: " + outFile.getAbsolutePath()); + System.out.println("--- output ---"); + System.out.println(FileUtils.readFileToString(outFile)); + System.out.println("--- output ---"); + Assert.assertTrue(outFile.delete(), "Unable to delete " + outFile.getAbsolutePath()); + System.out.println("Validating that we reached the end of the test without exit."); + } + + @Test + public void testCollectionConversions() { + Collection list = Arrays.asList("a=1", "foo=bar", "empty="); + Map map = new LinkedHashMap(); + map.put("a", "1"); + map.put("foo", "bar"); + map.put("empty", ""); + + Assert.assertEquals(JnaSession.collectionToMap(list), map); + Assert.assertEquals(JnaSession.mapToCollection(map), list); + } + + @Test + public void testLimitConversions() { + Assert.assertEquals(JnaSession.formatLimit(0), "0:00:00"); + Assert.assertEquals(JnaSession.formatLimit(59), "0:00:59"); + Assert.assertEquals(JnaSession.formatLimit(60), "0:01:00"); + Assert.assertEquals(JnaSession.formatLimit(3540), "0:59:00"); + Assert.assertEquals(JnaSession.formatLimit(3599), "0:59:59"); + Assert.assertEquals(JnaSession.formatLimit(7200), "2:00:00"); + Assert.assertEquals(JnaSession.formatLimit(7260), "2:01:00"); + Assert.assertEquals(JnaSession.formatLimit(7261), "2:01:01"); + + Assert.assertEquals(JnaSession.parseLimit("0"), 0); + Assert.assertEquals(JnaSession.parseLimit("00"), 0); + Assert.assertEquals(JnaSession.parseLimit("0:00"), 0); + Assert.assertEquals(JnaSession.parseLimit("00:00"), 0); + Assert.assertEquals(JnaSession.parseLimit("0:00:00"), 0); + + Assert.assertEquals(JnaSession.parseLimit("1"), 1); + Assert.assertEquals(JnaSession.parseLimit("01"), 1); + Assert.assertEquals(JnaSession.parseLimit("0:01"), 1); + Assert.assertEquals(JnaSession.parseLimit("00:01"), 1); + Assert.assertEquals(JnaSession.parseLimit("0:00:01"), 1); + + Assert.assertEquals(JnaSession.parseLimit("10"), 10); + Assert.assertEquals(JnaSession.parseLimit("0:10"), 10); + Assert.assertEquals(JnaSession.parseLimit("00:10"), 10); + Assert.assertEquals(JnaSession.parseLimit("0:00:10"), 10); + + Assert.assertEquals(JnaSession.parseLimit("1:0"), 60); + Assert.assertEquals(JnaSession.parseLimit("1:00"), 60); + Assert.assertEquals(JnaSession.parseLimit("01:00"), 60); + Assert.assertEquals(JnaSession.parseLimit("0:01:00"), 60); + + Assert.assertEquals(JnaSession.parseLimit("1:00:00"), 3600); + + Assert.assertEquals(JnaSession.parseLimit("1:02:03"), 3723); + } +} diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java new file mode 100644 index 000000000..ac2064640 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import com.sun.jna.Memory; +import com.sun.jna.NativeLong; +import com.sun.jna.Pointer; +import com.sun.jna.StringArray; +import com.sun.jna.ptr.IntByReference; +import com.sun.jna.ptr.PointerByReference; +import org.apache.commons.io.FileUtils; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +public class LibDrmaaIntegrationTest extends BaseTest { + + @Test + public void testDrmaa() throws Exception { + Memory error = new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER); + int errnum; + + IntByReference major = new IntByReference(); + IntByReference minor = new IntByReference(); + Memory contact = new Memory(LibDrmaa.DRMAA_CONTACT_BUFFER); + Memory drmSystem = new Memory(LibDrmaa.DRMAA_DRM_SYSTEM_BUFFER); + Memory drmaaImplementation = new Memory(LibDrmaa.DRMAA_DRMAA_IMPLEMENTATION_BUFFER); + + errnum = LibDrmaa.drmaa_version(major, minor, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not get version from the DRMAA library: %s", error.getString(0))); + + System.out.println(String.format("DRMAA version: %d.%d", major.getValue(), minor.getValue())); + + errnum = LibDrmaa.drmaa_get_contact(contact, LibDrmaa.DRMAA_CONTACT_BUFFER_LEN, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not get contacts from the DRMAA library: %s", error.getString(0))); + + System.out.println(String.format("DRMAA contact(s): %s", contact.getString(0))); + + errnum = LibDrmaa.drmaa_get_DRM_system(drmSystem, LibDrmaa.DRMAA_DRM_SYSTEM_BUFFER_LEN, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not get DRM system from the DRMAA library: %s", error.getString(0))); + + System.out.println(String.format("DRM system(s): %s", drmSystem.getString(0))); + + errnum = LibDrmaa.drmaa_get_DRMAA_implementation(drmaaImplementation, LibDrmaa.DRMAA_DRMAA_IMPLEMENTATION_BUFFER_LEN, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not get DRMAA implementation from the DRMAA library: %s", error.getString(0))); + + System.out.println(String.format("DRMAA implementation(s): %s", drmaaImplementation.getString(0))); + } + + @Test + public void testSubmitEcho() throws Exception { + Memory error = new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER); + int errnum; + + File outFile = createNetworkTempFile("LibDrmaaIntegrationTest-", ".out"); + + errnum = LibDrmaa.drmaa_init(null, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not initialize the DRMAA library: %s", error.getString(0))); + + try { + PointerByReference jtRef = new PointerByReference(); + Pointer jt; + Memory jobIdMem = new Memory(LibDrmaa.DRMAA_JOBNAME_BUFFER); + String jobId; + IntByReference remotePs = new IntByReference(); + IntByReference stat = new IntByReference(); + PointerByReference rusage = new PointerByReference(); + + errnum = LibDrmaa.drmaa_allocate_job_template(jtRef, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not create job template: %s", error.getString(0))); + + jt = jtRef.getValue(); + + errnum = LibDrmaa.drmaa_set_attribute(jt, LibDrmaa.DRMAA_REMOTE_COMMAND, "sh", error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_REMOTE_COMMAND, error.getString(0))); + + errnum = LibDrmaa.drmaa_set_attribute(jt, LibDrmaa.DRMAA_OUTPUT_PATH, ":" + outFile.getAbsolutePath(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_OUTPUT_PATH, error.getString(0))); + + errnum = LibDrmaa.drmaa_set_attribute(jt, LibDrmaa.DRMAA_JOIN_FILES, "y", error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_JOIN_FILES, error.getString(0))); + + StringArray args = new StringArray(new String[] { "-c", "echo \"Hello world.\"" }); + + errnum = LibDrmaa.drmaa_set_vector_attribute(jt, LibDrmaa.DRMAA_V_ARGV, args, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_REMOTE_COMMAND, error.getString(0))); + + errnum = LibDrmaa.drmaa_run_job(jobIdMem, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, jt, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not submit job: %s", error.getString(0))); + + jobId = jobIdMem.getString(0); + + System.out.println(String.format("Job id %s", jobId)); + + errnum = LibDrmaa.drmaa_delete_job_template(jt, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not delete job template: %s", error.getString(0))); + + System.out.println("Waiting for job to run: " + jobId); + remotePs.setValue(LibDrmaa.DRMAA_PS.DRMAA_PS_QUEUED_ACTIVE); + + List runningStatuses = Arrays.asList( + LibDrmaa.DRMAA_PS.DRMAA_PS_QUEUED_ACTIVE, LibDrmaa.DRMAA_PS.DRMAA_PS_RUNNING); + + while (runningStatuses.contains(remotePs.getValue())) { + Thread.sleep(30 * 1000L); + + errnum = LibDrmaa.drmaa_job_ps(jobId, remotePs, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not get status for jobId %s: %s", jobId, error.getString(0))); + } + + Assert.assertEquals(remotePs.getValue(), LibDrmaa.DRMAA_PS.DRMAA_PS_DONE, "Job status is not DONE."); + + errnum = LibDrmaa.drmaa_wait(jobId, Pointer.NULL, new NativeLong(0), stat, LibDrmaa.DRMAA_TIMEOUT_NO_WAIT, + rusage, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Wait failed for jobId %s: %s", jobId, error.getString(0))); + + IntByReference exited = new IntByReference(); + IntByReference exitStatus = new IntByReference(); + IntByReference signaled = new IntByReference(); + Memory signal = new Memory(LibDrmaa.DRMAA_SIGNAL_BUFFER); + IntByReference coreDumped = new IntByReference(); + IntByReference aborted = new IntByReference(); + + errnum = LibDrmaa.drmaa_wifexited(exited, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Exit check failed for jobId %s: %s", jobId, error.getString(0))); + + Assert.assertTrue(exited.getValue() != 0, String.format("Job did not exit cleanly: %s", jobId)); + + errnum = LibDrmaa.drmaa_wexitstatus(exitStatus, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Exit status failed for jobId %s: %s", jobId, error.getString(0))); + + Assert.assertEquals(exitStatus.getValue(), 0, String.format("Exit status for jobId %s is non-zero", jobId)); + + errnum = LibDrmaa.drmaa_wifsignaled(signaled, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Signaled check failed for jobId %s: %s", jobId, error.getString(0))); + + if (signaled.getValue() != 0) { + errnum = LibDrmaa.drmaa_wtermsig(signal, LibDrmaa.DRMAA_SIGNAL_BUFFER_LEN, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Signal lookup failed for jobId %s: %s", jobId, error.getString(0))); + + errnum = LibDrmaa.drmaa_wcoredump(coreDumped, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Core dump check failed for jobId %s: %s", jobId, error.getString(0))); + + Assert.fail(String.format("JobId %s exited with signal %s and core dump flag %d", jobId, signal.getString(0), coreDumped.getValue())); + } + + errnum = LibDrmaa.drmaa_wifaborted(aborted, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Aborted check failed for jobId %s: %s", jobId, error.getString(0))); + + Assert.assertTrue(aborted.getValue() == 0, String.format("Job was aborted: %s", jobId)); + + } finally { + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) { + LibDrmaa.drmaa_exit(error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + } else { + errnum = LibDrmaa.drmaa_exit(error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not shut down the DRMAA library: %s", error.getString(0))); + } + } + + Assert.assertTrue(FileUtils.waitFor(outFile, 120), "File not found: " + outFile.getAbsolutePath()); + System.out.println("--- output ---"); + System.out.println(FileUtils.readFileToString(outFile)); + System.out.println("--- output ---"); + Assert.assertTrue(outFile.delete(), "Unable to delete " + outFile.getAbsolutePath()); + System.out.println("Validating that we reached the end of the test without exit."); + } +} diff --git a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java index 77db34cbc..b4fb5cfa3 100644 --- a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java @@ -91,7 +91,7 @@ public class LibBatIntegrationTest extends BaseTest { } @Test - public void testSubmitEcho() throws InterruptedException { + public void testSubmitEcho() throws Exception { String queue = "hour"; File outFile = createNetworkTempFile("LibBatIntegrationTest-", ".out"); @@ -114,6 +114,10 @@ public class LibBatIntegrationTest extends BaseTest { req.command = "echo \"Hello world.\""; + String[] argv = {"", "-a", "tv"}; + int setOptionResult = LibBat.setOption_(argv.length, new StringArray(argv), "a:", req, ~0, ~0, ~0, null); + Assert.assertTrue(setOptionResult != -1, "setOption_ returned -1"); + submitReply reply = new submitReply(); long jobId = LibBat.lsb_submit(req, reply); @@ -142,6 +146,9 @@ public class LibBatIntegrationTest extends BaseTest { Assert.assertTrue(Utils.isFlagSet(jobStatus, LibBat.JOB_STAT_DONE), String.format("Unexpected job status: 0x%02x", jobStatus)); Assert.assertTrue(FileUtils.waitFor(outFile, 120), "File not found: " + outFile.getAbsolutePath()); + System.out.println("--- output ---"); + System.out.println(FileUtils.readFileToString(outFile)); + System.out.println("--- output ---"); Assert.assertTrue(outFile.delete(), "Unable to delete " + outFile.getAbsolutePath()); Assert.assertEquals(reply.queue, req.queue, "LSF reply queue does not match requested queue."); System.out.println("Validating that we reached the end of the test without exit."); diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index d8fa0eae4..f8e6da20a 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -146,8 +146,8 @@ public class VariantContextUnitTest { Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); Assert.assertTrue(vc.isSNP()); Assert.assertFalse(vc.isIndel()); - Assert.assertFalse(vc.isInsertion()); - Assert.assertFalse(vc.isDeletion()); + Assert.assertFalse(vc.isSimpleInsertion()); + Assert.assertFalse(vc.isSimpleDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertTrue(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 2); @@ -173,8 +173,8 @@ public class VariantContextUnitTest { Assert.assertEquals(VariantContext.Type.NO_VARIATION, vc.getType()); Assert.assertFalse(vc.isSNP()); Assert.assertFalse(vc.isIndel()); - Assert.assertFalse(vc.isInsertion()); - Assert.assertFalse(vc.isDeletion()); + Assert.assertFalse(vc.isSimpleInsertion()); + Assert.assertFalse(vc.isSimpleDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertFalse(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 1); @@ -199,8 +199,8 @@ public class VariantContextUnitTest { Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); Assert.assertFalse(vc.isSNP()); Assert.assertTrue(vc.isIndel()); - Assert.assertFalse(vc.isInsertion()); - Assert.assertTrue(vc.isDeletion()); + Assert.assertFalse(vc.isSimpleInsertion()); + Assert.assertTrue(vc.isSimpleDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertTrue(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 2); @@ -226,8 +226,8 @@ public class VariantContextUnitTest { Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); Assert.assertFalse(vc.isSNP()); Assert.assertTrue(vc.isIndel()); - Assert.assertTrue(vc.isInsertion()); - Assert.assertFalse(vc.isDeletion()); + Assert.assertTrue(vc.isSimpleInsertion()); + Assert.assertFalse(vc.isSimpleDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertTrue(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 2); @@ -433,7 +433,7 @@ public class VariantContextUnitTest { Assert.assertFalse(vc14.isBiallelic()); Assert.assertTrue(vc5.isIndel()); - Assert.assertTrue(vc5.isDeletion()); + Assert.assertTrue(vc5.isSimpleDeletion()); Assert.assertTrue(vc5.isVariant()); Assert.assertTrue(vc5.isBiallelic()); diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 47ba0220f..724518142 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -84,12 +84,6 @@ class DataProcessingPipeline extends QScript { var nContigs: Int = 0 // Use the number of contigs for scatter gathering jobs var cleanModelEnum: ConsensusDeterminationModel = ConsensusDeterminationModel.USE_READS - if (cleaningModel == "KNOWNS_ONLY") { - cleanModelEnum = ConsensusDeterminationModel.KNOWNS_ONLY - } - else if (cleaningModel == "USE_SW") { - cleanModelEnum = ConsensusDeterminationModel.USE_SW - } @@ -148,9 +142,9 @@ class DataProcessingPipeline extends QScript { println (f) println() - val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".bam") + val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list") sampleBamFiles(sample) = sampleFileName - add(joinBams(flist, sampleFileName)) + add(writeList(flist, sampleFileName)) } println("*** INPUT FILES ***\n\n") @@ -176,18 +170,20 @@ class DataProcessingPipeline extends QScript { var realignedBams: List[File] = List() var index = 1 for (bam <- bams) { - val readSortedBam = swapExt(bam, ".bam", "." + index + ".sorted.bam" ) + // first revert the BAM file to the original qualities + val revertedBAM = revertBAM(bam) + val readSortedBam = swapExt(revertedBAM, ".bam", "." + index + ".sorted.bam" ) val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai") val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai") val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam") val realignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.bam") val rgRealignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.rg.bam") if (useBWAse) { - add(bwa_aln_se(bam, saiFile1), - bwa_sam_se(bam, saiFile1, realignedSamFile)) + add(bwa_aln_se(revertedBAM, saiFile1), + bwa_sam_se(revertedBAM, saiFile1, realignedSamFile)) } else { - add(sortSam(bam, readSortedBam, SortOrder.queryname), + add(sortSam(revertedBAM, readSortedBam, SortOrder.queryname), bwa_aln_pe(readSortedBam, saiFile1, 1), bwa_aln_pe(readSortedBam, saiFile2, 2), bwa_sam_pe(readSortedBam, saiFile1, saiFile2, realignedSamFile)) @@ -200,6 +196,27 @@ class DataProcessingPipeline extends QScript { return realignedBams } + def getIndelCleaningModel(): ConsensusDeterminationModel = { + if (cleaningModel == "KNOWNS_ONLY") + ConsensusDeterminationModel.KNOWNS_ONLY + else if (cleaningModel == "USE_SW") + ConsensusDeterminationModel.USE_SW + else + ConsensusDeterminationModel.USE_READS + } + + def revertBams(bams: List[File]): List[File] = { + var revertedBAMList: List[File] = List() + for (bam <- bams) + revertedBAMList :+= revertBAM(bam) + return revertedBAMList + } + + def revertBAM(bam: File): File = { + val revertedBAM = swapExt(bam, ".bam", ".reverted.bam") + add(revert(bam, revertedBAM)) + return revertedBAM + } /**************************************************************************** * Main script @@ -208,21 +225,23 @@ class DataProcessingPipeline extends QScript { def script = { + cleanModelEnum = getIndelCleaningModel() + // keep a record of the number of contigs in the first bam file in the list val bams = QScriptUtils.createListFromFile(input) nContigs = QScriptUtils.getNumberOfContigs(bams(0)) - val realignedBams = if (useBWApe || useBWAse) {performAlignment(bams)} else {bams} + val realignedBAMs = if (useBWApe || useBWAse) {performAlignment(bams)} else {revertBams(bams)} // Generate a BAM file per sample joining all per lane files if necessary - val sampleBamFiles: Map[String, File] = createSampleFiles(bams, realignedBams) + val sampleBAMFiles: Map[String, File] = createSampleFiles(bams, realignedBAMs) // Final output list of processed bam files var cohortList: List[File] = List() // Simple progress report println("\nFound the following samples: ") - for ((sample, file) <- sampleBamFiles) + for ((sample, file) <- sampleBAMFiles) println("\t" + sample + " -> " + file) println("\n") @@ -232,7 +251,8 @@ class DataProcessingPipeline extends QScript { add(target(null, globalIntervals)) // Put each sample through the pipeline - for ((sample, bam) <- sampleBamFiles) { + for ((sample, sampleFile) <- sampleBAMFiles) { + val bam = if (sampleFile.endsWith(".list")) {swapExt(sampleFile, ".list", ".bam")} else {sampleFile} // BAM files generated by the pipeline val cleanedBam = swapExt(bam, ".bam", ".clean.bam") @@ -249,17 +269,18 @@ class DataProcessingPipeline extends QScript { val preValidateLog = swapExt(bam, ".bam", ".pre.validation") val postValidateLog = swapExt(bam, ".bam", ".post.validation") + // Validation is an optional step for the BAM file generated after // alignment and the final bam file of the pipeline. - if (!noValidation) { - add(validate(bam, preValidateLog), + if (!noValidation && sampleFile.endsWith(".bam")) { // todo -- implement validation for .list BAM files + add(validate(sampleFile, preValidateLog), validate(recalBam, postValidateLog)) } if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY) - add(target(bam, targetIntervals)) + add(target(sampleFile, targetIntervals)) - add(clean(bam, targetIntervals, cleanedBam), + add(clean(sampleFile, targetIntervals, cleanedBam), dedup(cleanedBam, dedupedBam, metricsFile), cov(dedupedBam, preRecalFile), recal(dedupedBam, preRecalFile, recalBam), @@ -300,27 +321,26 @@ class DataProcessingPipeline extends QScript { } case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { - if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY) + if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY) this.input_file :+= inBams this.out = outIntervals this.mismatchFraction = 0.0 - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.known :+= qscript.dbSNP if (indels != null) - this.rodBind :+= RodBind("indels", "VCF", indels) + this.known :+= qscript.indels this.scatterCount = nContigs this.analysisName = queueLogDir + outIntervals + ".target" this.jobName = queueLogDir + outIntervals + ".target" } case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") this.input_file :+= inBams this.targetIntervals = tIntervals this.out = outBam - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.known :+= qscript.dbSNP if (qscript.indels != null) - this.rodBind :+= RodBind("indels", "VCF", qscript.indels) - this.consensusDeterminationModel = consensusDeterminationModel + this.known :+= qscript.indels + this.consensusDeterminationModel = cleanModelEnum this.compress = 0 this.scatterCount = nContigs this.analysisName = queueLogDir + outBam + ".clean" @@ -328,7 +348,7 @@ class DataProcessingPipeline extends QScript { } case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.knownSites :+= qscript.dbSNP this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") this.input_file :+= inBam this.recal_file = outRecalFile @@ -368,16 +388,15 @@ class DataProcessingPipeline extends QScript { } case class dedup (inBam: File, outBam: File, metricsFile: File) extends MarkDuplicates with ExternalCommonArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") - this.input = List(inBam) + this.input :+= inBam this.output = outBam this.metrics = metricsFile + this.memoryLimit = 16 this.analysisName = queueLogDir + outBam + ".dedup" this.jobName = queueLogDir + outBam + ".dedup" } case class joinBams (inBams: List[File], outBam: File) extends MergeSamFiles with ExternalCommonArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") this.input = inBams this.output = outBam this.analysisName = queueLogDir + outBam + ".joinBams" @@ -385,8 +404,7 @@ class DataProcessingPipeline extends QScript { } case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam with ExternalCommonArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") - this.input = List(inSam) + this.input :+= inSam this.output = outBam this.sortOrder = sortOrderP this.analysisName = queueLogDir + outBam + ".sortSam" @@ -394,7 +412,7 @@ class DataProcessingPipeline extends QScript { } case class validate (inBam: File, outLog: File) extends ValidateSamFile with ExternalCommonArgs { - this.input = List(inBam) + this.input :+= inBam this.output = outLog this.REFERENCE_SEQUENCE = qscript.reference this.isIntermediate = false @@ -404,8 +422,7 @@ class DataProcessingPipeline extends QScript { case class addReadGroup (inBam: File, outBam: File, readGroup: ReadGroup) extends AddOrReplaceReadGroups with ExternalCommonArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") - this.input = List(inBam) + this.input :+= inBam this.output = outBam this.RGID = readGroup.id this.RGCN = readGroup.cn @@ -418,6 +435,14 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outBam + ".rg" } + case class revert (inBam: File, outBam: File) extends RevertSam with ExternalCommonArgs { + this.output = outBam + this.input :+= inBam + this.analysisName = queueLogDir + outBam + "revert" + this.jobName = queueLogDir + outBam + ".revert" + + } + case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with ExternalCommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Output(doc="output sai file") var sai = outSai diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index a961beca1..3c9a3fbcb 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -5,17 +5,6 @@ import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.gatk.phonehome.GATKRunReport - - // ToDos: - // reduce the scope of the datasets so the script is more nimble - // create gold standard BAQ'd bam files, no reason to always do it on the fly - - // Analysis to add at the end of the script: - // auto generation of the cluster plots - // spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls - // produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF - - class MethodsDevelopmentCallingPipeline extends QScript { qscript => @@ -28,15 +17,12 @@ class MethodsDevelopmentCallingPipeline extends QScript { @Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false) var datasets: List[String] = Nil - @Argument(shortName="skipGoldStandard", doc="doesn't run the pipeline with the goldstandard VCF files for comparison", required=false) - var skipGoldStandard: Boolean = false + @Argument(shortName="runGoldStandard", doc="run the pipeline with the goldstandard VCF files for comparison", required=false) + var runGoldStandard: Boolean = false @Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false) var noBAQ: Boolean = false - @Argument(shortName="eval", doc="adds the VariantEval walker to the pipeline", required=false) - var eval: Boolean = false - @Argument(shortName="indels", doc="calls indels with the Unified Genotyper", required=false) var callIndels: Boolean = false @@ -52,8 +38,6 @@ class MethodsDevelopmentCallingPipeline extends QScript { @Argument(shortName="sample", doc="Samples to include in Variant Eval", required=false) var samples: List[String] = Nil - - class Target( val baseName: String, val reference: File, @@ -65,7 +49,9 @@ class MethodsDevelopmentCallingPipeline extends QScript { val intervals: String, val titvTarget: Double, val trancheTarget: Double, - val isLowpass: Boolean) { + val isLowpass: Boolean, + val isExome: Boolean, + val nSamples: Int) { val name = qscript.outputDir + baseName val clusterFile = new File(name + ".clusters") val rawVCF = new File(name + ".raw.vcf") @@ -89,9 +75,8 @@ class MethodsDevelopmentCallingPipeline extends QScript { val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta") val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta") val dbSNP_hg18_129 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_hg18.rod" // Special case for NA12878 collections that can't use 132 because they are part of it. - val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b36.rod" - val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf" - val dbSNP_b37_129 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf" // Special case for NA12878 collections that can't use 132 because they are part of it. + val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132.b36.excluding_sites_after_129.vcf" + val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf" // Special case for NA12878 collections that can't use 132 because they are part of it. val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.hg18_fwd.vcf" val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b36_fwd.vcf" val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" @@ -100,55 +85,61 @@ class MethodsDevelopmentCallingPipeline extends QScript { val omni_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed" val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed" + val training_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.highQuality.vcf" + val badSites_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.terrible.vcf" + val projectConsensus_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/ALL.wgs.projectConsensus_v2b.20101123.snps.sites.vcf" val lowPass: Boolean = true + val exome: Boolean = true val indels: Boolean = true val queueLogDir = ".qlog/" + // BUGBUG: We no longer support b36/hg18 because several of the necessary files aren't available aligned to those references + val targetDataSets: Map[String, Target] = Map( "HiSeq" -> new Target("NA12878.HiSeq", hg18, dbSNP_hg18_129, hapmap_hg18, "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask", new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"), new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.07, 99.0, !lowPass), - "HiSeq19" -> new Target("NA12878.HiSeq19", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37, + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.14, 99.0, !lowPass, !exome, 1), + "HiSeq19" -> new Target("NA12878.HiSeq19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.HiSeq19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass), - "GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37, + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1), + "GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.GA2.hg19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1), "WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18_129, hapmap_hg18, "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask", new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"), new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, 97.0, !lowPass), - "WExTrio" -> new Target("CEUTrio.WEx", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37, + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 3.3, 98.0, !lowPass, exome, 1), + "WExTrio" -> new Target("CEUTrio.WEx", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 97.0, !lowPass), - "WGSTrio" -> new Target("CEUTrio.WGS", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37, + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 3.3, 98.0, !lowPass, exome, 3), + "WGSTrio" -> new Target("CEUTrio.WGS", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.bwa.cleaned.recal.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass, !exome, 3), "FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"), new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass, !exome, 79), "TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 99.0, !lowPass), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 99.0, !lowPass, exome, 96), "LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36, new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, 99.0, lowPass), // chunked interval list to use with Queue's scatter/gather functionality + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, 99.0, lowPass, !exome, 60), // chunked interval list to use with Queue's scatter/gather functionality "LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"), new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass) + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass, !exome, 363) ) @@ -170,9 +161,9 @@ class MethodsDevelopmentCallingPipeline extends QScript { add(new snpCall(target)) add(new VQSR(target, !goldStandard)) add(new applyVQSR(target, !goldStandard)) - if (eval) add(new snpEvaluation(target)) + add(new snpEvaluation(target)) } - if ( !skipGoldStandard ) { + if ( runGoldStandard ) { add(new VQSR(target, goldStandard)) add(new applyVQSR(target, goldStandard)) } @@ -187,22 +178,19 @@ class MethodsDevelopmentCallingPipeline extends QScript { } def bai(bam: File) = new File(bam + ".bai") - val FiltersToIgnore = List("DPFilter", "ABFilter", "ESPStandard", "QualByDepth", "StrandBias", "HomopolymerRun") // 1.) Unified Genotyper Base class GenotyperBase (t: Target) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { this.memoryLimit = 3 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) - this.scatterCount = 63 // the smallest interval list has 63 intervals, one for each Mb on chr20 + this.scatterCount = 140 + this.nt = 2 this.dcov = if ( t.isLowpass ) { 50 } else { 250 } this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } this.input_file :+= t.bamList - if (t.dbsnpFile.endsWith(".rod")) - this.DBSNP = new File(t.dbsnpFile) - else if (t.dbsnpFile.endsWith(".vcf")) - this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile) + this.D = new File(t.dbsnpFile) } // 1a.) Call SNPs with UG @@ -216,7 +204,6 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.baq = if (noBAQ) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY} this.analysisName = t.name + "_UGs" this.jobName = queueLogDir + t.name + ".snpcall" - this.A ++= List("FisherStrand") } // 1b.) Call Indels with UG @@ -234,15 +221,14 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) this.scatterCount = 10 - this.filterName ++= List("HARD_TO_VALIDATE") - this.filterExpression ++= List("\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"") - this.variantVCF = t.rawIndelVCF + this.V = t.rawIndelVCF this.out = t.filteredIndelVCF - this.filterName ++= List("LowQual", "StrandBias", "QualByDepth", "HomopolymerRun") - if (t.isLowpass) - this.filterExpression ++= List("\"QUAL<30.0\"", "\"SB>=-1.0\"", "\"QD<1.0\"", "\"HRun>=15\"") - else - this.filterExpression ++= List("\"QUAL<50.0\"", "\"SB>=-1.0\"", "\"QD<5.0\"", "\"HRun>=15\"") + this.filterName ++= List("IndelQD", "IndelReadPosRankSum", "IndelFS") + this.filterExpression ++= List("\"QD < 2.0\"", "\"ReadPosRankSum < -20.0\"", "\"FS > 200.0\"") + if (t.nSamples >= 10) { + this.filterName ++= List("IndelInbreedingCoeff") + this.filterExpression ++= List("\"InbreedingCoeff < -0.8\"") + } this.analysisName = t.name + "_VF" this.jobName = queueLogDir + t.name + ".indelfilter" } @@ -250,70 +236,74 @@ class MethodsDevelopmentCallingPipeline extends QScript { // 3.) Variant Quality Score Recalibration - Generate Recalibration table class VQSR(t: Target, goldStandard: Boolean) extends VariantRecalibrator with UNIVERSAL_GATK_ARGS { this.memoryLimit = 4 + this.nt = 2 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) - this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) - this.rodBind :+= RodBind("hapmap", "VCF", t.hapmapFile, "known=false,training=true,truth=true,prior=15.0") - if( t.hapmapFile.contains("b37") ) - this.rodBind :+= RodBind("omni", "VCF", omni_b37, "known=false,training=true,truth=true,prior=12.0") - else if( t.hapmapFile.contains("b36") ) - this.rodBind :+= RodBind("omni", "VCF", omni_b36, "known=false,training=true,truth=true,prior=12.0") - if (t.dbsnpFile.endsWith(".rod")) - this.rodBind :+= RodBind("dbsnp", "DBSNP", t.dbsnpFile, "known=true,training=false,truth=false,prior=10.0") - else if (t.dbsnpFile.endsWith(".vcf")) - this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile, "known=true,training=false,truth=false,prior=10.0") - this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "HRun", "FS") + this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) + this.training :+= new TaggedFile( t.hapmapFile, "prior=15.0") + this.truth :+= new TaggedFile( t.hapmapFile, "prior=15.0") + this.training :+= new TaggedFile( omni_b37, "prior=12.0") + this.truth :+= new TaggedFile( omni_b37, "prior=12.0") + this.training :+= new TaggedFile( training_1000G, "prior=10.0" ) + this.known :+= new TaggedFile( t.dbsnpFile, "prior=2.0" ) + this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" ) + this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS") + if(t.nSamples >= 10) { + this.use_annotation ++= List("InbreedingCoeff") + } + if(!t.isExome) { + this.use_annotation ++= List("DP") + } else { + this.mG = 6 + } this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile } this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile } this.allPoly = true this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0") this.rscript_file = t.vqsrRscript this.analysisName = t.name + "_VQSR" - this.jobName = queueLogDir + t.name + ".VQSR" + this.jobName = queueLogDir + t.name + ".VQSR" } // 4.) Apply the recalibration table to the appropriate tranches class applyVQSR (t: Target, goldStandard: Boolean) extends ApplyRecalibration with UNIVERSAL_GATK_ARGS { - this.memoryLimit = 4 + this.memoryLimit = 6 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) - this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) + this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile} this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile } this.ts_filter_level = t.trancheTarget this.out = t.recalibratedVCF this.analysisName = t.name + "_AVQSR" - this.jobName = queueLogDir + t.name + ".applyVQSR" + this.jobName = queueLogDir + t.name + ".applyVQSR" } // 5.) Variant Evaluation Base(OPTIONAL) class EvalBase(t: Target) extends VariantEval with UNIVERSAL_GATK_ARGS { this.memoryLimit = 3 this.reference_sequence = t.reference - this.rodBind :+= RodBind("comphapmap", "VCF", t.hapmapFile) + this.comp :+= new TaggedFile(t.hapmapFile, "hapmap" ) this.intervalsString ++= List(t.intervals) - if (t.dbsnpFile.endsWith(".rod")) - this.DBSNP = new File(t.dbsnpFile) - else if (t.dbsnpFile.endsWith(".vcf")) - this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile) + this.D = new File(t.dbsnpFile) this.sample = samples } // 5a.) SNP Evaluation (OPTIONAL) based on the cut vcf class snpEvaluation(t: Target) extends EvalBase(t) { - if (t.reference == b37 || t.reference == hg19) this.rodBind :+= RodBind("compomni", "VCF", omni_b37) - this.rodBind :+= RodBind("eval", "VCF", t.recalibratedVCF ) + if (t.reference == b37 || t.reference == hg19) this.comp :+= new TaggedFile( omni_b37, "omni" ) + this.eval :+= t.recalibratedVCF this.out = t.evalFile this.analysisName = t.name + "_VEs" - this.jobName = queueLogDir + t.name + ".snp.eval" + this.jobName = queueLogDir + t.name + ".snp.eval" } // 5b.) Indel Evaluation (OPTIONAL) class indelEvaluation(t: Target) extends EvalBase(t) { - this.rodBind :+= RodBind("eval", "VCF", t.filteredIndelVCF) + this.eval :+= t.filteredIndelVCF this.evalModule :+= "IndelStatistics" this.out = t.evalIndelFile this.analysisName = t.name + "_VEi" - this.jobName = queueLogDir + queueLogDir + t.name + ".indel.eval" + this.jobName = queueLogDir + queueLogDir + t.name + ".indel.eval" } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 05c1a1775..648f9ffef 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -41,12 +41,27 @@ class QSettings { @Argument(fullName="job_queue", shortName="jobQueue", doc="Default queue for compute farm jobs.", required=false) var jobQueue: String = _ - @Argument(fullName="job_priority", shortName="jobPriority", doc="Default priority for jobs.", required=false) + @Argument(fullName="job_priority", shortName="jobPriority", doc="Default priority for jobs. Min = 0, Max = 100", required=false) var jobPriority: Option[Int] = None - @Argument(fullName="default_memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false) + @Argument(fullName="job_native_arg", shortName="jobNative", doc="Native arguments to pass to the job runner.", required=false) + var jobNativeArgs: List[String] = Nil + + @Argument(fullName="job_resource_request", shortName="jobResReq", doc="Resource requests to pass to the job runner.", required=false) + var jobResourceRequests: List[String] = Nil + + @Argument(fullName="job_environment_name", shortName="jobEnv", doc="Environment names for the job runner.", required=false) + var jobEnvironmentNames: List[String] = Nil + + @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false) var memoryLimit: Option[Double] = None + @Argument(fullName="resident_memory_limit", shortName="resMemLimit", doc="Default resident memory limit for jobs, in gigabytes.", required=false) + var residentLimit: Option[Double] = None + + @Argument(fullName="resident_memory_request", shortName="resMemReq", doc="Default resident memory request for jobs, in gigabytes.", required=false) + var residentRequest: Option[Double] = None + @Argument(fullName="run_directory", shortName="runDir", doc="Root directory to run functions from.", required=false) var runDirectory = new File(".") diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala index 2e3108136..2c960d8f6 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala @@ -51,10 +51,21 @@ trait CommandLineJobRunner extends JobRunner[CommandLineFunction] with Logging { /** The last time the status was updated */ protected var lastStatusUpdate: Long = _ - final override def status = this.lastStatus + /** The runner specific priority for a minimum priority job */ + protected val minRunnerPriority = 0 - def residentRequestMB: Option[Double] = function.memoryLimit.map(_ * 1024) - def residentLimitMB: Option[Double] = residentRequestMB.map( _ * 1.2 ) + /** The runner specific priority for a maximum priority job */ + protected val maxRunnerPriority = 0 + + /** The priority of the function in the range defined by the runner */ + protected def functionPriority = { + function.jobPriority.map { priority => + (((priority / 100D) * (maxRunnerPriority - minRunnerPriority)) + minRunnerPriority). + round.intValue() min maxRunnerPriority max minRunnerPriority + } + } + + final override def status = this.lastStatus override def init() { super.init() diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala index 30187f7e2..9aeb3a8ee 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala @@ -30,6 +30,9 @@ import org.broadinstitute.sting.queue.function.QFunction * Creates and stops JobRunners */ trait JobManager[TFunction <: QFunction, TRunner <: JobRunner[TFunction]] { + def init() {} + def exit() {} + /** The class type of the runner. Available at runtime even after erasure. */ def functionType: Class[TFunction] @@ -52,6 +55,5 @@ trait JobManager[TFunction <: QFunction, TRunner <: JobRunner[TFunction]] { * Stops a list of functions. * @param runners Runners to stop. */ - def tryStop(runners: Set[TRunner]) { - } + def tryStop(runners: Set[TRunner]) {} } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala index a52e9c561..766d9db94 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala @@ -361,6 +361,13 @@ class QGraph extends Logging { settings.jobRunner = "Shell" commandLineManager = commandLinePluginManager.createByName(settings.jobRunner) + for (mgr <- managers) { + if (mgr != null) { + val manager = mgr.asInstanceOf[JobManager[QFunction,JobRunner[QFunction]]] + manager.init() + } + } + if (settings.startFromScratch) logger.info("Removing outputs from previous runs.") @@ -1034,18 +1041,26 @@ class QGraph extends Logging { for (mgr <- managers) { if (mgr != null) { val manager = mgr.asInstanceOf[JobManager[QFunction,JobRunner[QFunction]]] - val managerRunners = runners - .filter(runner => manager.runnerType.isAssignableFrom(runner.getClass)) - .asInstanceOf[Set[JobRunner[QFunction]]] - if (managerRunners.size > 0) - try { - manager.tryStop(managerRunners) - } catch { - case e => /* ignore */ + try { + val managerRunners = runners + .filter(runner => manager.runnerType.isAssignableFrom(runner.getClass)) + .asInstanceOf[Set[JobRunner[QFunction]]] + if (managerRunners.size > 0) + try { + manager.tryStop(managerRunners) + } catch { + case e => /* ignore */ + } + for (runner <- managerRunners) { + try { + runner.cleanup() + } catch { + case e => /* ignore */ + } } - for (runner <- managerRunners) { + } finally { try { - runner.cleanup() + manager.exit() } catch { case e => /* ignore */ } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala new file mode 100644 index 000000000..4c9cc1890 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.engine.drmaa + +import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.queue.engine.CommandLineJobManager +import org.broadinstitute.sting.jna.drmaa.v1_0.JnaSessionFactory +import org.ggf.drmaa.Session + +/** + * Runs jobs using DRMAA + */ +class DrmaaJobManager extends CommandLineJobManager[DrmaaJobRunner] { + protected var session: Session = _ + + protected def newSession() = new JnaSessionFactory().getSession + protected def contact = null + + override def init() { + session = newSession() + session.init(contact) + } + + override def exit() { + session.exit() + } + + def runnerType = classOf[DrmaaJobRunner] + def create(function: CommandLineFunction) = new DrmaaJobRunner(session, function) + + override def updateStatus(runners: Set[DrmaaJobRunner]) = { + var updatedRunners = Set.empty[DrmaaJobRunner] + runners.foreach(runner => if (runner.updateJobStatus()) {updatedRunners += runner}) + updatedRunners + } + override def tryStop(runners: Set[DrmaaJobRunner]) { + runners.filterNot(_.jobId == null).foreach(_.tryStop()) + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala new file mode 100644 index 000000000..b48dcd2a9 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.engine.drmaa + +import org.broadinstitute.sting.queue.QException +import org.broadinstitute.sting.queue.util.{Logging,Retry} +import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner} +import java.util.Collections +import org.ggf.drmaa._ + +/** + * Runs jobs using DRMAA. + */ +class DrmaaJobRunner(val session: Session, val function: CommandLineFunction) extends CommandLineJobRunner with Logging { + /** Job Id of the currently executing job. */ + var jobId: String = _ + override def jobIdString = jobId + + // Set the display name to < 512 characters of the description + // NOTE: Not sure if this is configuration specific? + protected val jobNameLength = 500 + protected val jobNameFilter = """[^A-Za-z0-9_]""" + protected def functionNativeSpec = function.jobNativeArgs.mkString(" ") + + def start() { + session.synchronized { + val drmaaJob: JobTemplate = session.createJobTemplate + + drmaaJob.setJobName(function.description.take(jobNameLength).replaceAll(jobNameFilter, "_")) + + // Set the current working directory + drmaaJob.setWorkingDirectory(function.commandDirectory.getPath) + + // Set the output file for stdout + drmaaJob.setOutputPath(":" + function.jobOutputFile.getPath) + + // If the error file is set specify the separate output for stderr + // Otherwise join with stdout + if (function.jobErrorFile != null) { + drmaaJob.setErrorPath(":" + function.jobErrorFile.getPath) + } else { + drmaaJob.setJoinFiles(true) + } + + drmaaJob.setNativeSpecification(functionNativeSpec) + + // Instead of running the function.commandLine, run "sh " + drmaaJob.setRemoteCommand("sh") + drmaaJob.setArgs(Collections.singletonList(jobScript.toString)) + + // Allow advanced users to update the request via QFunction.updateJobRun() + updateJobRun(drmaaJob) + + updateStatus(RunnerStatus.RUNNING) + + // Start the job and store the id so it can be killed in tryStop + try { + Retry.attempt(() => { + try { + jobId = session.runJob(drmaaJob) + } catch { + case de: DrmaaException => throw new QException("Unable to submit job: " + de.getLocalizedMessage) + } + }, 1, 5, 10) + } finally { + // Prevent memory leaks + session.deleteJobTemplate(drmaaJob) + } + logger.info("Submitted job id: " + jobId) + } + } + + def updateJobStatus() = { + session.synchronized { + var returnStatus: RunnerStatus.Value = null + + try { + val jobStatus = session.getJobProgramStatus(jobId); + jobStatus match { + case Session.QUEUED_ACTIVE => returnStatus = RunnerStatus.RUNNING + case Session.DONE => + val jobInfo: JobInfo = session.wait(jobId, Session.TIMEOUT_NO_WAIT) + if ((jobInfo.hasExited && jobInfo.getExitStatus != 0) + || jobInfo.hasSignaled + || jobInfo.wasAborted) + returnStatus = RunnerStatus.FAILED + else + returnStatus = RunnerStatus.DONE + case Session.FAILED => returnStatus = RunnerStatus.FAILED + case Session.UNDETERMINED => logger.warn("Unable to determine status of job id " + jobId) + case _ => returnStatus = RunnerStatus.RUNNING + } + } catch { + // getJobProgramStatus will throw an exception once wait has run, as the + // job will be reaped. If the status is currently DONE or FAILED, return + // the status. + case de: DrmaaException => + if (lastStatus == RunnerStatus.DONE || lastStatus == RunnerStatus.FAILED) + returnStatus = lastStatus + else + logger.warn("Unable to determine status of job id " + jobId, de) + } + + if (returnStatus != null) { + updateStatus(returnStatus) + true + } else { + false + } + } + } + + def tryStop() { + session.synchronized { + try { + // Stop runners. SIGTERM(15) is preferred to SIGKILL(9). + // Only way to send SIGTERM is for the Sys Admin set the terminate_method + // resource of the designated queue to SIGTERM + session.control(jobId, Session.TERMINATE) + } catch { + case e => + logger.error("Unable to kill job " + jobId, e) + } + } + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala index 78bd2cc78..7299036ed 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala @@ -24,13 +24,9 @@ package org.broadinstitute.sting.queue.engine.gridengine -import org.broadinstitute.sting.queue.engine.CommandLineJobManager import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.queue.engine.drmaa.DrmaaJobManager -class GridEngineJobManager extends CommandLineJobManager[GridEngineJobRunner] { - def runnerType = classOf[GridEngineJobRunner] - def create(function: CommandLineFunction) = new GridEngineJobRunner(function) - - override def updateStatus(runners: Set[GridEngineJobRunner]) = { GridEngineJobRunner.updateStatus(runners) } - override def tryStop(runners: Set[GridEngineJobRunner]) { GridEngineJobRunner.tryStop(runners) } +class GridEngineJobManager extends DrmaaJobManager { + override def create(function: CommandLineFunction) = new GridEngineJobRunner(session, function) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala index 8c639b5bb..96e3ffd95 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala @@ -24,203 +24,52 @@ package org.broadinstitute.sting.queue.engine.gridengine -import org.broadinstitute.sting.queue.QException -import org.broadinstitute.sting.queue.util.{Logging,Retry} +import org.broadinstitute.sting.queue.util.Logging import org.broadinstitute.sting.queue.function.CommandLineFunction -import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner} -import org.ggf.drmaa.{DrmaaException,JobInfo,JobTemplate,Session,SessionFactory} -import java.util.Collections +import org.broadinstitute.sting.queue.engine.drmaa.DrmaaJobRunner +import org.ggf.drmaa.Session /** * Runs jobs on a Grid Engine compute cluster. */ -class GridEngineJobRunner(val function: CommandLineFunction) extends CommandLineJobRunner with Logging { - // Run the static initializer for GridEngineJobRunner - GridEngineJobRunner - - /** Job Id of the currently executing job. */ - private var jobId: String = _ - override def jobIdString = jobId - - def start() { - GridEngineJobRunner.gridEngineSession.synchronized { - val gridEngineJob: JobTemplate = GridEngineJobRunner.gridEngineSession.createJobTemplate - - // Force the remote environment to inherit local environment settings - var nativeSpecString: String = "-V" - - // Set the display name to < 512 characters of the description - // NOTE: Not sure if this is configuration specific? - gridEngineJob.setJobName(GridEngineJobRunner.toJobName(function.description.take(500))) - - // Set the output file for stdout - gridEngineJob.setOutputPath(":" + function.jobOutputFile.getPath) - - // Set the current working directory - gridEngineJob.setWorkingDirectory(function.commandDirectory.getPath) - - // If the error file is set specify the separate output for stderr - // Otherwise join with stdout - if (Option(function.jobErrorFile) != None) { - gridEngineJob.setErrorPath(":" + function.jobErrorFile.getPath) - } else { - gridEngineJob.setJoinFiles(true) - } - - // If a project name is set specify the project name - if (Option(function.jobProject) != None) { - nativeSpecString += " -P " + function.jobProject - } - - // If the job queue is set specify the job queue - if (Option(function.jobQueue) != None) { - nativeSpecString += " -q " + function.jobQueue - } - - // If the resident set size is requested pass on the memory request - if (residentRequestMB.isDefined) { - nativeSpecString += " -l mem_free=%dM".format(residentRequestMB.get.ceil.toInt) - } - - // If the resident set size limit is defined specify the memory limit - if (residentLimitMB.isDefined) { - nativeSpecString += " -l h_rss=%dM".format(residentLimitMB.get.ceil.toInt) - } - - // If the priority is set (user specified Int) specify the priority - if (function.jobPriority.isDefined) { - nativeSpecString += " -p " + function.jobPriority.get - } - - gridEngineJob.setNativeSpecification(nativeSpecString) - - // Instead of running the function.commandLine, run "sh " - gridEngineJob.setRemoteCommand("sh") - gridEngineJob.setArgs(Collections.singletonList(jobScript.toString)) - - // Allow advanced users to update the request via QFunction.updateJobRun() - updateJobRun(gridEngineJob) - - updateStatus(RunnerStatus.RUNNING) - - // Start the job and store the id so it can be killed in tryStop - try { - Retry.attempt(() => { - try { - jobId = GridEngineJobRunner.gridEngineSession.runJob(gridEngineJob) - } catch { - case de: DrmaaException => throw new QException("Unable to submit job: " + de.getLocalizedMessage) - } - }, 1, 5, 10) - } finally { - // Prevent memory leaks - GridEngineJobRunner.gridEngineSession.deleteJobTemplate(gridEngineJob) - } - logger.info("Submitted Grid Engine job id: " + jobId) - } - } -} - -object GridEngineJobRunner extends Logging { - private val gridEngineSession = SessionFactory.getFactory.getSession - - initGridEngine() - - /** - * Initialize the Grid Engine library. - */ - private def initGridEngine() { - gridEngineSession.synchronized { - try { - gridEngineSession.init("") - } catch { - case de: DrmaaException => - logger.error("Issue initializing Grid Engine", de) - throw new QException("init() failed", de) - } - } - } - - /** - * Updates the status of a list of jobs. - * @param runners Runners to update. - * @return runners which were updated. - */ - def updateStatus(runners: Set[GridEngineJobRunner]) = { - var updatedRunners = Set.empty[GridEngineJobRunner] - gridEngineSession.synchronized { - runners.foreach(runner => if (updateRunnerStatus(runner)) {updatedRunners += runner}) - } - updatedRunners - } - - /** - * Tries to stop any running jobs. - * @param runners Runners to stop. - */ - def tryStop(runners: Set[GridEngineJobRunner]) { - // Stop runners. SIGTERM(15) is preferred to SIGKILL(9). - // Only way to send SIGTERM is for the Sys Admin set the terminate_method - // resource of the designated queue to SIGTERM - gridEngineSession.synchronized { - for (runner <- runners.filterNot(runner => Option(runner.jobId) == None)) { - try { - gridEngineSession.control(runner.jobId, Session.TERMINATE) - } catch { - case e => - logger.error("Unable to kill job " + runner.jobId, e) - } - } - gridEngineSession.exit() - } - } - - private def updateRunnerStatus(runner: GridEngineJobRunner): Boolean = { - var returnStatus: RunnerStatus.Value = null - - try { - val jobStatus = gridEngineSession.getJobProgramStatus(runner.jobId); - jobStatus match { - case Session.QUEUED_ACTIVE => returnStatus = RunnerStatus.RUNNING - case Session.DONE => - val jobInfo: JobInfo = gridEngineSession.wait(runner.jobId, Session.TIMEOUT_NO_WAIT) - if ((jobInfo.hasExited && jobInfo.getExitStatus > 0) - || jobInfo.hasSignaled - || jobInfo.wasAborted) - returnStatus = RunnerStatus.FAILED - else - returnStatus = RunnerStatus.DONE - case Session.FAILED => returnStatus = RunnerStatus.FAILED - case Session.UNDETERMINED => logger.warn("Unable to determine status of Grid Engine job id " + runner.jobId) - case _ => returnStatus = RunnerStatus.RUNNING - } - } catch { - // getJobProgramStatus will throw an exception once wait has run, as the - // job will be reaped. If the status is currently DONE or FAILED, return - // the status. - case de: DrmaaException => - if (runner.lastStatus == RunnerStatus.DONE || runner.lastStatus == RunnerStatus.FAILED) - returnStatus = runner.lastStatus - else - logger.warn("Unable to determine status of Grid Engine job id " + runner.jobId, de) - } - - if (returnStatus != null) { - runner.updateStatus(returnStatus) - true - } else { - false - } - } - - // Reap what we've sown - override def finalize() { - gridEngineSession.exit() - } - +class GridEngineJobRunner(session: Session, function: CommandLineFunction) extends DrmaaJobRunner(session, function) with Logging { // Grid Engine disallows certain characters from being in job names. // This replaces all illegal characters with underscores - private def toJobName(name: String): String = { - name.replaceAll("""[\n\t\r/:@\\*?]""", "_") + protected override val jobNameFilter = """[\n\t\r/:@\\*?]""" + protected override val minRunnerPriority = -1023 + protected override val maxRunnerPriority = 0 + + override protected def functionNativeSpec = { + // Force the remote environment to inherit local environment settings + var nativeSpec: String = "-V" + + // If a project name is set specify the project name + if (function.jobProject != null) + nativeSpec += " -P " + function.jobProject + + // If the job queue is set specify the job queue + if (function.jobQueue != null) + nativeSpec += " -q " + function.jobQueue + + // If the resident set size is requested pass on the memory request + if (function.residentRequest.isDefined) + nativeSpec += " -l mem_free=%dM".format(function.residentRequest.map(_ * 1024).get.ceil.toInt) + + // If the resident set size limit is defined specify the memory limit + if (function.residentLimit.isDefined) + nativeSpec += " -l h_rss=%dM".format(function.residentLimit.map(_ * 1024).get.ceil.toInt) + + // Pass on any job resource requests + nativeSpec += function.jobResourceRequests.map(" -l " + _).mkString + + // Pass on any job environment names + nativeSpec += function.jobEnvironmentNames.map(" -pe " + _).mkString + + // If the priority is set specify the priority + val priority = functionPriority + if (priority.isDefined) + nativeSpec += " -p " + priority.get + + (nativeSpec + " " + super.functionNativeSpec).trim() } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index 46dd08332..bb711344c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -34,6 +34,8 @@ import org.broadinstitute.sting.jna.lsf.v7_0_6.LibBat.{submitReply, submit} import com.sun.jna.ptr.IntByReference import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner} import com.sun.jna.{Structure, StringArray, NativeLong} +import java.util.regex.Pattern +import java.lang.StringBuffer /** * Runs jobs on an LSF compute cluster. @@ -47,12 +49,22 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR private var jobId = -1L override def jobIdString = jobId.toString + protected override val minRunnerPriority = 1 + protected override val maxRunnerPriority = Lsf706JobRunner.maxUserPriority + + private val selectString = new StringBuffer() + private val usageString = new StringBuffer() + private val requestString = new StringBuffer() + /** * Dispatches the function on the LSF cluster. * @param function Command to run. */ def start() { Lsf706JobRunner.lsfLibLock.synchronized { + + parseResourceRequest() + val request = new submit for (i <- 0 until LibLsf.LSF_RLIM_NLIMITS) request.rLimits(i) = LibLsf.DEFAULT_RLIMIT; @@ -81,28 +93,45 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR } // If the resident set size is requested pass on the memory request - if (residentRequestMB.isDefined) { - val memInUnits = Lsf706JobRunner.convertUnits(residentRequestMB.get) - request.resReq = "select[mem>%1$d] rusage[mem=%1$d]".format(memInUnits) + if (function.residentRequest.isDefined) { + val memInUnits = Lsf706JobRunner.convertUnits(function.residentRequest.get) + appendRequest("select", selectString, "&&", "mem>%d".format(memInUnits)) + appendRequest("rusage", usageString, ",", "mem=%d".format(memInUnits)) + } + + val resReq = getResourceRequest + if (resReq.length > 0) { + request.resReq = resReq request.options |= LibBat.SUB_RES_REQ } // If the resident set size limit is defined specify the memory limit - if (residentLimitMB.isDefined) { - val memInUnits = Lsf706JobRunner.convertUnits(residentLimitMB.get) + if (function.residentLimit.isDefined) { + val memInUnits = Lsf706JobRunner.convertUnits(function.residentLimit.get) request.rLimits(LibLsf.LSF_RLIMIT_RSS) = memInUnits } // If the priority is set (user specified Int) specify the priority - if (function.jobPriority.isDefined) { - request.userPriority = function.jobPriority.get + val priority = functionPriority + if (priority.isDefined) { + request.userPriority = priority.get request.options2 |= LibBat.SUB2_JOB_PRIORITY } - // Broad specific requirement, our esub requires there be a project - // else it will spit out a warning to stdout. see $LSF_SERVERDIR/esub - request.projectName = if (function.jobProject != null) function.jobProject else "Queue" - request.options |= LibBat.SUB_PROJECT_NAME + // Set the project to either the function or LSF default + val project = if (function.jobProject != null) function.jobProject else Lsf706JobRunner.defaultProject + if (project != null) { + request.projectName = project + request.options |= LibBat.SUB_PROJECT_NAME + } + + // Set the esub names based on the job envorinment names + if (!function.jobEnvironmentNames.isEmpty) { + val argv = Array("", "-a", function.jobEnvironmentNames.mkString(" ")) + val setOptionResult = LibBat.setOption_(argv.length, new StringArray(argv), "a:", request, ~0, ~0, ~0, null); + if (setOptionResult == -1) + throw new QException("setOption_() returned -1 while setting esub"); + } // LSF specific: get the max runtime for the jobQueue and pass it for this job request.rLimits(LibLsf.LSF_RLIMIT_RUN) = Lsf706JobRunner.getRlimitRun(function.jobQueue) @@ -132,6 +161,41 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR logger.debug("Job Id %s status / exitStatus / exitInfo: ??? / ??? / ???".format(jobId)) super.checkUnknownStatus() } + + private def parseResourceRequest() { + requestString.setLength(0) + selectString.setLength(0) + usageString.setLength(0) + + requestString.append(function.jobResourceRequests.mkString(" ")) + extractSection(requestString, "select", selectString) + extractSection(requestString, "rusage", usageString) + } + + private def extractSection(requestString: StringBuffer, section: String, sectionString: StringBuffer) { + val pattern = Pattern.compile(section + "\\s*\\[[^\\]]+\\]\\s*"); + val matcher = pattern.matcher(requestString.toString) + if (matcher.find()) { + sectionString.setLength(0) + sectionString.append(matcher.group().trim()) + + val sb = new StringBuffer + matcher.appendReplacement(sb, "") + matcher.appendTail(sb) + + requestString.setLength(0) + requestString.append(sb) + } + } + + private def appendRequest(section: String, sectionString: StringBuffer, separator: String, request: String) { + if (sectionString.length() == 0) + sectionString.append(section).append("[").append(request).append("]") + else + sectionString.insert(sectionString.length() - 1, separator + request) + } + + private def getResourceRequest = "%s %s %s".format(selectString, usageString, requestString).trim() } object Lsf706JobRunner extends Logging { @@ -141,15 +205,23 @@ object Lsf706JobRunner extends Logging { /** Number of seconds for a non-normal exit status before we give up on expecting LSF to retry the function. */ private val retryExpiredSeconds = 5 * 60 - initLsf() - /** * Initialize the Lsf library. */ - private def initLsf() { + private val (defaultQueue, defaultProject, maxUserPriority) = { lsfLibLock.synchronized { if (LibBat.lsb_init("Queue") < 0) throw new QException(LibBat.lsb_sperror("lsb_init() failed")) + + val parameterInfo = LibBat.lsb_parameterinfo(null, null, 0); + var defaultQueue: String = parameterInfo.defaultQueues + val defaultProject = parameterInfo.defaultProject + val maxUserPriority = parameterInfo.maxUserPriority + + if (defaultQueue != null && defaultQueue.indexOf(' ') > 0) + defaultQueue = defaultQueue.split(" ")(0) + + (defaultQueue, defaultProject, maxUserPriority) } } @@ -249,17 +321,6 @@ object Lsf706JobRunner extends Logging { } } - /** The name of the default queue. */ - private lazy val defaultQueue: String = { - lsfLibLock.synchronized { - val numQueues = new IntByReference(1) - val queueInfo = LibBat.lsb_queueinfo(null, numQueues, null, null, 0) - if (queueInfo == null) - throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for the default queue")) - queueInfo.queue - } - } - /** The run limits for each queue. */ private var queueRlimitRun = Map.empty[String,Int] @@ -299,15 +360,15 @@ object Lsf706JobRunner extends Logging { Structure.autoRead(unitsParam.asInstanceOf[Array[Structure]]) unitsParam(0).paramValue match { - case "MB" => 1D - case "GB" => 1024D - case "TB" => 1024D * 1024 - case "PB" => 1024D * 1024 * 1024 - case "EB" => 1024D * 1024 * 1024 * 1024 - case null => 1D + case "MB" => 1 / 1024D + case "GB" => 1D + case "TB" => 1024D + case "PB" => 1024D * 1024 + case "EB" => 1024D * 1024 * 1024 + case null => 1 / 1024D } } } - private def convertUnits(mb: Double) = (mb / unitDivisor).ceil.toInt + private def convertUnits(gb: Double) = (gb / unitDivisor).ceil.toInt } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala index 2508f5776..5456ed02c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala @@ -15,13 +15,13 @@ class AddOrReplaceReadGroups extends org.broadinstitute.sting.queue.function.Jav javaMainClass = "net.sf.picard.sam.AddOrReplaceReadGroups" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = _ + var input: List[File] = Nil @Output(doc="The output BAM file with the modified/added read groups", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) - var outputIndex: File = new File(output + ".bai") + var outputIndex: File = _ @Argument(doc="Read group ID", shortName = "id", fullName = "read_group_id", required = true) var RGID: String = _ @@ -44,6 +44,12 @@ class AddOrReplaceReadGroups extends org.broadinstitute.sting.queue.function.Jav @Argument(doc = "Read group description", shortName = "ds", fullName = "read_group_description", required = false) var RGDS: String = "" + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + override def inputBams = input override def outputBam = output diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala index 6f006ffad..d44d5e004 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala @@ -15,13 +15,13 @@ class MarkDuplicates extends org.broadinstitute.sting.queue.function.JavaCommand javaMainClass = "net.sf.picard.sam.MarkDuplicates" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = _ + var input: List[File] = Nil @Output(doc="The output file to write marked records to", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) - var outputIndex: File = new File(output + ".bai") + var outputIndex: File = _ @Output(doc="File to write duplication metrics to", shortName = "out_metrics", fullName = "output_metrics_file", required = false) var metrics: File = new File(output + ".metrics") @@ -35,6 +35,13 @@ class MarkDuplicates extends org.broadinstitute.sting.queue.function.JavaCommand @Argument(doc = "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by some of the sorting collections. If you are running out of memory, try reducing this number.", shortName = "sorting_ratio", fullName = "sorting_collection_size_ratio", required = false) var SORTING_COLLECTION_SIZE_RATIO: Double = -1 + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + + override def inputBams = input override def outputBam = output this.sortOrder = null diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala index a7e74e1b5..fd107890e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala @@ -3,6 +3,7 @@ package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ import java.io.File +import org.broadinstitute.sting.queue.QScript._ /* * Created by IntelliJ IDEA. @@ -15,13 +16,13 @@ class MergeSamFiles extends org.broadinstitute.sting.queue.function.JavaCommandL javaMainClass = "net.sf.picard.sam.MergeSamFiles" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = _ + var input: List[File] = Nil @Output(doc="The output merged BAM file", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) - var outputIndex: File = new File(output + ".bai") + var outputIndex: File = _ @Argument(doc="Merge the seqeunce dictionaries Default value: false. This option can be set to 'null' to clear the default value.", shortName = "merge_dict", fullName = "merge_sequence_dictionaries", required = false) var MERGE_SEQUENCE_DICTIONARIES: Boolean = false @@ -32,6 +33,13 @@ class MergeSamFiles extends org.broadinstitute.sting.queue.function.JavaCommandL @Argument(doc = "Comments to include in the merged output file's header.", shortName = "com", fullName = "comments", required = false) var COMMENT: String = "" + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + + override def inputBams = input override def outputBam = output this.createIndex = Some(true) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala new file mode 100644 index 000000000..746ce609e --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala @@ -0,0 +1,61 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline._ + +import java.io.File + +/* + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 6/22/11 + * Time: 10:35 AM + */ +class RevertSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { + analysisName = "RevertSam" + javaMainClass = "net.sf.picard.sam.RevertSam" + + @Input(shortName = "input", fullName = "input_bam_files", required = true, doc = "The input SAM or BAM files to revert.") + var input: List[File] = Nil + + @Output(shortName = "output", fullName = "output_bam_file", required = true, doc = "The reverted BAM or SAM output file.") + var output: File = _ + + @Output(shortName = "out_index", fullName = "output_bam_index_file", required = false, doc = "The output bam index") + var outputIndex: File = _ + + @Argument(shortName = "roq", fullName = "restore_original_qualities", required = false, doc = "True to restore original qualities from the OQ field to the QUAL field if available.") + var restoreOriginalQualities: Boolean = true + + @Argument(shortName = "rdi", fullName = "remove_duplicate_information", required = false, doc = "Remove duplicate read flags from all reads. Note that if this is true and REMOVE_ALIGNMENT_INFORMATION==false, the output may have the unusual but sometimes desirable trait of having unmapped reads that are marked as duplicates.") + var removeDuplicateInformation: Boolean = true + + @Argument(shortName = "rai", fullName = "remove_alignment_information", required = false, doc = "Remove all alignment information from the file.") + var removeAlignmentInformation: Boolean = true + + @Argument(shortName = "atc", fullName = "attributes_to_clear", required = false, doc = "When removing alignment information, the set of optional tags to remove.") + var attributesToClear: List[String] = Nil + + @Argument(shortName = "sa", fullName = "sample_alias", required = false, doc = "The sample alias to use in the reverted output file. This will override the existing sample alias in the file and is used only if all the read groups in the input file have the same sample alias.") + var sampleAlias: String = null + + @Argument(shortName = "ln", fullName = "library_name", required = false, doc = "The library name to use in the reverted output file. This will override the existing sample alias in the file and is used only if all the read groups in the input file have the same sample alias.") + var libraryName: String = null + + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + + + override def inputBams = input + override def outputBam = output + this.createIndex = Some(true) + override def commandLine = super.commandLine + + conditionalParameter(!restoreOriginalQualities, " RESTORE_ORIGINAL_QUALITIES=false") + + conditionalParameter(!removeDuplicateInformation, " REMOVE_DUPLICATE_INFORMATION=false") + + conditionalParameter(!removeAlignmentInformation, " REMOVE_ALIGNMENT_INFORMATION=false") + + conditionalParameter(!attributesToClear.isEmpty, repeat(" ATTRIBUTE_TO_CLEAR=", attributesToClear)) + + conditionalParameter(sampleAlias != null, " SAMPLE_ALIAS=" + sampleAlias) + + conditionalParameter(libraryName != null, " LIBRARY_NAME=" + libraryName) +} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala index cc26f7471..a56093be8 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala @@ -3,6 +3,7 @@ package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ import java.io.File +import org.broadinstitute.sting.queue.QScript._ /* * Created by IntelliJ IDEA. @@ -15,13 +16,21 @@ class SortSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFun javaMainClass = "net.sf.picard.sam.SortSam" @Input(doc="The input SAM or BAM files to sort.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = _ + var input: List[File] = Nil @Output(doc="The sorted BAM or SAM output file.", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) - var outputIndex: File = new File(output + ".bai") + var outputIndex: File = _ + + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + + override def inputBams = input override def outputBam = output diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala index 726682b89..2c8fbc6d9 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala @@ -17,7 +17,7 @@ class ValidateSamFile extends org.broadinstitute.sting.queue.function.JavaComman javaMainClass = "net.sf.picard.sam.ValidateSamFile" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = _ + var input: List[File] = Nil @Output(doc="Send output to a file instead of stdout", shortName = "output", fullName = "output_file", required = false) var output: File = _ @@ -26,7 +26,7 @@ class ValidateSamFile extends org.broadinstitute.sting.queue.function.JavaComman var MODE: Mode = Mode.VERBOSE @Argument(doc="List of validation error types to ignore.", shortName = "ignore", fullName = "ignore_error_types", required = false) - var IGNORE: List[String] = _ + var IGNORE: List[String] = Nil @Argument(doc = "The maximum number of lines output in verbose mode.", shortName = "max", fullName = "max_output", required = false) var MAX_OUTPUT: Int = 100 diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala index c62fdcd7c..ff77503ac 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala @@ -11,12 +11,27 @@ trait CommandLineFunction extends QFunction with Logging { /** Upper memory limit */ var memoryLimit: Option[Double] = None + /** Resident memory limit */ + var residentLimit: Option[Double] = None + + /** Resident memory request */ + var residentRequest: Option[Double] = None + /** Job project to run the command */ var jobProject: String = _ /** Job queue to run the command */ var jobQueue: String = _ + /** Native arguments to pass to the job runner */ + var jobNativeArgs: List[String] = Nil + + /** Native arguments to pass to the job runner */ + var jobResourceRequests: List[String] = Nil + + /** Environment names to pass to the job runner */ + var jobEnvironmentNames: List[String] = Nil + override def copySettingsTo(function: QFunction) { super.copySettingsTo(function) function match { @@ -24,13 +39,27 @@ trait CommandLineFunction extends QFunction with Logging { if (commandLineFunction.memoryLimit.isEmpty) commandLineFunction.memoryLimit = this.memoryLimit + if (commandLineFunction.residentLimit.isEmpty) + commandLineFunction.residentLimit = this.residentLimit + + if (commandLineFunction.residentRequest.isEmpty) + commandLineFunction.residentRequest = this.residentRequest + if (commandLineFunction.jobProject == null) commandLineFunction.jobProject = this.jobProject if (commandLineFunction.jobQueue == null) commandLineFunction.jobQueue = this.jobQueue - commandLineFunction.jobQueue = this.jobQueue + if (commandLineFunction.jobNativeArgs.isEmpty) + commandLineFunction.jobNativeArgs = this.jobNativeArgs + + if (commandLineFunction.jobResourceRequests.isEmpty) + commandLineFunction.jobResourceRequests = this.jobResourceRequests + + if (commandLineFunction.jobEnvironmentNames.isEmpty) + commandLineFunction.jobEnvironmentNames = this.jobEnvironmentNames + case _ => /* ignore */ } } @@ -53,9 +82,30 @@ trait CommandLineFunction extends QFunction with Logging { if (jobProject == null) jobProject = qSettings.jobProject + if (jobNativeArgs.isEmpty) + jobNativeArgs = qSettings.jobNativeArgs + + if (jobResourceRequests.isEmpty) + jobResourceRequests = qSettings.jobResourceRequests + + if (jobEnvironmentNames.isEmpty) + jobEnvironmentNames = qSettings.jobEnvironmentNames + if (memoryLimit.isEmpty) memoryLimit = qSettings.memoryLimit + if (residentLimit.isEmpty) + residentLimit = qSettings.residentLimit + + if (residentRequest.isEmpty) + residentRequest = qSettings.residentRequest + + if (residentRequest.isEmpty) + residentRequest = memoryLimit + + if (residentLimit.isEmpty) + residentLimit = residentRequest.map( _ * 1.2 ) + super.freezeFieldValues() } diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala index 27ac559c5..5de474340 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -43,13 +43,15 @@ object PipelineTest extends BaseTest with Logging { private val validationReportsDataLocation = "/humgen/gsa-hpprojects/GATK/validationreports/submitted/" - val run = System.getProperty("pipeline.run") == "run" + final val run = System.getProperty("pipeline.run") == "run" - private val jobRunners = { + final val allJobRunners = { val commandLinePluginManager = new CommandLinePluginManager - commandLinePluginManager.getPlugins.map(commandLinePluginManager.getName(_)).filterNot(_ == "Shell") + commandLinePluginManager.getPlugins.map(commandLinePluginManager.getName(_)).toList } + final val defaultJobRunners = List("Lsf706", "GridEngine") + /** * Returns the top level output path to this test. * @param testName The name of the test passed to PipelineTest.executeTest() @@ -79,9 +81,12 @@ object PipelineTest extends BaseTest with Logging { * @param pipelineTest test to run. */ def executeTest(pipelineTest: PipelineTestSpec) { + var jobRunners = pipelineTest.jobRunners + if (jobRunners == null) + jobRunners = defaultJobRunners; jobRunners.foreach(executeTest(pipelineTest, _)) } - + /** * Runs the pipelineTest. * @param pipelineTest test to run. diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala index f26689383..a7b3f3a47 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala @@ -1,7 +1,5 @@ package org.broadinstitute.sting.queue.pipeline -import java.io.File - class PipelineTestSpec(var name: String = null) { /** The arguments to pass to the Queue test, ex: "-S scala/qscript/examples/HelloWorld.scala" */ @@ -10,6 +8,9 @@ class PipelineTestSpec(var name: String = null) { /** Job Queue to run the test. Default is null which means use hour. */ var jobQueue: String = _ + /** Job runners to run the test. Default is null which means use the default. */ + var jobRunners: List[String] = _ + /** Expected MD5 results for each file path. */ var fileMD5s = Map.empty[String, String] diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala index 7c76823da..f320cb3a6 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala @@ -33,6 +33,7 @@ class HelloWorldPipelineTest { val spec = new PipelineTestSpec spec.name = "HelloWorld" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + spec.jobRunners = PipelineTest.allJobRunners PipelineTest.executeTest(spec) } @@ -40,23 +41,89 @@ class HelloWorldPipelineTest { def testHelloWorldWithPrefix() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPrefix" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPrefix HelloWorld" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobPrefix HelloWorld" + spec.jobRunners = PipelineTest.allJobRunners PipelineTest.executeTest(spec) } @Test def testHelloWorldWithMemoryLimit() { val spec = new PipelineTestSpec - spec.name = "HelloWorldWithPrefix" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -memLimit 1.25" + spec.name = "HelloWorldMemoryLimit" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -memLimit 1.25" + spec.jobRunners = PipelineTest.allJobRunners PipelineTest.executeTest(spec) } - @Test(enabled=false) + @Test def testHelloWorldWithPriority() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPriority" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPriority 100" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobPriority 100" + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithLsfResource() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithLsfResource" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobResReq rusage[iodine_io=1] -jobResReq select[swp>0] -jobResReq order[swp]" + spec.jobRunners = List("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithLsfResourceAndMemoryLimit() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithLsfResourceAndMemoryLimit" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -memLimit 1.25 -jobResReq rusage[iodine_io=1] -jobResReq select[swp>0] -jobResReq order[swp]" + spec.jobRunners = List("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithLsfEnvironment() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithLsfEnvironment" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobEnv tv" + spec.jobRunners = List("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithGridEngineResource() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithGridEngineResource" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobResReq s_core=1000M" + spec.jobRunners = List("GridEngine") + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithGridEngineResourceAndMemoryLimit() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithGridEngineResourceAndMemoryLimit" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -memLimit 1.25 -jobResReq s_core=1000M" + spec.jobRunners = List("GridEngine") + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithGridEngineEnvironment() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithGridEngineEnvironment" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobEnv \"make 1\"" + spec.jobRunners = List("GridEngine") PipelineTest.executeTest(spec) } } diff --git a/public/testdata/HiSeq.10000.vcf.gz b/public/testdata/HiSeq.10000.vcf.gz new file mode 100644 index 000000000..15e91010c Binary files /dev/null and b/public/testdata/HiSeq.10000.vcf.gz differ diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html index c11200121..7fc8cd7bd 100644 --- a/settings/helpTemplates/generic.template.html +++ b/settings/helpTemplates/generic.template.html @@ -20,16 +20,21 @@ <#macro argumentDetails arg>

    ${arg.name}<#if arg.synonyms??> / ${arg.synonyms} (<#if arg.attributes??>${arg.attributes} ${arg.type}<#if arg.defaultValue??> with default value ${arg.defaultValue})

    - ${arg.summary}. ${arg.fulltext}
    +

    + ${arg.summary}. ${arg.fulltext} + <#if arg.rodTypes??>${arg.name} binds reference ordered data. This argument supports ROD files of the + following types: ${arg.rodTypes} <#if arg.options??> -

    The ${arg.name} argument is an enumerated type (${arg.type}), which can have one of the following values:

    +
    + The ${arg.name} argument is an enumerated type (${arg.type}), which can have one of the following values:
    <#list arg.options as option> -
    ${option.name} -
    ${option.summary} +
    ${option.name}
    +
    ${option.summary}
    +

    <#macro relatedByType name type> @@ -77,6 +82,7 @@ <@argumentlist name="Required" myargs=arguments.required/> <@argumentlist name="Optional" myargs=arguments.optional/> + <@argumentlist name="Advanced" myargs=arguments.advanced/> <@argumentlist name="Hidden" myargs=arguments.hidden/> <@argumentlist name="Depreciated" myargs=arguments.depreciated/> @@ -98,7 +104,7 @@ <#-- This class is related to other documented classes via sub/super relationships --> - <#if relatedDocs?size != 0> + <#if relatedDocs?? && relatedDocs?size != 0>

    Related capabilities

    <@relatedByType name="Superclasses" type="superclass"/> diff --git a/settings/helpTemplates/style.css b/settings/helpTemplates/style.css index 1d7bcc576..297cd49ef 100644 --- a/settings/helpTemplates/style.css +++ b/settings/helpTemplates/style.css @@ -42,6 +42,10 @@ p.version text-align: center; } +p.args +{ + margin-left: 3em; +} h1, h2, h3, h4 { @@ -82,19 +86,33 @@ hr */ dl { - border: 1px solid #ccc; + margin-left: 3em; } -dt { +dl.enum { + margin-left: 3em; + border: 1px dashed #ccc; +} + +dt, dt.enum { font-weight: bold; text-decoration: underline; } -dd { - margin: 0; +/* +dt, dd.enum { padding: 0 0 0.5em 0; } +*/ +pre { + border: thin solid lightgray; + margin-left: 1em; + margin-right: 4em; +/* + background-color: #e0fdff; +*/ +} /* * clean table layouts */ @@ -128,6 +146,48 @@ dd { } th#row-divider +{ + font-weight: bolder; + font-size: larger; +} + + +/* + * Table design for input/ouptut description + */ + +#description-table +{ + font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; + font-size: 12px; + background: #fff; + margin: 5px; + border-collapse: collapse; + text-align: left; +} +#description-table th +{ + font-size: 16px; + font-weight: bold; + background-color: lightgray; + color: #039; + text-align: center; + padding: 10px 8px; + border-bottom: 2px solid #6678b1; +} +#description-table td +{ + border-bottom: 1px solid #ccc; + color: #669; + padding: 6px 8px; + text-align: right; +} +#description-table tbody tr:hover td +{ + color: #009; +} + +th#row-divider { font-weight: bolder; font-size: larger;